Commit 907aed48f65efeecf91575397e3d79335d93a466

Authored by Mel Gorman
Committed by Linus Torvalds
1 parent b37f1dd0f5

mm: allow PF_MEMALLOC from softirq context

This is needed to allow network softirq packet processing to make use of
PF_MEMALLOC.

Currently softirq context cannot use PF_MEMALLOC due to it not being
associated with a task, and therefore not having task flags to fiddle with
- thus the gfp to alloc flag mapping ignores the task flags when in
interrupts (hard or soft) context.

Allowing softirqs to make use of PF_MEMALLOC therefore requires some
trickery.  This patch borrows the task flags from whatever process happens
to be preempted by the softirq.  It then modifies the gfp to alloc flags
mapping to not exclude task flags in softirq context, and modify the
softirq code to save, clear and restore the PF_MEMALLOC flag.

The save and clear, ensures the preempted task's PF_MEMALLOC flag doesn't
leak into the softirq.  The restore ensures a softirq's PF_MEMALLOC flag
cannot leak back into the preempted process.  This should be safe due to
the following reasons

Softirqs can run on multiple CPUs sure but the same task should not be
	executing the same softirq code. Neither should the softirq
	handler be preempted by any other softirq handler so the flags
	should not leak to an unrelated softirq.

Softirqs re-enable hardware interrupts in __do_softirq() so can be
	preempted by hardware interrupts so PF_MEMALLOC is inherited
	by the hard IRQ. However, this is similar to a process in
	reclaim being preempted by a hardirq. While PF_MEMALLOC is
	set, gfp_to_alloc_flags() distinguishes between hard and
	soft irqs and avoids giving a hardirq the ALLOC_NO_WATERMARKS
	flag.

If the softirq is deferred to ksoftirq then its flags may be used
        instead of a normal tasks but as the softirq cannot be preempted,
        the PF_MEMALLOC flag does not leak to other code by accident.

[davem@davemloft.net: Document why PF_MEMALLOC is safe]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: David Miller <davem@davemloft.net>
Cc: Neil Brown <neilb@suse.de>
Cc: Mike Christie <michaelc@cs.wisc.edu>
Cc: Eric B Munson <emunson@mgebm.net>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 3 changed files with 21 additions and 1 deletions Inline Diff

include/linux/sched.h
1 #ifndef _LINUX_SCHED_H 1 #ifndef _LINUX_SCHED_H
2 #define _LINUX_SCHED_H 2 #define _LINUX_SCHED_H
3 3
4 /* 4 /*
5 * cloning flags: 5 * cloning flags:
6 */ 6 */
7 #define CSIGNAL 0x000000ff /* signal mask to be sent at exit */ 7 #define CSIGNAL 0x000000ff /* signal mask to be sent at exit */
8 #define CLONE_VM 0x00000100 /* set if VM shared between processes */ 8 #define CLONE_VM 0x00000100 /* set if VM shared between processes */
9 #define CLONE_FS 0x00000200 /* set if fs info shared between processes */ 9 #define CLONE_FS 0x00000200 /* set if fs info shared between processes */
10 #define CLONE_FILES 0x00000400 /* set if open files shared between processes */ 10 #define CLONE_FILES 0x00000400 /* set if open files shared between processes */
11 #define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */ 11 #define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */
12 #define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */ 12 #define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */
13 #define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */ 13 #define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */
14 #define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */ 14 #define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */
15 #define CLONE_THREAD 0x00010000 /* Same thread group? */ 15 #define CLONE_THREAD 0x00010000 /* Same thread group? */
16 #define CLONE_NEWNS 0x00020000 /* New namespace group? */ 16 #define CLONE_NEWNS 0x00020000 /* New namespace group? */
17 #define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */ 17 #define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */
18 #define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */ 18 #define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */
19 #define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */ 19 #define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */
20 #define CLONE_CHILD_CLEARTID 0x00200000 /* clear the TID in the child */ 20 #define CLONE_CHILD_CLEARTID 0x00200000 /* clear the TID in the child */
21 #define CLONE_DETACHED 0x00400000 /* Unused, ignored */ 21 #define CLONE_DETACHED 0x00400000 /* Unused, ignored */
22 #define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */ 22 #define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */
23 #define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */ 23 #define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */
24 /* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state) 24 /* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state)
25 and is now available for re-use. */ 25 and is now available for re-use. */
26 #define CLONE_NEWUTS 0x04000000 /* New utsname group? */ 26 #define CLONE_NEWUTS 0x04000000 /* New utsname group? */
27 #define CLONE_NEWIPC 0x08000000 /* New ipcs */ 27 #define CLONE_NEWIPC 0x08000000 /* New ipcs */
28 #define CLONE_NEWUSER 0x10000000 /* New user namespace */ 28 #define CLONE_NEWUSER 0x10000000 /* New user namespace */
29 #define CLONE_NEWPID 0x20000000 /* New pid namespace */ 29 #define CLONE_NEWPID 0x20000000 /* New pid namespace */
30 #define CLONE_NEWNET 0x40000000 /* New network namespace */ 30 #define CLONE_NEWNET 0x40000000 /* New network namespace */
31 #define CLONE_IO 0x80000000 /* Clone io context */ 31 #define CLONE_IO 0x80000000 /* Clone io context */
32 32
33 /* 33 /*
34 * Scheduling policies 34 * Scheduling policies
35 */ 35 */
36 #define SCHED_NORMAL 0 36 #define SCHED_NORMAL 0
37 #define SCHED_FIFO 1 37 #define SCHED_FIFO 1
38 #define SCHED_RR 2 38 #define SCHED_RR 2
39 #define SCHED_BATCH 3 39 #define SCHED_BATCH 3
40 /* SCHED_ISO: reserved but not implemented yet */ 40 /* SCHED_ISO: reserved but not implemented yet */
41 #define SCHED_IDLE 5 41 #define SCHED_IDLE 5
42 /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ 42 /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
43 #define SCHED_RESET_ON_FORK 0x40000000 43 #define SCHED_RESET_ON_FORK 0x40000000
44 44
45 #ifdef __KERNEL__ 45 #ifdef __KERNEL__
46 46
47 struct sched_param { 47 struct sched_param {
48 int sched_priority; 48 int sched_priority;
49 }; 49 };
50 50
51 #include <asm/param.h> /* for HZ */ 51 #include <asm/param.h> /* for HZ */
52 52
53 #include <linux/capability.h> 53 #include <linux/capability.h>
54 #include <linux/threads.h> 54 #include <linux/threads.h>
55 #include <linux/kernel.h> 55 #include <linux/kernel.h>
56 #include <linux/types.h> 56 #include <linux/types.h>
57 #include <linux/timex.h> 57 #include <linux/timex.h>
58 #include <linux/jiffies.h> 58 #include <linux/jiffies.h>
59 #include <linux/rbtree.h> 59 #include <linux/rbtree.h>
60 #include <linux/thread_info.h> 60 #include <linux/thread_info.h>
61 #include <linux/cpumask.h> 61 #include <linux/cpumask.h>
62 #include <linux/errno.h> 62 #include <linux/errno.h>
63 #include <linux/nodemask.h> 63 #include <linux/nodemask.h>
64 #include <linux/mm_types.h> 64 #include <linux/mm_types.h>
65 65
66 #include <asm/page.h> 66 #include <asm/page.h>
67 #include <asm/ptrace.h> 67 #include <asm/ptrace.h>
68 #include <asm/cputime.h> 68 #include <asm/cputime.h>
69 69
70 #include <linux/smp.h> 70 #include <linux/smp.h>
71 #include <linux/sem.h> 71 #include <linux/sem.h>
72 #include <linux/signal.h> 72 #include <linux/signal.h>
73 #include <linux/compiler.h> 73 #include <linux/compiler.h>
74 #include <linux/completion.h> 74 #include <linux/completion.h>
75 #include <linux/pid.h> 75 #include <linux/pid.h>
76 #include <linux/percpu.h> 76 #include <linux/percpu.h>
77 #include <linux/topology.h> 77 #include <linux/topology.h>
78 #include <linux/proportions.h> 78 #include <linux/proportions.h>
79 #include <linux/seccomp.h> 79 #include <linux/seccomp.h>
80 #include <linux/rcupdate.h> 80 #include <linux/rcupdate.h>
81 #include <linux/rculist.h> 81 #include <linux/rculist.h>
82 #include <linux/rtmutex.h> 82 #include <linux/rtmutex.h>
83 83
84 #include <linux/time.h> 84 #include <linux/time.h>
85 #include <linux/param.h> 85 #include <linux/param.h>
86 #include <linux/resource.h> 86 #include <linux/resource.h>
87 #include <linux/timer.h> 87 #include <linux/timer.h>
88 #include <linux/hrtimer.h> 88 #include <linux/hrtimer.h>
89 #include <linux/task_io_accounting.h> 89 #include <linux/task_io_accounting.h>
90 #include <linux/latencytop.h> 90 #include <linux/latencytop.h>
91 #include <linux/cred.h> 91 #include <linux/cred.h>
92 #include <linux/llist.h> 92 #include <linux/llist.h>
93 #include <linux/uidgid.h> 93 #include <linux/uidgid.h>
94 94
95 #include <asm/processor.h> 95 #include <asm/processor.h>
96 96
97 struct exec_domain; 97 struct exec_domain;
98 struct futex_pi_state; 98 struct futex_pi_state;
99 struct robust_list_head; 99 struct robust_list_head;
100 struct bio_list; 100 struct bio_list;
101 struct fs_struct; 101 struct fs_struct;
102 struct perf_event_context; 102 struct perf_event_context;
103 struct blk_plug; 103 struct blk_plug;
104 104
105 /* 105 /*
106 * List of flags we want to share for kernel threads, 106 * List of flags we want to share for kernel threads,
107 * if only because they are not used by them anyway. 107 * if only because they are not used by them anyway.
108 */ 108 */
109 #define CLONE_KERNEL (CLONE_FS | CLONE_FILES | CLONE_SIGHAND) 109 #define CLONE_KERNEL (CLONE_FS | CLONE_FILES | CLONE_SIGHAND)
110 110
111 /* 111 /*
112 * These are the constant used to fake the fixed-point load-average 112 * These are the constant used to fake the fixed-point load-average
113 * counting. Some notes: 113 * counting. Some notes:
114 * - 11 bit fractions expand to 22 bits by the multiplies: this gives 114 * - 11 bit fractions expand to 22 bits by the multiplies: this gives
115 * a load-average precision of 10 bits integer + 11 bits fractional 115 * a load-average precision of 10 bits integer + 11 bits fractional
116 * - if you want to count load-averages more often, you need more 116 * - if you want to count load-averages more often, you need more
117 * precision, or rounding will get you. With 2-second counting freq, 117 * precision, or rounding will get you. With 2-second counting freq,
118 * the EXP_n values would be 1981, 2034 and 2043 if still using only 118 * the EXP_n values would be 1981, 2034 and 2043 if still using only
119 * 11 bit fractions. 119 * 11 bit fractions.
120 */ 120 */
121 extern unsigned long avenrun[]; /* Load averages */ 121 extern unsigned long avenrun[]; /* Load averages */
122 extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift); 122 extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift);
123 123
124 #define FSHIFT 11 /* nr of bits of precision */ 124 #define FSHIFT 11 /* nr of bits of precision */
125 #define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */ 125 #define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */
126 #define LOAD_FREQ (5*HZ+1) /* 5 sec intervals */ 126 #define LOAD_FREQ (5*HZ+1) /* 5 sec intervals */
127 #define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */ 127 #define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */
128 #define EXP_5 2014 /* 1/exp(5sec/5min) */ 128 #define EXP_5 2014 /* 1/exp(5sec/5min) */
129 #define EXP_15 2037 /* 1/exp(5sec/15min) */ 129 #define EXP_15 2037 /* 1/exp(5sec/15min) */
130 130
131 #define CALC_LOAD(load,exp,n) \ 131 #define CALC_LOAD(load,exp,n) \
132 load *= exp; \ 132 load *= exp; \
133 load += n*(FIXED_1-exp); \ 133 load += n*(FIXED_1-exp); \
134 load >>= FSHIFT; 134 load >>= FSHIFT;
135 135
136 extern unsigned long total_forks; 136 extern unsigned long total_forks;
137 extern int nr_threads; 137 extern int nr_threads;
138 DECLARE_PER_CPU(unsigned long, process_counts); 138 DECLARE_PER_CPU(unsigned long, process_counts);
139 extern int nr_processes(void); 139 extern int nr_processes(void);
140 extern unsigned long nr_running(void); 140 extern unsigned long nr_running(void);
141 extern unsigned long nr_uninterruptible(void); 141 extern unsigned long nr_uninterruptible(void);
142 extern unsigned long nr_iowait(void); 142 extern unsigned long nr_iowait(void);
143 extern unsigned long nr_iowait_cpu(int cpu); 143 extern unsigned long nr_iowait_cpu(int cpu);
144 extern unsigned long this_cpu_load(void); 144 extern unsigned long this_cpu_load(void);
145 145
146 146
147 extern void calc_global_load(unsigned long ticks); 147 extern void calc_global_load(unsigned long ticks);
148 extern void update_cpu_load_nohz(void); 148 extern void update_cpu_load_nohz(void);
149 149
150 extern unsigned long get_parent_ip(unsigned long addr); 150 extern unsigned long get_parent_ip(unsigned long addr);
151 151
152 struct seq_file; 152 struct seq_file;
153 struct cfs_rq; 153 struct cfs_rq;
154 struct task_group; 154 struct task_group;
155 #ifdef CONFIG_SCHED_DEBUG 155 #ifdef CONFIG_SCHED_DEBUG
156 extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m); 156 extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m);
157 extern void proc_sched_set_task(struct task_struct *p); 157 extern void proc_sched_set_task(struct task_struct *p);
158 extern void 158 extern void
159 print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq); 159 print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
160 #else 160 #else
161 static inline void 161 static inline void
162 proc_sched_show_task(struct task_struct *p, struct seq_file *m) 162 proc_sched_show_task(struct task_struct *p, struct seq_file *m)
163 { 163 {
164 } 164 }
165 static inline void proc_sched_set_task(struct task_struct *p) 165 static inline void proc_sched_set_task(struct task_struct *p)
166 { 166 {
167 } 167 }
168 static inline void 168 static inline void
169 print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) 169 print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
170 { 170 {
171 } 171 }
172 #endif 172 #endif
173 173
174 /* 174 /*
175 * Task state bitmask. NOTE! These bits are also 175 * Task state bitmask. NOTE! These bits are also
176 * encoded in fs/proc/array.c: get_task_state(). 176 * encoded in fs/proc/array.c: get_task_state().
177 * 177 *
178 * We have two separate sets of flags: task->state 178 * We have two separate sets of flags: task->state
179 * is about runnability, while task->exit_state are 179 * is about runnability, while task->exit_state are
180 * about the task exiting. Confusing, but this way 180 * about the task exiting. Confusing, but this way
181 * modifying one set can't modify the other one by 181 * modifying one set can't modify the other one by
182 * mistake. 182 * mistake.
183 */ 183 */
184 #define TASK_RUNNING 0 184 #define TASK_RUNNING 0
185 #define TASK_INTERRUPTIBLE 1 185 #define TASK_INTERRUPTIBLE 1
186 #define TASK_UNINTERRUPTIBLE 2 186 #define TASK_UNINTERRUPTIBLE 2
187 #define __TASK_STOPPED 4 187 #define __TASK_STOPPED 4
188 #define __TASK_TRACED 8 188 #define __TASK_TRACED 8
189 /* in tsk->exit_state */ 189 /* in tsk->exit_state */
190 #define EXIT_ZOMBIE 16 190 #define EXIT_ZOMBIE 16
191 #define EXIT_DEAD 32 191 #define EXIT_DEAD 32
192 /* in tsk->state again */ 192 /* in tsk->state again */
193 #define TASK_DEAD 64 193 #define TASK_DEAD 64
194 #define TASK_WAKEKILL 128 194 #define TASK_WAKEKILL 128
195 #define TASK_WAKING 256 195 #define TASK_WAKING 256
196 #define TASK_STATE_MAX 512 196 #define TASK_STATE_MAX 512
197 197
198 #define TASK_STATE_TO_CHAR_STR "RSDTtZXxKW" 198 #define TASK_STATE_TO_CHAR_STR "RSDTtZXxKW"
199 199
200 extern char ___assert_task_state[1 - 2*!!( 200 extern char ___assert_task_state[1 - 2*!!(
201 sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)]; 201 sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)];
202 202
203 /* Convenience macros for the sake of set_task_state */ 203 /* Convenience macros for the sake of set_task_state */
204 #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) 204 #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
205 #define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED) 205 #define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED)
206 #define TASK_TRACED (TASK_WAKEKILL | __TASK_TRACED) 206 #define TASK_TRACED (TASK_WAKEKILL | __TASK_TRACED)
207 207
208 /* Convenience macros for the sake of wake_up */ 208 /* Convenience macros for the sake of wake_up */
209 #define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE) 209 #define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
210 #define TASK_ALL (TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED) 210 #define TASK_ALL (TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED)
211 211
212 /* get_task_state() */ 212 /* get_task_state() */
213 #define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \ 213 #define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \
214 TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \ 214 TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
215 __TASK_TRACED) 215 __TASK_TRACED)
216 216
217 #define task_is_traced(task) ((task->state & __TASK_TRACED) != 0) 217 #define task_is_traced(task) ((task->state & __TASK_TRACED) != 0)
218 #define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0) 218 #define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0)
219 #define task_is_dead(task) ((task)->exit_state != 0) 219 #define task_is_dead(task) ((task)->exit_state != 0)
220 #define task_is_stopped_or_traced(task) \ 220 #define task_is_stopped_or_traced(task) \
221 ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0) 221 ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
222 #define task_contributes_to_load(task) \ 222 #define task_contributes_to_load(task) \
223 ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \ 223 ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
224 (task->flags & PF_FROZEN) == 0) 224 (task->flags & PF_FROZEN) == 0)
225 225
226 #define __set_task_state(tsk, state_value) \ 226 #define __set_task_state(tsk, state_value) \
227 do { (tsk)->state = (state_value); } while (0) 227 do { (tsk)->state = (state_value); } while (0)
228 #define set_task_state(tsk, state_value) \ 228 #define set_task_state(tsk, state_value) \
229 set_mb((tsk)->state, (state_value)) 229 set_mb((tsk)->state, (state_value))
230 230
231 /* 231 /*
232 * set_current_state() includes a barrier so that the write of current->state 232 * set_current_state() includes a barrier so that the write of current->state
233 * is correctly serialised wrt the caller's subsequent test of whether to 233 * is correctly serialised wrt the caller's subsequent test of whether to
234 * actually sleep: 234 * actually sleep:
235 * 235 *
236 * set_current_state(TASK_UNINTERRUPTIBLE); 236 * set_current_state(TASK_UNINTERRUPTIBLE);
237 * if (do_i_need_to_sleep()) 237 * if (do_i_need_to_sleep())
238 * schedule(); 238 * schedule();
239 * 239 *
240 * If the caller does not need such serialisation then use __set_current_state() 240 * If the caller does not need such serialisation then use __set_current_state()
241 */ 241 */
242 #define __set_current_state(state_value) \ 242 #define __set_current_state(state_value) \
243 do { current->state = (state_value); } while (0) 243 do { current->state = (state_value); } while (0)
244 #define set_current_state(state_value) \ 244 #define set_current_state(state_value) \
245 set_mb(current->state, (state_value)) 245 set_mb(current->state, (state_value))
246 246
247 /* Task command name length */ 247 /* Task command name length */
248 #define TASK_COMM_LEN 16 248 #define TASK_COMM_LEN 16
249 249
250 #include <linux/spinlock.h> 250 #include <linux/spinlock.h>
251 251
252 /* 252 /*
253 * This serializes "schedule()" and also protects 253 * This serializes "schedule()" and also protects
254 * the run-queue from deletions/modifications (but 254 * the run-queue from deletions/modifications (but
255 * _adding_ to the beginning of the run-queue has 255 * _adding_ to the beginning of the run-queue has
256 * a separate lock). 256 * a separate lock).
257 */ 257 */
258 extern rwlock_t tasklist_lock; 258 extern rwlock_t tasklist_lock;
259 extern spinlock_t mmlist_lock; 259 extern spinlock_t mmlist_lock;
260 260
261 struct task_struct; 261 struct task_struct;
262 262
263 #ifdef CONFIG_PROVE_RCU 263 #ifdef CONFIG_PROVE_RCU
264 extern int lockdep_tasklist_lock_is_held(void); 264 extern int lockdep_tasklist_lock_is_held(void);
265 #endif /* #ifdef CONFIG_PROVE_RCU */ 265 #endif /* #ifdef CONFIG_PROVE_RCU */
266 266
267 extern void sched_init(void); 267 extern void sched_init(void);
268 extern void sched_init_smp(void); 268 extern void sched_init_smp(void);
269 extern asmlinkage void schedule_tail(struct task_struct *prev); 269 extern asmlinkage void schedule_tail(struct task_struct *prev);
270 extern void init_idle(struct task_struct *idle, int cpu); 270 extern void init_idle(struct task_struct *idle, int cpu);
271 extern void init_idle_bootup_task(struct task_struct *idle); 271 extern void init_idle_bootup_task(struct task_struct *idle);
272 272
273 extern int runqueue_is_locked(int cpu); 273 extern int runqueue_is_locked(int cpu);
274 274
275 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) 275 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
276 extern void select_nohz_load_balancer(int stop_tick); 276 extern void select_nohz_load_balancer(int stop_tick);
277 extern void set_cpu_sd_state_idle(void); 277 extern void set_cpu_sd_state_idle(void);
278 extern int get_nohz_timer_target(void); 278 extern int get_nohz_timer_target(void);
279 #else 279 #else
280 static inline void select_nohz_load_balancer(int stop_tick) { } 280 static inline void select_nohz_load_balancer(int stop_tick) { }
281 static inline void set_cpu_sd_state_idle(void) { } 281 static inline void set_cpu_sd_state_idle(void) { }
282 #endif 282 #endif
283 283
284 /* 284 /*
285 * Only dump TASK_* tasks. (0 for all tasks) 285 * Only dump TASK_* tasks. (0 for all tasks)
286 */ 286 */
287 extern void show_state_filter(unsigned long state_filter); 287 extern void show_state_filter(unsigned long state_filter);
288 288
289 static inline void show_state(void) 289 static inline void show_state(void)
290 { 290 {
291 show_state_filter(0); 291 show_state_filter(0);
292 } 292 }
293 293
294 extern void show_regs(struct pt_regs *); 294 extern void show_regs(struct pt_regs *);
295 295
296 /* 296 /*
297 * TASK is a pointer to the task whose backtrace we want to see (or NULL for current 297 * TASK is a pointer to the task whose backtrace we want to see (or NULL for current
298 * task), SP is the stack pointer of the first frame that should be shown in the back 298 * task), SP is the stack pointer of the first frame that should be shown in the back
299 * trace (or NULL if the entire call-chain of the task should be shown). 299 * trace (or NULL if the entire call-chain of the task should be shown).
300 */ 300 */
301 extern void show_stack(struct task_struct *task, unsigned long *sp); 301 extern void show_stack(struct task_struct *task, unsigned long *sp);
302 302
303 void io_schedule(void); 303 void io_schedule(void);
304 long io_schedule_timeout(long timeout); 304 long io_schedule_timeout(long timeout);
305 305
306 extern void cpu_init (void); 306 extern void cpu_init (void);
307 extern void trap_init(void); 307 extern void trap_init(void);
308 extern void update_process_times(int user); 308 extern void update_process_times(int user);
309 extern void scheduler_tick(void); 309 extern void scheduler_tick(void);
310 310
311 extern void sched_show_task(struct task_struct *p); 311 extern void sched_show_task(struct task_struct *p);
312 312
313 #ifdef CONFIG_LOCKUP_DETECTOR 313 #ifdef CONFIG_LOCKUP_DETECTOR
314 extern void touch_softlockup_watchdog(void); 314 extern void touch_softlockup_watchdog(void);
315 extern void touch_softlockup_watchdog_sync(void); 315 extern void touch_softlockup_watchdog_sync(void);
316 extern void touch_all_softlockup_watchdogs(void); 316 extern void touch_all_softlockup_watchdogs(void);
317 extern int proc_dowatchdog_thresh(struct ctl_table *table, int write, 317 extern int proc_dowatchdog_thresh(struct ctl_table *table, int write,
318 void __user *buffer, 318 void __user *buffer,
319 size_t *lenp, loff_t *ppos); 319 size_t *lenp, loff_t *ppos);
320 extern unsigned int softlockup_panic; 320 extern unsigned int softlockup_panic;
321 void lockup_detector_init(void); 321 void lockup_detector_init(void);
322 #else 322 #else
323 static inline void touch_softlockup_watchdog(void) 323 static inline void touch_softlockup_watchdog(void)
324 { 324 {
325 } 325 }
326 static inline void touch_softlockup_watchdog_sync(void) 326 static inline void touch_softlockup_watchdog_sync(void)
327 { 327 {
328 } 328 }
329 static inline void touch_all_softlockup_watchdogs(void) 329 static inline void touch_all_softlockup_watchdogs(void)
330 { 330 {
331 } 331 }
332 static inline void lockup_detector_init(void) 332 static inline void lockup_detector_init(void)
333 { 333 {
334 } 334 }
335 #endif 335 #endif
336 336
337 #if defined(CONFIG_LOCKUP_DETECTOR) && defined(CONFIG_SUSPEND) 337 #if defined(CONFIG_LOCKUP_DETECTOR) && defined(CONFIG_SUSPEND)
338 void lockup_detector_bootcpu_resume(void); 338 void lockup_detector_bootcpu_resume(void);
339 #else 339 #else
340 static inline void lockup_detector_bootcpu_resume(void) 340 static inline void lockup_detector_bootcpu_resume(void)
341 { 341 {
342 } 342 }
343 #endif 343 #endif
344 344
345 #ifdef CONFIG_DETECT_HUNG_TASK 345 #ifdef CONFIG_DETECT_HUNG_TASK
346 extern unsigned int sysctl_hung_task_panic; 346 extern unsigned int sysctl_hung_task_panic;
347 extern unsigned long sysctl_hung_task_check_count; 347 extern unsigned long sysctl_hung_task_check_count;
348 extern unsigned long sysctl_hung_task_timeout_secs; 348 extern unsigned long sysctl_hung_task_timeout_secs;
349 extern unsigned long sysctl_hung_task_warnings; 349 extern unsigned long sysctl_hung_task_warnings;
350 extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, 350 extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
351 void __user *buffer, 351 void __user *buffer,
352 size_t *lenp, loff_t *ppos); 352 size_t *lenp, loff_t *ppos);
353 #else 353 #else
354 /* Avoid need for ifdefs elsewhere in the code */ 354 /* Avoid need for ifdefs elsewhere in the code */
355 enum { sysctl_hung_task_timeout_secs = 0 }; 355 enum { sysctl_hung_task_timeout_secs = 0 };
356 #endif 356 #endif
357 357
358 /* Attach to any functions which should be ignored in wchan output. */ 358 /* Attach to any functions which should be ignored in wchan output. */
359 #define __sched __attribute__((__section__(".sched.text"))) 359 #define __sched __attribute__((__section__(".sched.text")))
360 360
361 /* Linker adds these: start and end of __sched functions */ 361 /* Linker adds these: start and end of __sched functions */
362 extern char __sched_text_start[], __sched_text_end[]; 362 extern char __sched_text_start[], __sched_text_end[];
363 363
364 /* Is this address in the __sched functions? */ 364 /* Is this address in the __sched functions? */
365 extern int in_sched_functions(unsigned long addr); 365 extern int in_sched_functions(unsigned long addr);
366 366
367 #define MAX_SCHEDULE_TIMEOUT LONG_MAX 367 #define MAX_SCHEDULE_TIMEOUT LONG_MAX
368 extern signed long schedule_timeout(signed long timeout); 368 extern signed long schedule_timeout(signed long timeout);
369 extern signed long schedule_timeout_interruptible(signed long timeout); 369 extern signed long schedule_timeout_interruptible(signed long timeout);
370 extern signed long schedule_timeout_killable(signed long timeout); 370 extern signed long schedule_timeout_killable(signed long timeout);
371 extern signed long schedule_timeout_uninterruptible(signed long timeout); 371 extern signed long schedule_timeout_uninterruptible(signed long timeout);
372 asmlinkage void schedule(void); 372 asmlinkage void schedule(void);
373 extern void schedule_preempt_disabled(void); 373 extern void schedule_preempt_disabled(void);
374 extern int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner); 374 extern int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner);
375 375
376 struct nsproxy; 376 struct nsproxy;
377 struct user_namespace; 377 struct user_namespace;
378 378
379 /* 379 /*
380 * Default maximum number of active map areas, this limits the number of vmas 380 * Default maximum number of active map areas, this limits the number of vmas
381 * per mm struct. Users can overwrite this number by sysctl but there is a 381 * per mm struct. Users can overwrite this number by sysctl but there is a
382 * problem. 382 * problem.
383 * 383 *
384 * When a program's coredump is generated as ELF format, a section is created 384 * When a program's coredump is generated as ELF format, a section is created
385 * per a vma. In ELF, the number of sections is represented in unsigned short. 385 * per a vma. In ELF, the number of sections is represented in unsigned short.
386 * This means the number of sections should be smaller than 65535 at coredump. 386 * This means the number of sections should be smaller than 65535 at coredump.
387 * Because the kernel adds some informative sections to a image of program at 387 * Because the kernel adds some informative sections to a image of program at
388 * generating coredump, we need some margin. The number of extra sections is 388 * generating coredump, we need some margin. The number of extra sections is
389 * 1-3 now and depends on arch. We use "5" as safe margin, here. 389 * 1-3 now and depends on arch. We use "5" as safe margin, here.
390 */ 390 */
391 #define MAPCOUNT_ELF_CORE_MARGIN (5) 391 #define MAPCOUNT_ELF_CORE_MARGIN (5)
392 #define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) 392 #define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN)
393 393
394 extern int sysctl_max_map_count; 394 extern int sysctl_max_map_count;
395 395
396 #include <linux/aio.h> 396 #include <linux/aio.h>
397 397
398 #ifdef CONFIG_MMU 398 #ifdef CONFIG_MMU
399 extern void arch_pick_mmap_layout(struct mm_struct *mm); 399 extern void arch_pick_mmap_layout(struct mm_struct *mm);
400 extern unsigned long 400 extern unsigned long
401 arch_get_unmapped_area(struct file *, unsigned long, unsigned long, 401 arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
402 unsigned long, unsigned long); 402 unsigned long, unsigned long);
403 extern unsigned long 403 extern unsigned long
404 arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, 404 arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
405 unsigned long len, unsigned long pgoff, 405 unsigned long len, unsigned long pgoff,
406 unsigned long flags); 406 unsigned long flags);
407 extern void arch_unmap_area(struct mm_struct *, unsigned long); 407 extern void arch_unmap_area(struct mm_struct *, unsigned long);
408 extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); 408 extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
409 #else 409 #else
410 static inline void arch_pick_mmap_layout(struct mm_struct *mm) {} 410 static inline void arch_pick_mmap_layout(struct mm_struct *mm) {}
411 #endif 411 #endif
412 412
413 413
414 extern void set_dumpable(struct mm_struct *mm, int value); 414 extern void set_dumpable(struct mm_struct *mm, int value);
415 extern int get_dumpable(struct mm_struct *mm); 415 extern int get_dumpable(struct mm_struct *mm);
416 416
417 /* get/set_dumpable() values */ 417 /* get/set_dumpable() values */
418 #define SUID_DUMPABLE_DISABLED 0 418 #define SUID_DUMPABLE_DISABLED 0
419 #define SUID_DUMPABLE_ENABLED 1 419 #define SUID_DUMPABLE_ENABLED 1
420 #define SUID_DUMPABLE_SAFE 2 420 #define SUID_DUMPABLE_SAFE 2
421 421
422 /* mm flags */ 422 /* mm flags */
423 /* dumpable bits */ 423 /* dumpable bits */
424 #define MMF_DUMPABLE 0 /* core dump is permitted */ 424 #define MMF_DUMPABLE 0 /* core dump is permitted */
425 #define MMF_DUMP_SECURELY 1 /* core file is readable only by root */ 425 #define MMF_DUMP_SECURELY 1 /* core file is readable only by root */
426 426
427 #define MMF_DUMPABLE_BITS 2 427 #define MMF_DUMPABLE_BITS 2
428 #define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1) 428 #define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1)
429 429
430 /* coredump filter bits */ 430 /* coredump filter bits */
431 #define MMF_DUMP_ANON_PRIVATE 2 431 #define MMF_DUMP_ANON_PRIVATE 2
432 #define MMF_DUMP_ANON_SHARED 3 432 #define MMF_DUMP_ANON_SHARED 3
433 #define MMF_DUMP_MAPPED_PRIVATE 4 433 #define MMF_DUMP_MAPPED_PRIVATE 4
434 #define MMF_DUMP_MAPPED_SHARED 5 434 #define MMF_DUMP_MAPPED_SHARED 5
435 #define MMF_DUMP_ELF_HEADERS 6 435 #define MMF_DUMP_ELF_HEADERS 6
436 #define MMF_DUMP_HUGETLB_PRIVATE 7 436 #define MMF_DUMP_HUGETLB_PRIVATE 7
437 #define MMF_DUMP_HUGETLB_SHARED 8 437 #define MMF_DUMP_HUGETLB_SHARED 8
438 438
439 #define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS 439 #define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS
440 #define MMF_DUMP_FILTER_BITS 7 440 #define MMF_DUMP_FILTER_BITS 7
441 #define MMF_DUMP_FILTER_MASK \ 441 #define MMF_DUMP_FILTER_MASK \
442 (((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT) 442 (((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT)
443 #define MMF_DUMP_FILTER_DEFAULT \ 443 #define MMF_DUMP_FILTER_DEFAULT \
444 ((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED) |\ 444 ((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED) |\
445 (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF) 445 (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF)
446 446
447 #ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS 447 #ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS
448 # define MMF_DUMP_MASK_DEFAULT_ELF (1 << MMF_DUMP_ELF_HEADERS) 448 # define MMF_DUMP_MASK_DEFAULT_ELF (1 << MMF_DUMP_ELF_HEADERS)
449 #else 449 #else
450 # define MMF_DUMP_MASK_DEFAULT_ELF 0 450 # define MMF_DUMP_MASK_DEFAULT_ELF 0
451 #endif 451 #endif
452 /* leave room for more dump flags */ 452 /* leave room for more dump flags */
453 #define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ 453 #define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */
454 #define MMF_VM_HUGEPAGE 17 /* set when VM_HUGEPAGE is set on vma */ 454 #define MMF_VM_HUGEPAGE 17 /* set when VM_HUGEPAGE is set on vma */
455 #define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */ 455 #define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */
456 456
457 #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) 457 #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
458 458
459 struct sighand_struct { 459 struct sighand_struct {
460 atomic_t count; 460 atomic_t count;
461 struct k_sigaction action[_NSIG]; 461 struct k_sigaction action[_NSIG];
462 spinlock_t siglock; 462 spinlock_t siglock;
463 wait_queue_head_t signalfd_wqh; 463 wait_queue_head_t signalfd_wqh;
464 }; 464 };
465 465
466 struct pacct_struct { 466 struct pacct_struct {
467 int ac_flag; 467 int ac_flag;
468 long ac_exitcode; 468 long ac_exitcode;
469 unsigned long ac_mem; 469 unsigned long ac_mem;
470 cputime_t ac_utime, ac_stime; 470 cputime_t ac_utime, ac_stime;
471 unsigned long ac_minflt, ac_majflt; 471 unsigned long ac_minflt, ac_majflt;
472 }; 472 };
473 473
474 struct cpu_itimer { 474 struct cpu_itimer {
475 cputime_t expires; 475 cputime_t expires;
476 cputime_t incr; 476 cputime_t incr;
477 u32 error; 477 u32 error;
478 u32 incr_error; 478 u32 incr_error;
479 }; 479 };
480 480
481 /** 481 /**
482 * struct task_cputime - collected CPU time counts 482 * struct task_cputime - collected CPU time counts
483 * @utime: time spent in user mode, in &cputime_t units 483 * @utime: time spent in user mode, in &cputime_t units
484 * @stime: time spent in kernel mode, in &cputime_t units 484 * @stime: time spent in kernel mode, in &cputime_t units
485 * @sum_exec_runtime: total time spent on the CPU, in nanoseconds 485 * @sum_exec_runtime: total time spent on the CPU, in nanoseconds
486 * 486 *
487 * This structure groups together three kinds of CPU time that are 487 * This structure groups together three kinds of CPU time that are
488 * tracked for threads and thread groups. Most things considering 488 * tracked for threads and thread groups. Most things considering
489 * CPU time want to group these counts together and treat all three 489 * CPU time want to group these counts together and treat all three
490 * of them in parallel. 490 * of them in parallel.
491 */ 491 */
492 struct task_cputime { 492 struct task_cputime {
493 cputime_t utime; 493 cputime_t utime;
494 cputime_t stime; 494 cputime_t stime;
495 unsigned long long sum_exec_runtime; 495 unsigned long long sum_exec_runtime;
496 }; 496 };
497 /* Alternate field names when used to cache expirations. */ 497 /* Alternate field names when used to cache expirations. */
498 #define prof_exp stime 498 #define prof_exp stime
499 #define virt_exp utime 499 #define virt_exp utime
500 #define sched_exp sum_exec_runtime 500 #define sched_exp sum_exec_runtime
501 501
502 #define INIT_CPUTIME \ 502 #define INIT_CPUTIME \
503 (struct task_cputime) { \ 503 (struct task_cputime) { \
504 .utime = 0, \ 504 .utime = 0, \
505 .stime = 0, \ 505 .stime = 0, \
506 .sum_exec_runtime = 0, \ 506 .sum_exec_runtime = 0, \
507 } 507 }
508 508
509 /* 509 /*
510 * Disable preemption until the scheduler is running. 510 * Disable preemption until the scheduler is running.
511 * Reset by start_kernel()->sched_init()->init_idle(). 511 * Reset by start_kernel()->sched_init()->init_idle().
512 * 512 *
513 * We include PREEMPT_ACTIVE to avoid cond_resched() from working 513 * We include PREEMPT_ACTIVE to avoid cond_resched() from working
514 * before the scheduler is active -- see should_resched(). 514 * before the scheduler is active -- see should_resched().
515 */ 515 */
516 #define INIT_PREEMPT_COUNT (1 + PREEMPT_ACTIVE) 516 #define INIT_PREEMPT_COUNT (1 + PREEMPT_ACTIVE)
517 517
518 /** 518 /**
519 * struct thread_group_cputimer - thread group interval timer counts 519 * struct thread_group_cputimer - thread group interval timer counts
520 * @cputime: thread group interval timers. 520 * @cputime: thread group interval timers.
521 * @running: non-zero when there are timers running and 521 * @running: non-zero when there are timers running and
522 * @cputime receives updates. 522 * @cputime receives updates.
523 * @lock: lock for fields in this struct. 523 * @lock: lock for fields in this struct.
524 * 524 *
525 * This structure contains the version of task_cputime, above, that is 525 * This structure contains the version of task_cputime, above, that is
526 * used for thread group CPU timer calculations. 526 * used for thread group CPU timer calculations.
527 */ 527 */
528 struct thread_group_cputimer { 528 struct thread_group_cputimer {
529 struct task_cputime cputime; 529 struct task_cputime cputime;
530 int running; 530 int running;
531 raw_spinlock_t lock; 531 raw_spinlock_t lock;
532 }; 532 };
533 533
534 #include <linux/rwsem.h> 534 #include <linux/rwsem.h>
535 struct autogroup; 535 struct autogroup;
536 536
537 /* 537 /*
538 * NOTE! "signal_struct" does not have its own 538 * NOTE! "signal_struct" does not have its own
539 * locking, because a shared signal_struct always 539 * locking, because a shared signal_struct always
540 * implies a shared sighand_struct, so locking 540 * implies a shared sighand_struct, so locking
541 * sighand_struct is always a proper superset of 541 * sighand_struct is always a proper superset of
542 * the locking of signal_struct. 542 * the locking of signal_struct.
543 */ 543 */
544 struct signal_struct { 544 struct signal_struct {
545 atomic_t sigcnt; 545 atomic_t sigcnt;
546 atomic_t live; 546 atomic_t live;
547 int nr_threads; 547 int nr_threads;
548 548
549 wait_queue_head_t wait_chldexit; /* for wait4() */ 549 wait_queue_head_t wait_chldexit; /* for wait4() */
550 550
551 /* current thread group signal load-balancing target: */ 551 /* current thread group signal load-balancing target: */
552 struct task_struct *curr_target; 552 struct task_struct *curr_target;
553 553
554 /* shared signal handling: */ 554 /* shared signal handling: */
555 struct sigpending shared_pending; 555 struct sigpending shared_pending;
556 556
557 /* thread group exit support */ 557 /* thread group exit support */
558 int group_exit_code; 558 int group_exit_code;
559 /* overloaded: 559 /* overloaded:
560 * - notify group_exit_task when ->count is equal to notify_count 560 * - notify group_exit_task when ->count is equal to notify_count
561 * - everyone except group_exit_task is stopped during signal delivery 561 * - everyone except group_exit_task is stopped during signal delivery
562 * of fatal signals, group_exit_task processes the signal. 562 * of fatal signals, group_exit_task processes the signal.
563 */ 563 */
564 int notify_count; 564 int notify_count;
565 struct task_struct *group_exit_task; 565 struct task_struct *group_exit_task;
566 566
567 /* thread group stop support, overloads group_exit_code too */ 567 /* thread group stop support, overloads group_exit_code too */
568 int group_stop_count; 568 int group_stop_count;
569 unsigned int flags; /* see SIGNAL_* flags below */ 569 unsigned int flags; /* see SIGNAL_* flags below */
570 570
571 /* 571 /*
572 * PR_SET_CHILD_SUBREAPER marks a process, like a service 572 * PR_SET_CHILD_SUBREAPER marks a process, like a service
573 * manager, to re-parent orphan (double-forking) child processes 573 * manager, to re-parent orphan (double-forking) child processes
574 * to this process instead of 'init'. The service manager is 574 * to this process instead of 'init'. The service manager is
575 * able to receive SIGCHLD signals and is able to investigate 575 * able to receive SIGCHLD signals and is able to investigate
576 * the process until it calls wait(). All children of this 576 * the process until it calls wait(). All children of this
577 * process will inherit a flag if they should look for a 577 * process will inherit a flag if they should look for a
578 * child_subreaper process at exit. 578 * child_subreaper process at exit.
579 */ 579 */
580 unsigned int is_child_subreaper:1; 580 unsigned int is_child_subreaper:1;
581 unsigned int has_child_subreaper:1; 581 unsigned int has_child_subreaper:1;
582 582
583 /* POSIX.1b Interval Timers */ 583 /* POSIX.1b Interval Timers */
584 struct list_head posix_timers; 584 struct list_head posix_timers;
585 585
586 /* ITIMER_REAL timer for the process */ 586 /* ITIMER_REAL timer for the process */
587 struct hrtimer real_timer; 587 struct hrtimer real_timer;
588 struct pid *leader_pid; 588 struct pid *leader_pid;
589 ktime_t it_real_incr; 589 ktime_t it_real_incr;
590 590
591 /* 591 /*
592 * ITIMER_PROF and ITIMER_VIRTUAL timers for the process, we use 592 * ITIMER_PROF and ITIMER_VIRTUAL timers for the process, we use
593 * CPUCLOCK_PROF and CPUCLOCK_VIRT for indexing array as these 593 * CPUCLOCK_PROF and CPUCLOCK_VIRT for indexing array as these
594 * values are defined to 0 and 1 respectively 594 * values are defined to 0 and 1 respectively
595 */ 595 */
596 struct cpu_itimer it[2]; 596 struct cpu_itimer it[2];
597 597
598 /* 598 /*
599 * Thread group totals for process CPU timers. 599 * Thread group totals for process CPU timers.
600 * See thread_group_cputimer(), et al, for details. 600 * See thread_group_cputimer(), et al, for details.
601 */ 601 */
602 struct thread_group_cputimer cputimer; 602 struct thread_group_cputimer cputimer;
603 603
604 /* Earliest-expiration cache. */ 604 /* Earliest-expiration cache. */
605 struct task_cputime cputime_expires; 605 struct task_cputime cputime_expires;
606 606
607 struct list_head cpu_timers[3]; 607 struct list_head cpu_timers[3];
608 608
609 struct pid *tty_old_pgrp; 609 struct pid *tty_old_pgrp;
610 610
611 /* boolean value for session group leader */ 611 /* boolean value for session group leader */
612 int leader; 612 int leader;
613 613
614 struct tty_struct *tty; /* NULL if no tty */ 614 struct tty_struct *tty; /* NULL if no tty */
615 615
616 #ifdef CONFIG_SCHED_AUTOGROUP 616 #ifdef CONFIG_SCHED_AUTOGROUP
617 struct autogroup *autogroup; 617 struct autogroup *autogroup;
618 #endif 618 #endif
619 /* 619 /*
620 * Cumulative resource counters for dead threads in the group, 620 * Cumulative resource counters for dead threads in the group,
621 * and for reaped dead child processes forked by this group. 621 * and for reaped dead child processes forked by this group.
622 * Live threads maintain their own counters and add to these 622 * Live threads maintain their own counters and add to these
623 * in __exit_signal, except for the group leader. 623 * in __exit_signal, except for the group leader.
624 */ 624 */
625 cputime_t utime, stime, cutime, cstime; 625 cputime_t utime, stime, cutime, cstime;
626 cputime_t gtime; 626 cputime_t gtime;
627 cputime_t cgtime; 627 cputime_t cgtime;
628 #ifndef CONFIG_VIRT_CPU_ACCOUNTING 628 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
629 cputime_t prev_utime, prev_stime; 629 cputime_t prev_utime, prev_stime;
630 #endif 630 #endif
631 unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; 631 unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
632 unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; 632 unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
633 unsigned long inblock, oublock, cinblock, coublock; 633 unsigned long inblock, oublock, cinblock, coublock;
634 unsigned long maxrss, cmaxrss; 634 unsigned long maxrss, cmaxrss;
635 struct task_io_accounting ioac; 635 struct task_io_accounting ioac;
636 636
637 /* 637 /*
638 * Cumulative ns of schedule CPU time fo dead threads in the 638 * Cumulative ns of schedule CPU time fo dead threads in the
639 * group, not including a zombie group leader, (This only differs 639 * group, not including a zombie group leader, (This only differs
640 * from jiffies_to_ns(utime + stime) if sched_clock uses something 640 * from jiffies_to_ns(utime + stime) if sched_clock uses something
641 * other than jiffies.) 641 * other than jiffies.)
642 */ 642 */
643 unsigned long long sum_sched_runtime; 643 unsigned long long sum_sched_runtime;
644 644
645 /* 645 /*
646 * We don't bother to synchronize most readers of this at all, 646 * We don't bother to synchronize most readers of this at all,
647 * because there is no reader checking a limit that actually needs 647 * because there is no reader checking a limit that actually needs
648 * to get both rlim_cur and rlim_max atomically, and either one 648 * to get both rlim_cur and rlim_max atomically, and either one
649 * alone is a single word that can safely be read normally. 649 * alone is a single word that can safely be read normally.
650 * getrlimit/setrlimit use task_lock(current->group_leader) to 650 * getrlimit/setrlimit use task_lock(current->group_leader) to
651 * protect this instead of the siglock, because they really 651 * protect this instead of the siglock, because they really
652 * have no need to disable irqs. 652 * have no need to disable irqs.
653 */ 653 */
654 struct rlimit rlim[RLIM_NLIMITS]; 654 struct rlimit rlim[RLIM_NLIMITS];
655 655
656 #ifdef CONFIG_BSD_PROCESS_ACCT 656 #ifdef CONFIG_BSD_PROCESS_ACCT
657 struct pacct_struct pacct; /* per-process accounting information */ 657 struct pacct_struct pacct; /* per-process accounting information */
658 #endif 658 #endif
659 #ifdef CONFIG_TASKSTATS 659 #ifdef CONFIG_TASKSTATS
660 struct taskstats *stats; 660 struct taskstats *stats;
661 #endif 661 #endif
662 #ifdef CONFIG_AUDIT 662 #ifdef CONFIG_AUDIT
663 unsigned audit_tty; 663 unsigned audit_tty;
664 struct tty_audit_buf *tty_audit_buf; 664 struct tty_audit_buf *tty_audit_buf;
665 #endif 665 #endif
666 #ifdef CONFIG_CGROUPS 666 #ifdef CONFIG_CGROUPS
667 /* 667 /*
668 * group_rwsem prevents new tasks from entering the threadgroup and 668 * group_rwsem prevents new tasks from entering the threadgroup and
669 * member tasks from exiting,a more specifically, setting of 669 * member tasks from exiting,a more specifically, setting of
670 * PF_EXITING. fork and exit paths are protected with this rwsem 670 * PF_EXITING. fork and exit paths are protected with this rwsem
671 * using threadgroup_change_begin/end(). Users which require 671 * using threadgroup_change_begin/end(). Users which require
672 * threadgroup to remain stable should use threadgroup_[un]lock() 672 * threadgroup to remain stable should use threadgroup_[un]lock()
673 * which also takes care of exec path. Currently, cgroup is the 673 * which also takes care of exec path. Currently, cgroup is the
674 * only user. 674 * only user.
675 */ 675 */
676 struct rw_semaphore group_rwsem; 676 struct rw_semaphore group_rwsem;
677 #endif 677 #endif
678 678
679 int oom_adj; /* OOM kill score adjustment (bit shift) */ 679 int oom_adj; /* OOM kill score adjustment (bit shift) */
680 int oom_score_adj; /* OOM kill score adjustment */ 680 int oom_score_adj; /* OOM kill score adjustment */
681 int oom_score_adj_min; /* OOM kill score adjustment minimum value. 681 int oom_score_adj_min; /* OOM kill score adjustment minimum value.
682 * Only settable by CAP_SYS_RESOURCE. */ 682 * Only settable by CAP_SYS_RESOURCE. */
683 683
684 struct mutex cred_guard_mutex; /* guard against foreign influences on 684 struct mutex cred_guard_mutex; /* guard against foreign influences on
685 * credential calculations 685 * credential calculations
686 * (notably. ptrace) */ 686 * (notably. ptrace) */
687 }; 687 };
688 688
689 /* Context switch must be unlocked if interrupts are to be enabled */ 689 /* Context switch must be unlocked if interrupts are to be enabled */
690 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 690 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
691 # define __ARCH_WANT_UNLOCKED_CTXSW 691 # define __ARCH_WANT_UNLOCKED_CTXSW
692 #endif 692 #endif
693 693
694 /* 694 /*
695 * Bits in flags field of signal_struct. 695 * Bits in flags field of signal_struct.
696 */ 696 */
697 #define SIGNAL_STOP_STOPPED 0x00000001 /* job control stop in effect */ 697 #define SIGNAL_STOP_STOPPED 0x00000001 /* job control stop in effect */
698 #define SIGNAL_STOP_CONTINUED 0x00000002 /* SIGCONT since WCONTINUED reap */ 698 #define SIGNAL_STOP_CONTINUED 0x00000002 /* SIGCONT since WCONTINUED reap */
699 #define SIGNAL_GROUP_EXIT 0x00000004 /* group exit in progress */ 699 #define SIGNAL_GROUP_EXIT 0x00000004 /* group exit in progress */
700 /* 700 /*
701 * Pending notifications to parent. 701 * Pending notifications to parent.
702 */ 702 */
703 #define SIGNAL_CLD_STOPPED 0x00000010 703 #define SIGNAL_CLD_STOPPED 0x00000010
704 #define SIGNAL_CLD_CONTINUED 0x00000020 704 #define SIGNAL_CLD_CONTINUED 0x00000020
705 #define SIGNAL_CLD_MASK (SIGNAL_CLD_STOPPED|SIGNAL_CLD_CONTINUED) 705 #define SIGNAL_CLD_MASK (SIGNAL_CLD_STOPPED|SIGNAL_CLD_CONTINUED)
706 706
707 #define SIGNAL_UNKILLABLE 0x00000040 /* for init: ignore fatal signals */ 707 #define SIGNAL_UNKILLABLE 0x00000040 /* for init: ignore fatal signals */
708 708
709 /* If true, all threads except ->group_exit_task have pending SIGKILL */ 709 /* If true, all threads except ->group_exit_task have pending SIGKILL */
710 static inline int signal_group_exit(const struct signal_struct *sig) 710 static inline int signal_group_exit(const struct signal_struct *sig)
711 { 711 {
712 return (sig->flags & SIGNAL_GROUP_EXIT) || 712 return (sig->flags & SIGNAL_GROUP_EXIT) ||
713 (sig->group_exit_task != NULL); 713 (sig->group_exit_task != NULL);
714 } 714 }
715 715
716 /* 716 /*
717 * Some day this will be a full-fledged user tracking system.. 717 * Some day this will be a full-fledged user tracking system..
718 */ 718 */
719 struct user_struct { 719 struct user_struct {
720 atomic_t __count; /* reference count */ 720 atomic_t __count; /* reference count */
721 atomic_t processes; /* How many processes does this user have? */ 721 atomic_t processes; /* How many processes does this user have? */
722 atomic_t files; /* How many open files does this user have? */ 722 atomic_t files; /* How many open files does this user have? */
723 atomic_t sigpending; /* How many pending signals does this user have? */ 723 atomic_t sigpending; /* How many pending signals does this user have? */
724 #ifdef CONFIG_INOTIFY_USER 724 #ifdef CONFIG_INOTIFY_USER
725 atomic_t inotify_watches; /* How many inotify watches does this user have? */ 725 atomic_t inotify_watches; /* How many inotify watches does this user have? */
726 atomic_t inotify_devs; /* How many inotify devs does this user have opened? */ 726 atomic_t inotify_devs; /* How many inotify devs does this user have opened? */
727 #endif 727 #endif
728 #ifdef CONFIG_FANOTIFY 728 #ifdef CONFIG_FANOTIFY
729 atomic_t fanotify_listeners; 729 atomic_t fanotify_listeners;
730 #endif 730 #endif
731 #ifdef CONFIG_EPOLL 731 #ifdef CONFIG_EPOLL
732 atomic_long_t epoll_watches; /* The number of file descriptors currently watched */ 732 atomic_long_t epoll_watches; /* The number of file descriptors currently watched */
733 #endif 733 #endif
734 #ifdef CONFIG_POSIX_MQUEUE 734 #ifdef CONFIG_POSIX_MQUEUE
735 /* protected by mq_lock */ 735 /* protected by mq_lock */
736 unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */ 736 unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */
737 #endif 737 #endif
738 unsigned long locked_shm; /* How many pages of mlocked shm ? */ 738 unsigned long locked_shm; /* How many pages of mlocked shm ? */
739 739
740 #ifdef CONFIG_KEYS 740 #ifdef CONFIG_KEYS
741 struct key *uid_keyring; /* UID specific keyring */ 741 struct key *uid_keyring; /* UID specific keyring */
742 struct key *session_keyring; /* UID's default session keyring */ 742 struct key *session_keyring; /* UID's default session keyring */
743 #endif 743 #endif
744 744
745 /* Hash table maintenance information */ 745 /* Hash table maintenance information */
746 struct hlist_node uidhash_node; 746 struct hlist_node uidhash_node;
747 kuid_t uid; 747 kuid_t uid;
748 748
749 #ifdef CONFIG_PERF_EVENTS 749 #ifdef CONFIG_PERF_EVENTS
750 atomic_long_t locked_vm; 750 atomic_long_t locked_vm;
751 #endif 751 #endif
752 }; 752 };
753 753
754 extern int uids_sysfs_init(void); 754 extern int uids_sysfs_init(void);
755 755
756 extern struct user_struct *find_user(kuid_t); 756 extern struct user_struct *find_user(kuid_t);
757 757
758 extern struct user_struct root_user; 758 extern struct user_struct root_user;
759 #define INIT_USER (&root_user) 759 #define INIT_USER (&root_user)
760 760
761 761
762 struct backing_dev_info; 762 struct backing_dev_info;
763 struct reclaim_state; 763 struct reclaim_state;
764 764
765 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 765 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
766 struct sched_info { 766 struct sched_info {
767 /* cumulative counters */ 767 /* cumulative counters */
768 unsigned long pcount; /* # of times run on this cpu */ 768 unsigned long pcount; /* # of times run on this cpu */
769 unsigned long long run_delay; /* time spent waiting on a runqueue */ 769 unsigned long long run_delay; /* time spent waiting on a runqueue */
770 770
771 /* timestamps */ 771 /* timestamps */
772 unsigned long long last_arrival,/* when we last ran on a cpu */ 772 unsigned long long last_arrival,/* when we last ran on a cpu */
773 last_queued; /* when we were last queued to run */ 773 last_queued; /* when we were last queued to run */
774 }; 774 };
775 #endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */ 775 #endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */
776 776
777 #ifdef CONFIG_TASK_DELAY_ACCT 777 #ifdef CONFIG_TASK_DELAY_ACCT
778 struct task_delay_info { 778 struct task_delay_info {
779 spinlock_t lock; 779 spinlock_t lock;
780 unsigned int flags; /* Private per-task flags */ 780 unsigned int flags; /* Private per-task flags */
781 781
782 /* For each stat XXX, add following, aligned appropriately 782 /* For each stat XXX, add following, aligned appropriately
783 * 783 *
784 * struct timespec XXX_start, XXX_end; 784 * struct timespec XXX_start, XXX_end;
785 * u64 XXX_delay; 785 * u64 XXX_delay;
786 * u32 XXX_count; 786 * u32 XXX_count;
787 * 787 *
788 * Atomicity of updates to XXX_delay, XXX_count protected by 788 * Atomicity of updates to XXX_delay, XXX_count protected by
789 * single lock above (split into XXX_lock if contention is an issue). 789 * single lock above (split into XXX_lock if contention is an issue).
790 */ 790 */
791 791
792 /* 792 /*
793 * XXX_count is incremented on every XXX operation, the delay 793 * XXX_count is incremented on every XXX operation, the delay
794 * associated with the operation is added to XXX_delay. 794 * associated with the operation is added to XXX_delay.
795 * XXX_delay contains the accumulated delay time in nanoseconds. 795 * XXX_delay contains the accumulated delay time in nanoseconds.
796 */ 796 */
797 struct timespec blkio_start, blkio_end; /* Shared by blkio, swapin */ 797 struct timespec blkio_start, blkio_end; /* Shared by blkio, swapin */
798 u64 blkio_delay; /* wait for sync block io completion */ 798 u64 blkio_delay; /* wait for sync block io completion */
799 u64 swapin_delay; /* wait for swapin block io completion */ 799 u64 swapin_delay; /* wait for swapin block io completion */
800 u32 blkio_count; /* total count of the number of sync block */ 800 u32 blkio_count; /* total count of the number of sync block */
801 /* io operations performed */ 801 /* io operations performed */
802 u32 swapin_count; /* total count of the number of swapin block */ 802 u32 swapin_count; /* total count of the number of swapin block */
803 /* io operations performed */ 803 /* io operations performed */
804 804
805 struct timespec freepages_start, freepages_end; 805 struct timespec freepages_start, freepages_end;
806 u64 freepages_delay; /* wait for memory reclaim */ 806 u64 freepages_delay; /* wait for memory reclaim */
807 u32 freepages_count; /* total count of memory reclaim */ 807 u32 freepages_count; /* total count of memory reclaim */
808 }; 808 };
809 #endif /* CONFIG_TASK_DELAY_ACCT */ 809 #endif /* CONFIG_TASK_DELAY_ACCT */
810 810
811 static inline int sched_info_on(void) 811 static inline int sched_info_on(void)
812 { 812 {
813 #ifdef CONFIG_SCHEDSTATS 813 #ifdef CONFIG_SCHEDSTATS
814 return 1; 814 return 1;
815 #elif defined(CONFIG_TASK_DELAY_ACCT) 815 #elif defined(CONFIG_TASK_DELAY_ACCT)
816 extern int delayacct_on; 816 extern int delayacct_on;
817 return delayacct_on; 817 return delayacct_on;
818 #else 818 #else
819 return 0; 819 return 0;
820 #endif 820 #endif
821 } 821 }
822 822
823 enum cpu_idle_type { 823 enum cpu_idle_type {
824 CPU_IDLE, 824 CPU_IDLE,
825 CPU_NOT_IDLE, 825 CPU_NOT_IDLE,
826 CPU_NEWLY_IDLE, 826 CPU_NEWLY_IDLE,
827 CPU_MAX_IDLE_TYPES 827 CPU_MAX_IDLE_TYPES
828 }; 828 };
829 829
830 /* 830 /*
831 * Increase resolution of nice-level calculations for 64-bit architectures. 831 * Increase resolution of nice-level calculations for 64-bit architectures.
832 * The extra resolution improves shares distribution and load balancing of 832 * The extra resolution improves shares distribution and load balancing of
833 * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup 833 * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup
834 * hierarchies, especially on larger systems. This is not a user-visible change 834 * hierarchies, especially on larger systems. This is not a user-visible change
835 * and does not change the user-interface for setting shares/weights. 835 * and does not change the user-interface for setting shares/weights.
836 * 836 *
837 * We increase resolution only if we have enough bits to allow this increased 837 * We increase resolution only if we have enough bits to allow this increased
838 * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution 838 * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution
839 * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the 839 * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the
840 * increased costs. 840 * increased costs.
841 */ 841 */
842 #if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load */ 842 #if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load */
843 # define SCHED_LOAD_RESOLUTION 10 843 # define SCHED_LOAD_RESOLUTION 10
844 # define scale_load(w) ((w) << SCHED_LOAD_RESOLUTION) 844 # define scale_load(w) ((w) << SCHED_LOAD_RESOLUTION)
845 # define scale_load_down(w) ((w) >> SCHED_LOAD_RESOLUTION) 845 # define scale_load_down(w) ((w) >> SCHED_LOAD_RESOLUTION)
846 #else 846 #else
847 # define SCHED_LOAD_RESOLUTION 0 847 # define SCHED_LOAD_RESOLUTION 0
848 # define scale_load(w) (w) 848 # define scale_load(w) (w)
849 # define scale_load_down(w) (w) 849 # define scale_load_down(w) (w)
850 #endif 850 #endif
851 851
852 #define SCHED_LOAD_SHIFT (10 + SCHED_LOAD_RESOLUTION) 852 #define SCHED_LOAD_SHIFT (10 + SCHED_LOAD_RESOLUTION)
853 #define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT) 853 #define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT)
854 854
855 /* 855 /*
856 * Increase resolution of cpu_power calculations 856 * Increase resolution of cpu_power calculations
857 */ 857 */
858 #define SCHED_POWER_SHIFT 10 858 #define SCHED_POWER_SHIFT 10
859 #define SCHED_POWER_SCALE (1L << SCHED_POWER_SHIFT) 859 #define SCHED_POWER_SCALE (1L << SCHED_POWER_SHIFT)
860 860
861 /* 861 /*
862 * sched-domains (multiprocessor balancing) declarations: 862 * sched-domains (multiprocessor balancing) declarations:
863 */ 863 */
864 #ifdef CONFIG_SMP 864 #ifdef CONFIG_SMP
865 #define SD_LOAD_BALANCE 0x0001 /* Do load balancing on this domain. */ 865 #define SD_LOAD_BALANCE 0x0001 /* Do load balancing on this domain. */
866 #define SD_BALANCE_NEWIDLE 0x0002 /* Balance when about to become idle */ 866 #define SD_BALANCE_NEWIDLE 0x0002 /* Balance when about to become idle */
867 #define SD_BALANCE_EXEC 0x0004 /* Balance on exec */ 867 #define SD_BALANCE_EXEC 0x0004 /* Balance on exec */
868 #define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */ 868 #define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */
869 #define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */ 869 #define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */
870 #define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ 870 #define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */
871 #define SD_PREFER_LOCAL 0x0040 /* Prefer to keep tasks local to this domain */ 871 #define SD_PREFER_LOCAL 0x0040 /* Prefer to keep tasks local to this domain */
872 #define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */ 872 #define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */
873 #define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ 873 #define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */
874 #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ 874 #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */
875 #define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */ 875 #define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */
876 #define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ 876 #define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */
877 #define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */ 877 #define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */
878 878
879 extern int __weak arch_sd_sibiling_asym_packing(void); 879 extern int __weak arch_sd_sibiling_asym_packing(void);
880 880
881 struct sched_group_power { 881 struct sched_group_power {
882 atomic_t ref; 882 atomic_t ref;
883 /* 883 /*
884 * CPU power of this group, SCHED_LOAD_SCALE being max power for a 884 * CPU power of this group, SCHED_LOAD_SCALE being max power for a
885 * single CPU. 885 * single CPU.
886 */ 886 */
887 unsigned int power, power_orig; 887 unsigned int power, power_orig;
888 unsigned long next_update; 888 unsigned long next_update;
889 /* 889 /*
890 * Number of busy cpus in this group. 890 * Number of busy cpus in this group.
891 */ 891 */
892 atomic_t nr_busy_cpus; 892 atomic_t nr_busy_cpus;
893 893
894 unsigned long cpumask[0]; /* iteration mask */ 894 unsigned long cpumask[0]; /* iteration mask */
895 }; 895 };
896 896
897 struct sched_group { 897 struct sched_group {
898 struct sched_group *next; /* Must be a circular list */ 898 struct sched_group *next; /* Must be a circular list */
899 atomic_t ref; 899 atomic_t ref;
900 900
901 unsigned int group_weight; 901 unsigned int group_weight;
902 struct sched_group_power *sgp; 902 struct sched_group_power *sgp;
903 903
904 /* 904 /*
905 * The CPUs this group covers. 905 * The CPUs this group covers.
906 * 906 *
907 * NOTE: this field is variable length. (Allocated dynamically 907 * NOTE: this field is variable length. (Allocated dynamically
908 * by attaching extra space to the end of the structure, 908 * by attaching extra space to the end of the structure,
909 * depending on how many CPUs the kernel has booted up with) 909 * depending on how many CPUs the kernel has booted up with)
910 */ 910 */
911 unsigned long cpumask[0]; 911 unsigned long cpumask[0];
912 }; 912 };
913 913
914 static inline struct cpumask *sched_group_cpus(struct sched_group *sg) 914 static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
915 { 915 {
916 return to_cpumask(sg->cpumask); 916 return to_cpumask(sg->cpumask);
917 } 917 }
918 918
919 /* 919 /*
920 * cpumask masking which cpus in the group are allowed to iterate up the domain 920 * cpumask masking which cpus in the group are allowed to iterate up the domain
921 * tree. 921 * tree.
922 */ 922 */
923 static inline struct cpumask *sched_group_mask(struct sched_group *sg) 923 static inline struct cpumask *sched_group_mask(struct sched_group *sg)
924 { 924 {
925 return to_cpumask(sg->sgp->cpumask); 925 return to_cpumask(sg->sgp->cpumask);
926 } 926 }
927 927
928 /** 928 /**
929 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. 929 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
930 * @group: The group whose first cpu is to be returned. 930 * @group: The group whose first cpu is to be returned.
931 */ 931 */
932 static inline unsigned int group_first_cpu(struct sched_group *group) 932 static inline unsigned int group_first_cpu(struct sched_group *group)
933 { 933 {
934 return cpumask_first(sched_group_cpus(group)); 934 return cpumask_first(sched_group_cpus(group));
935 } 935 }
936 936
937 struct sched_domain_attr { 937 struct sched_domain_attr {
938 int relax_domain_level; 938 int relax_domain_level;
939 }; 939 };
940 940
941 #define SD_ATTR_INIT (struct sched_domain_attr) { \ 941 #define SD_ATTR_INIT (struct sched_domain_attr) { \
942 .relax_domain_level = -1, \ 942 .relax_domain_level = -1, \
943 } 943 }
944 944
945 extern int sched_domain_level_max; 945 extern int sched_domain_level_max;
946 946
947 struct sched_domain { 947 struct sched_domain {
948 /* These fields must be setup */ 948 /* These fields must be setup */
949 struct sched_domain *parent; /* top domain must be null terminated */ 949 struct sched_domain *parent; /* top domain must be null terminated */
950 struct sched_domain *child; /* bottom domain must be null terminated */ 950 struct sched_domain *child; /* bottom domain must be null terminated */
951 struct sched_group *groups; /* the balancing groups of the domain */ 951 struct sched_group *groups; /* the balancing groups of the domain */
952 unsigned long min_interval; /* Minimum balance interval ms */ 952 unsigned long min_interval; /* Minimum balance interval ms */
953 unsigned long max_interval; /* Maximum balance interval ms */ 953 unsigned long max_interval; /* Maximum balance interval ms */
954 unsigned int busy_factor; /* less balancing by factor if busy */ 954 unsigned int busy_factor; /* less balancing by factor if busy */
955 unsigned int imbalance_pct; /* No balance until over watermark */ 955 unsigned int imbalance_pct; /* No balance until over watermark */
956 unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */ 956 unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */
957 unsigned int busy_idx; 957 unsigned int busy_idx;
958 unsigned int idle_idx; 958 unsigned int idle_idx;
959 unsigned int newidle_idx; 959 unsigned int newidle_idx;
960 unsigned int wake_idx; 960 unsigned int wake_idx;
961 unsigned int forkexec_idx; 961 unsigned int forkexec_idx;
962 unsigned int smt_gain; 962 unsigned int smt_gain;
963 int flags; /* See SD_* */ 963 int flags; /* See SD_* */
964 int level; 964 int level;
965 int idle_buddy; /* cpu assigned to select_idle_sibling() */ 965 int idle_buddy; /* cpu assigned to select_idle_sibling() */
966 966
967 /* Runtime fields. */ 967 /* Runtime fields. */
968 unsigned long last_balance; /* init to jiffies. units in jiffies */ 968 unsigned long last_balance; /* init to jiffies. units in jiffies */
969 unsigned int balance_interval; /* initialise to 1. units in ms. */ 969 unsigned int balance_interval; /* initialise to 1. units in ms. */
970 unsigned int nr_balance_failed; /* initialise to 0 */ 970 unsigned int nr_balance_failed; /* initialise to 0 */
971 971
972 u64 last_update; 972 u64 last_update;
973 973
974 #ifdef CONFIG_SCHEDSTATS 974 #ifdef CONFIG_SCHEDSTATS
975 /* load_balance() stats */ 975 /* load_balance() stats */
976 unsigned int lb_count[CPU_MAX_IDLE_TYPES]; 976 unsigned int lb_count[CPU_MAX_IDLE_TYPES];
977 unsigned int lb_failed[CPU_MAX_IDLE_TYPES]; 977 unsigned int lb_failed[CPU_MAX_IDLE_TYPES];
978 unsigned int lb_balanced[CPU_MAX_IDLE_TYPES]; 978 unsigned int lb_balanced[CPU_MAX_IDLE_TYPES];
979 unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES]; 979 unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES];
980 unsigned int lb_gained[CPU_MAX_IDLE_TYPES]; 980 unsigned int lb_gained[CPU_MAX_IDLE_TYPES];
981 unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES]; 981 unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES];
982 unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES]; 982 unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES];
983 unsigned int lb_nobusyq[CPU_MAX_IDLE_TYPES]; 983 unsigned int lb_nobusyq[CPU_MAX_IDLE_TYPES];
984 984
985 /* Active load balancing */ 985 /* Active load balancing */
986 unsigned int alb_count; 986 unsigned int alb_count;
987 unsigned int alb_failed; 987 unsigned int alb_failed;
988 unsigned int alb_pushed; 988 unsigned int alb_pushed;
989 989
990 /* SD_BALANCE_EXEC stats */ 990 /* SD_BALANCE_EXEC stats */
991 unsigned int sbe_count; 991 unsigned int sbe_count;
992 unsigned int sbe_balanced; 992 unsigned int sbe_balanced;
993 unsigned int sbe_pushed; 993 unsigned int sbe_pushed;
994 994
995 /* SD_BALANCE_FORK stats */ 995 /* SD_BALANCE_FORK stats */
996 unsigned int sbf_count; 996 unsigned int sbf_count;
997 unsigned int sbf_balanced; 997 unsigned int sbf_balanced;
998 unsigned int sbf_pushed; 998 unsigned int sbf_pushed;
999 999
1000 /* try_to_wake_up() stats */ 1000 /* try_to_wake_up() stats */
1001 unsigned int ttwu_wake_remote; 1001 unsigned int ttwu_wake_remote;
1002 unsigned int ttwu_move_affine; 1002 unsigned int ttwu_move_affine;
1003 unsigned int ttwu_move_balance; 1003 unsigned int ttwu_move_balance;
1004 #endif 1004 #endif
1005 #ifdef CONFIG_SCHED_DEBUG 1005 #ifdef CONFIG_SCHED_DEBUG
1006 char *name; 1006 char *name;
1007 #endif 1007 #endif
1008 union { 1008 union {
1009 void *private; /* used during construction */ 1009 void *private; /* used during construction */
1010 struct rcu_head rcu; /* used during destruction */ 1010 struct rcu_head rcu; /* used during destruction */
1011 }; 1011 };
1012 1012
1013 unsigned int span_weight; 1013 unsigned int span_weight;
1014 /* 1014 /*
1015 * Span of all CPUs in this domain. 1015 * Span of all CPUs in this domain.
1016 * 1016 *
1017 * NOTE: this field is variable length. (Allocated dynamically 1017 * NOTE: this field is variable length. (Allocated dynamically
1018 * by attaching extra space to the end of the structure, 1018 * by attaching extra space to the end of the structure,
1019 * depending on how many CPUs the kernel has booted up with) 1019 * depending on how many CPUs the kernel has booted up with)
1020 */ 1020 */
1021 unsigned long span[0]; 1021 unsigned long span[0];
1022 }; 1022 };
1023 1023
1024 static inline struct cpumask *sched_domain_span(struct sched_domain *sd) 1024 static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
1025 { 1025 {
1026 return to_cpumask(sd->span); 1026 return to_cpumask(sd->span);
1027 } 1027 }
1028 1028
1029 extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], 1029 extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
1030 struct sched_domain_attr *dattr_new); 1030 struct sched_domain_attr *dattr_new);
1031 1031
1032 /* Allocate an array of sched domains, for partition_sched_domains(). */ 1032 /* Allocate an array of sched domains, for partition_sched_domains(). */
1033 cpumask_var_t *alloc_sched_domains(unsigned int ndoms); 1033 cpumask_var_t *alloc_sched_domains(unsigned int ndoms);
1034 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms); 1034 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms);
1035 1035
1036 /* Test a flag in parent sched domain */ 1036 /* Test a flag in parent sched domain */
1037 static inline int test_sd_parent(struct sched_domain *sd, int flag) 1037 static inline int test_sd_parent(struct sched_domain *sd, int flag)
1038 { 1038 {
1039 if (sd->parent && (sd->parent->flags & flag)) 1039 if (sd->parent && (sd->parent->flags & flag))
1040 return 1; 1040 return 1;
1041 1041
1042 return 0; 1042 return 0;
1043 } 1043 }
1044 1044
1045 unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu); 1045 unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu);
1046 unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu); 1046 unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu);
1047 1047
1048 bool cpus_share_cache(int this_cpu, int that_cpu); 1048 bool cpus_share_cache(int this_cpu, int that_cpu);
1049 1049
1050 #else /* CONFIG_SMP */ 1050 #else /* CONFIG_SMP */
1051 1051
1052 struct sched_domain_attr; 1052 struct sched_domain_attr;
1053 1053
1054 static inline void 1054 static inline void
1055 partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], 1055 partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
1056 struct sched_domain_attr *dattr_new) 1056 struct sched_domain_attr *dattr_new)
1057 { 1057 {
1058 } 1058 }
1059 1059
1060 static inline bool cpus_share_cache(int this_cpu, int that_cpu) 1060 static inline bool cpus_share_cache(int this_cpu, int that_cpu)
1061 { 1061 {
1062 return true; 1062 return true;
1063 } 1063 }
1064 1064
1065 #endif /* !CONFIG_SMP */ 1065 #endif /* !CONFIG_SMP */
1066 1066
1067 1067
1068 struct io_context; /* See blkdev.h */ 1068 struct io_context; /* See blkdev.h */
1069 1069
1070 1070
1071 #ifdef ARCH_HAS_PREFETCH_SWITCH_STACK 1071 #ifdef ARCH_HAS_PREFETCH_SWITCH_STACK
1072 extern void prefetch_stack(struct task_struct *t); 1072 extern void prefetch_stack(struct task_struct *t);
1073 #else 1073 #else
1074 static inline void prefetch_stack(struct task_struct *t) { } 1074 static inline void prefetch_stack(struct task_struct *t) { }
1075 #endif 1075 #endif
1076 1076
1077 struct audit_context; /* See audit.c */ 1077 struct audit_context; /* See audit.c */
1078 struct mempolicy; 1078 struct mempolicy;
1079 struct pipe_inode_info; 1079 struct pipe_inode_info;
1080 struct uts_namespace; 1080 struct uts_namespace;
1081 1081
1082 struct rq; 1082 struct rq;
1083 struct sched_domain; 1083 struct sched_domain;
1084 1084
1085 /* 1085 /*
1086 * wake flags 1086 * wake flags
1087 */ 1087 */
1088 #define WF_SYNC 0x01 /* waker goes to sleep after wakup */ 1088 #define WF_SYNC 0x01 /* waker goes to sleep after wakup */
1089 #define WF_FORK 0x02 /* child wakeup after fork */ 1089 #define WF_FORK 0x02 /* child wakeup after fork */
1090 #define WF_MIGRATED 0x04 /* internal use, task got migrated */ 1090 #define WF_MIGRATED 0x04 /* internal use, task got migrated */
1091 1091
1092 #define ENQUEUE_WAKEUP 1 1092 #define ENQUEUE_WAKEUP 1
1093 #define ENQUEUE_HEAD 2 1093 #define ENQUEUE_HEAD 2
1094 #ifdef CONFIG_SMP 1094 #ifdef CONFIG_SMP
1095 #define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */ 1095 #define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */
1096 #else 1096 #else
1097 #define ENQUEUE_WAKING 0 1097 #define ENQUEUE_WAKING 0
1098 #endif 1098 #endif
1099 1099
1100 #define DEQUEUE_SLEEP 1 1100 #define DEQUEUE_SLEEP 1
1101 1101
1102 struct sched_class { 1102 struct sched_class {
1103 const struct sched_class *next; 1103 const struct sched_class *next;
1104 1104
1105 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); 1105 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
1106 void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); 1106 void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
1107 void (*yield_task) (struct rq *rq); 1107 void (*yield_task) (struct rq *rq);
1108 bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt); 1108 bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt);
1109 1109
1110 void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); 1110 void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
1111 1111
1112 struct task_struct * (*pick_next_task) (struct rq *rq); 1112 struct task_struct * (*pick_next_task) (struct rq *rq);
1113 void (*put_prev_task) (struct rq *rq, struct task_struct *p); 1113 void (*put_prev_task) (struct rq *rq, struct task_struct *p);
1114 1114
1115 #ifdef CONFIG_SMP 1115 #ifdef CONFIG_SMP
1116 int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); 1116 int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
1117 1117
1118 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); 1118 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
1119 void (*post_schedule) (struct rq *this_rq); 1119 void (*post_schedule) (struct rq *this_rq);
1120 void (*task_waking) (struct task_struct *task); 1120 void (*task_waking) (struct task_struct *task);
1121 void (*task_woken) (struct rq *this_rq, struct task_struct *task); 1121 void (*task_woken) (struct rq *this_rq, struct task_struct *task);
1122 1122
1123 void (*set_cpus_allowed)(struct task_struct *p, 1123 void (*set_cpus_allowed)(struct task_struct *p,
1124 const struct cpumask *newmask); 1124 const struct cpumask *newmask);
1125 1125
1126 void (*rq_online)(struct rq *rq); 1126 void (*rq_online)(struct rq *rq);
1127 void (*rq_offline)(struct rq *rq); 1127 void (*rq_offline)(struct rq *rq);
1128 #endif 1128 #endif
1129 1129
1130 void (*set_curr_task) (struct rq *rq); 1130 void (*set_curr_task) (struct rq *rq);
1131 void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); 1131 void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
1132 void (*task_fork) (struct task_struct *p); 1132 void (*task_fork) (struct task_struct *p);
1133 1133
1134 void (*switched_from) (struct rq *this_rq, struct task_struct *task); 1134 void (*switched_from) (struct rq *this_rq, struct task_struct *task);
1135 void (*switched_to) (struct rq *this_rq, struct task_struct *task); 1135 void (*switched_to) (struct rq *this_rq, struct task_struct *task);
1136 void (*prio_changed) (struct rq *this_rq, struct task_struct *task, 1136 void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
1137 int oldprio); 1137 int oldprio);
1138 1138
1139 unsigned int (*get_rr_interval) (struct rq *rq, 1139 unsigned int (*get_rr_interval) (struct rq *rq,
1140 struct task_struct *task); 1140 struct task_struct *task);
1141 1141
1142 #ifdef CONFIG_FAIR_GROUP_SCHED 1142 #ifdef CONFIG_FAIR_GROUP_SCHED
1143 void (*task_move_group) (struct task_struct *p, int on_rq); 1143 void (*task_move_group) (struct task_struct *p, int on_rq);
1144 #endif 1144 #endif
1145 }; 1145 };
1146 1146
1147 struct load_weight { 1147 struct load_weight {
1148 unsigned long weight, inv_weight; 1148 unsigned long weight, inv_weight;
1149 }; 1149 };
1150 1150
1151 #ifdef CONFIG_SCHEDSTATS 1151 #ifdef CONFIG_SCHEDSTATS
1152 struct sched_statistics { 1152 struct sched_statistics {
1153 u64 wait_start; 1153 u64 wait_start;
1154 u64 wait_max; 1154 u64 wait_max;
1155 u64 wait_count; 1155 u64 wait_count;
1156 u64 wait_sum; 1156 u64 wait_sum;
1157 u64 iowait_count; 1157 u64 iowait_count;
1158 u64 iowait_sum; 1158 u64 iowait_sum;
1159 1159
1160 u64 sleep_start; 1160 u64 sleep_start;
1161 u64 sleep_max; 1161 u64 sleep_max;
1162 s64 sum_sleep_runtime; 1162 s64 sum_sleep_runtime;
1163 1163
1164 u64 block_start; 1164 u64 block_start;
1165 u64 block_max; 1165 u64 block_max;
1166 u64 exec_max; 1166 u64 exec_max;
1167 u64 slice_max; 1167 u64 slice_max;
1168 1168
1169 u64 nr_migrations_cold; 1169 u64 nr_migrations_cold;
1170 u64 nr_failed_migrations_affine; 1170 u64 nr_failed_migrations_affine;
1171 u64 nr_failed_migrations_running; 1171 u64 nr_failed_migrations_running;
1172 u64 nr_failed_migrations_hot; 1172 u64 nr_failed_migrations_hot;
1173 u64 nr_forced_migrations; 1173 u64 nr_forced_migrations;
1174 1174
1175 u64 nr_wakeups; 1175 u64 nr_wakeups;
1176 u64 nr_wakeups_sync; 1176 u64 nr_wakeups_sync;
1177 u64 nr_wakeups_migrate; 1177 u64 nr_wakeups_migrate;
1178 u64 nr_wakeups_local; 1178 u64 nr_wakeups_local;
1179 u64 nr_wakeups_remote; 1179 u64 nr_wakeups_remote;
1180 u64 nr_wakeups_affine; 1180 u64 nr_wakeups_affine;
1181 u64 nr_wakeups_affine_attempts; 1181 u64 nr_wakeups_affine_attempts;
1182 u64 nr_wakeups_passive; 1182 u64 nr_wakeups_passive;
1183 u64 nr_wakeups_idle; 1183 u64 nr_wakeups_idle;
1184 }; 1184 };
1185 #endif 1185 #endif
1186 1186
1187 struct sched_entity { 1187 struct sched_entity {
1188 struct load_weight load; /* for load-balancing */ 1188 struct load_weight load; /* for load-balancing */
1189 struct rb_node run_node; 1189 struct rb_node run_node;
1190 struct list_head group_node; 1190 struct list_head group_node;
1191 unsigned int on_rq; 1191 unsigned int on_rq;
1192 1192
1193 u64 exec_start; 1193 u64 exec_start;
1194 u64 sum_exec_runtime; 1194 u64 sum_exec_runtime;
1195 u64 vruntime; 1195 u64 vruntime;
1196 u64 prev_sum_exec_runtime; 1196 u64 prev_sum_exec_runtime;
1197 1197
1198 u64 nr_migrations; 1198 u64 nr_migrations;
1199 1199
1200 #ifdef CONFIG_SCHEDSTATS 1200 #ifdef CONFIG_SCHEDSTATS
1201 struct sched_statistics statistics; 1201 struct sched_statistics statistics;
1202 #endif 1202 #endif
1203 1203
1204 #ifdef CONFIG_FAIR_GROUP_SCHED 1204 #ifdef CONFIG_FAIR_GROUP_SCHED
1205 struct sched_entity *parent; 1205 struct sched_entity *parent;
1206 /* rq on which this entity is (to be) queued: */ 1206 /* rq on which this entity is (to be) queued: */
1207 struct cfs_rq *cfs_rq; 1207 struct cfs_rq *cfs_rq;
1208 /* rq "owned" by this entity/group: */ 1208 /* rq "owned" by this entity/group: */
1209 struct cfs_rq *my_q; 1209 struct cfs_rq *my_q;
1210 #endif 1210 #endif
1211 }; 1211 };
1212 1212
1213 struct sched_rt_entity { 1213 struct sched_rt_entity {
1214 struct list_head run_list; 1214 struct list_head run_list;
1215 unsigned long timeout; 1215 unsigned long timeout;
1216 unsigned int time_slice; 1216 unsigned int time_slice;
1217 1217
1218 struct sched_rt_entity *back; 1218 struct sched_rt_entity *back;
1219 #ifdef CONFIG_RT_GROUP_SCHED 1219 #ifdef CONFIG_RT_GROUP_SCHED
1220 struct sched_rt_entity *parent; 1220 struct sched_rt_entity *parent;
1221 /* rq on which this entity is (to be) queued: */ 1221 /* rq on which this entity is (to be) queued: */
1222 struct rt_rq *rt_rq; 1222 struct rt_rq *rt_rq;
1223 /* rq "owned" by this entity/group: */ 1223 /* rq "owned" by this entity/group: */
1224 struct rt_rq *my_q; 1224 struct rt_rq *my_q;
1225 #endif 1225 #endif
1226 }; 1226 };
1227 1227
1228 /* 1228 /*
1229 * default timeslice is 100 msecs (used only for SCHED_RR tasks). 1229 * default timeslice is 100 msecs (used only for SCHED_RR tasks).
1230 * Timeslices get refilled after they expire. 1230 * Timeslices get refilled after they expire.
1231 */ 1231 */
1232 #define RR_TIMESLICE (100 * HZ / 1000) 1232 #define RR_TIMESLICE (100 * HZ / 1000)
1233 1233
1234 struct rcu_node; 1234 struct rcu_node;
1235 1235
1236 enum perf_event_task_context { 1236 enum perf_event_task_context {
1237 perf_invalid_context = -1, 1237 perf_invalid_context = -1,
1238 perf_hw_context = 0, 1238 perf_hw_context = 0,
1239 perf_sw_context, 1239 perf_sw_context,
1240 perf_nr_task_contexts, 1240 perf_nr_task_contexts,
1241 }; 1241 };
1242 1242
1243 struct task_struct { 1243 struct task_struct {
1244 volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ 1244 volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
1245 void *stack; 1245 void *stack;
1246 atomic_t usage; 1246 atomic_t usage;
1247 unsigned int flags; /* per process flags, defined below */ 1247 unsigned int flags; /* per process flags, defined below */
1248 unsigned int ptrace; 1248 unsigned int ptrace;
1249 1249
1250 #ifdef CONFIG_SMP 1250 #ifdef CONFIG_SMP
1251 struct llist_node wake_entry; 1251 struct llist_node wake_entry;
1252 int on_cpu; 1252 int on_cpu;
1253 #endif 1253 #endif
1254 int on_rq; 1254 int on_rq;
1255 1255
1256 int prio, static_prio, normal_prio; 1256 int prio, static_prio, normal_prio;
1257 unsigned int rt_priority; 1257 unsigned int rt_priority;
1258 const struct sched_class *sched_class; 1258 const struct sched_class *sched_class;
1259 struct sched_entity se; 1259 struct sched_entity se;
1260 struct sched_rt_entity rt; 1260 struct sched_rt_entity rt;
1261 #ifdef CONFIG_CGROUP_SCHED 1261 #ifdef CONFIG_CGROUP_SCHED
1262 struct task_group *sched_task_group; 1262 struct task_group *sched_task_group;
1263 #endif 1263 #endif
1264 1264
1265 #ifdef CONFIG_PREEMPT_NOTIFIERS 1265 #ifdef CONFIG_PREEMPT_NOTIFIERS
1266 /* list of struct preempt_notifier: */ 1266 /* list of struct preempt_notifier: */
1267 struct hlist_head preempt_notifiers; 1267 struct hlist_head preempt_notifiers;
1268 #endif 1268 #endif
1269 1269
1270 /* 1270 /*
1271 * fpu_counter contains the number of consecutive context switches 1271 * fpu_counter contains the number of consecutive context switches
1272 * that the FPU is used. If this is over a threshold, the lazy fpu 1272 * that the FPU is used. If this is over a threshold, the lazy fpu
1273 * saving becomes unlazy to save the trap. This is an unsigned char 1273 * saving becomes unlazy to save the trap. This is an unsigned char
1274 * so that after 256 times the counter wraps and the behavior turns 1274 * so that after 256 times the counter wraps and the behavior turns
1275 * lazy again; this to deal with bursty apps that only use FPU for 1275 * lazy again; this to deal with bursty apps that only use FPU for
1276 * a short time 1276 * a short time
1277 */ 1277 */
1278 unsigned char fpu_counter; 1278 unsigned char fpu_counter;
1279 #ifdef CONFIG_BLK_DEV_IO_TRACE 1279 #ifdef CONFIG_BLK_DEV_IO_TRACE
1280 unsigned int btrace_seq; 1280 unsigned int btrace_seq;
1281 #endif 1281 #endif
1282 1282
1283 unsigned int policy; 1283 unsigned int policy;
1284 int nr_cpus_allowed; 1284 int nr_cpus_allowed;
1285 cpumask_t cpus_allowed; 1285 cpumask_t cpus_allowed;
1286 1286
1287 #ifdef CONFIG_PREEMPT_RCU 1287 #ifdef CONFIG_PREEMPT_RCU
1288 int rcu_read_lock_nesting; 1288 int rcu_read_lock_nesting;
1289 char rcu_read_unlock_special; 1289 char rcu_read_unlock_special;
1290 struct list_head rcu_node_entry; 1290 struct list_head rcu_node_entry;
1291 #endif /* #ifdef CONFIG_PREEMPT_RCU */ 1291 #endif /* #ifdef CONFIG_PREEMPT_RCU */
1292 #ifdef CONFIG_TREE_PREEMPT_RCU 1292 #ifdef CONFIG_TREE_PREEMPT_RCU
1293 struct rcu_node *rcu_blocked_node; 1293 struct rcu_node *rcu_blocked_node;
1294 #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 1294 #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
1295 #ifdef CONFIG_RCU_BOOST 1295 #ifdef CONFIG_RCU_BOOST
1296 struct rt_mutex *rcu_boost_mutex; 1296 struct rt_mutex *rcu_boost_mutex;
1297 #endif /* #ifdef CONFIG_RCU_BOOST */ 1297 #endif /* #ifdef CONFIG_RCU_BOOST */
1298 1298
1299 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 1299 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1300 struct sched_info sched_info; 1300 struct sched_info sched_info;
1301 #endif 1301 #endif
1302 1302
1303 struct list_head tasks; 1303 struct list_head tasks;
1304 #ifdef CONFIG_SMP 1304 #ifdef CONFIG_SMP
1305 struct plist_node pushable_tasks; 1305 struct plist_node pushable_tasks;
1306 #endif 1306 #endif
1307 1307
1308 struct mm_struct *mm, *active_mm; 1308 struct mm_struct *mm, *active_mm;
1309 #ifdef CONFIG_COMPAT_BRK 1309 #ifdef CONFIG_COMPAT_BRK
1310 unsigned brk_randomized:1; 1310 unsigned brk_randomized:1;
1311 #endif 1311 #endif
1312 #if defined(SPLIT_RSS_COUNTING) 1312 #if defined(SPLIT_RSS_COUNTING)
1313 struct task_rss_stat rss_stat; 1313 struct task_rss_stat rss_stat;
1314 #endif 1314 #endif
1315 /* task state */ 1315 /* task state */
1316 int exit_state; 1316 int exit_state;
1317 int exit_code, exit_signal; 1317 int exit_code, exit_signal;
1318 int pdeath_signal; /* The signal sent when the parent dies */ 1318 int pdeath_signal; /* The signal sent when the parent dies */
1319 unsigned int jobctl; /* JOBCTL_*, siglock protected */ 1319 unsigned int jobctl; /* JOBCTL_*, siglock protected */
1320 /* ??? */ 1320 /* ??? */
1321 unsigned int personality; 1321 unsigned int personality;
1322 unsigned did_exec:1; 1322 unsigned did_exec:1;
1323 unsigned in_execve:1; /* Tell the LSMs that the process is doing an 1323 unsigned in_execve:1; /* Tell the LSMs that the process is doing an
1324 * execve */ 1324 * execve */
1325 unsigned in_iowait:1; 1325 unsigned in_iowait:1;
1326 1326
1327 /* task may not gain privileges */ 1327 /* task may not gain privileges */
1328 unsigned no_new_privs:1; 1328 unsigned no_new_privs:1;
1329 1329
1330 /* Revert to default priority/policy when forking */ 1330 /* Revert to default priority/policy when forking */
1331 unsigned sched_reset_on_fork:1; 1331 unsigned sched_reset_on_fork:1;
1332 unsigned sched_contributes_to_load:1; 1332 unsigned sched_contributes_to_load:1;
1333 1333
1334 pid_t pid; 1334 pid_t pid;
1335 pid_t tgid; 1335 pid_t tgid;
1336 1336
1337 #ifdef CONFIG_CC_STACKPROTECTOR 1337 #ifdef CONFIG_CC_STACKPROTECTOR
1338 /* Canary value for the -fstack-protector gcc feature */ 1338 /* Canary value for the -fstack-protector gcc feature */
1339 unsigned long stack_canary; 1339 unsigned long stack_canary;
1340 #endif 1340 #endif
1341 /* 1341 /*
1342 * pointers to (original) parent process, youngest child, younger sibling, 1342 * pointers to (original) parent process, youngest child, younger sibling,
1343 * older sibling, respectively. (p->father can be replaced with 1343 * older sibling, respectively. (p->father can be replaced with
1344 * p->real_parent->pid) 1344 * p->real_parent->pid)
1345 */ 1345 */
1346 struct task_struct __rcu *real_parent; /* real parent process */ 1346 struct task_struct __rcu *real_parent; /* real parent process */
1347 struct task_struct __rcu *parent; /* recipient of SIGCHLD, wait4() reports */ 1347 struct task_struct __rcu *parent; /* recipient of SIGCHLD, wait4() reports */
1348 /* 1348 /*
1349 * children/sibling forms the list of my natural children 1349 * children/sibling forms the list of my natural children
1350 */ 1350 */
1351 struct list_head children; /* list of my children */ 1351 struct list_head children; /* list of my children */
1352 struct list_head sibling; /* linkage in my parent's children list */ 1352 struct list_head sibling; /* linkage in my parent's children list */
1353 struct task_struct *group_leader; /* threadgroup leader */ 1353 struct task_struct *group_leader; /* threadgroup leader */
1354 1354
1355 /* 1355 /*
1356 * ptraced is the list of tasks this task is using ptrace on. 1356 * ptraced is the list of tasks this task is using ptrace on.
1357 * This includes both natural children and PTRACE_ATTACH targets. 1357 * This includes both natural children and PTRACE_ATTACH targets.
1358 * p->ptrace_entry is p's link on the p->parent->ptraced list. 1358 * p->ptrace_entry is p's link on the p->parent->ptraced list.
1359 */ 1359 */
1360 struct list_head ptraced; 1360 struct list_head ptraced;
1361 struct list_head ptrace_entry; 1361 struct list_head ptrace_entry;
1362 1362
1363 /* PID/PID hash table linkage. */ 1363 /* PID/PID hash table linkage. */
1364 struct pid_link pids[PIDTYPE_MAX]; 1364 struct pid_link pids[PIDTYPE_MAX];
1365 struct list_head thread_group; 1365 struct list_head thread_group;
1366 1366
1367 struct completion *vfork_done; /* for vfork() */ 1367 struct completion *vfork_done; /* for vfork() */
1368 int __user *set_child_tid; /* CLONE_CHILD_SETTID */ 1368 int __user *set_child_tid; /* CLONE_CHILD_SETTID */
1369 int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ 1369 int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */
1370 1370
1371 cputime_t utime, stime, utimescaled, stimescaled; 1371 cputime_t utime, stime, utimescaled, stimescaled;
1372 cputime_t gtime; 1372 cputime_t gtime;
1373 #ifndef CONFIG_VIRT_CPU_ACCOUNTING 1373 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
1374 cputime_t prev_utime, prev_stime; 1374 cputime_t prev_utime, prev_stime;
1375 #endif 1375 #endif
1376 unsigned long nvcsw, nivcsw; /* context switch counts */ 1376 unsigned long nvcsw, nivcsw; /* context switch counts */
1377 struct timespec start_time; /* monotonic time */ 1377 struct timespec start_time; /* monotonic time */
1378 struct timespec real_start_time; /* boot based time */ 1378 struct timespec real_start_time; /* boot based time */
1379 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ 1379 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
1380 unsigned long min_flt, maj_flt; 1380 unsigned long min_flt, maj_flt;
1381 1381
1382 struct task_cputime cputime_expires; 1382 struct task_cputime cputime_expires;
1383 struct list_head cpu_timers[3]; 1383 struct list_head cpu_timers[3];
1384 1384
1385 /* process credentials */ 1385 /* process credentials */
1386 const struct cred __rcu *real_cred; /* objective and real subjective task 1386 const struct cred __rcu *real_cred; /* objective and real subjective task
1387 * credentials (COW) */ 1387 * credentials (COW) */
1388 const struct cred __rcu *cred; /* effective (overridable) subjective task 1388 const struct cred __rcu *cred; /* effective (overridable) subjective task
1389 * credentials (COW) */ 1389 * credentials (COW) */
1390 char comm[TASK_COMM_LEN]; /* executable name excluding path 1390 char comm[TASK_COMM_LEN]; /* executable name excluding path
1391 - access with [gs]et_task_comm (which lock 1391 - access with [gs]et_task_comm (which lock
1392 it with task_lock()) 1392 it with task_lock())
1393 - initialized normally by setup_new_exec */ 1393 - initialized normally by setup_new_exec */
1394 /* file system info */ 1394 /* file system info */
1395 int link_count, total_link_count; 1395 int link_count, total_link_count;
1396 #ifdef CONFIG_SYSVIPC 1396 #ifdef CONFIG_SYSVIPC
1397 /* ipc stuff */ 1397 /* ipc stuff */
1398 struct sysv_sem sysvsem; 1398 struct sysv_sem sysvsem;
1399 #endif 1399 #endif
1400 #ifdef CONFIG_DETECT_HUNG_TASK 1400 #ifdef CONFIG_DETECT_HUNG_TASK
1401 /* hung task detection */ 1401 /* hung task detection */
1402 unsigned long last_switch_count; 1402 unsigned long last_switch_count;
1403 #endif 1403 #endif
1404 /* CPU-specific state of this task */ 1404 /* CPU-specific state of this task */
1405 struct thread_struct thread; 1405 struct thread_struct thread;
1406 /* filesystem information */ 1406 /* filesystem information */
1407 struct fs_struct *fs; 1407 struct fs_struct *fs;
1408 /* open file information */ 1408 /* open file information */
1409 struct files_struct *files; 1409 struct files_struct *files;
1410 /* namespaces */ 1410 /* namespaces */
1411 struct nsproxy *nsproxy; 1411 struct nsproxy *nsproxy;
1412 /* signal handlers */ 1412 /* signal handlers */
1413 struct signal_struct *signal; 1413 struct signal_struct *signal;
1414 struct sighand_struct *sighand; 1414 struct sighand_struct *sighand;
1415 1415
1416 sigset_t blocked, real_blocked; 1416 sigset_t blocked, real_blocked;
1417 sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */ 1417 sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */
1418 struct sigpending pending; 1418 struct sigpending pending;
1419 1419
1420 unsigned long sas_ss_sp; 1420 unsigned long sas_ss_sp;
1421 size_t sas_ss_size; 1421 size_t sas_ss_size;
1422 int (*notifier)(void *priv); 1422 int (*notifier)(void *priv);
1423 void *notifier_data; 1423 void *notifier_data;
1424 sigset_t *notifier_mask; 1424 sigset_t *notifier_mask;
1425 struct callback_head *task_works; 1425 struct callback_head *task_works;
1426 1426
1427 struct audit_context *audit_context; 1427 struct audit_context *audit_context;
1428 #ifdef CONFIG_AUDITSYSCALL 1428 #ifdef CONFIG_AUDITSYSCALL
1429 uid_t loginuid; 1429 uid_t loginuid;
1430 unsigned int sessionid; 1430 unsigned int sessionid;
1431 #endif 1431 #endif
1432 struct seccomp seccomp; 1432 struct seccomp seccomp;
1433 1433
1434 /* Thread group tracking */ 1434 /* Thread group tracking */
1435 u32 parent_exec_id; 1435 u32 parent_exec_id;
1436 u32 self_exec_id; 1436 u32 self_exec_id;
1437 /* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, 1437 /* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed,
1438 * mempolicy */ 1438 * mempolicy */
1439 spinlock_t alloc_lock; 1439 spinlock_t alloc_lock;
1440 1440
1441 /* Protection of the PI data structures: */ 1441 /* Protection of the PI data structures: */
1442 raw_spinlock_t pi_lock; 1442 raw_spinlock_t pi_lock;
1443 1443
1444 #ifdef CONFIG_RT_MUTEXES 1444 #ifdef CONFIG_RT_MUTEXES
1445 /* PI waiters blocked on a rt_mutex held by this task */ 1445 /* PI waiters blocked on a rt_mutex held by this task */
1446 struct plist_head pi_waiters; 1446 struct plist_head pi_waiters;
1447 /* Deadlock detection and priority inheritance handling */ 1447 /* Deadlock detection and priority inheritance handling */
1448 struct rt_mutex_waiter *pi_blocked_on; 1448 struct rt_mutex_waiter *pi_blocked_on;
1449 #endif 1449 #endif
1450 1450
1451 #ifdef CONFIG_DEBUG_MUTEXES 1451 #ifdef CONFIG_DEBUG_MUTEXES
1452 /* mutex deadlock detection */ 1452 /* mutex deadlock detection */
1453 struct mutex_waiter *blocked_on; 1453 struct mutex_waiter *blocked_on;
1454 #endif 1454 #endif
1455 #ifdef CONFIG_TRACE_IRQFLAGS 1455 #ifdef CONFIG_TRACE_IRQFLAGS
1456 unsigned int irq_events; 1456 unsigned int irq_events;
1457 unsigned long hardirq_enable_ip; 1457 unsigned long hardirq_enable_ip;
1458 unsigned long hardirq_disable_ip; 1458 unsigned long hardirq_disable_ip;
1459 unsigned int hardirq_enable_event; 1459 unsigned int hardirq_enable_event;
1460 unsigned int hardirq_disable_event; 1460 unsigned int hardirq_disable_event;
1461 int hardirqs_enabled; 1461 int hardirqs_enabled;
1462 int hardirq_context; 1462 int hardirq_context;
1463 unsigned long softirq_disable_ip; 1463 unsigned long softirq_disable_ip;
1464 unsigned long softirq_enable_ip; 1464 unsigned long softirq_enable_ip;
1465 unsigned int softirq_disable_event; 1465 unsigned int softirq_disable_event;
1466 unsigned int softirq_enable_event; 1466 unsigned int softirq_enable_event;
1467 int softirqs_enabled; 1467 int softirqs_enabled;
1468 int softirq_context; 1468 int softirq_context;
1469 #endif 1469 #endif
1470 #ifdef CONFIG_LOCKDEP 1470 #ifdef CONFIG_LOCKDEP
1471 # define MAX_LOCK_DEPTH 48UL 1471 # define MAX_LOCK_DEPTH 48UL
1472 u64 curr_chain_key; 1472 u64 curr_chain_key;
1473 int lockdep_depth; 1473 int lockdep_depth;
1474 unsigned int lockdep_recursion; 1474 unsigned int lockdep_recursion;
1475 struct held_lock held_locks[MAX_LOCK_DEPTH]; 1475 struct held_lock held_locks[MAX_LOCK_DEPTH];
1476 gfp_t lockdep_reclaim_gfp; 1476 gfp_t lockdep_reclaim_gfp;
1477 #endif 1477 #endif
1478 1478
1479 /* journalling filesystem info */ 1479 /* journalling filesystem info */
1480 void *journal_info; 1480 void *journal_info;
1481 1481
1482 /* stacked block device info */ 1482 /* stacked block device info */
1483 struct bio_list *bio_list; 1483 struct bio_list *bio_list;
1484 1484
1485 #ifdef CONFIG_BLOCK 1485 #ifdef CONFIG_BLOCK
1486 /* stack plugging */ 1486 /* stack plugging */
1487 struct blk_plug *plug; 1487 struct blk_plug *plug;
1488 #endif 1488 #endif
1489 1489
1490 /* VM state */ 1490 /* VM state */
1491 struct reclaim_state *reclaim_state; 1491 struct reclaim_state *reclaim_state;
1492 1492
1493 struct backing_dev_info *backing_dev_info; 1493 struct backing_dev_info *backing_dev_info;
1494 1494
1495 struct io_context *io_context; 1495 struct io_context *io_context;
1496 1496
1497 unsigned long ptrace_message; 1497 unsigned long ptrace_message;
1498 siginfo_t *last_siginfo; /* For ptrace use. */ 1498 siginfo_t *last_siginfo; /* For ptrace use. */
1499 struct task_io_accounting ioac; 1499 struct task_io_accounting ioac;
1500 #if defined(CONFIG_TASK_XACCT) 1500 #if defined(CONFIG_TASK_XACCT)
1501 u64 acct_rss_mem1; /* accumulated rss usage */ 1501 u64 acct_rss_mem1; /* accumulated rss usage */
1502 u64 acct_vm_mem1; /* accumulated virtual memory usage */ 1502 u64 acct_vm_mem1; /* accumulated virtual memory usage */
1503 cputime_t acct_timexpd; /* stime + utime since last update */ 1503 cputime_t acct_timexpd; /* stime + utime since last update */
1504 #endif 1504 #endif
1505 #ifdef CONFIG_CPUSETS 1505 #ifdef CONFIG_CPUSETS
1506 nodemask_t mems_allowed; /* Protected by alloc_lock */ 1506 nodemask_t mems_allowed; /* Protected by alloc_lock */
1507 seqcount_t mems_allowed_seq; /* Seqence no to catch updates */ 1507 seqcount_t mems_allowed_seq; /* Seqence no to catch updates */
1508 int cpuset_mem_spread_rotor; 1508 int cpuset_mem_spread_rotor;
1509 int cpuset_slab_spread_rotor; 1509 int cpuset_slab_spread_rotor;
1510 #endif 1510 #endif
1511 #ifdef CONFIG_CGROUPS 1511 #ifdef CONFIG_CGROUPS
1512 /* Control Group info protected by css_set_lock */ 1512 /* Control Group info protected by css_set_lock */
1513 struct css_set __rcu *cgroups; 1513 struct css_set __rcu *cgroups;
1514 /* cg_list protected by css_set_lock and tsk->alloc_lock */ 1514 /* cg_list protected by css_set_lock and tsk->alloc_lock */
1515 struct list_head cg_list; 1515 struct list_head cg_list;
1516 #endif 1516 #endif
1517 #ifdef CONFIG_FUTEX 1517 #ifdef CONFIG_FUTEX
1518 struct robust_list_head __user *robust_list; 1518 struct robust_list_head __user *robust_list;
1519 #ifdef CONFIG_COMPAT 1519 #ifdef CONFIG_COMPAT
1520 struct compat_robust_list_head __user *compat_robust_list; 1520 struct compat_robust_list_head __user *compat_robust_list;
1521 #endif 1521 #endif
1522 struct list_head pi_state_list; 1522 struct list_head pi_state_list;
1523 struct futex_pi_state *pi_state_cache; 1523 struct futex_pi_state *pi_state_cache;
1524 #endif 1524 #endif
1525 #ifdef CONFIG_PERF_EVENTS 1525 #ifdef CONFIG_PERF_EVENTS
1526 struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts]; 1526 struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
1527 struct mutex perf_event_mutex; 1527 struct mutex perf_event_mutex;
1528 struct list_head perf_event_list; 1528 struct list_head perf_event_list;
1529 #endif 1529 #endif
1530 #ifdef CONFIG_NUMA 1530 #ifdef CONFIG_NUMA
1531 struct mempolicy *mempolicy; /* Protected by alloc_lock */ 1531 struct mempolicy *mempolicy; /* Protected by alloc_lock */
1532 short il_next; 1532 short il_next;
1533 short pref_node_fork; 1533 short pref_node_fork;
1534 #endif 1534 #endif
1535 struct rcu_head rcu; 1535 struct rcu_head rcu;
1536 1536
1537 /* 1537 /*
1538 * cache last used pipe for splice 1538 * cache last used pipe for splice
1539 */ 1539 */
1540 struct pipe_inode_info *splice_pipe; 1540 struct pipe_inode_info *splice_pipe;
1541 #ifdef CONFIG_TASK_DELAY_ACCT 1541 #ifdef CONFIG_TASK_DELAY_ACCT
1542 struct task_delay_info *delays; 1542 struct task_delay_info *delays;
1543 #endif 1543 #endif
1544 #ifdef CONFIG_FAULT_INJECTION 1544 #ifdef CONFIG_FAULT_INJECTION
1545 int make_it_fail; 1545 int make_it_fail;
1546 #endif 1546 #endif
1547 /* 1547 /*
1548 * when (nr_dirtied >= nr_dirtied_pause), it's time to call 1548 * when (nr_dirtied >= nr_dirtied_pause), it's time to call
1549 * balance_dirty_pages() for some dirty throttling pause 1549 * balance_dirty_pages() for some dirty throttling pause
1550 */ 1550 */
1551 int nr_dirtied; 1551 int nr_dirtied;
1552 int nr_dirtied_pause; 1552 int nr_dirtied_pause;
1553 unsigned long dirty_paused_when; /* start of a write-and-pause period */ 1553 unsigned long dirty_paused_when; /* start of a write-and-pause period */
1554 1554
1555 #ifdef CONFIG_LATENCYTOP 1555 #ifdef CONFIG_LATENCYTOP
1556 int latency_record_count; 1556 int latency_record_count;
1557 struct latency_record latency_record[LT_SAVECOUNT]; 1557 struct latency_record latency_record[LT_SAVECOUNT];
1558 #endif 1558 #endif
1559 /* 1559 /*
1560 * time slack values; these are used to round up poll() and 1560 * time slack values; these are used to round up poll() and
1561 * select() etc timeout values. These are in nanoseconds. 1561 * select() etc timeout values. These are in nanoseconds.
1562 */ 1562 */
1563 unsigned long timer_slack_ns; 1563 unsigned long timer_slack_ns;
1564 unsigned long default_timer_slack_ns; 1564 unsigned long default_timer_slack_ns;
1565 1565
1566 #ifdef CONFIG_FUNCTION_GRAPH_TRACER 1566 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
1567 /* Index of current stored address in ret_stack */ 1567 /* Index of current stored address in ret_stack */
1568 int curr_ret_stack; 1568 int curr_ret_stack;
1569 /* Stack of return addresses for return function tracing */ 1569 /* Stack of return addresses for return function tracing */
1570 struct ftrace_ret_stack *ret_stack; 1570 struct ftrace_ret_stack *ret_stack;
1571 /* time stamp for last schedule */ 1571 /* time stamp for last schedule */
1572 unsigned long long ftrace_timestamp; 1572 unsigned long long ftrace_timestamp;
1573 /* 1573 /*
1574 * Number of functions that haven't been traced 1574 * Number of functions that haven't been traced
1575 * because of depth overrun. 1575 * because of depth overrun.
1576 */ 1576 */
1577 atomic_t trace_overrun; 1577 atomic_t trace_overrun;
1578 /* Pause for the tracing */ 1578 /* Pause for the tracing */
1579 atomic_t tracing_graph_pause; 1579 atomic_t tracing_graph_pause;
1580 #endif 1580 #endif
1581 #ifdef CONFIG_TRACING 1581 #ifdef CONFIG_TRACING
1582 /* state flags for use by tracers */ 1582 /* state flags for use by tracers */
1583 unsigned long trace; 1583 unsigned long trace;
1584 /* bitmask and counter of trace recursion */ 1584 /* bitmask and counter of trace recursion */
1585 unsigned long trace_recursion; 1585 unsigned long trace_recursion;
1586 #endif /* CONFIG_TRACING */ 1586 #endif /* CONFIG_TRACING */
1587 #ifdef CONFIG_MEMCG /* memcg uses this to do batch job */ 1587 #ifdef CONFIG_MEMCG /* memcg uses this to do batch job */
1588 struct memcg_batch_info { 1588 struct memcg_batch_info {
1589 int do_batch; /* incremented when batch uncharge started */ 1589 int do_batch; /* incremented when batch uncharge started */
1590 struct mem_cgroup *memcg; /* target memcg of uncharge */ 1590 struct mem_cgroup *memcg; /* target memcg of uncharge */
1591 unsigned long nr_pages; /* uncharged usage */ 1591 unsigned long nr_pages; /* uncharged usage */
1592 unsigned long memsw_nr_pages; /* uncharged mem+swap usage */ 1592 unsigned long memsw_nr_pages; /* uncharged mem+swap usage */
1593 } memcg_batch; 1593 } memcg_batch;
1594 #endif 1594 #endif
1595 #ifdef CONFIG_HAVE_HW_BREAKPOINT 1595 #ifdef CONFIG_HAVE_HW_BREAKPOINT
1596 atomic_t ptrace_bp_refcnt; 1596 atomic_t ptrace_bp_refcnt;
1597 #endif 1597 #endif
1598 #ifdef CONFIG_UPROBES 1598 #ifdef CONFIG_UPROBES
1599 struct uprobe_task *utask; 1599 struct uprobe_task *utask;
1600 #endif 1600 #endif
1601 }; 1601 };
1602 1602
1603 /* Future-safe accessor for struct task_struct's cpus_allowed. */ 1603 /* Future-safe accessor for struct task_struct's cpus_allowed. */
1604 #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) 1604 #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
1605 1605
1606 /* 1606 /*
1607 * Priority of a process goes from 0..MAX_PRIO-1, valid RT 1607 * Priority of a process goes from 0..MAX_PRIO-1, valid RT
1608 * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH 1608 * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
1609 * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority 1609 * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
1610 * values are inverted: lower p->prio value means higher priority. 1610 * values are inverted: lower p->prio value means higher priority.
1611 * 1611 *
1612 * The MAX_USER_RT_PRIO value allows the actual maximum 1612 * The MAX_USER_RT_PRIO value allows the actual maximum
1613 * RT priority to be separate from the value exported to 1613 * RT priority to be separate from the value exported to
1614 * user-space. This allows kernel threads to set their 1614 * user-space. This allows kernel threads to set their
1615 * priority to a value higher than any user task. Note: 1615 * priority to a value higher than any user task. Note:
1616 * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO. 1616 * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
1617 */ 1617 */
1618 1618
1619 #define MAX_USER_RT_PRIO 100 1619 #define MAX_USER_RT_PRIO 100
1620 #define MAX_RT_PRIO MAX_USER_RT_PRIO 1620 #define MAX_RT_PRIO MAX_USER_RT_PRIO
1621 1621
1622 #define MAX_PRIO (MAX_RT_PRIO + 40) 1622 #define MAX_PRIO (MAX_RT_PRIO + 40)
1623 #define DEFAULT_PRIO (MAX_RT_PRIO + 20) 1623 #define DEFAULT_PRIO (MAX_RT_PRIO + 20)
1624 1624
1625 static inline int rt_prio(int prio) 1625 static inline int rt_prio(int prio)
1626 { 1626 {
1627 if (unlikely(prio < MAX_RT_PRIO)) 1627 if (unlikely(prio < MAX_RT_PRIO))
1628 return 1; 1628 return 1;
1629 return 0; 1629 return 0;
1630 } 1630 }
1631 1631
1632 static inline int rt_task(struct task_struct *p) 1632 static inline int rt_task(struct task_struct *p)
1633 { 1633 {
1634 return rt_prio(p->prio); 1634 return rt_prio(p->prio);
1635 } 1635 }
1636 1636
1637 static inline struct pid *task_pid(struct task_struct *task) 1637 static inline struct pid *task_pid(struct task_struct *task)
1638 { 1638 {
1639 return task->pids[PIDTYPE_PID].pid; 1639 return task->pids[PIDTYPE_PID].pid;
1640 } 1640 }
1641 1641
1642 static inline struct pid *task_tgid(struct task_struct *task) 1642 static inline struct pid *task_tgid(struct task_struct *task)
1643 { 1643 {
1644 return task->group_leader->pids[PIDTYPE_PID].pid; 1644 return task->group_leader->pids[PIDTYPE_PID].pid;
1645 } 1645 }
1646 1646
1647 /* 1647 /*
1648 * Without tasklist or rcu lock it is not safe to dereference 1648 * Without tasklist or rcu lock it is not safe to dereference
1649 * the result of task_pgrp/task_session even if task == current, 1649 * the result of task_pgrp/task_session even if task == current,
1650 * we can race with another thread doing sys_setsid/sys_setpgid. 1650 * we can race with another thread doing sys_setsid/sys_setpgid.
1651 */ 1651 */
1652 static inline struct pid *task_pgrp(struct task_struct *task) 1652 static inline struct pid *task_pgrp(struct task_struct *task)
1653 { 1653 {
1654 return task->group_leader->pids[PIDTYPE_PGID].pid; 1654 return task->group_leader->pids[PIDTYPE_PGID].pid;
1655 } 1655 }
1656 1656
1657 static inline struct pid *task_session(struct task_struct *task) 1657 static inline struct pid *task_session(struct task_struct *task)
1658 { 1658 {
1659 return task->group_leader->pids[PIDTYPE_SID].pid; 1659 return task->group_leader->pids[PIDTYPE_SID].pid;
1660 } 1660 }
1661 1661
1662 struct pid_namespace; 1662 struct pid_namespace;
1663 1663
1664 /* 1664 /*
1665 * the helpers to get the task's different pids as they are seen 1665 * the helpers to get the task's different pids as they are seen
1666 * from various namespaces 1666 * from various namespaces
1667 * 1667 *
1668 * task_xid_nr() : global id, i.e. the id seen from the init namespace; 1668 * task_xid_nr() : global id, i.e. the id seen from the init namespace;
1669 * task_xid_vnr() : virtual id, i.e. the id seen from the pid namespace of 1669 * task_xid_vnr() : virtual id, i.e. the id seen from the pid namespace of
1670 * current. 1670 * current.
1671 * task_xid_nr_ns() : id seen from the ns specified; 1671 * task_xid_nr_ns() : id seen from the ns specified;
1672 * 1672 *
1673 * set_task_vxid() : assigns a virtual id to a task; 1673 * set_task_vxid() : assigns a virtual id to a task;
1674 * 1674 *
1675 * see also pid_nr() etc in include/linux/pid.h 1675 * see also pid_nr() etc in include/linux/pid.h
1676 */ 1676 */
1677 pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, 1677 pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
1678 struct pid_namespace *ns); 1678 struct pid_namespace *ns);
1679 1679
1680 static inline pid_t task_pid_nr(struct task_struct *tsk) 1680 static inline pid_t task_pid_nr(struct task_struct *tsk)
1681 { 1681 {
1682 return tsk->pid; 1682 return tsk->pid;
1683 } 1683 }
1684 1684
1685 static inline pid_t task_pid_nr_ns(struct task_struct *tsk, 1685 static inline pid_t task_pid_nr_ns(struct task_struct *tsk,
1686 struct pid_namespace *ns) 1686 struct pid_namespace *ns)
1687 { 1687 {
1688 return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns); 1688 return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns);
1689 } 1689 }
1690 1690
1691 static inline pid_t task_pid_vnr(struct task_struct *tsk) 1691 static inline pid_t task_pid_vnr(struct task_struct *tsk)
1692 { 1692 {
1693 return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL); 1693 return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL);
1694 } 1694 }
1695 1695
1696 1696
1697 static inline pid_t task_tgid_nr(struct task_struct *tsk) 1697 static inline pid_t task_tgid_nr(struct task_struct *tsk)
1698 { 1698 {
1699 return tsk->tgid; 1699 return tsk->tgid;
1700 } 1700 }
1701 1701
1702 pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns); 1702 pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
1703 1703
1704 static inline pid_t task_tgid_vnr(struct task_struct *tsk) 1704 static inline pid_t task_tgid_vnr(struct task_struct *tsk)
1705 { 1705 {
1706 return pid_vnr(task_tgid(tsk)); 1706 return pid_vnr(task_tgid(tsk));
1707 } 1707 }
1708 1708
1709 1709
1710 static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, 1710 static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk,
1711 struct pid_namespace *ns) 1711 struct pid_namespace *ns)
1712 { 1712 {
1713 return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns); 1713 return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns);
1714 } 1714 }
1715 1715
1716 static inline pid_t task_pgrp_vnr(struct task_struct *tsk) 1716 static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
1717 { 1717 {
1718 return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL); 1718 return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL);
1719 } 1719 }
1720 1720
1721 1721
1722 static inline pid_t task_session_nr_ns(struct task_struct *tsk, 1722 static inline pid_t task_session_nr_ns(struct task_struct *tsk,
1723 struct pid_namespace *ns) 1723 struct pid_namespace *ns)
1724 { 1724 {
1725 return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns); 1725 return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns);
1726 } 1726 }
1727 1727
1728 static inline pid_t task_session_vnr(struct task_struct *tsk) 1728 static inline pid_t task_session_vnr(struct task_struct *tsk)
1729 { 1729 {
1730 return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL); 1730 return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL);
1731 } 1731 }
1732 1732
1733 /* obsolete, do not use */ 1733 /* obsolete, do not use */
1734 static inline pid_t task_pgrp_nr(struct task_struct *tsk) 1734 static inline pid_t task_pgrp_nr(struct task_struct *tsk)
1735 { 1735 {
1736 return task_pgrp_nr_ns(tsk, &init_pid_ns); 1736 return task_pgrp_nr_ns(tsk, &init_pid_ns);
1737 } 1737 }
1738 1738
1739 /** 1739 /**
1740 * pid_alive - check that a task structure is not stale 1740 * pid_alive - check that a task structure is not stale
1741 * @p: Task structure to be checked. 1741 * @p: Task structure to be checked.
1742 * 1742 *
1743 * Test if a process is not yet dead (at most zombie state) 1743 * Test if a process is not yet dead (at most zombie state)
1744 * If pid_alive fails, then pointers within the task structure 1744 * If pid_alive fails, then pointers within the task structure
1745 * can be stale and must not be dereferenced. 1745 * can be stale and must not be dereferenced.
1746 */ 1746 */
1747 static inline int pid_alive(struct task_struct *p) 1747 static inline int pid_alive(struct task_struct *p)
1748 { 1748 {
1749 return p->pids[PIDTYPE_PID].pid != NULL; 1749 return p->pids[PIDTYPE_PID].pid != NULL;
1750 } 1750 }
1751 1751
1752 /** 1752 /**
1753 * is_global_init - check if a task structure is init 1753 * is_global_init - check if a task structure is init
1754 * @tsk: Task structure to be checked. 1754 * @tsk: Task structure to be checked.
1755 * 1755 *
1756 * Check if a task structure is the first user space task the kernel created. 1756 * Check if a task structure is the first user space task the kernel created.
1757 */ 1757 */
1758 static inline int is_global_init(struct task_struct *tsk) 1758 static inline int is_global_init(struct task_struct *tsk)
1759 { 1759 {
1760 return tsk->pid == 1; 1760 return tsk->pid == 1;
1761 } 1761 }
1762 1762
1763 /* 1763 /*
1764 * is_container_init: 1764 * is_container_init:
1765 * check whether in the task is init in its own pid namespace. 1765 * check whether in the task is init in its own pid namespace.
1766 */ 1766 */
1767 extern int is_container_init(struct task_struct *tsk); 1767 extern int is_container_init(struct task_struct *tsk);
1768 1768
1769 extern struct pid *cad_pid; 1769 extern struct pid *cad_pid;
1770 1770
1771 extern void free_task(struct task_struct *tsk); 1771 extern void free_task(struct task_struct *tsk);
1772 #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0) 1772 #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
1773 1773
1774 extern void __put_task_struct(struct task_struct *t); 1774 extern void __put_task_struct(struct task_struct *t);
1775 1775
1776 static inline void put_task_struct(struct task_struct *t) 1776 static inline void put_task_struct(struct task_struct *t)
1777 { 1777 {
1778 if (atomic_dec_and_test(&t->usage)) 1778 if (atomic_dec_and_test(&t->usage))
1779 __put_task_struct(t); 1779 __put_task_struct(t);
1780 } 1780 }
1781 1781
1782 extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st); 1782 extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st);
1783 extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st); 1783 extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st);
1784 1784
1785 /* 1785 /*
1786 * Per process flags 1786 * Per process flags
1787 */ 1787 */
1788 #define PF_EXITING 0x00000004 /* getting shut down */ 1788 #define PF_EXITING 0x00000004 /* getting shut down */
1789 #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ 1789 #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */
1790 #define PF_VCPU 0x00000010 /* I'm a virtual CPU */ 1790 #define PF_VCPU 0x00000010 /* I'm a virtual CPU */
1791 #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ 1791 #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */
1792 #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ 1792 #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */
1793 #define PF_MCE_PROCESS 0x00000080 /* process policy on mce errors */ 1793 #define PF_MCE_PROCESS 0x00000080 /* process policy on mce errors */
1794 #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ 1794 #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */
1795 #define PF_DUMPCORE 0x00000200 /* dumped core */ 1795 #define PF_DUMPCORE 0x00000200 /* dumped core */
1796 #define PF_SIGNALED 0x00000400 /* killed by a signal */ 1796 #define PF_SIGNALED 0x00000400 /* killed by a signal */
1797 #define PF_MEMALLOC 0x00000800 /* Allocating memory */ 1797 #define PF_MEMALLOC 0x00000800 /* Allocating memory */
1798 #define PF_NPROC_EXCEEDED 0x00001000 /* set_user noticed that RLIMIT_NPROC was exceeded */ 1798 #define PF_NPROC_EXCEEDED 0x00001000 /* set_user noticed that RLIMIT_NPROC was exceeded */
1799 #define PF_USED_MATH 0x00002000 /* if unset the fpu must be initialized before use */ 1799 #define PF_USED_MATH 0x00002000 /* if unset the fpu must be initialized before use */
1800 #define PF_NOFREEZE 0x00008000 /* this thread should not be frozen */ 1800 #define PF_NOFREEZE 0x00008000 /* this thread should not be frozen */
1801 #define PF_FROZEN 0x00010000 /* frozen for system suspend */ 1801 #define PF_FROZEN 0x00010000 /* frozen for system suspend */
1802 #define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */ 1802 #define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */
1803 #define PF_KSWAPD 0x00040000 /* I am kswapd */ 1803 #define PF_KSWAPD 0x00040000 /* I am kswapd */
1804 #define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ 1804 #define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */
1805 #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ 1805 #define PF_KTHREAD 0x00200000 /* I am a kernel thread */
1806 #define PF_RANDOMIZE 0x00400000 /* randomize virtual address space */ 1806 #define PF_RANDOMIZE 0x00400000 /* randomize virtual address space */
1807 #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ 1807 #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */
1808 #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ 1808 #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */
1809 #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ 1809 #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */
1810 #define PF_THREAD_BOUND 0x04000000 /* Thread bound to specific cpu */ 1810 #define PF_THREAD_BOUND 0x04000000 /* Thread bound to specific cpu */
1811 #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ 1811 #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
1812 #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ 1812 #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */
1813 #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ 1813 #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */
1814 #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ 1814 #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */
1815 1815
1816 /* 1816 /*
1817 * Only the _current_ task can read/write to tsk->flags, but other 1817 * Only the _current_ task can read/write to tsk->flags, but other
1818 * tasks can access tsk->flags in readonly mode for example 1818 * tasks can access tsk->flags in readonly mode for example
1819 * with tsk_used_math (like during threaded core dumping). 1819 * with tsk_used_math (like during threaded core dumping).
1820 * There is however an exception to this rule during ptrace 1820 * There is however an exception to this rule during ptrace
1821 * or during fork: the ptracer task is allowed to write to the 1821 * or during fork: the ptracer task is allowed to write to the
1822 * child->flags of its traced child (same goes for fork, the parent 1822 * child->flags of its traced child (same goes for fork, the parent
1823 * can write to the child->flags), because we're guaranteed the 1823 * can write to the child->flags), because we're guaranteed the
1824 * child is not running and in turn not changing child->flags 1824 * child is not running and in turn not changing child->flags
1825 * at the same time the parent does it. 1825 * at the same time the parent does it.
1826 */ 1826 */
1827 #define clear_stopped_child_used_math(child) do { (child)->flags &= ~PF_USED_MATH; } while (0) 1827 #define clear_stopped_child_used_math(child) do { (child)->flags &= ~PF_USED_MATH; } while (0)
1828 #define set_stopped_child_used_math(child) do { (child)->flags |= PF_USED_MATH; } while (0) 1828 #define set_stopped_child_used_math(child) do { (child)->flags |= PF_USED_MATH; } while (0)
1829 #define clear_used_math() clear_stopped_child_used_math(current) 1829 #define clear_used_math() clear_stopped_child_used_math(current)
1830 #define set_used_math() set_stopped_child_used_math(current) 1830 #define set_used_math() set_stopped_child_used_math(current)
1831 #define conditional_stopped_child_used_math(condition, child) \ 1831 #define conditional_stopped_child_used_math(condition, child) \
1832 do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0) 1832 do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0)
1833 #define conditional_used_math(condition) \ 1833 #define conditional_used_math(condition) \
1834 conditional_stopped_child_used_math(condition, current) 1834 conditional_stopped_child_used_math(condition, current)
1835 #define copy_to_stopped_child_used_math(child) \ 1835 #define copy_to_stopped_child_used_math(child) \
1836 do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0) 1836 do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0)
1837 /* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */ 1837 /* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */
1838 #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) 1838 #define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
1839 #define used_math() tsk_used_math(current) 1839 #define used_math() tsk_used_math(current)
1840 1840
1841 /* 1841 /*
1842 * task->jobctl flags 1842 * task->jobctl flags
1843 */ 1843 */
1844 #define JOBCTL_STOP_SIGMASK 0xffff /* signr of the last group stop */ 1844 #define JOBCTL_STOP_SIGMASK 0xffff /* signr of the last group stop */
1845 1845
1846 #define JOBCTL_STOP_DEQUEUED_BIT 16 /* stop signal dequeued */ 1846 #define JOBCTL_STOP_DEQUEUED_BIT 16 /* stop signal dequeued */
1847 #define JOBCTL_STOP_PENDING_BIT 17 /* task should stop for group stop */ 1847 #define JOBCTL_STOP_PENDING_BIT 17 /* task should stop for group stop */
1848 #define JOBCTL_STOP_CONSUME_BIT 18 /* consume group stop count */ 1848 #define JOBCTL_STOP_CONSUME_BIT 18 /* consume group stop count */
1849 #define JOBCTL_TRAP_STOP_BIT 19 /* trap for STOP */ 1849 #define JOBCTL_TRAP_STOP_BIT 19 /* trap for STOP */
1850 #define JOBCTL_TRAP_NOTIFY_BIT 20 /* trap for NOTIFY */ 1850 #define JOBCTL_TRAP_NOTIFY_BIT 20 /* trap for NOTIFY */
1851 #define JOBCTL_TRAPPING_BIT 21 /* switching to TRACED */ 1851 #define JOBCTL_TRAPPING_BIT 21 /* switching to TRACED */
1852 #define JOBCTL_LISTENING_BIT 22 /* ptracer is listening for events */ 1852 #define JOBCTL_LISTENING_BIT 22 /* ptracer is listening for events */
1853 1853
1854 #define JOBCTL_STOP_DEQUEUED (1 << JOBCTL_STOP_DEQUEUED_BIT) 1854 #define JOBCTL_STOP_DEQUEUED (1 << JOBCTL_STOP_DEQUEUED_BIT)
1855 #define JOBCTL_STOP_PENDING (1 << JOBCTL_STOP_PENDING_BIT) 1855 #define JOBCTL_STOP_PENDING (1 << JOBCTL_STOP_PENDING_BIT)
1856 #define JOBCTL_STOP_CONSUME (1 << JOBCTL_STOP_CONSUME_BIT) 1856 #define JOBCTL_STOP_CONSUME (1 << JOBCTL_STOP_CONSUME_BIT)
1857 #define JOBCTL_TRAP_STOP (1 << JOBCTL_TRAP_STOP_BIT) 1857 #define JOBCTL_TRAP_STOP (1 << JOBCTL_TRAP_STOP_BIT)
1858 #define JOBCTL_TRAP_NOTIFY (1 << JOBCTL_TRAP_NOTIFY_BIT) 1858 #define JOBCTL_TRAP_NOTIFY (1 << JOBCTL_TRAP_NOTIFY_BIT)
1859 #define JOBCTL_TRAPPING (1 << JOBCTL_TRAPPING_BIT) 1859 #define JOBCTL_TRAPPING (1 << JOBCTL_TRAPPING_BIT)
1860 #define JOBCTL_LISTENING (1 << JOBCTL_LISTENING_BIT) 1860 #define JOBCTL_LISTENING (1 << JOBCTL_LISTENING_BIT)
1861 1861
1862 #define JOBCTL_TRAP_MASK (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY) 1862 #define JOBCTL_TRAP_MASK (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY)
1863 #define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK) 1863 #define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK)
1864 1864
1865 extern bool task_set_jobctl_pending(struct task_struct *task, 1865 extern bool task_set_jobctl_pending(struct task_struct *task,
1866 unsigned int mask); 1866 unsigned int mask);
1867 extern void task_clear_jobctl_trapping(struct task_struct *task); 1867 extern void task_clear_jobctl_trapping(struct task_struct *task);
1868 extern void task_clear_jobctl_pending(struct task_struct *task, 1868 extern void task_clear_jobctl_pending(struct task_struct *task,
1869 unsigned int mask); 1869 unsigned int mask);
1870 1870
1871 #ifdef CONFIG_PREEMPT_RCU 1871 #ifdef CONFIG_PREEMPT_RCU
1872 1872
1873 #define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */ 1873 #define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */
1874 #define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */ 1874 #define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */
1875 1875
1876 static inline void rcu_copy_process(struct task_struct *p) 1876 static inline void rcu_copy_process(struct task_struct *p)
1877 { 1877 {
1878 p->rcu_read_lock_nesting = 0; 1878 p->rcu_read_lock_nesting = 0;
1879 p->rcu_read_unlock_special = 0; 1879 p->rcu_read_unlock_special = 0;
1880 #ifdef CONFIG_TREE_PREEMPT_RCU 1880 #ifdef CONFIG_TREE_PREEMPT_RCU
1881 p->rcu_blocked_node = NULL; 1881 p->rcu_blocked_node = NULL;
1882 #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 1882 #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
1883 #ifdef CONFIG_RCU_BOOST 1883 #ifdef CONFIG_RCU_BOOST
1884 p->rcu_boost_mutex = NULL; 1884 p->rcu_boost_mutex = NULL;
1885 #endif /* #ifdef CONFIG_RCU_BOOST */ 1885 #endif /* #ifdef CONFIG_RCU_BOOST */
1886 INIT_LIST_HEAD(&p->rcu_node_entry); 1886 INIT_LIST_HEAD(&p->rcu_node_entry);
1887 } 1887 }
1888 1888
1889 #else 1889 #else
1890 1890
1891 static inline void rcu_copy_process(struct task_struct *p) 1891 static inline void rcu_copy_process(struct task_struct *p)
1892 { 1892 {
1893 } 1893 }
1894 1894
1895 #endif 1895 #endif
1896 1896
1897 static inline void tsk_restore_flags(struct task_struct *task,
1898 unsigned long orig_flags, unsigned long flags)
1899 {
1900 task->flags &= ~flags;
1901 task->flags |= orig_flags & flags;
1902 }
1903
1897 #ifdef CONFIG_SMP 1904 #ifdef CONFIG_SMP
1898 extern void do_set_cpus_allowed(struct task_struct *p, 1905 extern void do_set_cpus_allowed(struct task_struct *p,
1899 const struct cpumask *new_mask); 1906 const struct cpumask *new_mask);
1900 1907
1901 extern int set_cpus_allowed_ptr(struct task_struct *p, 1908 extern int set_cpus_allowed_ptr(struct task_struct *p,
1902 const struct cpumask *new_mask); 1909 const struct cpumask *new_mask);
1903 #else 1910 #else
1904 static inline void do_set_cpus_allowed(struct task_struct *p, 1911 static inline void do_set_cpus_allowed(struct task_struct *p,
1905 const struct cpumask *new_mask) 1912 const struct cpumask *new_mask)
1906 { 1913 {
1907 } 1914 }
1908 static inline int set_cpus_allowed_ptr(struct task_struct *p, 1915 static inline int set_cpus_allowed_ptr(struct task_struct *p,
1909 const struct cpumask *new_mask) 1916 const struct cpumask *new_mask)
1910 { 1917 {
1911 if (!cpumask_test_cpu(0, new_mask)) 1918 if (!cpumask_test_cpu(0, new_mask))
1912 return -EINVAL; 1919 return -EINVAL;
1913 return 0; 1920 return 0;
1914 } 1921 }
1915 #endif 1922 #endif
1916 1923
1917 #ifdef CONFIG_NO_HZ 1924 #ifdef CONFIG_NO_HZ
1918 void calc_load_enter_idle(void); 1925 void calc_load_enter_idle(void);
1919 void calc_load_exit_idle(void); 1926 void calc_load_exit_idle(void);
1920 #else 1927 #else
1921 static inline void calc_load_enter_idle(void) { } 1928 static inline void calc_load_enter_idle(void) { }
1922 static inline void calc_load_exit_idle(void) { } 1929 static inline void calc_load_exit_idle(void) { }
1923 #endif /* CONFIG_NO_HZ */ 1930 #endif /* CONFIG_NO_HZ */
1924 1931
1925 #ifndef CONFIG_CPUMASK_OFFSTACK 1932 #ifndef CONFIG_CPUMASK_OFFSTACK
1926 static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) 1933 static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
1927 { 1934 {
1928 return set_cpus_allowed_ptr(p, &new_mask); 1935 return set_cpus_allowed_ptr(p, &new_mask);
1929 } 1936 }
1930 #endif 1937 #endif
1931 1938
1932 /* 1939 /*
1933 * Do not use outside of architecture code which knows its limitations. 1940 * Do not use outside of architecture code which knows its limitations.
1934 * 1941 *
1935 * sched_clock() has no promise of monotonicity or bounded drift between 1942 * sched_clock() has no promise of monotonicity or bounded drift between
1936 * CPUs, use (which you should not) requires disabling IRQs. 1943 * CPUs, use (which you should not) requires disabling IRQs.
1937 * 1944 *
1938 * Please use one of the three interfaces below. 1945 * Please use one of the three interfaces below.
1939 */ 1946 */
1940 extern unsigned long long notrace sched_clock(void); 1947 extern unsigned long long notrace sched_clock(void);
1941 /* 1948 /*
1942 * See the comment in kernel/sched/clock.c 1949 * See the comment in kernel/sched/clock.c
1943 */ 1950 */
1944 extern u64 cpu_clock(int cpu); 1951 extern u64 cpu_clock(int cpu);
1945 extern u64 local_clock(void); 1952 extern u64 local_clock(void);
1946 extern u64 sched_clock_cpu(int cpu); 1953 extern u64 sched_clock_cpu(int cpu);
1947 1954
1948 1955
1949 extern void sched_clock_init(void); 1956 extern void sched_clock_init(void);
1950 1957
1951 #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK 1958 #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
1952 static inline void sched_clock_tick(void) 1959 static inline void sched_clock_tick(void)
1953 { 1960 {
1954 } 1961 }
1955 1962
1956 static inline void sched_clock_idle_sleep_event(void) 1963 static inline void sched_clock_idle_sleep_event(void)
1957 { 1964 {
1958 } 1965 }
1959 1966
1960 static inline void sched_clock_idle_wakeup_event(u64 delta_ns) 1967 static inline void sched_clock_idle_wakeup_event(u64 delta_ns)
1961 { 1968 {
1962 } 1969 }
1963 #else 1970 #else
1964 /* 1971 /*
1965 * Architectures can set this to 1 if they have specified 1972 * Architectures can set this to 1 if they have specified
1966 * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig, 1973 * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig,
1967 * but then during bootup it turns out that sched_clock() 1974 * but then during bootup it turns out that sched_clock()
1968 * is reliable after all: 1975 * is reliable after all:
1969 */ 1976 */
1970 extern int sched_clock_stable; 1977 extern int sched_clock_stable;
1971 1978
1972 extern void sched_clock_tick(void); 1979 extern void sched_clock_tick(void);
1973 extern void sched_clock_idle_sleep_event(void); 1980 extern void sched_clock_idle_sleep_event(void);
1974 extern void sched_clock_idle_wakeup_event(u64 delta_ns); 1981 extern void sched_clock_idle_wakeup_event(u64 delta_ns);
1975 #endif 1982 #endif
1976 1983
1977 #ifdef CONFIG_IRQ_TIME_ACCOUNTING 1984 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
1978 /* 1985 /*
1979 * An i/f to runtime opt-in for irq time accounting based off of sched_clock. 1986 * An i/f to runtime opt-in for irq time accounting based off of sched_clock.
1980 * The reason for this explicit opt-in is not to have perf penalty with 1987 * The reason for this explicit opt-in is not to have perf penalty with
1981 * slow sched_clocks. 1988 * slow sched_clocks.
1982 */ 1989 */
1983 extern void enable_sched_clock_irqtime(void); 1990 extern void enable_sched_clock_irqtime(void);
1984 extern void disable_sched_clock_irqtime(void); 1991 extern void disable_sched_clock_irqtime(void);
1985 #else 1992 #else
1986 static inline void enable_sched_clock_irqtime(void) {} 1993 static inline void enable_sched_clock_irqtime(void) {}
1987 static inline void disable_sched_clock_irqtime(void) {} 1994 static inline void disable_sched_clock_irqtime(void) {}
1988 #endif 1995 #endif
1989 1996
1990 extern unsigned long long 1997 extern unsigned long long
1991 task_sched_runtime(struct task_struct *task); 1998 task_sched_runtime(struct task_struct *task);
1992 1999
1993 /* sched_exec is called by processes performing an exec */ 2000 /* sched_exec is called by processes performing an exec */
1994 #ifdef CONFIG_SMP 2001 #ifdef CONFIG_SMP
1995 extern void sched_exec(void); 2002 extern void sched_exec(void);
1996 #else 2003 #else
1997 #define sched_exec() {} 2004 #define sched_exec() {}
1998 #endif 2005 #endif
1999 2006
2000 extern void sched_clock_idle_sleep_event(void); 2007 extern void sched_clock_idle_sleep_event(void);
2001 extern void sched_clock_idle_wakeup_event(u64 delta_ns); 2008 extern void sched_clock_idle_wakeup_event(u64 delta_ns);
2002 2009
2003 #ifdef CONFIG_HOTPLUG_CPU 2010 #ifdef CONFIG_HOTPLUG_CPU
2004 extern void idle_task_exit(void); 2011 extern void idle_task_exit(void);
2005 #else 2012 #else
2006 static inline void idle_task_exit(void) {} 2013 static inline void idle_task_exit(void) {}
2007 #endif 2014 #endif
2008 2015
2009 #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) 2016 #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
2010 extern void wake_up_idle_cpu(int cpu); 2017 extern void wake_up_idle_cpu(int cpu);
2011 #else 2018 #else
2012 static inline void wake_up_idle_cpu(int cpu) { } 2019 static inline void wake_up_idle_cpu(int cpu) { }
2013 #endif 2020 #endif
2014 2021
2015 extern unsigned int sysctl_sched_latency; 2022 extern unsigned int sysctl_sched_latency;
2016 extern unsigned int sysctl_sched_min_granularity; 2023 extern unsigned int sysctl_sched_min_granularity;
2017 extern unsigned int sysctl_sched_wakeup_granularity; 2024 extern unsigned int sysctl_sched_wakeup_granularity;
2018 extern unsigned int sysctl_sched_child_runs_first; 2025 extern unsigned int sysctl_sched_child_runs_first;
2019 2026
2020 enum sched_tunable_scaling { 2027 enum sched_tunable_scaling {
2021 SCHED_TUNABLESCALING_NONE, 2028 SCHED_TUNABLESCALING_NONE,
2022 SCHED_TUNABLESCALING_LOG, 2029 SCHED_TUNABLESCALING_LOG,
2023 SCHED_TUNABLESCALING_LINEAR, 2030 SCHED_TUNABLESCALING_LINEAR,
2024 SCHED_TUNABLESCALING_END, 2031 SCHED_TUNABLESCALING_END,
2025 }; 2032 };
2026 extern enum sched_tunable_scaling sysctl_sched_tunable_scaling; 2033 extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
2027 2034
2028 #ifdef CONFIG_SCHED_DEBUG 2035 #ifdef CONFIG_SCHED_DEBUG
2029 extern unsigned int sysctl_sched_migration_cost; 2036 extern unsigned int sysctl_sched_migration_cost;
2030 extern unsigned int sysctl_sched_nr_migrate; 2037 extern unsigned int sysctl_sched_nr_migrate;
2031 extern unsigned int sysctl_sched_time_avg; 2038 extern unsigned int sysctl_sched_time_avg;
2032 extern unsigned int sysctl_timer_migration; 2039 extern unsigned int sysctl_timer_migration;
2033 extern unsigned int sysctl_sched_shares_window; 2040 extern unsigned int sysctl_sched_shares_window;
2034 2041
2035 int sched_proc_update_handler(struct ctl_table *table, int write, 2042 int sched_proc_update_handler(struct ctl_table *table, int write,
2036 void __user *buffer, size_t *length, 2043 void __user *buffer, size_t *length,
2037 loff_t *ppos); 2044 loff_t *ppos);
2038 #endif 2045 #endif
2039 #ifdef CONFIG_SCHED_DEBUG 2046 #ifdef CONFIG_SCHED_DEBUG
2040 static inline unsigned int get_sysctl_timer_migration(void) 2047 static inline unsigned int get_sysctl_timer_migration(void)
2041 { 2048 {
2042 return sysctl_timer_migration; 2049 return sysctl_timer_migration;
2043 } 2050 }
2044 #else 2051 #else
2045 static inline unsigned int get_sysctl_timer_migration(void) 2052 static inline unsigned int get_sysctl_timer_migration(void)
2046 { 2053 {
2047 return 1; 2054 return 1;
2048 } 2055 }
2049 #endif 2056 #endif
2050 extern unsigned int sysctl_sched_rt_period; 2057 extern unsigned int sysctl_sched_rt_period;
2051 extern int sysctl_sched_rt_runtime; 2058 extern int sysctl_sched_rt_runtime;
2052 2059
2053 int sched_rt_handler(struct ctl_table *table, int write, 2060 int sched_rt_handler(struct ctl_table *table, int write,
2054 void __user *buffer, size_t *lenp, 2061 void __user *buffer, size_t *lenp,
2055 loff_t *ppos); 2062 loff_t *ppos);
2056 2063
2057 #ifdef CONFIG_SCHED_AUTOGROUP 2064 #ifdef CONFIG_SCHED_AUTOGROUP
2058 extern unsigned int sysctl_sched_autogroup_enabled; 2065 extern unsigned int sysctl_sched_autogroup_enabled;
2059 2066
2060 extern void sched_autogroup_create_attach(struct task_struct *p); 2067 extern void sched_autogroup_create_attach(struct task_struct *p);
2061 extern void sched_autogroup_detach(struct task_struct *p); 2068 extern void sched_autogroup_detach(struct task_struct *p);
2062 extern void sched_autogroup_fork(struct signal_struct *sig); 2069 extern void sched_autogroup_fork(struct signal_struct *sig);
2063 extern void sched_autogroup_exit(struct signal_struct *sig); 2070 extern void sched_autogroup_exit(struct signal_struct *sig);
2064 #ifdef CONFIG_PROC_FS 2071 #ifdef CONFIG_PROC_FS
2065 extern void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m); 2072 extern void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m);
2066 extern int proc_sched_autogroup_set_nice(struct task_struct *p, int nice); 2073 extern int proc_sched_autogroup_set_nice(struct task_struct *p, int nice);
2067 #endif 2074 #endif
2068 #else 2075 #else
2069 static inline void sched_autogroup_create_attach(struct task_struct *p) { } 2076 static inline void sched_autogroup_create_attach(struct task_struct *p) { }
2070 static inline void sched_autogroup_detach(struct task_struct *p) { } 2077 static inline void sched_autogroup_detach(struct task_struct *p) { }
2071 static inline void sched_autogroup_fork(struct signal_struct *sig) { } 2078 static inline void sched_autogroup_fork(struct signal_struct *sig) { }
2072 static inline void sched_autogroup_exit(struct signal_struct *sig) { } 2079 static inline void sched_autogroup_exit(struct signal_struct *sig) { }
2073 #endif 2080 #endif
2074 2081
2075 #ifdef CONFIG_CFS_BANDWIDTH 2082 #ifdef CONFIG_CFS_BANDWIDTH
2076 extern unsigned int sysctl_sched_cfs_bandwidth_slice; 2083 extern unsigned int sysctl_sched_cfs_bandwidth_slice;
2077 #endif 2084 #endif
2078 2085
2079 #ifdef CONFIG_RT_MUTEXES 2086 #ifdef CONFIG_RT_MUTEXES
2080 extern int rt_mutex_getprio(struct task_struct *p); 2087 extern int rt_mutex_getprio(struct task_struct *p);
2081 extern void rt_mutex_setprio(struct task_struct *p, int prio); 2088 extern void rt_mutex_setprio(struct task_struct *p, int prio);
2082 extern void rt_mutex_adjust_pi(struct task_struct *p); 2089 extern void rt_mutex_adjust_pi(struct task_struct *p);
2083 static inline bool tsk_is_pi_blocked(struct task_struct *tsk) 2090 static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
2084 { 2091 {
2085 return tsk->pi_blocked_on != NULL; 2092 return tsk->pi_blocked_on != NULL;
2086 } 2093 }
2087 #else 2094 #else
2088 static inline int rt_mutex_getprio(struct task_struct *p) 2095 static inline int rt_mutex_getprio(struct task_struct *p)
2089 { 2096 {
2090 return p->normal_prio; 2097 return p->normal_prio;
2091 } 2098 }
2092 # define rt_mutex_adjust_pi(p) do { } while (0) 2099 # define rt_mutex_adjust_pi(p) do { } while (0)
2093 static inline bool tsk_is_pi_blocked(struct task_struct *tsk) 2100 static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
2094 { 2101 {
2095 return false; 2102 return false;
2096 } 2103 }
2097 #endif 2104 #endif
2098 2105
2099 extern bool yield_to(struct task_struct *p, bool preempt); 2106 extern bool yield_to(struct task_struct *p, bool preempt);
2100 extern void set_user_nice(struct task_struct *p, long nice); 2107 extern void set_user_nice(struct task_struct *p, long nice);
2101 extern int task_prio(const struct task_struct *p); 2108 extern int task_prio(const struct task_struct *p);
2102 extern int task_nice(const struct task_struct *p); 2109 extern int task_nice(const struct task_struct *p);
2103 extern int can_nice(const struct task_struct *p, const int nice); 2110 extern int can_nice(const struct task_struct *p, const int nice);
2104 extern int task_curr(const struct task_struct *p); 2111 extern int task_curr(const struct task_struct *p);
2105 extern int idle_cpu(int cpu); 2112 extern int idle_cpu(int cpu);
2106 extern int sched_setscheduler(struct task_struct *, int, 2113 extern int sched_setscheduler(struct task_struct *, int,
2107 const struct sched_param *); 2114 const struct sched_param *);
2108 extern int sched_setscheduler_nocheck(struct task_struct *, int, 2115 extern int sched_setscheduler_nocheck(struct task_struct *, int,
2109 const struct sched_param *); 2116 const struct sched_param *);
2110 extern struct task_struct *idle_task(int cpu); 2117 extern struct task_struct *idle_task(int cpu);
2111 /** 2118 /**
2112 * is_idle_task - is the specified task an idle task? 2119 * is_idle_task - is the specified task an idle task?
2113 * @p: the task in question. 2120 * @p: the task in question.
2114 */ 2121 */
2115 static inline bool is_idle_task(const struct task_struct *p) 2122 static inline bool is_idle_task(const struct task_struct *p)
2116 { 2123 {
2117 return p->pid == 0; 2124 return p->pid == 0;
2118 } 2125 }
2119 extern struct task_struct *curr_task(int cpu); 2126 extern struct task_struct *curr_task(int cpu);
2120 extern void set_curr_task(int cpu, struct task_struct *p); 2127 extern void set_curr_task(int cpu, struct task_struct *p);
2121 2128
2122 void yield(void); 2129 void yield(void);
2123 2130
2124 /* 2131 /*
2125 * The default (Linux) execution domain. 2132 * The default (Linux) execution domain.
2126 */ 2133 */
2127 extern struct exec_domain default_exec_domain; 2134 extern struct exec_domain default_exec_domain;
2128 2135
2129 union thread_union { 2136 union thread_union {
2130 struct thread_info thread_info; 2137 struct thread_info thread_info;
2131 unsigned long stack[THREAD_SIZE/sizeof(long)]; 2138 unsigned long stack[THREAD_SIZE/sizeof(long)];
2132 }; 2139 };
2133 2140
2134 #ifndef __HAVE_ARCH_KSTACK_END 2141 #ifndef __HAVE_ARCH_KSTACK_END
2135 static inline int kstack_end(void *addr) 2142 static inline int kstack_end(void *addr)
2136 { 2143 {
2137 /* Reliable end of stack detection: 2144 /* Reliable end of stack detection:
2138 * Some APM bios versions misalign the stack 2145 * Some APM bios versions misalign the stack
2139 */ 2146 */
2140 return !(((unsigned long)addr+sizeof(void*)-1) & (THREAD_SIZE-sizeof(void*))); 2147 return !(((unsigned long)addr+sizeof(void*)-1) & (THREAD_SIZE-sizeof(void*)));
2141 } 2148 }
2142 #endif 2149 #endif
2143 2150
2144 extern union thread_union init_thread_union; 2151 extern union thread_union init_thread_union;
2145 extern struct task_struct init_task; 2152 extern struct task_struct init_task;
2146 2153
2147 extern struct mm_struct init_mm; 2154 extern struct mm_struct init_mm;
2148 2155
2149 extern struct pid_namespace init_pid_ns; 2156 extern struct pid_namespace init_pid_ns;
2150 2157
2151 /* 2158 /*
2152 * find a task by one of its numerical ids 2159 * find a task by one of its numerical ids
2153 * 2160 *
2154 * find_task_by_pid_ns(): 2161 * find_task_by_pid_ns():
2155 * finds a task by its pid in the specified namespace 2162 * finds a task by its pid in the specified namespace
2156 * find_task_by_vpid(): 2163 * find_task_by_vpid():
2157 * finds a task by its virtual pid 2164 * finds a task by its virtual pid
2158 * 2165 *
2159 * see also find_vpid() etc in include/linux/pid.h 2166 * see also find_vpid() etc in include/linux/pid.h
2160 */ 2167 */
2161 2168
2162 extern struct task_struct *find_task_by_vpid(pid_t nr); 2169 extern struct task_struct *find_task_by_vpid(pid_t nr);
2163 extern struct task_struct *find_task_by_pid_ns(pid_t nr, 2170 extern struct task_struct *find_task_by_pid_ns(pid_t nr,
2164 struct pid_namespace *ns); 2171 struct pid_namespace *ns);
2165 2172
2166 extern void __set_special_pids(struct pid *pid); 2173 extern void __set_special_pids(struct pid *pid);
2167 2174
2168 /* per-UID process charging. */ 2175 /* per-UID process charging. */
2169 extern struct user_struct * alloc_uid(kuid_t); 2176 extern struct user_struct * alloc_uid(kuid_t);
2170 static inline struct user_struct *get_uid(struct user_struct *u) 2177 static inline struct user_struct *get_uid(struct user_struct *u)
2171 { 2178 {
2172 atomic_inc(&u->__count); 2179 atomic_inc(&u->__count);
2173 return u; 2180 return u;
2174 } 2181 }
2175 extern void free_uid(struct user_struct *); 2182 extern void free_uid(struct user_struct *);
2176 2183
2177 #include <asm/current.h> 2184 #include <asm/current.h>
2178 2185
2179 extern void xtime_update(unsigned long ticks); 2186 extern void xtime_update(unsigned long ticks);
2180 2187
2181 extern int wake_up_state(struct task_struct *tsk, unsigned int state); 2188 extern int wake_up_state(struct task_struct *tsk, unsigned int state);
2182 extern int wake_up_process(struct task_struct *tsk); 2189 extern int wake_up_process(struct task_struct *tsk);
2183 extern void wake_up_new_task(struct task_struct *tsk); 2190 extern void wake_up_new_task(struct task_struct *tsk);
2184 #ifdef CONFIG_SMP 2191 #ifdef CONFIG_SMP
2185 extern void kick_process(struct task_struct *tsk); 2192 extern void kick_process(struct task_struct *tsk);
2186 #else 2193 #else
2187 static inline void kick_process(struct task_struct *tsk) { } 2194 static inline void kick_process(struct task_struct *tsk) { }
2188 #endif 2195 #endif
2189 extern void sched_fork(struct task_struct *p); 2196 extern void sched_fork(struct task_struct *p);
2190 extern void sched_dead(struct task_struct *p); 2197 extern void sched_dead(struct task_struct *p);
2191 2198
2192 extern void proc_caches_init(void); 2199 extern void proc_caches_init(void);
2193 extern void flush_signals(struct task_struct *); 2200 extern void flush_signals(struct task_struct *);
2194 extern void __flush_signals(struct task_struct *); 2201 extern void __flush_signals(struct task_struct *);
2195 extern void ignore_signals(struct task_struct *); 2202 extern void ignore_signals(struct task_struct *);
2196 extern void flush_signal_handlers(struct task_struct *, int force_default); 2203 extern void flush_signal_handlers(struct task_struct *, int force_default);
2197 extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info); 2204 extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info);
2198 2205
2199 static inline int dequeue_signal_lock(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) 2206 static inline int dequeue_signal_lock(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
2200 { 2207 {
2201 unsigned long flags; 2208 unsigned long flags;
2202 int ret; 2209 int ret;
2203 2210
2204 spin_lock_irqsave(&tsk->sighand->siglock, flags); 2211 spin_lock_irqsave(&tsk->sighand->siglock, flags);
2205 ret = dequeue_signal(tsk, mask, info); 2212 ret = dequeue_signal(tsk, mask, info);
2206 spin_unlock_irqrestore(&tsk->sighand->siglock, flags); 2213 spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
2207 2214
2208 return ret; 2215 return ret;
2209 } 2216 }
2210 2217
2211 extern void block_all_signals(int (*notifier)(void *priv), void *priv, 2218 extern void block_all_signals(int (*notifier)(void *priv), void *priv,
2212 sigset_t *mask); 2219 sigset_t *mask);
2213 extern void unblock_all_signals(void); 2220 extern void unblock_all_signals(void);
2214 extern void release_task(struct task_struct * p); 2221 extern void release_task(struct task_struct * p);
2215 extern int send_sig_info(int, struct siginfo *, struct task_struct *); 2222 extern int send_sig_info(int, struct siginfo *, struct task_struct *);
2216 extern int force_sigsegv(int, struct task_struct *); 2223 extern int force_sigsegv(int, struct task_struct *);
2217 extern int force_sig_info(int, struct siginfo *, struct task_struct *); 2224 extern int force_sig_info(int, struct siginfo *, struct task_struct *);
2218 extern int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp); 2225 extern int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp);
2219 extern int kill_pid_info(int sig, struct siginfo *info, struct pid *pid); 2226 extern int kill_pid_info(int sig, struct siginfo *info, struct pid *pid);
2220 extern int kill_pid_info_as_cred(int, struct siginfo *, struct pid *, 2227 extern int kill_pid_info_as_cred(int, struct siginfo *, struct pid *,
2221 const struct cred *, u32); 2228 const struct cred *, u32);
2222 extern int kill_pgrp(struct pid *pid, int sig, int priv); 2229 extern int kill_pgrp(struct pid *pid, int sig, int priv);
2223 extern int kill_pid(struct pid *pid, int sig, int priv); 2230 extern int kill_pid(struct pid *pid, int sig, int priv);
2224 extern int kill_proc_info(int, struct siginfo *, pid_t); 2231 extern int kill_proc_info(int, struct siginfo *, pid_t);
2225 extern __must_check bool do_notify_parent(struct task_struct *, int); 2232 extern __must_check bool do_notify_parent(struct task_struct *, int);
2226 extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent); 2233 extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent);
2227 extern void force_sig(int, struct task_struct *); 2234 extern void force_sig(int, struct task_struct *);
2228 extern int send_sig(int, struct task_struct *, int); 2235 extern int send_sig(int, struct task_struct *, int);
2229 extern int zap_other_threads(struct task_struct *p); 2236 extern int zap_other_threads(struct task_struct *p);
2230 extern struct sigqueue *sigqueue_alloc(void); 2237 extern struct sigqueue *sigqueue_alloc(void);
2231 extern void sigqueue_free(struct sigqueue *); 2238 extern void sigqueue_free(struct sigqueue *);
2232 extern int send_sigqueue(struct sigqueue *, struct task_struct *, int group); 2239 extern int send_sigqueue(struct sigqueue *, struct task_struct *, int group);
2233 extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *); 2240 extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *);
2234 extern int do_sigaltstack(const stack_t __user *, stack_t __user *, unsigned long); 2241 extern int do_sigaltstack(const stack_t __user *, stack_t __user *, unsigned long);
2235 2242
2236 static inline void restore_saved_sigmask(void) 2243 static inline void restore_saved_sigmask(void)
2237 { 2244 {
2238 if (test_and_clear_restore_sigmask()) 2245 if (test_and_clear_restore_sigmask())
2239 __set_current_blocked(&current->saved_sigmask); 2246 __set_current_blocked(&current->saved_sigmask);
2240 } 2247 }
2241 2248
2242 static inline sigset_t *sigmask_to_save(void) 2249 static inline sigset_t *sigmask_to_save(void)
2243 { 2250 {
2244 sigset_t *res = &current->blocked; 2251 sigset_t *res = &current->blocked;
2245 if (unlikely(test_restore_sigmask())) 2252 if (unlikely(test_restore_sigmask()))
2246 res = &current->saved_sigmask; 2253 res = &current->saved_sigmask;
2247 return res; 2254 return res;
2248 } 2255 }
2249 2256
2250 static inline int kill_cad_pid(int sig, int priv) 2257 static inline int kill_cad_pid(int sig, int priv)
2251 { 2258 {
2252 return kill_pid(cad_pid, sig, priv); 2259 return kill_pid(cad_pid, sig, priv);
2253 } 2260 }
2254 2261
2255 /* These can be the second arg to send_sig_info/send_group_sig_info. */ 2262 /* These can be the second arg to send_sig_info/send_group_sig_info. */
2256 #define SEND_SIG_NOINFO ((struct siginfo *) 0) 2263 #define SEND_SIG_NOINFO ((struct siginfo *) 0)
2257 #define SEND_SIG_PRIV ((struct siginfo *) 1) 2264 #define SEND_SIG_PRIV ((struct siginfo *) 1)
2258 #define SEND_SIG_FORCED ((struct siginfo *) 2) 2265 #define SEND_SIG_FORCED ((struct siginfo *) 2)
2259 2266
2260 /* 2267 /*
2261 * True if we are on the alternate signal stack. 2268 * True if we are on the alternate signal stack.
2262 */ 2269 */
2263 static inline int on_sig_stack(unsigned long sp) 2270 static inline int on_sig_stack(unsigned long sp)
2264 { 2271 {
2265 #ifdef CONFIG_STACK_GROWSUP 2272 #ifdef CONFIG_STACK_GROWSUP
2266 return sp >= current->sas_ss_sp && 2273 return sp >= current->sas_ss_sp &&
2267 sp - current->sas_ss_sp < current->sas_ss_size; 2274 sp - current->sas_ss_sp < current->sas_ss_size;
2268 #else 2275 #else
2269 return sp > current->sas_ss_sp && 2276 return sp > current->sas_ss_sp &&
2270 sp - current->sas_ss_sp <= current->sas_ss_size; 2277 sp - current->sas_ss_sp <= current->sas_ss_size;
2271 #endif 2278 #endif
2272 } 2279 }
2273 2280
2274 static inline int sas_ss_flags(unsigned long sp) 2281 static inline int sas_ss_flags(unsigned long sp)
2275 { 2282 {
2276 return (current->sas_ss_size == 0 ? SS_DISABLE 2283 return (current->sas_ss_size == 0 ? SS_DISABLE
2277 : on_sig_stack(sp) ? SS_ONSTACK : 0); 2284 : on_sig_stack(sp) ? SS_ONSTACK : 0);
2278 } 2285 }
2279 2286
2280 /* 2287 /*
2281 * Routines for handling mm_structs 2288 * Routines for handling mm_structs
2282 */ 2289 */
2283 extern struct mm_struct * mm_alloc(void); 2290 extern struct mm_struct * mm_alloc(void);
2284 2291
2285 /* mmdrop drops the mm and the page tables */ 2292 /* mmdrop drops the mm and the page tables */
2286 extern void __mmdrop(struct mm_struct *); 2293 extern void __mmdrop(struct mm_struct *);
2287 static inline void mmdrop(struct mm_struct * mm) 2294 static inline void mmdrop(struct mm_struct * mm)
2288 { 2295 {
2289 if (unlikely(atomic_dec_and_test(&mm->mm_count))) 2296 if (unlikely(atomic_dec_and_test(&mm->mm_count)))
2290 __mmdrop(mm); 2297 __mmdrop(mm);
2291 } 2298 }
2292 2299
2293 /* mmput gets rid of the mappings and all user-space */ 2300 /* mmput gets rid of the mappings and all user-space */
2294 extern void mmput(struct mm_struct *); 2301 extern void mmput(struct mm_struct *);
2295 /* Grab a reference to a task's mm, if it is not already going away */ 2302 /* Grab a reference to a task's mm, if it is not already going away */
2296 extern struct mm_struct *get_task_mm(struct task_struct *task); 2303 extern struct mm_struct *get_task_mm(struct task_struct *task);
2297 /* 2304 /*
2298 * Grab a reference to a task's mm, if it is not already going away 2305 * Grab a reference to a task's mm, if it is not already going away
2299 * and ptrace_may_access with the mode parameter passed to it 2306 * and ptrace_may_access with the mode parameter passed to it
2300 * succeeds. 2307 * succeeds.
2301 */ 2308 */
2302 extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode); 2309 extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode);
2303 /* Remove the current tasks stale references to the old mm_struct */ 2310 /* Remove the current tasks stale references to the old mm_struct */
2304 extern void mm_release(struct task_struct *, struct mm_struct *); 2311 extern void mm_release(struct task_struct *, struct mm_struct *);
2305 /* Allocate a new mm structure and copy contents from tsk->mm */ 2312 /* Allocate a new mm structure and copy contents from tsk->mm */
2306 extern struct mm_struct *dup_mm(struct task_struct *tsk); 2313 extern struct mm_struct *dup_mm(struct task_struct *tsk);
2307 2314
2308 extern int copy_thread(unsigned long, unsigned long, unsigned long, 2315 extern int copy_thread(unsigned long, unsigned long, unsigned long,
2309 struct task_struct *, struct pt_regs *); 2316 struct task_struct *, struct pt_regs *);
2310 extern void flush_thread(void); 2317 extern void flush_thread(void);
2311 extern void exit_thread(void); 2318 extern void exit_thread(void);
2312 2319
2313 extern void exit_files(struct task_struct *); 2320 extern void exit_files(struct task_struct *);
2314 extern void __cleanup_sighand(struct sighand_struct *); 2321 extern void __cleanup_sighand(struct sighand_struct *);
2315 2322
2316 extern void exit_itimers(struct signal_struct *); 2323 extern void exit_itimers(struct signal_struct *);
2317 extern void flush_itimer_signals(void); 2324 extern void flush_itimer_signals(void);
2318 2325
2319 extern void do_group_exit(int); 2326 extern void do_group_exit(int);
2320 2327
2321 extern void daemonize(const char *, ...); 2328 extern void daemonize(const char *, ...);
2322 extern int allow_signal(int); 2329 extern int allow_signal(int);
2323 extern int disallow_signal(int); 2330 extern int disallow_signal(int);
2324 2331
2325 extern int do_execve(const char *, 2332 extern int do_execve(const char *,
2326 const char __user * const __user *, 2333 const char __user * const __user *,
2327 const char __user * const __user *, struct pt_regs *); 2334 const char __user * const __user *, struct pt_regs *);
2328 extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *); 2335 extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *);
2329 struct task_struct *fork_idle(int); 2336 struct task_struct *fork_idle(int);
2330 2337
2331 extern void set_task_comm(struct task_struct *tsk, char *from); 2338 extern void set_task_comm(struct task_struct *tsk, char *from);
2332 extern char *get_task_comm(char *to, struct task_struct *tsk); 2339 extern char *get_task_comm(char *to, struct task_struct *tsk);
2333 2340
2334 #ifdef CONFIG_SMP 2341 #ifdef CONFIG_SMP
2335 void scheduler_ipi(void); 2342 void scheduler_ipi(void);
2336 extern unsigned long wait_task_inactive(struct task_struct *, long match_state); 2343 extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
2337 #else 2344 #else
2338 static inline void scheduler_ipi(void) { } 2345 static inline void scheduler_ipi(void) { }
2339 static inline unsigned long wait_task_inactive(struct task_struct *p, 2346 static inline unsigned long wait_task_inactive(struct task_struct *p,
2340 long match_state) 2347 long match_state)
2341 { 2348 {
2342 return 1; 2349 return 1;
2343 } 2350 }
2344 #endif 2351 #endif
2345 2352
2346 #define next_task(p) \ 2353 #define next_task(p) \
2347 list_entry_rcu((p)->tasks.next, struct task_struct, tasks) 2354 list_entry_rcu((p)->tasks.next, struct task_struct, tasks)
2348 2355
2349 #define for_each_process(p) \ 2356 #define for_each_process(p) \
2350 for (p = &init_task ; (p = next_task(p)) != &init_task ; ) 2357 for (p = &init_task ; (p = next_task(p)) != &init_task ; )
2351 2358
2352 extern bool current_is_single_threaded(void); 2359 extern bool current_is_single_threaded(void);
2353 2360
2354 /* 2361 /*
2355 * Careful: do_each_thread/while_each_thread is a double loop so 2362 * Careful: do_each_thread/while_each_thread is a double loop so
2356 * 'break' will not work as expected - use goto instead. 2363 * 'break' will not work as expected - use goto instead.
2357 */ 2364 */
2358 #define do_each_thread(g, t) \ 2365 #define do_each_thread(g, t) \
2359 for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do 2366 for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do
2360 2367
2361 #define while_each_thread(g, t) \ 2368 #define while_each_thread(g, t) \
2362 while ((t = next_thread(t)) != g) 2369 while ((t = next_thread(t)) != g)
2363 2370
2364 static inline int get_nr_threads(struct task_struct *tsk) 2371 static inline int get_nr_threads(struct task_struct *tsk)
2365 { 2372 {
2366 return tsk->signal->nr_threads; 2373 return tsk->signal->nr_threads;
2367 } 2374 }
2368 2375
2369 static inline bool thread_group_leader(struct task_struct *p) 2376 static inline bool thread_group_leader(struct task_struct *p)
2370 { 2377 {
2371 return p->exit_signal >= 0; 2378 return p->exit_signal >= 0;
2372 } 2379 }
2373 2380
2374 /* Do to the insanities of de_thread it is possible for a process 2381 /* Do to the insanities of de_thread it is possible for a process
2375 * to have the pid of the thread group leader without actually being 2382 * to have the pid of the thread group leader without actually being
2376 * the thread group leader. For iteration through the pids in proc 2383 * the thread group leader. For iteration through the pids in proc
2377 * all we care about is that we have a task with the appropriate 2384 * all we care about is that we have a task with the appropriate
2378 * pid, we don't actually care if we have the right task. 2385 * pid, we don't actually care if we have the right task.
2379 */ 2386 */
2380 static inline int has_group_leader_pid(struct task_struct *p) 2387 static inline int has_group_leader_pid(struct task_struct *p)
2381 { 2388 {
2382 return p->pid == p->tgid; 2389 return p->pid == p->tgid;
2383 } 2390 }
2384 2391
2385 static inline 2392 static inline
2386 int same_thread_group(struct task_struct *p1, struct task_struct *p2) 2393 int same_thread_group(struct task_struct *p1, struct task_struct *p2)
2387 { 2394 {
2388 return p1->tgid == p2->tgid; 2395 return p1->tgid == p2->tgid;
2389 } 2396 }
2390 2397
2391 static inline struct task_struct *next_thread(const struct task_struct *p) 2398 static inline struct task_struct *next_thread(const struct task_struct *p)
2392 { 2399 {
2393 return list_entry_rcu(p->thread_group.next, 2400 return list_entry_rcu(p->thread_group.next,
2394 struct task_struct, thread_group); 2401 struct task_struct, thread_group);
2395 } 2402 }
2396 2403
2397 static inline int thread_group_empty(struct task_struct *p) 2404 static inline int thread_group_empty(struct task_struct *p)
2398 { 2405 {
2399 return list_empty(&p->thread_group); 2406 return list_empty(&p->thread_group);
2400 } 2407 }
2401 2408
2402 #define delay_group_leader(p) \ 2409 #define delay_group_leader(p) \
2403 (thread_group_leader(p) && !thread_group_empty(p)) 2410 (thread_group_leader(p) && !thread_group_empty(p))
2404 2411
2405 /* 2412 /*
2406 * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring 2413 * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
2407 * subscriptions and synchronises with wait4(). Also used in procfs. Also 2414 * subscriptions and synchronises with wait4(). Also used in procfs. Also
2408 * pins the final release of task.io_context. Also protects ->cpuset and 2415 * pins the final release of task.io_context. Also protects ->cpuset and
2409 * ->cgroup.subsys[]. And ->vfork_done. 2416 * ->cgroup.subsys[]. And ->vfork_done.
2410 * 2417 *
2411 * Nests both inside and outside of read_lock(&tasklist_lock). 2418 * Nests both inside and outside of read_lock(&tasklist_lock).
2412 * It must not be nested with write_lock_irq(&tasklist_lock), 2419 * It must not be nested with write_lock_irq(&tasklist_lock),
2413 * neither inside nor outside. 2420 * neither inside nor outside.
2414 */ 2421 */
2415 static inline void task_lock(struct task_struct *p) 2422 static inline void task_lock(struct task_struct *p)
2416 { 2423 {
2417 spin_lock(&p->alloc_lock); 2424 spin_lock(&p->alloc_lock);
2418 } 2425 }
2419 2426
2420 static inline void task_unlock(struct task_struct *p) 2427 static inline void task_unlock(struct task_struct *p)
2421 { 2428 {
2422 spin_unlock(&p->alloc_lock); 2429 spin_unlock(&p->alloc_lock);
2423 } 2430 }
2424 2431
2425 extern struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, 2432 extern struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
2426 unsigned long *flags); 2433 unsigned long *flags);
2427 2434
2428 static inline struct sighand_struct *lock_task_sighand(struct task_struct *tsk, 2435 static inline struct sighand_struct *lock_task_sighand(struct task_struct *tsk,
2429 unsigned long *flags) 2436 unsigned long *flags)
2430 { 2437 {
2431 struct sighand_struct *ret; 2438 struct sighand_struct *ret;
2432 2439
2433 ret = __lock_task_sighand(tsk, flags); 2440 ret = __lock_task_sighand(tsk, flags);
2434 (void)__cond_lock(&tsk->sighand->siglock, ret); 2441 (void)__cond_lock(&tsk->sighand->siglock, ret);
2435 return ret; 2442 return ret;
2436 } 2443 }
2437 2444
2438 static inline void unlock_task_sighand(struct task_struct *tsk, 2445 static inline void unlock_task_sighand(struct task_struct *tsk,
2439 unsigned long *flags) 2446 unsigned long *flags)
2440 { 2447 {
2441 spin_unlock_irqrestore(&tsk->sighand->siglock, *flags); 2448 spin_unlock_irqrestore(&tsk->sighand->siglock, *flags);
2442 } 2449 }
2443 2450
2444 #ifdef CONFIG_CGROUPS 2451 #ifdef CONFIG_CGROUPS
2445 static inline void threadgroup_change_begin(struct task_struct *tsk) 2452 static inline void threadgroup_change_begin(struct task_struct *tsk)
2446 { 2453 {
2447 down_read(&tsk->signal->group_rwsem); 2454 down_read(&tsk->signal->group_rwsem);
2448 } 2455 }
2449 static inline void threadgroup_change_end(struct task_struct *tsk) 2456 static inline void threadgroup_change_end(struct task_struct *tsk)
2450 { 2457 {
2451 up_read(&tsk->signal->group_rwsem); 2458 up_read(&tsk->signal->group_rwsem);
2452 } 2459 }
2453 2460
2454 /** 2461 /**
2455 * threadgroup_lock - lock threadgroup 2462 * threadgroup_lock - lock threadgroup
2456 * @tsk: member task of the threadgroup to lock 2463 * @tsk: member task of the threadgroup to lock
2457 * 2464 *
2458 * Lock the threadgroup @tsk belongs to. No new task is allowed to enter 2465 * Lock the threadgroup @tsk belongs to. No new task is allowed to enter
2459 * and member tasks aren't allowed to exit (as indicated by PF_EXITING) or 2466 * and member tasks aren't allowed to exit (as indicated by PF_EXITING) or
2460 * perform exec. This is useful for cases where the threadgroup needs to 2467 * perform exec. This is useful for cases where the threadgroup needs to
2461 * stay stable across blockable operations. 2468 * stay stable across blockable operations.
2462 * 2469 *
2463 * fork and exit paths explicitly call threadgroup_change_{begin|end}() for 2470 * fork and exit paths explicitly call threadgroup_change_{begin|end}() for
2464 * synchronization. While held, no new task will be added to threadgroup 2471 * synchronization. While held, no new task will be added to threadgroup
2465 * and no existing live task will have its PF_EXITING set. 2472 * and no existing live task will have its PF_EXITING set.
2466 * 2473 *
2467 * During exec, a task goes and puts its thread group through unusual 2474 * During exec, a task goes and puts its thread group through unusual
2468 * changes. After de-threading, exclusive access is assumed to resources 2475 * changes. After de-threading, exclusive access is assumed to resources
2469 * which are usually shared by tasks in the same group - e.g. sighand may 2476 * which are usually shared by tasks in the same group - e.g. sighand may
2470 * be replaced with a new one. Also, the exec'ing task takes over group 2477 * be replaced with a new one. Also, the exec'ing task takes over group
2471 * leader role including its pid. Exclude these changes while locked by 2478 * leader role including its pid. Exclude these changes while locked by
2472 * grabbing cred_guard_mutex which is used to synchronize exec path. 2479 * grabbing cred_guard_mutex which is used to synchronize exec path.
2473 */ 2480 */
2474 static inline void threadgroup_lock(struct task_struct *tsk) 2481 static inline void threadgroup_lock(struct task_struct *tsk)
2475 { 2482 {
2476 /* 2483 /*
2477 * exec uses exit for de-threading nesting group_rwsem inside 2484 * exec uses exit for de-threading nesting group_rwsem inside
2478 * cred_guard_mutex. Grab cred_guard_mutex first. 2485 * cred_guard_mutex. Grab cred_guard_mutex first.
2479 */ 2486 */
2480 mutex_lock(&tsk->signal->cred_guard_mutex); 2487 mutex_lock(&tsk->signal->cred_guard_mutex);
2481 down_write(&tsk->signal->group_rwsem); 2488 down_write(&tsk->signal->group_rwsem);
2482 } 2489 }
2483 2490
2484 /** 2491 /**
2485 * threadgroup_unlock - unlock threadgroup 2492 * threadgroup_unlock - unlock threadgroup
2486 * @tsk: member task of the threadgroup to unlock 2493 * @tsk: member task of the threadgroup to unlock
2487 * 2494 *
2488 * Reverse threadgroup_lock(). 2495 * Reverse threadgroup_lock().
2489 */ 2496 */
2490 static inline void threadgroup_unlock(struct task_struct *tsk) 2497 static inline void threadgroup_unlock(struct task_struct *tsk)
2491 { 2498 {
2492 up_write(&tsk->signal->group_rwsem); 2499 up_write(&tsk->signal->group_rwsem);
2493 mutex_unlock(&tsk->signal->cred_guard_mutex); 2500 mutex_unlock(&tsk->signal->cred_guard_mutex);
2494 } 2501 }
2495 #else 2502 #else
2496 static inline void threadgroup_change_begin(struct task_struct *tsk) {} 2503 static inline void threadgroup_change_begin(struct task_struct *tsk) {}
2497 static inline void threadgroup_change_end(struct task_struct *tsk) {} 2504 static inline void threadgroup_change_end(struct task_struct *tsk) {}
2498 static inline void threadgroup_lock(struct task_struct *tsk) {} 2505 static inline void threadgroup_lock(struct task_struct *tsk) {}
2499 static inline void threadgroup_unlock(struct task_struct *tsk) {} 2506 static inline void threadgroup_unlock(struct task_struct *tsk) {}
2500 #endif 2507 #endif
2501 2508
2502 #ifndef __HAVE_THREAD_FUNCTIONS 2509 #ifndef __HAVE_THREAD_FUNCTIONS
2503 2510
2504 #define task_thread_info(task) ((struct thread_info *)(task)->stack) 2511 #define task_thread_info(task) ((struct thread_info *)(task)->stack)
2505 #define task_stack_page(task) ((task)->stack) 2512 #define task_stack_page(task) ((task)->stack)
2506 2513
2507 static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org) 2514 static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org)
2508 { 2515 {
2509 *task_thread_info(p) = *task_thread_info(org); 2516 *task_thread_info(p) = *task_thread_info(org);
2510 task_thread_info(p)->task = p; 2517 task_thread_info(p)->task = p;
2511 } 2518 }
2512 2519
2513 static inline unsigned long *end_of_stack(struct task_struct *p) 2520 static inline unsigned long *end_of_stack(struct task_struct *p)
2514 { 2521 {
2515 return (unsigned long *)(task_thread_info(p) + 1); 2522 return (unsigned long *)(task_thread_info(p) + 1);
2516 } 2523 }
2517 2524
2518 #endif 2525 #endif
2519 2526
2520 static inline int object_is_on_stack(void *obj) 2527 static inline int object_is_on_stack(void *obj)
2521 { 2528 {
2522 void *stack = task_stack_page(current); 2529 void *stack = task_stack_page(current);
2523 2530
2524 return (obj >= stack) && (obj < (stack + THREAD_SIZE)); 2531 return (obj >= stack) && (obj < (stack + THREAD_SIZE));
2525 } 2532 }
2526 2533
2527 extern void thread_info_cache_init(void); 2534 extern void thread_info_cache_init(void);
2528 2535
2529 #ifdef CONFIG_DEBUG_STACK_USAGE 2536 #ifdef CONFIG_DEBUG_STACK_USAGE
2530 static inline unsigned long stack_not_used(struct task_struct *p) 2537 static inline unsigned long stack_not_used(struct task_struct *p)
2531 { 2538 {
2532 unsigned long *n = end_of_stack(p); 2539 unsigned long *n = end_of_stack(p);
2533 2540
2534 do { /* Skip over canary */ 2541 do { /* Skip over canary */
2535 n++; 2542 n++;
2536 } while (!*n); 2543 } while (!*n);
2537 2544
2538 return (unsigned long)n - (unsigned long)end_of_stack(p); 2545 return (unsigned long)n - (unsigned long)end_of_stack(p);
2539 } 2546 }
2540 #endif 2547 #endif
2541 2548
2542 /* set thread flags in other task's structures 2549 /* set thread flags in other task's structures
2543 * - see asm/thread_info.h for TIF_xxxx flags available 2550 * - see asm/thread_info.h for TIF_xxxx flags available
2544 */ 2551 */
2545 static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag) 2552 static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag)
2546 { 2553 {
2547 set_ti_thread_flag(task_thread_info(tsk), flag); 2554 set_ti_thread_flag(task_thread_info(tsk), flag);
2548 } 2555 }
2549 2556
2550 static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag) 2557 static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag)
2551 { 2558 {
2552 clear_ti_thread_flag(task_thread_info(tsk), flag); 2559 clear_ti_thread_flag(task_thread_info(tsk), flag);
2553 } 2560 }
2554 2561
2555 static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag) 2562 static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag)
2556 { 2563 {
2557 return test_and_set_ti_thread_flag(task_thread_info(tsk), flag); 2564 return test_and_set_ti_thread_flag(task_thread_info(tsk), flag);
2558 } 2565 }
2559 2566
2560 static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag) 2567 static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag)
2561 { 2568 {
2562 return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag); 2569 return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag);
2563 } 2570 }
2564 2571
2565 static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag) 2572 static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag)
2566 { 2573 {
2567 return test_ti_thread_flag(task_thread_info(tsk), flag); 2574 return test_ti_thread_flag(task_thread_info(tsk), flag);
2568 } 2575 }
2569 2576
2570 static inline void set_tsk_need_resched(struct task_struct *tsk) 2577 static inline void set_tsk_need_resched(struct task_struct *tsk)
2571 { 2578 {
2572 set_tsk_thread_flag(tsk,TIF_NEED_RESCHED); 2579 set_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
2573 } 2580 }
2574 2581
2575 static inline void clear_tsk_need_resched(struct task_struct *tsk) 2582 static inline void clear_tsk_need_resched(struct task_struct *tsk)
2576 { 2583 {
2577 clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED); 2584 clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
2578 } 2585 }
2579 2586
2580 static inline int test_tsk_need_resched(struct task_struct *tsk) 2587 static inline int test_tsk_need_resched(struct task_struct *tsk)
2581 { 2588 {
2582 return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); 2589 return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
2583 } 2590 }
2584 2591
2585 static inline int restart_syscall(void) 2592 static inline int restart_syscall(void)
2586 { 2593 {
2587 set_tsk_thread_flag(current, TIF_SIGPENDING); 2594 set_tsk_thread_flag(current, TIF_SIGPENDING);
2588 return -ERESTARTNOINTR; 2595 return -ERESTARTNOINTR;
2589 } 2596 }
2590 2597
2591 static inline int signal_pending(struct task_struct *p) 2598 static inline int signal_pending(struct task_struct *p)
2592 { 2599 {
2593 return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING)); 2600 return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
2594 } 2601 }
2595 2602
2596 static inline int __fatal_signal_pending(struct task_struct *p) 2603 static inline int __fatal_signal_pending(struct task_struct *p)
2597 { 2604 {
2598 return unlikely(sigismember(&p->pending.signal, SIGKILL)); 2605 return unlikely(sigismember(&p->pending.signal, SIGKILL));
2599 } 2606 }
2600 2607
2601 static inline int fatal_signal_pending(struct task_struct *p) 2608 static inline int fatal_signal_pending(struct task_struct *p)
2602 { 2609 {
2603 return signal_pending(p) && __fatal_signal_pending(p); 2610 return signal_pending(p) && __fatal_signal_pending(p);
2604 } 2611 }
2605 2612
2606 static inline int signal_pending_state(long state, struct task_struct *p) 2613 static inline int signal_pending_state(long state, struct task_struct *p)
2607 { 2614 {
2608 if (!(state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL))) 2615 if (!(state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL)))
2609 return 0; 2616 return 0;
2610 if (!signal_pending(p)) 2617 if (!signal_pending(p))
2611 return 0; 2618 return 0;
2612 2619
2613 return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p); 2620 return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
2614 } 2621 }
2615 2622
2616 static inline int need_resched(void) 2623 static inline int need_resched(void)
2617 { 2624 {
2618 return unlikely(test_thread_flag(TIF_NEED_RESCHED)); 2625 return unlikely(test_thread_flag(TIF_NEED_RESCHED));
2619 } 2626 }
2620 2627
2621 /* 2628 /*
2622 * cond_resched() and cond_resched_lock(): latency reduction via 2629 * cond_resched() and cond_resched_lock(): latency reduction via
2623 * explicit rescheduling in places that are safe. The return 2630 * explicit rescheduling in places that are safe. The return
2624 * value indicates whether a reschedule was done in fact. 2631 * value indicates whether a reschedule was done in fact.
2625 * cond_resched_lock() will drop the spinlock before scheduling, 2632 * cond_resched_lock() will drop the spinlock before scheduling,
2626 * cond_resched_softirq() will enable bhs before scheduling. 2633 * cond_resched_softirq() will enable bhs before scheduling.
2627 */ 2634 */
2628 extern int _cond_resched(void); 2635 extern int _cond_resched(void);
2629 2636
2630 #define cond_resched() ({ \ 2637 #define cond_resched() ({ \
2631 __might_sleep(__FILE__, __LINE__, 0); \ 2638 __might_sleep(__FILE__, __LINE__, 0); \
2632 _cond_resched(); \ 2639 _cond_resched(); \
2633 }) 2640 })
2634 2641
2635 extern int __cond_resched_lock(spinlock_t *lock); 2642 extern int __cond_resched_lock(spinlock_t *lock);
2636 2643
2637 #ifdef CONFIG_PREEMPT_COUNT 2644 #ifdef CONFIG_PREEMPT_COUNT
2638 #define PREEMPT_LOCK_OFFSET PREEMPT_OFFSET 2645 #define PREEMPT_LOCK_OFFSET PREEMPT_OFFSET
2639 #else 2646 #else
2640 #define PREEMPT_LOCK_OFFSET 0 2647 #define PREEMPT_LOCK_OFFSET 0
2641 #endif 2648 #endif
2642 2649
2643 #define cond_resched_lock(lock) ({ \ 2650 #define cond_resched_lock(lock) ({ \
2644 __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \ 2651 __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \
2645 __cond_resched_lock(lock); \ 2652 __cond_resched_lock(lock); \
2646 }) 2653 })
2647 2654
2648 extern int __cond_resched_softirq(void); 2655 extern int __cond_resched_softirq(void);
2649 2656
2650 #define cond_resched_softirq() ({ \ 2657 #define cond_resched_softirq() ({ \
2651 __might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET); \ 2658 __might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET); \
2652 __cond_resched_softirq(); \ 2659 __cond_resched_softirq(); \
2653 }) 2660 })
2654 2661
2655 /* 2662 /*
2656 * Does a critical section need to be broken due to another 2663 * Does a critical section need to be broken due to another
2657 * task waiting?: (technically does not depend on CONFIG_PREEMPT, 2664 * task waiting?: (technically does not depend on CONFIG_PREEMPT,
2658 * but a general need for low latency) 2665 * but a general need for low latency)
2659 */ 2666 */
2660 static inline int spin_needbreak(spinlock_t *lock) 2667 static inline int spin_needbreak(spinlock_t *lock)
2661 { 2668 {
2662 #ifdef CONFIG_PREEMPT 2669 #ifdef CONFIG_PREEMPT
2663 return spin_is_contended(lock); 2670 return spin_is_contended(lock);
2664 #else 2671 #else
2665 return 0; 2672 return 0;
2666 #endif 2673 #endif
2667 } 2674 }
2668 2675
2669 /* 2676 /*
2670 * Thread group CPU time accounting. 2677 * Thread group CPU time accounting.
2671 */ 2678 */
2672 void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times); 2679 void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
2673 void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times); 2680 void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
2674 2681
2675 static inline void thread_group_cputime_init(struct signal_struct *sig) 2682 static inline void thread_group_cputime_init(struct signal_struct *sig)
2676 { 2683 {
2677 raw_spin_lock_init(&sig->cputimer.lock); 2684 raw_spin_lock_init(&sig->cputimer.lock);
2678 } 2685 }
2679 2686
2680 /* 2687 /*
2681 * Reevaluate whether the task has signals pending delivery. 2688 * Reevaluate whether the task has signals pending delivery.
2682 * Wake the task if so. 2689 * Wake the task if so.
2683 * This is required every time the blocked sigset_t changes. 2690 * This is required every time the blocked sigset_t changes.
2684 * callers must hold sighand->siglock. 2691 * callers must hold sighand->siglock.
2685 */ 2692 */
2686 extern void recalc_sigpending_and_wake(struct task_struct *t); 2693 extern void recalc_sigpending_and_wake(struct task_struct *t);
2687 extern void recalc_sigpending(void); 2694 extern void recalc_sigpending(void);
2688 2695
2689 extern void signal_wake_up(struct task_struct *t, int resume_stopped); 2696 extern void signal_wake_up(struct task_struct *t, int resume_stopped);
2690 2697
2691 /* 2698 /*
2692 * Wrappers for p->thread_info->cpu access. No-op on UP. 2699 * Wrappers for p->thread_info->cpu access. No-op on UP.
2693 */ 2700 */
2694 #ifdef CONFIG_SMP 2701 #ifdef CONFIG_SMP
2695 2702
2696 static inline unsigned int task_cpu(const struct task_struct *p) 2703 static inline unsigned int task_cpu(const struct task_struct *p)
2697 { 2704 {
2698 return task_thread_info(p)->cpu; 2705 return task_thread_info(p)->cpu;
2699 } 2706 }
2700 2707
2701 extern void set_task_cpu(struct task_struct *p, unsigned int cpu); 2708 extern void set_task_cpu(struct task_struct *p, unsigned int cpu);
2702 2709
2703 #else 2710 #else
2704 2711
2705 static inline unsigned int task_cpu(const struct task_struct *p) 2712 static inline unsigned int task_cpu(const struct task_struct *p)
2706 { 2713 {
2707 return 0; 2714 return 0;
2708 } 2715 }
2709 2716
2710 static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) 2717 static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
2711 { 2718 {
2712 } 2719 }
2713 2720
2714 #endif /* CONFIG_SMP */ 2721 #endif /* CONFIG_SMP */
2715 2722
2716 extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask); 2723 extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
2717 extern long sched_getaffinity(pid_t pid, struct cpumask *mask); 2724 extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
2718 2725
2719 extern void normalize_rt_tasks(void); 2726 extern void normalize_rt_tasks(void);
2720 2727
2721 #ifdef CONFIG_CGROUP_SCHED 2728 #ifdef CONFIG_CGROUP_SCHED
2722 2729
2723 extern struct task_group root_task_group; 2730 extern struct task_group root_task_group;
2724 2731
2725 extern struct task_group *sched_create_group(struct task_group *parent); 2732 extern struct task_group *sched_create_group(struct task_group *parent);
2726 extern void sched_destroy_group(struct task_group *tg); 2733 extern void sched_destroy_group(struct task_group *tg);
2727 extern void sched_move_task(struct task_struct *tsk); 2734 extern void sched_move_task(struct task_struct *tsk);
2728 #ifdef CONFIG_FAIR_GROUP_SCHED 2735 #ifdef CONFIG_FAIR_GROUP_SCHED
2729 extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); 2736 extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
2730 extern unsigned long sched_group_shares(struct task_group *tg); 2737 extern unsigned long sched_group_shares(struct task_group *tg);
2731 #endif 2738 #endif
2732 #ifdef CONFIG_RT_GROUP_SCHED 2739 #ifdef CONFIG_RT_GROUP_SCHED
2733 extern int sched_group_set_rt_runtime(struct task_group *tg, 2740 extern int sched_group_set_rt_runtime(struct task_group *tg,
2734 long rt_runtime_us); 2741 long rt_runtime_us);
2735 extern long sched_group_rt_runtime(struct task_group *tg); 2742 extern long sched_group_rt_runtime(struct task_group *tg);
2736 extern int sched_group_set_rt_period(struct task_group *tg, 2743 extern int sched_group_set_rt_period(struct task_group *tg,
2737 long rt_period_us); 2744 long rt_period_us);
2738 extern long sched_group_rt_period(struct task_group *tg); 2745 extern long sched_group_rt_period(struct task_group *tg);
2739 extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk); 2746 extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk);
2740 #endif 2747 #endif
2741 #endif /* CONFIG_CGROUP_SCHED */ 2748 #endif /* CONFIG_CGROUP_SCHED */
2742 2749
2743 extern int task_can_switch_user(struct user_struct *up, 2750 extern int task_can_switch_user(struct user_struct *up,
2744 struct task_struct *tsk); 2751 struct task_struct *tsk);
2745 2752
2746 #ifdef CONFIG_TASK_XACCT 2753 #ifdef CONFIG_TASK_XACCT
2747 static inline void add_rchar(struct task_struct *tsk, ssize_t amt) 2754 static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
2748 { 2755 {
2749 tsk->ioac.rchar += amt; 2756 tsk->ioac.rchar += amt;
2750 } 2757 }
2751 2758
2752 static inline void add_wchar(struct task_struct *tsk, ssize_t amt) 2759 static inline void add_wchar(struct task_struct *tsk, ssize_t amt)
2753 { 2760 {
2754 tsk->ioac.wchar += amt; 2761 tsk->ioac.wchar += amt;
2755 } 2762 }
2756 2763
2757 static inline void inc_syscr(struct task_struct *tsk) 2764 static inline void inc_syscr(struct task_struct *tsk)
2758 { 2765 {
2759 tsk->ioac.syscr++; 2766 tsk->ioac.syscr++;
2760 } 2767 }
2761 2768
2762 static inline void inc_syscw(struct task_struct *tsk) 2769 static inline void inc_syscw(struct task_struct *tsk)
2763 { 2770 {
2764 tsk->ioac.syscw++; 2771 tsk->ioac.syscw++;
2765 } 2772 }
2766 #else 2773 #else
2767 static inline void add_rchar(struct task_struct *tsk, ssize_t amt) 2774 static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
2768 { 2775 {
2769 } 2776 }
2770 2777
2771 static inline void add_wchar(struct task_struct *tsk, ssize_t amt) 2778 static inline void add_wchar(struct task_struct *tsk, ssize_t amt)
2772 { 2779 {
2773 } 2780 }
2774 2781
2775 static inline void inc_syscr(struct task_struct *tsk) 2782 static inline void inc_syscr(struct task_struct *tsk)
2776 { 2783 {
2777 } 2784 }
2778 2785
2779 static inline void inc_syscw(struct task_struct *tsk) 2786 static inline void inc_syscw(struct task_struct *tsk)
2780 { 2787 {
2781 } 2788 }
2782 #endif 2789 #endif
2783 2790
2784 #ifndef TASK_SIZE_OF 2791 #ifndef TASK_SIZE_OF
2785 #define TASK_SIZE_OF(tsk) TASK_SIZE 2792 #define TASK_SIZE_OF(tsk) TASK_SIZE
2786 #endif 2793 #endif
2787 2794
2788 #ifdef CONFIG_MM_OWNER 2795 #ifdef CONFIG_MM_OWNER
2789 extern void mm_update_next_owner(struct mm_struct *mm); 2796 extern void mm_update_next_owner(struct mm_struct *mm);
2790 extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p); 2797 extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p);
2791 #else 2798 #else
2792 static inline void mm_update_next_owner(struct mm_struct *mm) 2799 static inline void mm_update_next_owner(struct mm_struct *mm)
2793 { 2800 {
2794 } 2801 }
2795 2802
2796 static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p) 2803 static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
2797 { 2804 {
2798 } 2805 }
2799 #endif /* CONFIG_MM_OWNER */ 2806 #endif /* CONFIG_MM_OWNER */
2800 2807
2801 static inline unsigned long task_rlimit(const struct task_struct *tsk, 2808 static inline unsigned long task_rlimit(const struct task_struct *tsk,
2802 unsigned int limit) 2809 unsigned int limit)
2803 { 2810 {
2804 return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_cur); 2811 return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_cur);
2805 } 2812 }
2806 2813
2807 static inline unsigned long task_rlimit_max(const struct task_struct *tsk, 2814 static inline unsigned long task_rlimit_max(const struct task_struct *tsk,
2808 unsigned int limit) 2815 unsigned int limit)
2809 { 2816 {
2810 return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_max); 2817 return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_max);
2811 } 2818 }
2812 2819
2813 static inline unsigned long rlimit(unsigned int limit) 2820 static inline unsigned long rlimit(unsigned int limit)
2814 { 2821 {
2815 return task_rlimit(current, limit); 2822 return task_rlimit(current, limit);
2816 } 2823 }
2817 2824
2818 static inline unsigned long rlimit_max(unsigned int limit) 2825 static inline unsigned long rlimit_max(unsigned int limit)
2819 { 2826 {
2820 return task_rlimit_max(current, limit); 2827 return task_rlimit_max(current, limit);
2821 } 2828 }
2822 2829
2823 #endif /* __KERNEL__ */ 2830 #endif /* __KERNEL__ */
2824 2831
2825 #endif 2832 #endif
2826 2833
1 /* 1 /*
2 * linux/kernel/softirq.c 2 * linux/kernel/softirq.c
3 * 3 *
4 * Copyright (C) 1992 Linus Torvalds 4 * Copyright (C) 1992 Linus Torvalds
5 * 5 *
6 * Distribute under GPLv2. 6 * Distribute under GPLv2.
7 * 7 *
8 * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) 8 * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
9 * 9 *
10 * Remote softirq infrastructure is by Jens Axboe. 10 * Remote softirq infrastructure is by Jens Axboe.
11 */ 11 */
12 12
13 #include <linux/export.h> 13 #include <linux/export.h>
14 #include <linux/kernel_stat.h> 14 #include <linux/kernel_stat.h>
15 #include <linux/interrupt.h> 15 #include <linux/interrupt.h>
16 #include <linux/init.h> 16 #include <linux/init.h>
17 #include <linux/mm.h> 17 #include <linux/mm.h>
18 #include <linux/notifier.h> 18 #include <linux/notifier.h>
19 #include <linux/percpu.h> 19 #include <linux/percpu.h>
20 #include <linux/cpu.h> 20 #include <linux/cpu.h>
21 #include <linux/freezer.h> 21 #include <linux/freezer.h>
22 #include <linux/kthread.h> 22 #include <linux/kthread.h>
23 #include <linux/rcupdate.h> 23 #include <linux/rcupdate.h>
24 #include <linux/ftrace.h> 24 #include <linux/ftrace.h>
25 #include <linux/smp.h> 25 #include <linux/smp.h>
26 #include <linux/tick.h> 26 #include <linux/tick.h>
27 27
28 #define CREATE_TRACE_POINTS 28 #define CREATE_TRACE_POINTS
29 #include <trace/events/irq.h> 29 #include <trace/events/irq.h>
30 30
31 #include <asm/irq.h> 31 #include <asm/irq.h>
32 /* 32 /*
33 - No shared variables, all the data are CPU local. 33 - No shared variables, all the data are CPU local.
34 - If a softirq needs serialization, let it serialize itself 34 - If a softirq needs serialization, let it serialize itself
35 by its own spinlocks. 35 by its own spinlocks.
36 - Even if softirq is serialized, only local cpu is marked for 36 - Even if softirq is serialized, only local cpu is marked for
37 execution. Hence, we get something sort of weak cpu binding. 37 execution. Hence, we get something sort of weak cpu binding.
38 Though it is still not clear, will it result in better locality 38 Though it is still not clear, will it result in better locality
39 or will not. 39 or will not.
40 40
41 Examples: 41 Examples:
42 - NET RX softirq. It is multithreaded and does not require 42 - NET RX softirq. It is multithreaded and does not require
43 any global serialization. 43 any global serialization.
44 - NET TX softirq. It kicks software netdevice queues, hence 44 - NET TX softirq. It kicks software netdevice queues, hence
45 it is logically serialized per device, but this serialization 45 it is logically serialized per device, but this serialization
46 is invisible to common code. 46 is invisible to common code.
47 - Tasklets: serialized wrt itself. 47 - Tasklets: serialized wrt itself.
48 */ 48 */
49 49
50 #ifndef __ARCH_IRQ_STAT 50 #ifndef __ARCH_IRQ_STAT
51 irq_cpustat_t irq_stat[NR_CPUS] ____cacheline_aligned; 51 irq_cpustat_t irq_stat[NR_CPUS] ____cacheline_aligned;
52 EXPORT_SYMBOL(irq_stat); 52 EXPORT_SYMBOL(irq_stat);
53 #endif 53 #endif
54 54
55 static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; 55 static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
56 56
57 DEFINE_PER_CPU(struct task_struct *, ksoftirqd); 57 DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
58 58
59 char *softirq_to_name[NR_SOFTIRQS] = { 59 char *softirq_to_name[NR_SOFTIRQS] = {
60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", 60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
61 "TASKLET", "SCHED", "HRTIMER", "RCU" 61 "TASKLET", "SCHED", "HRTIMER", "RCU"
62 }; 62 };
63 63
64 /* 64 /*
65 * we cannot loop indefinitely here to avoid userspace starvation, 65 * we cannot loop indefinitely here to avoid userspace starvation,
66 * but we also don't want to introduce a worst case 1/HZ latency 66 * but we also don't want to introduce a worst case 1/HZ latency
67 * to the pending events, so lets the scheduler to balance 67 * to the pending events, so lets the scheduler to balance
68 * the softirq load for us. 68 * the softirq load for us.
69 */ 69 */
70 static void wakeup_softirqd(void) 70 static void wakeup_softirqd(void)
71 { 71 {
72 /* Interrupts are disabled: no need to stop preemption */ 72 /* Interrupts are disabled: no need to stop preemption */
73 struct task_struct *tsk = __this_cpu_read(ksoftirqd); 73 struct task_struct *tsk = __this_cpu_read(ksoftirqd);
74 74
75 if (tsk && tsk->state != TASK_RUNNING) 75 if (tsk && tsk->state != TASK_RUNNING)
76 wake_up_process(tsk); 76 wake_up_process(tsk);
77 } 77 }
78 78
79 /* 79 /*
80 * preempt_count and SOFTIRQ_OFFSET usage: 80 * preempt_count and SOFTIRQ_OFFSET usage:
81 * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving 81 * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
82 * softirq processing. 82 * softirq processing.
83 * - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET) 83 * - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET)
84 * on local_bh_disable or local_bh_enable. 84 * on local_bh_disable or local_bh_enable.
85 * This lets us distinguish between whether we are currently processing 85 * This lets us distinguish between whether we are currently processing
86 * softirq and whether we just have bh disabled. 86 * softirq and whether we just have bh disabled.
87 */ 87 */
88 88
89 /* 89 /*
90 * This one is for softirq.c-internal use, 90 * This one is for softirq.c-internal use,
91 * where hardirqs are disabled legitimately: 91 * where hardirqs are disabled legitimately:
92 */ 92 */
93 #ifdef CONFIG_TRACE_IRQFLAGS 93 #ifdef CONFIG_TRACE_IRQFLAGS
94 static void __local_bh_disable(unsigned long ip, unsigned int cnt) 94 static void __local_bh_disable(unsigned long ip, unsigned int cnt)
95 { 95 {
96 unsigned long flags; 96 unsigned long flags;
97 97
98 WARN_ON_ONCE(in_irq()); 98 WARN_ON_ONCE(in_irq());
99 99
100 raw_local_irq_save(flags); 100 raw_local_irq_save(flags);
101 /* 101 /*
102 * The preempt tracer hooks into add_preempt_count and will break 102 * The preempt tracer hooks into add_preempt_count and will break
103 * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET 103 * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET
104 * is set and before current->softirq_enabled is cleared. 104 * is set and before current->softirq_enabled is cleared.
105 * We must manually increment preempt_count here and manually 105 * We must manually increment preempt_count here and manually
106 * call the trace_preempt_off later. 106 * call the trace_preempt_off later.
107 */ 107 */
108 preempt_count() += cnt; 108 preempt_count() += cnt;
109 /* 109 /*
110 * Were softirqs turned off above: 110 * Were softirqs turned off above:
111 */ 111 */
112 if (softirq_count() == cnt) 112 if (softirq_count() == cnt)
113 trace_softirqs_off(ip); 113 trace_softirqs_off(ip);
114 raw_local_irq_restore(flags); 114 raw_local_irq_restore(flags);
115 115
116 if (preempt_count() == cnt) 116 if (preempt_count() == cnt)
117 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 117 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
118 } 118 }
119 #else /* !CONFIG_TRACE_IRQFLAGS */ 119 #else /* !CONFIG_TRACE_IRQFLAGS */
120 static inline void __local_bh_disable(unsigned long ip, unsigned int cnt) 120 static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)
121 { 121 {
122 add_preempt_count(cnt); 122 add_preempt_count(cnt);
123 barrier(); 123 barrier();
124 } 124 }
125 #endif /* CONFIG_TRACE_IRQFLAGS */ 125 #endif /* CONFIG_TRACE_IRQFLAGS */
126 126
127 void local_bh_disable(void) 127 void local_bh_disable(void)
128 { 128 {
129 __local_bh_disable((unsigned long)__builtin_return_address(0), 129 __local_bh_disable((unsigned long)__builtin_return_address(0),
130 SOFTIRQ_DISABLE_OFFSET); 130 SOFTIRQ_DISABLE_OFFSET);
131 } 131 }
132 132
133 EXPORT_SYMBOL(local_bh_disable); 133 EXPORT_SYMBOL(local_bh_disable);
134 134
135 static void __local_bh_enable(unsigned int cnt) 135 static void __local_bh_enable(unsigned int cnt)
136 { 136 {
137 WARN_ON_ONCE(in_irq()); 137 WARN_ON_ONCE(in_irq());
138 WARN_ON_ONCE(!irqs_disabled()); 138 WARN_ON_ONCE(!irqs_disabled());
139 139
140 if (softirq_count() == cnt) 140 if (softirq_count() == cnt)
141 trace_softirqs_on((unsigned long)__builtin_return_address(0)); 141 trace_softirqs_on((unsigned long)__builtin_return_address(0));
142 sub_preempt_count(cnt); 142 sub_preempt_count(cnt);
143 } 143 }
144 144
145 /* 145 /*
146 * Special-case - softirqs can safely be enabled in 146 * Special-case - softirqs can safely be enabled in
147 * cond_resched_softirq(), or by __do_softirq(), 147 * cond_resched_softirq(), or by __do_softirq(),
148 * without processing still-pending softirqs: 148 * without processing still-pending softirqs:
149 */ 149 */
150 void _local_bh_enable(void) 150 void _local_bh_enable(void)
151 { 151 {
152 __local_bh_enable(SOFTIRQ_DISABLE_OFFSET); 152 __local_bh_enable(SOFTIRQ_DISABLE_OFFSET);
153 } 153 }
154 154
155 EXPORT_SYMBOL(_local_bh_enable); 155 EXPORT_SYMBOL(_local_bh_enable);
156 156
157 static inline void _local_bh_enable_ip(unsigned long ip) 157 static inline void _local_bh_enable_ip(unsigned long ip)
158 { 158 {
159 WARN_ON_ONCE(in_irq() || irqs_disabled()); 159 WARN_ON_ONCE(in_irq() || irqs_disabled());
160 #ifdef CONFIG_TRACE_IRQFLAGS 160 #ifdef CONFIG_TRACE_IRQFLAGS
161 local_irq_disable(); 161 local_irq_disable();
162 #endif 162 #endif
163 /* 163 /*
164 * Are softirqs going to be turned on now: 164 * Are softirqs going to be turned on now:
165 */ 165 */
166 if (softirq_count() == SOFTIRQ_DISABLE_OFFSET) 166 if (softirq_count() == SOFTIRQ_DISABLE_OFFSET)
167 trace_softirqs_on(ip); 167 trace_softirqs_on(ip);
168 /* 168 /*
169 * Keep preemption disabled until we are done with 169 * Keep preemption disabled until we are done with
170 * softirq processing: 170 * softirq processing:
171 */ 171 */
172 sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1); 172 sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1);
173 173
174 if (unlikely(!in_interrupt() && local_softirq_pending())) 174 if (unlikely(!in_interrupt() && local_softirq_pending()))
175 do_softirq(); 175 do_softirq();
176 176
177 dec_preempt_count(); 177 dec_preempt_count();
178 #ifdef CONFIG_TRACE_IRQFLAGS 178 #ifdef CONFIG_TRACE_IRQFLAGS
179 local_irq_enable(); 179 local_irq_enable();
180 #endif 180 #endif
181 preempt_check_resched(); 181 preempt_check_resched();
182 } 182 }
183 183
184 void local_bh_enable(void) 184 void local_bh_enable(void)
185 { 185 {
186 _local_bh_enable_ip((unsigned long)__builtin_return_address(0)); 186 _local_bh_enable_ip((unsigned long)__builtin_return_address(0));
187 } 187 }
188 EXPORT_SYMBOL(local_bh_enable); 188 EXPORT_SYMBOL(local_bh_enable);
189 189
190 void local_bh_enable_ip(unsigned long ip) 190 void local_bh_enable_ip(unsigned long ip)
191 { 191 {
192 _local_bh_enable_ip(ip); 192 _local_bh_enable_ip(ip);
193 } 193 }
194 EXPORT_SYMBOL(local_bh_enable_ip); 194 EXPORT_SYMBOL(local_bh_enable_ip);
195 195
196 /* 196 /*
197 * We restart softirq processing MAX_SOFTIRQ_RESTART times, 197 * We restart softirq processing MAX_SOFTIRQ_RESTART times,
198 * and we fall back to softirqd after that. 198 * and we fall back to softirqd after that.
199 * 199 *
200 * This number has been established via experimentation. 200 * This number has been established via experimentation.
201 * The two things to balance is latency against fairness - 201 * The two things to balance is latency against fairness -
202 * we want to handle softirqs as soon as possible, but they 202 * we want to handle softirqs as soon as possible, but they
203 * should not be able to lock up the box. 203 * should not be able to lock up the box.
204 */ 204 */
205 #define MAX_SOFTIRQ_RESTART 10 205 #define MAX_SOFTIRQ_RESTART 10
206 206
207 asmlinkage void __do_softirq(void) 207 asmlinkage void __do_softirq(void)
208 { 208 {
209 struct softirq_action *h; 209 struct softirq_action *h;
210 __u32 pending; 210 __u32 pending;
211 int max_restart = MAX_SOFTIRQ_RESTART; 211 int max_restart = MAX_SOFTIRQ_RESTART;
212 int cpu; 212 int cpu;
213 unsigned long old_flags = current->flags;
213 214
215 /*
216 * Mask out PF_MEMALLOC s current task context is borrowed for the
217 * softirq. A softirq handled such as network RX might set PF_MEMALLOC
218 * again if the socket is related to swap
219 */
220 current->flags &= ~PF_MEMALLOC;
221
214 pending = local_softirq_pending(); 222 pending = local_softirq_pending();
215 account_system_vtime(current); 223 account_system_vtime(current);
216 224
217 __local_bh_disable((unsigned long)__builtin_return_address(0), 225 __local_bh_disable((unsigned long)__builtin_return_address(0),
218 SOFTIRQ_OFFSET); 226 SOFTIRQ_OFFSET);
219 lockdep_softirq_enter(); 227 lockdep_softirq_enter();
220 228
221 cpu = smp_processor_id(); 229 cpu = smp_processor_id();
222 restart: 230 restart:
223 /* Reset the pending bitmask before enabling irqs */ 231 /* Reset the pending bitmask before enabling irqs */
224 set_softirq_pending(0); 232 set_softirq_pending(0);
225 233
226 local_irq_enable(); 234 local_irq_enable();
227 235
228 h = softirq_vec; 236 h = softirq_vec;
229 237
230 do { 238 do {
231 if (pending & 1) { 239 if (pending & 1) {
232 unsigned int vec_nr = h - softirq_vec; 240 unsigned int vec_nr = h - softirq_vec;
233 int prev_count = preempt_count(); 241 int prev_count = preempt_count();
234 242
235 kstat_incr_softirqs_this_cpu(vec_nr); 243 kstat_incr_softirqs_this_cpu(vec_nr);
236 244
237 trace_softirq_entry(vec_nr); 245 trace_softirq_entry(vec_nr);
238 h->action(h); 246 h->action(h);
239 trace_softirq_exit(vec_nr); 247 trace_softirq_exit(vec_nr);
240 if (unlikely(prev_count != preempt_count())) { 248 if (unlikely(prev_count != preempt_count())) {
241 printk(KERN_ERR "huh, entered softirq %u %s %p" 249 printk(KERN_ERR "huh, entered softirq %u %s %p"
242 "with preempt_count %08x," 250 "with preempt_count %08x,"
243 " exited with %08x?\n", vec_nr, 251 " exited with %08x?\n", vec_nr,
244 softirq_to_name[vec_nr], h->action, 252 softirq_to_name[vec_nr], h->action,
245 prev_count, preempt_count()); 253 prev_count, preempt_count());
246 preempt_count() = prev_count; 254 preempt_count() = prev_count;
247 } 255 }
248 256
249 rcu_bh_qs(cpu); 257 rcu_bh_qs(cpu);
250 } 258 }
251 h++; 259 h++;
252 pending >>= 1; 260 pending >>= 1;
253 } while (pending); 261 } while (pending);
254 262
255 local_irq_disable(); 263 local_irq_disable();
256 264
257 pending = local_softirq_pending(); 265 pending = local_softirq_pending();
258 if (pending && --max_restart) 266 if (pending && --max_restart)
259 goto restart; 267 goto restart;
260 268
261 if (pending) 269 if (pending)
262 wakeup_softirqd(); 270 wakeup_softirqd();
263 271
264 lockdep_softirq_exit(); 272 lockdep_softirq_exit();
265 273
266 account_system_vtime(current); 274 account_system_vtime(current);
267 __local_bh_enable(SOFTIRQ_OFFSET); 275 __local_bh_enable(SOFTIRQ_OFFSET);
276 tsk_restore_flags(current, old_flags, PF_MEMALLOC);
268 } 277 }
269 278
270 #ifndef __ARCH_HAS_DO_SOFTIRQ 279 #ifndef __ARCH_HAS_DO_SOFTIRQ
271 280
272 asmlinkage void do_softirq(void) 281 asmlinkage void do_softirq(void)
273 { 282 {
274 __u32 pending; 283 __u32 pending;
275 unsigned long flags; 284 unsigned long flags;
276 285
277 if (in_interrupt()) 286 if (in_interrupt())
278 return; 287 return;
279 288
280 local_irq_save(flags); 289 local_irq_save(flags);
281 290
282 pending = local_softirq_pending(); 291 pending = local_softirq_pending();
283 292
284 if (pending) 293 if (pending)
285 __do_softirq(); 294 __do_softirq();
286 295
287 local_irq_restore(flags); 296 local_irq_restore(flags);
288 } 297 }
289 298
290 #endif 299 #endif
291 300
292 /* 301 /*
293 * Enter an interrupt context. 302 * Enter an interrupt context.
294 */ 303 */
295 void irq_enter(void) 304 void irq_enter(void)
296 { 305 {
297 int cpu = smp_processor_id(); 306 int cpu = smp_processor_id();
298 307
299 rcu_irq_enter(); 308 rcu_irq_enter();
300 if (is_idle_task(current) && !in_interrupt()) { 309 if (is_idle_task(current) && !in_interrupt()) {
301 /* 310 /*
302 * Prevent raise_softirq from needlessly waking up ksoftirqd 311 * Prevent raise_softirq from needlessly waking up ksoftirqd
303 * here, as softirq will be serviced on return from interrupt. 312 * here, as softirq will be serviced on return from interrupt.
304 */ 313 */
305 local_bh_disable(); 314 local_bh_disable();
306 tick_check_idle(cpu); 315 tick_check_idle(cpu);
307 _local_bh_enable(); 316 _local_bh_enable();
308 } 317 }
309 318
310 __irq_enter(); 319 __irq_enter();
311 } 320 }
312 321
313 static inline void invoke_softirq(void) 322 static inline void invoke_softirq(void)
314 { 323 {
315 if (!force_irqthreads) { 324 if (!force_irqthreads) {
316 #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED 325 #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
317 __do_softirq(); 326 __do_softirq();
318 #else 327 #else
319 do_softirq(); 328 do_softirq();
320 #endif 329 #endif
321 } else { 330 } else {
322 __local_bh_disable((unsigned long)__builtin_return_address(0), 331 __local_bh_disable((unsigned long)__builtin_return_address(0),
323 SOFTIRQ_OFFSET); 332 SOFTIRQ_OFFSET);
324 wakeup_softirqd(); 333 wakeup_softirqd();
325 __local_bh_enable(SOFTIRQ_OFFSET); 334 __local_bh_enable(SOFTIRQ_OFFSET);
326 } 335 }
327 } 336 }
328 337
329 /* 338 /*
330 * Exit an interrupt context. Process softirqs if needed and possible: 339 * Exit an interrupt context. Process softirqs if needed and possible:
331 */ 340 */
332 void irq_exit(void) 341 void irq_exit(void)
333 { 342 {
334 account_system_vtime(current); 343 account_system_vtime(current);
335 trace_hardirq_exit(); 344 trace_hardirq_exit();
336 sub_preempt_count(IRQ_EXIT_OFFSET); 345 sub_preempt_count(IRQ_EXIT_OFFSET);
337 if (!in_interrupt() && local_softirq_pending()) 346 if (!in_interrupt() && local_softirq_pending())
338 invoke_softirq(); 347 invoke_softirq();
339 348
340 #ifdef CONFIG_NO_HZ 349 #ifdef CONFIG_NO_HZ
341 /* Make sure that timer wheel updates are propagated */ 350 /* Make sure that timer wheel updates are propagated */
342 if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) 351 if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
343 tick_nohz_irq_exit(); 352 tick_nohz_irq_exit();
344 #endif 353 #endif
345 rcu_irq_exit(); 354 rcu_irq_exit();
346 sched_preempt_enable_no_resched(); 355 sched_preempt_enable_no_resched();
347 } 356 }
348 357
349 /* 358 /*
350 * This function must run with irqs disabled! 359 * This function must run with irqs disabled!
351 */ 360 */
352 inline void raise_softirq_irqoff(unsigned int nr) 361 inline void raise_softirq_irqoff(unsigned int nr)
353 { 362 {
354 __raise_softirq_irqoff(nr); 363 __raise_softirq_irqoff(nr);
355 364
356 /* 365 /*
357 * If we're in an interrupt or softirq, we're done 366 * If we're in an interrupt or softirq, we're done
358 * (this also catches softirq-disabled code). We will 367 * (this also catches softirq-disabled code). We will
359 * actually run the softirq once we return from 368 * actually run the softirq once we return from
360 * the irq or softirq. 369 * the irq or softirq.
361 * 370 *
362 * Otherwise we wake up ksoftirqd to make sure we 371 * Otherwise we wake up ksoftirqd to make sure we
363 * schedule the softirq soon. 372 * schedule the softirq soon.
364 */ 373 */
365 if (!in_interrupt()) 374 if (!in_interrupt())
366 wakeup_softirqd(); 375 wakeup_softirqd();
367 } 376 }
368 377
369 void raise_softirq(unsigned int nr) 378 void raise_softirq(unsigned int nr)
370 { 379 {
371 unsigned long flags; 380 unsigned long flags;
372 381
373 local_irq_save(flags); 382 local_irq_save(flags);
374 raise_softirq_irqoff(nr); 383 raise_softirq_irqoff(nr);
375 local_irq_restore(flags); 384 local_irq_restore(flags);
376 } 385 }
377 386
378 void __raise_softirq_irqoff(unsigned int nr) 387 void __raise_softirq_irqoff(unsigned int nr)
379 { 388 {
380 trace_softirq_raise(nr); 389 trace_softirq_raise(nr);
381 or_softirq_pending(1UL << nr); 390 or_softirq_pending(1UL << nr);
382 } 391 }
383 392
384 void open_softirq(int nr, void (*action)(struct softirq_action *)) 393 void open_softirq(int nr, void (*action)(struct softirq_action *))
385 { 394 {
386 softirq_vec[nr].action = action; 395 softirq_vec[nr].action = action;
387 } 396 }
388 397
389 /* 398 /*
390 * Tasklets 399 * Tasklets
391 */ 400 */
392 struct tasklet_head 401 struct tasklet_head
393 { 402 {
394 struct tasklet_struct *head; 403 struct tasklet_struct *head;
395 struct tasklet_struct **tail; 404 struct tasklet_struct **tail;
396 }; 405 };
397 406
398 static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec); 407 static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);
399 static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec); 408 static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);
400 409
401 void __tasklet_schedule(struct tasklet_struct *t) 410 void __tasklet_schedule(struct tasklet_struct *t)
402 { 411 {
403 unsigned long flags; 412 unsigned long flags;
404 413
405 local_irq_save(flags); 414 local_irq_save(flags);
406 t->next = NULL; 415 t->next = NULL;
407 *__this_cpu_read(tasklet_vec.tail) = t; 416 *__this_cpu_read(tasklet_vec.tail) = t;
408 __this_cpu_write(tasklet_vec.tail, &(t->next)); 417 __this_cpu_write(tasklet_vec.tail, &(t->next));
409 raise_softirq_irqoff(TASKLET_SOFTIRQ); 418 raise_softirq_irqoff(TASKLET_SOFTIRQ);
410 local_irq_restore(flags); 419 local_irq_restore(flags);
411 } 420 }
412 421
413 EXPORT_SYMBOL(__tasklet_schedule); 422 EXPORT_SYMBOL(__tasklet_schedule);
414 423
415 void __tasklet_hi_schedule(struct tasklet_struct *t) 424 void __tasklet_hi_schedule(struct tasklet_struct *t)
416 { 425 {
417 unsigned long flags; 426 unsigned long flags;
418 427
419 local_irq_save(flags); 428 local_irq_save(flags);
420 t->next = NULL; 429 t->next = NULL;
421 *__this_cpu_read(tasklet_hi_vec.tail) = t; 430 *__this_cpu_read(tasklet_hi_vec.tail) = t;
422 __this_cpu_write(tasklet_hi_vec.tail, &(t->next)); 431 __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
423 raise_softirq_irqoff(HI_SOFTIRQ); 432 raise_softirq_irqoff(HI_SOFTIRQ);
424 local_irq_restore(flags); 433 local_irq_restore(flags);
425 } 434 }
426 435
427 EXPORT_SYMBOL(__tasklet_hi_schedule); 436 EXPORT_SYMBOL(__tasklet_hi_schedule);
428 437
429 void __tasklet_hi_schedule_first(struct tasklet_struct *t) 438 void __tasklet_hi_schedule_first(struct tasklet_struct *t)
430 { 439 {
431 BUG_ON(!irqs_disabled()); 440 BUG_ON(!irqs_disabled());
432 441
433 t->next = __this_cpu_read(tasklet_hi_vec.head); 442 t->next = __this_cpu_read(tasklet_hi_vec.head);
434 __this_cpu_write(tasklet_hi_vec.head, t); 443 __this_cpu_write(tasklet_hi_vec.head, t);
435 __raise_softirq_irqoff(HI_SOFTIRQ); 444 __raise_softirq_irqoff(HI_SOFTIRQ);
436 } 445 }
437 446
438 EXPORT_SYMBOL(__tasklet_hi_schedule_first); 447 EXPORT_SYMBOL(__tasklet_hi_schedule_first);
439 448
440 static void tasklet_action(struct softirq_action *a) 449 static void tasklet_action(struct softirq_action *a)
441 { 450 {
442 struct tasklet_struct *list; 451 struct tasklet_struct *list;
443 452
444 local_irq_disable(); 453 local_irq_disable();
445 list = __this_cpu_read(tasklet_vec.head); 454 list = __this_cpu_read(tasklet_vec.head);
446 __this_cpu_write(tasklet_vec.head, NULL); 455 __this_cpu_write(tasklet_vec.head, NULL);
447 __this_cpu_write(tasklet_vec.tail, &__get_cpu_var(tasklet_vec).head); 456 __this_cpu_write(tasklet_vec.tail, &__get_cpu_var(tasklet_vec).head);
448 local_irq_enable(); 457 local_irq_enable();
449 458
450 while (list) { 459 while (list) {
451 struct tasklet_struct *t = list; 460 struct tasklet_struct *t = list;
452 461
453 list = list->next; 462 list = list->next;
454 463
455 if (tasklet_trylock(t)) { 464 if (tasklet_trylock(t)) {
456 if (!atomic_read(&t->count)) { 465 if (!atomic_read(&t->count)) {
457 if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) 466 if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
458 BUG(); 467 BUG();
459 t->func(t->data); 468 t->func(t->data);
460 tasklet_unlock(t); 469 tasklet_unlock(t);
461 continue; 470 continue;
462 } 471 }
463 tasklet_unlock(t); 472 tasklet_unlock(t);
464 } 473 }
465 474
466 local_irq_disable(); 475 local_irq_disable();
467 t->next = NULL; 476 t->next = NULL;
468 *__this_cpu_read(tasklet_vec.tail) = t; 477 *__this_cpu_read(tasklet_vec.tail) = t;
469 __this_cpu_write(tasklet_vec.tail, &(t->next)); 478 __this_cpu_write(tasklet_vec.tail, &(t->next));
470 __raise_softirq_irqoff(TASKLET_SOFTIRQ); 479 __raise_softirq_irqoff(TASKLET_SOFTIRQ);
471 local_irq_enable(); 480 local_irq_enable();
472 } 481 }
473 } 482 }
474 483
475 static void tasklet_hi_action(struct softirq_action *a) 484 static void tasklet_hi_action(struct softirq_action *a)
476 { 485 {
477 struct tasklet_struct *list; 486 struct tasklet_struct *list;
478 487
479 local_irq_disable(); 488 local_irq_disable();
480 list = __this_cpu_read(tasklet_hi_vec.head); 489 list = __this_cpu_read(tasklet_hi_vec.head);
481 __this_cpu_write(tasklet_hi_vec.head, NULL); 490 __this_cpu_write(tasklet_hi_vec.head, NULL);
482 __this_cpu_write(tasklet_hi_vec.tail, &__get_cpu_var(tasklet_hi_vec).head); 491 __this_cpu_write(tasklet_hi_vec.tail, &__get_cpu_var(tasklet_hi_vec).head);
483 local_irq_enable(); 492 local_irq_enable();
484 493
485 while (list) { 494 while (list) {
486 struct tasklet_struct *t = list; 495 struct tasklet_struct *t = list;
487 496
488 list = list->next; 497 list = list->next;
489 498
490 if (tasklet_trylock(t)) { 499 if (tasklet_trylock(t)) {
491 if (!atomic_read(&t->count)) { 500 if (!atomic_read(&t->count)) {
492 if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) 501 if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
493 BUG(); 502 BUG();
494 t->func(t->data); 503 t->func(t->data);
495 tasklet_unlock(t); 504 tasklet_unlock(t);
496 continue; 505 continue;
497 } 506 }
498 tasklet_unlock(t); 507 tasklet_unlock(t);
499 } 508 }
500 509
501 local_irq_disable(); 510 local_irq_disable();
502 t->next = NULL; 511 t->next = NULL;
503 *__this_cpu_read(tasklet_hi_vec.tail) = t; 512 *__this_cpu_read(tasklet_hi_vec.tail) = t;
504 __this_cpu_write(tasklet_hi_vec.tail, &(t->next)); 513 __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
505 __raise_softirq_irqoff(HI_SOFTIRQ); 514 __raise_softirq_irqoff(HI_SOFTIRQ);
506 local_irq_enable(); 515 local_irq_enable();
507 } 516 }
508 } 517 }
509 518
510 519
511 void tasklet_init(struct tasklet_struct *t, 520 void tasklet_init(struct tasklet_struct *t,
512 void (*func)(unsigned long), unsigned long data) 521 void (*func)(unsigned long), unsigned long data)
513 { 522 {
514 t->next = NULL; 523 t->next = NULL;
515 t->state = 0; 524 t->state = 0;
516 atomic_set(&t->count, 0); 525 atomic_set(&t->count, 0);
517 t->func = func; 526 t->func = func;
518 t->data = data; 527 t->data = data;
519 } 528 }
520 529
521 EXPORT_SYMBOL(tasklet_init); 530 EXPORT_SYMBOL(tasklet_init);
522 531
523 void tasklet_kill(struct tasklet_struct *t) 532 void tasklet_kill(struct tasklet_struct *t)
524 { 533 {
525 if (in_interrupt()) 534 if (in_interrupt())
526 printk("Attempt to kill tasklet from interrupt\n"); 535 printk("Attempt to kill tasklet from interrupt\n");
527 536
528 while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) { 537 while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
529 do { 538 do {
530 yield(); 539 yield();
531 } while (test_bit(TASKLET_STATE_SCHED, &t->state)); 540 } while (test_bit(TASKLET_STATE_SCHED, &t->state));
532 } 541 }
533 tasklet_unlock_wait(t); 542 tasklet_unlock_wait(t);
534 clear_bit(TASKLET_STATE_SCHED, &t->state); 543 clear_bit(TASKLET_STATE_SCHED, &t->state);
535 } 544 }
536 545
537 EXPORT_SYMBOL(tasklet_kill); 546 EXPORT_SYMBOL(tasklet_kill);
538 547
539 /* 548 /*
540 * tasklet_hrtimer 549 * tasklet_hrtimer
541 */ 550 */
542 551
543 /* 552 /*
544 * The trampoline is called when the hrtimer expires. It schedules a tasklet 553 * The trampoline is called when the hrtimer expires. It schedules a tasklet
545 * to run __tasklet_hrtimer_trampoline() which in turn will call the intended 554 * to run __tasklet_hrtimer_trampoline() which in turn will call the intended
546 * hrtimer callback, but from softirq context. 555 * hrtimer callback, but from softirq context.
547 */ 556 */
548 static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer) 557 static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer)
549 { 558 {
550 struct tasklet_hrtimer *ttimer = 559 struct tasklet_hrtimer *ttimer =
551 container_of(timer, struct tasklet_hrtimer, timer); 560 container_of(timer, struct tasklet_hrtimer, timer);
552 561
553 tasklet_hi_schedule(&ttimer->tasklet); 562 tasklet_hi_schedule(&ttimer->tasklet);
554 return HRTIMER_NORESTART; 563 return HRTIMER_NORESTART;
555 } 564 }
556 565
557 /* 566 /*
558 * Helper function which calls the hrtimer callback from 567 * Helper function which calls the hrtimer callback from
559 * tasklet/softirq context 568 * tasklet/softirq context
560 */ 569 */
561 static void __tasklet_hrtimer_trampoline(unsigned long data) 570 static void __tasklet_hrtimer_trampoline(unsigned long data)
562 { 571 {
563 struct tasklet_hrtimer *ttimer = (void *)data; 572 struct tasklet_hrtimer *ttimer = (void *)data;
564 enum hrtimer_restart restart; 573 enum hrtimer_restart restart;
565 574
566 restart = ttimer->function(&ttimer->timer); 575 restart = ttimer->function(&ttimer->timer);
567 if (restart != HRTIMER_NORESTART) 576 if (restart != HRTIMER_NORESTART)
568 hrtimer_restart(&ttimer->timer); 577 hrtimer_restart(&ttimer->timer);
569 } 578 }
570 579
571 /** 580 /**
572 * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks 581 * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks
573 * @ttimer: tasklet_hrtimer which is initialized 582 * @ttimer: tasklet_hrtimer which is initialized
574 * @function: hrtimer callback function which gets called from softirq context 583 * @function: hrtimer callback function which gets called from softirq context
575 * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME) 584 * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME)
576 * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL) 585 * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL)
577 */ 586 */
578 void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer, 587 void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer,
579 enum hrtimer_restart (*function)(struct hrtimer *), 588 enum hrtimer_restart (*function)(struct hrtimer *),
580 clockid_t which_clock, enum hrtimer_mode mode) 589 clockid_t which_clock, enum hrtimer_mode mode)
581 { 590 {
582 hrtimer_init(&ttimer->timer, which_clock, mode); 591 hrtimer_init(&ttimer->timer, which_clock, mode);
583 ttimer->timer.function = __hrtimer_tasklet_trampoline; 592 ttimer->timer.function = __hrtimer_tasklet_trampoline;
584 tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline, 593 tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline,
585 (unsigned long)ttimer); 594 (unsigned long)ttimer);
586 ttimer->function = function; 595 ttimer->function = function;
587 } 596 }
588 EXPORT_SYMBOL_GPL(tasklet_hrtimer_init); 597 EXPORT_SYMBOL_GPL(tasklet_hrtimer_init);
589 598
590 /* 599 /*
591 * Remote softirq bits 600 * Remote softirq bits
592 */ 601 */
593 602
594 DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list); 603 DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list);
595 EXPORT_PER_CPU_SYMBOL(softirq_work_list); 604 EXPORT_PER_CPU_SYMBOL(softirq_work_list);
596 605
597 static void __local_trigger(struct call_single_data *cp, int softirq) 606 static void __local_trigger(struct call_single_data *cp, int softirq)
598 { 607 {
599 struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]); 608 struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]);
600 609
601 list_add_tail(&cp->list, head); 610 list_add_tail(&cp->list, head);
602 611
603 /* Trigger the softirq only if the list was previously empty. */ 612 /* Trigger the softirq only if the list was previously empty. */
604 if (head->next == &cp->list) 613 if (head->next == &cp->list)
605 raise_softirq_irqoff(softirq); 614 raise_softirq_irqoff(softirq);
606 } 615 }
607 616
608 #ifdef CONFIG_USE_GENERIC_SMP_HELPERS 617 #ifdef CONFIG_USE_GENERIC_SMP_HELPERS
609 static void remote_softirq_receive(void *data) 618 static void remote_softirq_receive(void *data)
610 { 619 {
611 struct call_single_data *cp = data; 620 struct call_single_data *cp = data;
612 unsigned long flags; 621 unsigned long flags;
613 int softirq; 622 int softirq;
614 623
615 softirq = cp->priv; 624 softirq = cp->priv;
616 625
617 local_irq_save(flags); 626 local_irq_save(flags);
618 __local_trigger(cp, softirq); 627 __local_trigger(cp, softirq);
619 local_irq_restore(flags); 628 local_irq_restore(flags);
620 } 629 }
621 630
622 static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) 631 static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
623 { 632 {
624 if (cpu_online(cpu)) { 633 if (cpu_online(cpu)) {
625 cp->func = remote_softirq_receive; 634 cp->func = remote_softirq_receive;
626 cp->info = cp; 635 cp->info = cp;
627 cp->flags = 0; 636 cp->flags = 0;
628 cp->priv = softirq; 637 cp->priv = softirq;
629 638
630 __smp_call_function_single(cpu, cp, 0); 639 __smp_call_function_single(cpu, cp, 0);
631 return 0; 640 return 0;
632 } 641 }
633 return 1; 642 return 1;
634 } 643 }
635 #else /* CONFIG_USE_GENERIC_SMP_HELPERS */ 644 #else /* CONFIG_USE_GENERIC_SMP_HELPERS */
636 static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) 645 static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
637 { 646 {
638 return 1; 647 return 1;
639 } 648 }
640 #endif 649 #endif
641 650
642 /** 651 /**
643 * __send_remote_softirq - try to schedule softirq work on a remote cpu 652 * __send_remote_softirq - try to schedule softirq work on a remote cpu
644 * @cp: private SMP call function data area 653 * @cp: private SMP call function data area
645 * @cpu: the remote cpu 654 * @cpu: the remote cpu
646 * @this_cpu: the currently executing cpu 655 * @this_cpu: the currently executing cpu
647 * @softirq: the softirq for the work 656 * @softirq: the softirq for the work
648 * 657 *
649 * Attempt to schedule softirq work on a remote cpu. If this cannot be 658 * Attempt to schedule softirq work on a remote cpu. If this cannot be
650 * done, the work is instead queued up on the local cpu. 659 * done, the work is instead queued up on the local cpu.
651 * 660 *
652 * Interrupts must be disabled. 661 * Interrupts must be disabled.
653 */ 662 */
654 void __send_remote_softirq(struct call_single_data *cp, int cpu, int this_cpu, int softirq) 663 void __send_remote_softirq(struct call_single_data *cp, int cpu, int this_cpu, int softirq)
655 { 664 {
656 if (cpu == this_cpu || __try_remote_softirq(cp, cpu, softirq)) 665 if (cpu == this_cpu || __try_remote_softirq(cp, cpu, softirq))
657 __local_trigger(cp, softirq); 666 __local_trigger(cp, softirq);
658 } 667 }
659 EXPORT_SYMBOL(__send_remote_softirq); 668 EXPORT_SYMBOL(__send_remote_softirq);
660 669
661 /** 670 /**
662 * send_remote_softirq - try to schedule softirq work on a remote cpu 671 * send_remote_softirq - try to schedule softirq work on a remote cpu
663 * @cp: private SMP call function data area 672 * @cp: private SMP call function data area
664 * @cpu: the remote cpu 673 * @cpu: the remote cpu
665 * @softirq: the softirq for the work 674 * @softirq: the softirq for the work
666 * 675 *
667 * Like __send_remote_softirq except that disabling interrupts and 676 * Like __send_remote_softirq except that disabling interrupts and
668 * computing the current cpu is done for the caller. 677 * computing the current cpu is done for the caller.
669 */ 678 */
670 void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq) 679 void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
671 { 680 {
672 unsigned long flags; 681 unsigned long flags;
673 int this_cpu; 682 int this_cpu;
674 683
675 local_irq_save(flags); 684 local_irq_save(flags);
676 this_cpu = smp_processor_id(); 685 this_cpu = smp_processor_id();
677 __send_remote_softirq(cp, cpu, this_cpu, softirq); 686 __send_remote_softirq(cp, cpu, this_cpu, softirq);
678 local_irq_restore(flags); 687 local_irq_restore(flags);
679 } 688 }
680 EXPORT_SYMBOL(send_remote_softirq); 689 EXPORT_SYMBOL(send_remote_softirq);
681 690
682 static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self, 691 static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self,
683 unsigned long action, void *hcpu) 692 unsigned long action, void *hcpu)
684 { 693 {
685 /* 694 /*
686 * If a CPU goes away, splice its entries to the current CPU 695 * If a CPU goes away, splice its entries to the current CPU
687 * and trigger a run of the softirq 696 * and trigger a run of the softirq
688 */ 697 */
689 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { 698 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
690 int cpu = (unsigned long) hcpu; 699 int cpu = (unsigned long) hcpu;
691 int i; 700 int i;
692 701
693 local_irq_disable(); 702 local_irq_disable();
694 for (i = 0; i < NR_SOFTIRQS; i++) { 703 for (i = 0; i < NR_SOFTIRQS; i++) {
695 struct list_head *head = &per_cpu(softirq_work_list[i], cpu); 704 struct list_head *head = &per_cpu(softirq_work_list[i], cpu);
696 struct list_head *local_head; 705 struct list_head *local_head;
697 706
698 if (list_empty(head)) 707 if (list_empty(head))
699 continue; 708 continue;
700 709
701 local_head = &__get_cpu_var(softirq_work_list[i]); 710 local_head = &__get_cpu_var(softirq_work_list[i]);
702 list_splice_init(head, local_head); 711 list_splice_init(head, local_head);
703 raise_softirq_irqoff(i); 712 raise_softirq_irqoff(i);
704 } 713 }
705 local_irq_enable(); 714 local_irq_enable();
706 } 715 }
707 716
708 return NOTIFY_OK; 717 return NOTIFY_OK;
709 } 718 }
710 719
711 static struct notifier_block __cpuinitdata remote_softirq_cpu_notifier = { 720 static struct notifier_block __cpuinitdata remote_softirq_cpu_notifier = {
712 .notifier_call = remote_softirq_cpu_notify, 721 .notifier_call = remote_softirq_cpu_notify,
713 }; 722 };
714 723
715 void __init softirq_init(void) 724 void __init softirq_init(void)
716 { 725 {
717 int cpu; 726 int cpu;
718 727
719 for_each_possible_cpu(cpu) { 728 for_each_possible_cpu(cpu) {
720 int i; 729 int i;
721 730
722 per_cpu(tasklet_vec, cpu).tail = 731 per_cpu(tasklet_vec, cpu).tail =
723 &per_cpu(tasklet_vec, cpu).head; 732 &per_cpu(tasklet_vec, cpu).head;
724 per_cpu(tasklet_hi_vec, cpu).tail = 733 per_cpu(tasklet_hi_vec, cpu).tail =
725 &per_cpu(tasklet_hi_vec, cpu).head; 734 &per_cpu(tasklet_hi_vec, cpu).head;
726 for (i = 0; i < NR_SOFTIRQS; i++) 735 for (i = 0; i < NR_SOFTIRQS; i++)
727 INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu)); 736 INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu));
728 } 737 }
729 738
730 register_hotcpu_notifier(&remote_softirq_cpu_notifier); 739 register_hotcpu_notifier(&remote_softirq_cpu_notifier);
731 740
732 open_softirq(TASKLET_SOFTIRQ, tasklet_action); 741 open_softirq(TASKLET_SOFTIRQ, tasklet_action);
733 open_softirq(HI_SOFTIRQ, tasklet_hi_action); 742 open_softirq(HI_SOFTIRQ, tasklet_hi_action);
734 } 743 }
735 744
736 static int run_ksoftirqd(void * __bind_cpu) 745 static int run_ksoftirqd(void * __bind_cpu)
737 { 746 {
738 set_current_state(TASK_INTERRUPTIBLE); 747 set_current_state(TASK_INTERRUPTIBLE);
739 748
740 while (!kthread_should_stop()) { 749 while (!kthread_should_stop()) {
741 preempt_disable(); 750 preempt_disable();
742 if (!local_softirq_pending()) { 751 if (!local_softirq_pending()) {
743 schedule_preempt_disabled(); 752 schedule_preempt_disabled();
744 } 753 }
745 754
746 __set_current_state(TASK_RUNNING); 755 __set_current_state(TASK_RUNNING);
747 756
748 while (local_softirq_pending()) { 757 while (local_softirq_pending()) {
749 /* Preempt disable stops cpu going offline. 758 /* Preempt disable stops cpu going offline.
750 If already offline, we'll be on wrong CPU: 759 If already offline, we'll be on wrong CPU:
751 don't process */ 760 don't process */
752 if (cpu_is_offline((long)__bind_cpu)) 761 if (cpu_is_offline((long)__bind_cpu))
753 goto wait_to_die; 762 goto wait_to_die;
754 local_irq_disable(); 763 local_irq_disable();
755 if (local_softirq_pending()) 764 if (local_softirq_pending())
756 __do_softirq(); 765 __do_softirq();
757 local_irq_enable(); 766 local_irq_enable();
758 sched_preempt_enable_no_resched(); 767 sched_preempt_enable_no_resched();
759 cond_resched(); 768 cond_resched();
760 preempt_disable(); 769 preempt_disable();
761 rcu_note_context_switch((long)__bind_cpu); 770 rcu_note_context_switch((long)__bind_cpu);
762 } 771 }
763 preempt_enable(); 772 preempt_enable();
764 set_current_state(TASK_INTERRUPTIBLE); 773 set_current_state(TASK_INTERRUPTIBLE);
765 } 774 }
766 __set_current_state(TASK_RUNNING); 775 __set_current_state(TASK_RUNNING);
767 return 0; 776 return 0;
768 777
769 wait_to_die: 778 wait_to_die:
770 preempt_enable(); 779 preempt_enable();
771 /* Wait for kthread_stop */ 780 /* Wait for kthread_stop */
772 set_current_state(TASK_INTERRUPTIBLE); 781 set_current_state(TASK_INTERRUPTIBLE);
773 while (!kthread_should_stop()) { 782 while (!kthread_should_stop()) {
774 schedule(); 783 schedule();
775 set_current_state(TASK_INTERRUPTIBLE); 784 set_current_state(TASK_INTERRUPTIBLE);
776 } 785 }
777 __set_current_state(TASK_RUNNING); 786 __set_current_state(TASK_RUNNING);
778 return 0; 787 return 0;
779 } 788 }
780 789
781 #ifdef CONFIG_HOTPLUG_CPU 790 #ifdef CONFIG_HOTPLUG_CPU
782 /* 791 /*
783 * tasklet_kill_immediate is called to remove a tasklet which can already be 792 * tasklet_kill_immediate is called to remove a tasklet which can already be
784 * scheduled for execution on @cpu. 793 * scheduled for execution on @cpu.
785 * 794 *
786 * Unlike tasklet_kill, this function removes the tasklet 795 * Unlike tasklet_kill, this function removes the tasklet
787 * _immediately_, even if the tasklet is in TASKLET_STATE_SCHED state. 796 * _immediately_, even if the tasklet is in TASKLET_STATE_SCHED state.
788 * 797 *
789 * When this function is called, @cpu must be in the CPU_DEAD state. 798 * When this function is called, @cpu must be in the CPU_DEAD state.
790 */ 799 */
791 void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu) 800 void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu)
792 { 801 {
793 struct tasklet_struct **i; 802 struct tasklet_struct **i;
794 803
795 BUG_ON(cpu_online(cpu)); 804 BUG_ON(cpu_online(cpu));
796 BUG_ON(test_bit(TASKLET_STATE_RUN, &t->state)); 805 BUG_ON(test_bit(TASKLET_STATE_RUN, &t->state));
797 806
798 if (!test_bit(TASKLET_STATE_SCHED, &t->state)) 807 if (!test_bit(TASKLET_STATE_SCHED, &t->state))
799 return; 808 return;
800 809
801 /* CPU is dead, so no lock needed. */ 810 /* CPU is dead, so no lock needed. */
802 for (i = &per_cpu(tasklet_vec, cpu).head; *i; i = &(*i)->next) { 811 for (i = &per_cpu(tasklet_vec, cpu).head; *i; i = &(*i)->next) {
803 if (*i == t) { 812 if (*i == t) {
804 *i = t->next; 813 *i = t->next;
805 /* If this was the tail element, move the tail ptr */ 814 /* If this was the tail element, move the tail ptr */
806 if (*i == NULL) 815 if (*i == NULL)
807 per_cpu(tasklet_vec, cpu).tail = i; 816 per_cpu(tasklet_vec, cpu).tail = i;
808 return; 817 return;
809 } 818 }
810 } 819 }
811 BUG(); 820 BUG();
812 } 821 }
813 822
814 static void takeover_tasklets(unsigned int cpu) 823 static void takeover_tasklets(unsigned int cpu)
815 { 824 {
816 /* CPU is dead, so no lock needed. */ 825 /* CPU is dead, so no lock needed. */
817 local_irq_disable(); 826 local_irq_disable();
818 827
819 /* Find end, append list for that CPU. */ 828 /* Find end, append list for that CPU. */
820 if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) { 829 if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) {
821 *__this_cpu_read(tasklet_vec.tail) = per_cpu(tasklet_vec, cpu).head; 830 *__this_cpu_read(tasklet_vec.tail) = per_cpu(tasklet_vec, cpu).head;
822 this_cpu_write(tasklet_vec.tail, per_cpu(tasklet_vec, cpu).tail); 831 this_cpu_write(tasklet_vec.tail, per_cpu(tasklet_vec, cpu).tail);
823 per_cpu(tasklet_vec, cpu).head = NULL; 832 per_cpu(tasklet_vec, cpu).head = NULL;
824 per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; 833 per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head;
825 } 834 }
826 raise_softirq_irqoff(TASKLET_SOFTIRQ); 835 raise_softirq_irqoff(TASKLET_SOFTIRQ);
827 836
828 if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) { 837 if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) {
829 *__this_cpu_read(tasklet_hi_vec.tail) = per_cpu(tasklet_hi_vec, cpu).head; 838 *__this_cpu_read(tasklet_hi_vec.tail) = per_cpu(tasklet_hi_vec, cpu).head;
830 __this_cpu_write(tasklet_hi_vec.tail, per_cpu(tasklet_hi_vec, cpu).tail); 839 __this_cpu_write(tasklet_hi_vec.tail, per_cpu(tasklet_hi_vec, cpu).tail);
831 per_cpu(tasklet_hi_vec, cpu).head = NULL; 840 per_cpu(tasklet_hi_vec, cpu).head = NULL;
832 per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head; 841 per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head;
833 } 842 }
834 raise_softirq_irqoff(HI_SOFTIRQ); 843 raise_softirq_irqoff(HI_SOFTIRQ);
835 844
836 local_irq_enable(); 845 local_irq_enable();
837 } 846 }
838 #endif /* CONFIG_HOTPLUG_CPU */ 847 #endif /* CONFIG_HOTPLUG_CPU */
839 848
840 static int __cpuinit cpu_callback(struct notifier_block *nfb, 849 static int __cpuinit cpu_callback(struct notifier_block *nfb,
841 unsigned long action, 850 unsigned long action,
842 void *hcpu) 851 void *hcpu)
843 { 852 {
844 int hotcpu = (unsigned long)hcpu; 853 int hotcpu = (unsigned long)hcpu;
845 struct task_struct *p; 854 struct task_struct *p;
846 855
847 switch (action) { 856 switch (action) {
848 case CPU_UP_PREPARE: 857 case CPU_UP_PREPARE:
849 case CPU_UP_PREPARE_FROZEN: 858 case CPU_UP_PREPARE_FROZEN:
850 p = kthread_create_on_node(run_ksoftirqd, 859 p = kthread_create_on_node(run_ksoftirqd,
851 hcpu, 860 hcpu,
852 cpu_to_node(hotcpu), 861 cpu_to_node(hotcpu),
853 "ksoftirqd/%d", hotcpu); 862 "ksoftirqd/%d", hotcpu);
854 if (IS_ERR(p)) { 863 if (IS_ERR(p)) {
855 printk("ksoftirqd for %i failed\n", hotcpu); 864 printk("ksoftirqd for %i failed\n", hotcpu);
856 return notifier_from_errno(PTR_ERR(p)); 865 return notifier_from_errno(PTR_ERR(p));
857 } 866 }
858 kthread_bind(p, hotcpu); 867 kthread_bind(p, hotcpu);
859 per_cpu(ksoftirqd, hotcpu) = p; 868 per_cpu(ksoftirqd, hotcpu) = p;
860 break; 869 break;
861 case CPU_ONLINE: 870 case CPU_ONLINE:
862 case CPU_ONLINE_FROZEN: 871 case CPU_ONLINE_FROZEN:
863 wake_up_process(per_cpu(ksoftirqd, hotcpu)); 872 wake_up_process(per_cpu(ksoftirqd, hotcpu));
864 break; 873 break;
865 #ifdef CONFIG_HOTPLUG_CPU 874 #ifdef CONFIG_HOTPLUG_CPU
866 case CPU_UP_CANCELED: 875 case CPU_UP_CANCELED:
867 case CPU_UP_CANCELED_FROZEN: 876 case CPU_UP_CANCELED_FROZEN:
868 if (!per_cpu(ksoftirqd, hotcpu)) 877 if (!per_cpu(ksoftirqd, hotcpu))
869 break; 878 break;
870 /* Unbind so it can run. Fall thru. */ 879 /* Unbind so it can run. Fall thru. */
871 kthread_bind(per_cpu(ksoftirqd, hotcpu), 880 kthread_bind(per_cpu(ksoftirqd, hotcpu),
872 cpumask_any(cpu_online_mask)); 881 cpumask_any(cpu_online_mask));
873 case CPU_DEAD: 882 case CPU_DEAD:
874 case CPU_DEAD_FROZEN: { 883 case CPU_DEAD_FROZEN: {
875 static const struct sched_param param = { 884 static const struct sched_param param = {
876 .sched_priority = MAX_RT_PRIO-1 885 .sched_priority = MAX_RT_PRIO-1
877 }; 886 };
878 887
879 p = per_cpu(ksoftirqd, hotcpu); 888 p = per_cpu(ksoftirqd, hotcpu);
880 per_cpu(ksoftirqd, hotcpu) = NULL; 889 per_cpu(ksoftirqd, hotcpu) = NULL;
881 sched_setscheduler_nocheck(p, SCHED_FIFO, &param); 890 sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
882 kthread_stop(p); 891 kthread_stop(p);
883 takeover_tasklets(hotcpu); 892 takeover_tasklets(hotcpu);
884 break; 893 break;
885 } 894 }
886 #endif /* CONFIG_HOTPLUG_CPU */ 895 #endif /* CONFIG_HOTPLUG_CPU */
887 } 896 }
888 return NOTIFY_OK; 897 return NOTIFY_OK;
889 } 898 }
890 899
891 static struct notifier_block __cpuinitdata cpu_nfb = { 900 static struct notifier_block __cpuinitdata cpu_nfb = {
892 .notifier_call = cpu_callback 901 .notifier_call = cpu_callback
893 }; 902 };
894 903
895 static __init int spawn_ksoftirqd(void) 904 static __init int spawn_ksoftirqd(void)
896 { 905 {
897 void *cpu = (void *)(long)smp_processor_id(); 906 void *cpu = (void *)(long)smp_processor_id();
898 int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); 907 int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
899 908
900 BUG_ON(err != NOTIFY_OK); 909 BUG_ON(err != NOTIFY_OK);
901 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); 910 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
902 register_cpu_notifier(&cpu_nfb); 911 register_cpu_notifier(&cpu_nfb);
903 return 0; 912 return 0;
904 } 913 }
905 early_initcall(spawn_ksoftirqd); 914 early_initcall(spawn_ksoftirqd);
906 915
907 /* 916 /*
908 * [ These __weak aliases are kept in a separate compilation unit, so that 917 * [ These __weak aliases are kept in a separate compilation unit, so that
909 * GCC does not inline them incorrectly. ] 918 * GCC does not inline them incorrectly. ]
910 */ 919 */
911 920
912 int __init __weak early_irq_init(void) 921 int __init __weak early_irq_init(void)
913 { 922 {
914 return 0; 923 return 0;
915 } 924 }
916 925
917 #ifdef CONFIG_GENERIC_HARDIRQS 926 #ifdef CONFIG_GENERIC_HARDIRQS
918 int __init __weak arch_probe_nr_irqs(void) 927 int __init __weak arch_probe_nr_irqs(void)
919 { 928 {
920 return NR_IRQS_LEGACY; 929 return NR_IRQS_LEGACY;
921 } 930 }
922 931
923 int __init __weak arch_early_irq_init(void) 932 int __init __weak arch_early_irq_init(void)
924 { 933 {
925 return 0; 934 return 0;
926 } 935 }
927 #endif 936 #endif
928 937
1 /* 1 /*
2 * linux/mm/page_alloc.c 2 * linux/mm/page_alloc.c
3 * 3 *
4 * Manages the free list, the system allocates free pages here. 4 * Manages the free list, the system allocates free pages here.
5 * Note that kmalloc() lives in slab.c 5 * Note that kmalloc() lives in slab.c
6 * 6 *
7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
8 * Swap reorganised 29.12.95, Stephen Tweedie 8 * Swap reorganised 29.12.95, Stephen Tweedie
9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000
13 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 13 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton) 14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton)
15 */ 15 */
16 16
17 #include <linux/stddef.h> 17 #include <linux/stddef.h>
18 #include <linux/mm.h> 18 #include <linux/mm.h>
19 #include <linux/swap.h> 19 #include <linux/swap.h>
20 #include <linux/interrupt.h> 20 #include <linux/interrupt.h>
21 #include <linux/pagemap.h> 21 #include <linux/pagemap.h>
22 #include <linux/jiffies.h> 22 #include <linux/jiffies.h>
23 #include <linux/bootmem.h> 23 #include <linux/bootmem.h>
24 #include <linux/memblock.h> 24 #include <linux/memblock.h>
25 #include <linux/compiler.h> 25 #include <linux/compiler.h>
26 #include <linux/kernel.h> 26 #include <linux/kernel.h>
27 #include <linux/kmemcheck.h> 27 #include <linux/kmemcheck.h>
28 #include <linux/module.h> 28 #include <linux/module.h>
29 #include <linux/suspend.h> 29 #include <linux/suspend.h>
30 #include <linux/pagevec.h> 30 #include <linux/pagevec.h>
31 #include <linux/blkdev.h> 31 #include <linux/blkdev.h>
32 #include <linux/slab.h> 32 #include <linux/slab.h>
33 #include <linux/ratelimit.h> 33 #include <linux/ratelimit.h>
34 #include <linux/oom.h> 34 #include <linux/oom.h>
35 #include <linux/notifier.h> 35 #include <linux/notifier.h>
36 #include <linux/topology.h> 36 #include <linux/topology.h>
37 #include <linux/sysctl.h> 37 #include <linux/sysctl.h>
38 #include <linux/cpu.h> 38 #include <linux/cpu.h>
39 #include <linux/cpuset.h> 39 #include <linux/cpuset.h>
40 #include <linux/memory_hotplug.h> 40 #include <linux/memory_hotplug.h>
41 #include <linux/nodemask.h> 41 #include <linux/nodemask.h>
42 #include <linux/vmalloc.h> 42 #include <linux/vmalloc.h>
43 #include <linux/vmstat.h> 43 #include <linux/vmstat.h>
44 #include <linux/mempolicy.h> 44 #include <linux/mempolicy.h>
45 #include <linux/stop_machine.h> 45 #include <linux/stop_machine.h>
46 #include <linux/sort.h> 46 #include <linux/sort.h>
47 #include <linux/pfn.h> 47 #include <linux/pfn.h>
48 #include <linux/backing-dev.h> 48 #include <linux/backing-dev.h>
49 #include <linux/fault-inject.h> 49 #include <linux/fault-inject.h>
50 #include <linux/page-isolation.h> 50 #include <linux/page-isolation.h>
51 #include <linux/page_cgroup.h> 51 #include <linux/page_cgroup.h>
52 #include <linux/debugobjects.h> 52 #include <linux/debugobjects.h>
53 #include <linux/kmemleak.h> 53 #include <linux/kmemleak.h>
54 #include <linux/compaction.h> 54 #include <linux/compaction.h>
55 #include <trace/events/kmem.h> 55 #include <trace/events/kmem.h>
56 #include <linux/ftrace_event.h> 56 #include <linux/ftrace_event.h>
57 #include <linux/memcontrol.h> 57 #include <linux/memcontrol.h>
58 #include <linux/prefetch.h> 58 #include <linux/prefetch.h>
59 #include <linux/migrate.h> 59 #include <linux/migrate.h>
60 #include <linux/page-debug-flags.h> 60 #include <linux/page-debug-flags.h>
61 61
62 #include <asm/tlbflush.h> 62 #include <asm/tlbflush.h>
63 #include <asm/div64.h> 63 #include <asm/div64.h>
64 #include "internal.h" 64 #include "internal.h"
65 65
66 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID 66 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
67 DEFINE_PER_CPU(int, numa_node); 67 DEFINE_PER_CPU(int, numa_node);
68 EXPORT_PER_CPU_SYMBOL(numa_node); 68 EXPORT_PER_CPU_SYMBOL(numa_node);
69 #endif 69 #endif
70 70
71 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 71 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
72 /* 72 /*
73 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. 73 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
74 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. 74 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
75 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem() 75 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
76 * defined in <linux/topology.h>. 76 * defined in <linux/topology.h>.
77 */ 77 */
78 DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ 78 DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
79 EXPORT_PER_CPU_SYMBOL(_numa_mem_); 79 EXPORT_PER_CPU_SYMBOL(_numa_mem_);
80 #endif 80 #endif
81 81
82 /* 82 /*
83 * Array of node states. 83 * Array of node states.
84 */ 84 */
85 nodemask_t node_states[NR_NODE_STATES] __read_mostly = { 85 nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
86 [N_POSSIBLE] = NODE_MASK_ALL, 86 [N_POSSIBLE] = NODE_MASK_ALL,
87 [N_ONLINE] = { { [0] = 1UL } }, 87 [N_ONLINE] = { { [0] = 1UL } },
88 #ifndef CONFIG_NUMA 88 #ifndef CONFIG_NUMA
89 [N_NORMAL_MEMORY] = { { [0] = 1UL } }, 89 [N_NORMAL_MEMORY] = { { [0] = 1UL } },
90 #ifdef CONFIG_HIGHMEM 90 #ifdef CONFIG_HIGHMEM
91 [N_HIGH_MEMORY] = { { [0] = 1UL } }, 91 [N_HIGH_MEMORY] = { { [0] = 1UL } },
92 #endif 92 #endif
93 [N_CPU] = { { [0] = 1UL } }, 93 [N_CPU] = { { [0] = 1UL } },
94 #endif /* NUMA */ 94 #endif /* NUMA */
95 }; 95 };
96 EXPORT_SYMBOL(node_states); 96 EXPORT_SYMBOL(node_states);
97 97
98 unsigned long totalram_pages __read_mostly; 98 unsigned long totalram_pages __read_mostly;
99 unsigned long totalreserve_pages __read_mostly; 99 unsigned long totalreserve_pages __read_mostly;
100 /* 100 /*
101 * When calculating the number of globally allowed dirty pages, there 101 * When calculating the number of globally allowed dirty pages, there
102 * is a certain number of per-zone reserves that should not be 102 * is a certain number of per-zone reserves that should not be
103 * considered dirtyable memory. This is the sum of those reserves 103 * considered dirtyable memory. This is the sum of those reserves
104 * over all existing zones that contribute dirtyable memory. 104 * over all existing zones that contribute dirtyable memory.
105 */ 105 */
106 unsigned long dirty_balance_reserve __read_mostly; 106 unsigned long dirty_balance_reserve __read_mostly;
107 107
108 int percpu_pagelist_fraction; 108 int percpu_pagelist_fraction;
109 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; 109 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
110 110
111 #ifdef CONFIG_PM_SLEEP 111 #ifdef CONFIG_PM_SLEEP
112 /* 112 /*
113 * The following functions are used by the suspend/hibernate code to temporarily 113 * The following functions are used by the suspend/hibernate code to temporarily
114 * change gfp_allowed_mask in order to avoid using I/O during memory allocations 114 * change gfp_allowed_mask in order to avoid using I/O during memory allocations
115 * while devices are suspended. To avoid races with the suspend/hibernate code, 115 * while devices are suspended. To avoid races with the suspend/hibernate code,
116 * they should always be called with pm_mutex held (gfp_allowed_mask also should 116 * they should always be called with pm_mutex held (gfp_allowed_mask also should
117 * only be modified with pm_mutex held, unless the suspend/hibernate code is 117 * only be modified with pm_mutex held, unless the suspend/hibernate code is
118 * guaranteed not to run in parallel with that modification). 118 * guaranteed not to run in parallel with that modification).
119 */ 119 */
120 120
121 static gfp_t saved_gfp_mask; 121 static gfp_t saved_gfp_mask;
122 122
123 void pm_restore_gfp_mask(void) 123 void pm_restore_gfp_mask(void)
124 { 124 {
125 WARN_ON(!mutex_is_locked(&pm_mutex)); 125 WARN_ON(!mutex_is_locked(&pm_mutex));
126 if (saved_gfp_mask) { 126 if (saved_gfp_mask) {
127 gfp_allowed_mask = saved_gfp_mask; 127 gfp_allowed_mask = saved_gfp_mask;
128 saved_gfp_mask = 0; 128 saved_gfp_mask = 0;
129 } 129 }
130 } 130 }
131 131
132 void pm_restrict_gfp_mask(void) 132 void pm_restrict_gfp_mask(void)
133 { 133 {
134 WARN_ON(!mutex_is_locked(&pm_mutex)); 134 WARN_ON(!mutex_is_locked(&pm_mutex));
135 WARN_ON(saved_gfp_mask); 135 WARN_ON(saved_gfp_mask);
136 saved_gfp_mask = gfp_allowed_mask; 136 saved_gfp_mask = gfp_allowed_mask;
137 gfp_allowed_mask &= ~GFP_IOFS; 137 gfp_allowed_mask &= ~GFP_IOFS;
138 } 138 }
139 139
140 bool pm_suspended_storage(void) 140 bool pm_suspended_storage(void)
141 { 141 {
142 if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS) 142 if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
143 return false; 143 return false;
144 return true; 144 return true;
145 } 145 }
146 #endif /* CONFIG_PM_SLEEP */ 146 #endif /* CONFIG_PM_SLEEP */
147 147
148 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 148 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
149 int pageblock_order __read_mostly; 149 int pageblock_order __read_mostly;
150 #endif 150 #endif
151 151
152 static void __free_pages_ok(struct page *page, unsigned int order); 152 static void __free_pages_ok(struct page *page, unsigned int order);
153 153
154 /* 154 /*
155 * results with 256, 32 in the lowmem_reserve sysctl: 155 * results with 256, 32 in the lowmem_reserve sysctl:
156 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) 156 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
157 * 1G machine -> (16M dma, 784M normal, 224M high) 157 * 1G machine -> (16M dma, 784M normal, 224M high)
158 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA 158 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
159 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL 159 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
160 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA 160 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
161 * 161 *
162 * TBD: should special case ZONE_DMA32 machines here - in those we normally 162 * TBD: should special case ZONE_DMA32 machines here - in those we normally
163 * don't need any ZONE_NORMAL reservation 163 * don't need any ZONE_NORMAL reservation
164 */ 164 */
165 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 165 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
166 #ifdef CONFIG_ZONE_DMA 166 #ifdef CONFIG_ZONE_DMA
167 256, 167 256,
168 #endif 168 #endif
169 #ifdef CONFIG_ZONE_DMA32 169 #ifdef CONFIG_ZONE_DMA32
170 256, 170 256,
171 #endif 171 #endif
172 #ifdef CONFIG_HIGHMEM 172 #ifdef CONFIG_HIGHMEM
173 32, 173 32,
174 #endif 174 #endif
175 32, 175 32,
176 }; 176 };
177 177
178 EXPORT_SYMBOL(totalram_pages); 178 EXPORT_SYMBOL(totalram_pages);
179 179
180 static char * const zone_names[MAX_NR_ZONES] = { 180 static char * const zone_names[MAX_NR_ZONES] = {
181 #ifdef CONFIG_ZONE_DMA 181 #ifdef CONFIG_ZONE_DMA
182 "DMA", 182 "DMA",
183 #endif 183 #endif
184 #ifdef CONFIG_ZONE_DMA32 184 #ifdef CONFIG_ZONE_DMA32
185 "DMA32", 185 "DMA32",
186 #endif 186 #endif
187 "Normal", 187 "Normal",
188 #ifdef CONFIG_HIGHMEM 188 #ifdef CONFIG_HIGHMEM
189 "HighMem", 189 "HighMem",
190 #endif 190 #endif
191 "Movable", 191 "Movable",
192 }; 192 };
193 193
194 int min_free_kbytes = 1024; 194 int min_free_kbytes = 1024;
195 195
196 static unsigned long __meminitdata nr_kernel_pages; 196 static unsigned long __meminitdata nr_kernel_pages;
197 static unsigned long __meminitdata nr_all_pages; 197 static unsigned long __meminitdata nr_all_pages;
198 static unsigned long __meminitdata dma_reserve; 198 static unsigned long __meminitdata dma_reserve;
199 199
200 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 200 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
201 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; 201 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
202 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; 202 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
203 static unsigned long __initdata required_kernelcore; 203 static unsigned long __initdata required_kernelcore;
204 static unsigned long __initdata required_movablecore; 204 static unsigned long __initdata required_movablecore;
205 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; 205 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
206 206
207 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ 207 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
208 int movable_zone; 208 int movable_zone;
209 EXPORT_SYMBOL(movable_zone); 209 EXPORT_SYMBOL(movable_zone);
210 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 210 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
211 211
212 #if MAX_NUMNODES > 1 212 #if MAX_NUMNODES > 1
213 int nr_node_ids __read_mostly = MAX_NUMNODES; 213 int nr_node_ids __read_mostly = MAX_NUMNODES;
214 int nr_online_nodes __read_mostly = 1; 214 int nr_online_nodes __read_mostly = 1;
215 EXPORT_SYMBOL(nr_node_ids); 215 EXPORT_SYMBOL(nr_node_ids);
216 EXPORT_SYMBOL(nr_online_nodes); 216 EXPORT_SYMBOL(nr_online_nodes);
217 #endif 217 #endif
218 218
219 int page_group_by_mobility_disabled __read_mostly; 219 int page_group_by_mobility_disabled __read_mostly;
220 220
221 /* 221 /*
222 * NOTE: 222 * NOTE:
223 * Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly. 223 * Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly.
224 * Instead, use {un}set_pageblock_isolate. 224 * Instead, use {un}set_pageblock_isolate.
225 */ 225 */
226 void set_pageblock_migratetype(struct page *page, int migratetype) 226 void set_pageblock_migratetype(struct page *page, int migratetype)
227 { 227 {
228 228
229 if (unlikely(page_group_by_mobility_disabled)) 229 if (unlikely(page_group_by_mobility_disabled))
230 migratetype = MIGRATE_UNMOVABLE; 230 migratetype = MIGRATE_UNMOVABLE;
231 231
232 set_pageblock_flags_group(page, (unsigned long)migratetype, 232 set_pageblock_flags_group(page, (unsigned long)migratetype,
233 PB_migrate, PB_migrate_end); 233 PB_migrate, PB_migrate_end);
234 } 234 }
235 235
236 bool oom_killer_disabled __read_mostly; 236 bool oom_killer_disabled __read_mostly;
237 237
238 #ifdef CONFIG_DEBUG_VM 238 #ifdef CONFIG_DEBUG_VM
239 static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 239 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
240 { 240 {
241 int ret = 0; 241 int ret = 0;
242 unsigned seq; 242 unsigned seq;
243 unsigned long pfn = page_to_pfn(page); 243 unsigned long pfn = page_to_pfn(page);
244 244
245 do { 245 do {
246 seq = zone_span_seqbegin(zone); 246 seq = zone_span_seqbegin(zone);
247 if (pfn >= zone->zone_start_pfn + zone->spanned_pages) 247 if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
248 ret = 1; 248 ret = 1;
249 else if (pfn < zone->zone_start_pfn) 249 else if (pfn < zone->zone_start_pfn)
250 ret = 1; 250 ret = 1;
251 } while (zone_span_seqretry(zone, seq)); 251 } while (zone_span_seqretry(zone, seq));
252 252
253 return ret; 253 return ret;
254 } 254 }
255 255
256 static int page_is_consistent(struct zone *zone, struct page *page) 256 static int page_is_consistent(struct zone *zone, struct page *page)
257 { 257 {
258 if (!pfn_valid_within(page_to_pfn(page))) 258 if (!pfn_valid_within(page_to_pfn(page)))
259 return 0; 259 return 0;
260 if (zone != page_zone(page)) 260 if (zone != page_zone(page))
261 return 0; 261 return 0;
262 262
263 return 1; 263 return 1;
264 } 264 }
265 /* 265 /*
266 * Temporary debugging check for pages not lying within a given zone. 266 * Temporary debugging check for pages not lying within a given zone.
267 */ 267 */
268 static int bad_range(struct zone *zone, struct page *page) 268 static int bad_range(struct zone *zone, struct page *page)
269 { 269 {
270 if (page_outside_zone_boundaries(zone, page)) 270 if (page_outside_zone_boundaries(zone, page))
271 return 1; 271 return 1;
272 if (!page_is_consistent(zone, page)) 272 if (!page_is_consistent(zone, page))
273 return 1; 273 return 1;
274 274
275 return 0; 275 return 0;
276 } 276 }
277 #else 277 #else
278 static inline int bad_range(struct zone *zone, struct page *page) 278 static inline int bad_range(struct zone *zone, struct page *page)
279 { 279 {
280 return 0; 280 return 0;
281 } 281 }
282 #endif 282 #endif
283 283
284 static void bad_page(struct page *page) 284 static void bad_page(struct page *page)
285 { 285 {
286 static unsigned long resume; 286 static unsigned long resume;
287 static unsigned long nr_shown; 287 static unsigned long nr_shown;
288 static unsigned long nr_unshown; 288 static unsigned long nr_unshown;
289 289
290 /* Don't complain about poisoned pages */ 290 /* Don't complain about poisoned pages */
291 if (PageHWPoison(page)) { 291 if (PageHWPoison(page)) {
292 reset_page_mapcount(page); /* remove PageBuddy */ 292 reset_page_mapcount(page); /* remove PageBuddy */
293 return; 293 return;
294 } 294 }
295 295
296 /* 296 /*
297 * Allow a burst of 60 reports, then keep quiet for that minute; 297 * Allow a burst of 60 reports, then keep quiet for that minute;
298 * or allow a steady drip of one report per second. 298 * or allow a steady drip of one report per second.
299 */ 299 */
300 if (nr_shown == 60) { 300 if (nr_shown == 60) {
301 if (time_before(jiffies, resume)) { 301 if (time_before(jiffies, resume)) {
302 nr_unshown++; 302 nr_unshown++;
303 goto out; 303 goto out;
304 } 304 }
305 if (nr_unshown) { 305 if (nr_unshown) {
306 printk(KERN_ALERT 306 printk(KERN_ALERT
307 "BUG: Bad page state: %lu messages suppressed\n", 307 "BUG: Bad page state: %lu messages suppressed\n",
308 nr_unshown); 308 nr_unshown);
309 nr_unshown = 0; 309 nr_unshown = 0;
310 } 310 }
311 nr_shown = 0; 311 nr_shown = 0;
312 } 312 }
313 if (nr_shown++ == 0) 313 if (nr_shown++ == 0)
314 resume = jiffies + 60 * HZ; 314 resume = jiffies + 60 * HZ;
315 315
316 printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", 316 printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",
317 current->comm, page_to_pfn(page)); 317 current->comm, page_to_pfn(page));
318 dump_page(page); 318 dump_page(page);
319 319
320 print_modules(); 320 print_modules();
321 dump_stack(); 321 dump_stack();
322 out: 322 out:
323 /* Leave bad fields for debug, except PageBuddy could make trouble */ 323 /* Leave bad fields for debug, except PageBuddy could make trouble */
324 reset_page_mapcount(page); /* remove PageBuddy */ 324 reset_page_mapcount(page); /* remove PageBuddy */
325 add_taint(TAINT_BAD_PAGE); 325 add_taint(TAINT_BAD_PAGE);
326 } 326 }
327 327
328 /* 328 /*
329 * Higher-order pages are called "compound pages". They are structured thusly: 329 * Higher-order pages are called "compound pages". They are structured thusly:
330 * 330 *
331 * The first PAGE_SIZE page is called the "head page". 331 * The first PAGE_SIZE page is called the "head page".
332 * 332 *
333 * The remaining PAGE_SIZE pages are called "tail pages". 333 * The remaining PAGE_SIZE pages are called "tail pages".
334 * 334 *
335 * All pages have PG_compound set. All tail pages have their ->first_page 335 * All pages have PG_compound set. All tail pages have their ->first_page
336 * pointing at the head page. 336 * pointing at the head page.
337 * 337 *
338 * The first tail page's ->lru.next holds the address of the compound page's 338 * The first tail page's ->lru.next holds the address of the compound page's
339 * put_page() function. Its ->lru.prev holds the order of allocation. 339 * put_page() function. Its ->lru.prev holds the order of allocation.
340 * This usage means that zero-order pages may not be compound. 340 * This usage means that zero-order pages may not be compound.
341 */ 341 */
342 342
343 static void free_compound_page(struct page *page) 343 static void free_compound_page(struct page *page)
344 { 344 {
345 __free_pages_ok(page, compound_order(page)); 345 __free_pages_ok(page, compound_order(page));
346 } 346 }
347 347
348 void prep_compound_page(struct page *page, unsigned long order) 348 void prep_compound_page(struct page *page, unsigned long order)
349 { 349 {
350 int i; 350 int i;
351 int nr_pages = 1 << order; 351 int nr_pages = 1 << order;
352 352
353 set_compound_page_dtor(page, free_compound_page); 353 set_compound_page_dtor(page, free_compound_page);
354 set_compound_order(page, order); 354 set_compound_order(page, order);
355 __SetPageHead(page); 355 __SetPageHead(page);
356 for (i = 1; i < nr_pages; i++) { 356 for (i = 1; i < nr_pages; i++) {
357 struct page *p = page + i; 357 struct page *p = page + i;
358 __SetPageTail(p); 358 __SetPageTail(p);
359 set_page_count(p, 0); 359 set_page_count(p, 0);
360 p->first_page = page; 360 p->first_page = page;
361 } 361 }
362 } 362 }
363 363
364 /* update __split_huge_page_refcount if you change this function */ 364 /* update __split_huge_page_refcount if you change this function */
365 static int destroy_compound_page(struct page *page, unsigned long order) 365 static int destroy_compound_page(struct page *page, unsigned long order)
366 { 366 {
367 int i; 367 int i;
368 int nr_pages = 1 << order; 368 int nr_pages = 1 << order;
369 int bad = 0; 369 int bad = 0;
370 370
371 if (unlikely(compound_order(page) != order) || 371 if (unlikely(compound_order(page) != order) ||
372 unlikely(!PageHead(page))) { 372 unlikely(!PageHead(page))) {
373 bad_page(page); 373 bad_page(page);
374 bad++; 374 bad++;
375 } 375 }
376 376
377 __ClearPageHead(page); 377 __ClearPageHead(page);
378 378
379 for (i = 1; i < nr_pages; i++) { 379 for (i = 1; i < nr_pages; i++) {
380 struct page *p = page + i; 380 struct page *p = page + i;
381 381
382 if (unlikely(!PageTail(p) || (p->first_page != page))) { 382 if (unlikely(!PageTail(p) || (p->first_page != page))) {
383 bad_page(page); 383 bad_page(page);
384 bad++; 384 bad++;
385 } 385 }
386 __ClearPageTail(p); 386 __ClearPageTail(p);
387 } 387 }
388 388
389 return bad; 389 return bad;
390 } 390 }
391 391
392 static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) 392 static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
393 { 393 {
394 int i; 394 int i;
395 395
396 /* 396 /*
397 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO 397 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
398 * and __GFP_HIGHMEM from hard or soft interrupt context. 398 * and __GFP_HIGHMEM from hard or soft interrupt context.
399 */ 399 */
400 VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); 400 VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
401 for (i = 0; i < (1 << order); i++) 401 for (i = 0; i < (1 << order); i++)
402 clear_highpage(page + i); 402 clear_highpage(page + i);
403 } 403 }
404 404
405 #ifdef CONFIG_DEBUG_PAGEALLOC 405 #ifdef CONFIG_DEBUG_PAGEALLOC
406 unsigned int _debug_guardpage_minorder; 406 unsigned int _debug_guardpage_minorder;
407 407
408 static int __init debug_guardpage_minorder_setup(char *buf) 408 static int __init debug_guardpage_minorder_setup(char *buf)
409 { 409 {
410 unsigned long res; 410 unsigned long res;
411 411
412 if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { 412 if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {
413 printk(KERN_ERR "Bad debug_guardpage_minorder value\n"); 413 printk(KERN_ERR "Bad debug_guardpage_minorder value\n");
414 return 0; 414 return 0;
415 } 415 }
416 _debug_guardpage_minorder = res; 416 _debug_guardpage_minorder = res;
417 printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res); 417 printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);
418 return 0; 418 return 0;
419 } 419 }
420 __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); 420 __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
421 421
422 static inline void set_page_guard_flag(struct page *page) 422 static inline void set_page_guard_flag(struct page *page)
423 { 423 {
424 __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); 424 __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
425 } 425 }
426 426
427 static inline void clear_page_guard_flag(struct page *page) 427 static inline void clear_page_guard_flag(struct page *page)
428 { 428 {
429 __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); 429 __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
430 } 430 }
431 #else 431 #else
432 static inline void set_page_guard_flag(struct page *page) { } 432 static inline void set_page_guard_flag(struct page *page) { }
433 static inline void clear_page_guard_flag(struct page *page) { } 433 static inline void clear_page_guard_flag(struct page *page) { }
434 #endif 434 #endif
435 435
436 static inline void set_page_order(struct page *page, int order) 436 static inline void set_page_order(struct page *page, int order)
437 { 437 {
438 set_page_private(page, order); 438 set_page_private(page, order);
439 __SetPageBuddy(page); 439 __SetPageBuddy(page);
440 } 440 }
441 441
442 static inline void rmv_page_order(struct page *page) 442 static inline void rmv_page_order(struct page *page)
443 { 443 {
444 __ClearPageBuddy(page); 444 __ClearPageBuddy(page);
445 set_page_private(page, 0); 445 set_page_private(page, 0);
446 } 446 }
447 447
448 /* 448 /*
449 * Locate the struct page for both the matching buddy in our 449 * Locate the struct page for both the matching buddy in our
450 * pair (buddy1) and the combined O(n+1) page they form (page). 450 * pair (buddy1) and the combined O(n+1) page they form (page).
451 * 451 *
452 * 1) Any buddy B1 will have an order O twin B2 which satisfies 452 * 1) Any buddy B1 will have an order O twin B2 which satisfies
453 * the following equation: 453 * the following equation:
454 * B2 = B1 ^ (1 << O) 454 * B2 = B1 ^ (1 << O)
455 * For example, if the starting buddy (buddy2) is #8 its order 455 * For example, if the starting buddy (buddy2) is #8 its order
456 * 1 buddy is #10: 456 * 1 buddy is #10:
457 * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 457 * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
458 * 458 *
459 * 2) Any buddy B will have an order O+1 parent P which 459 * 2) Any buddy B will have an order O+1 parent P which
460 * satisfies the following equation: 460 * satisfies the following equation:
461 * P = B & ~(1 << O) 461 * P = B & ~(1 << O)
462 * 462 *
463 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER 463 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
464 */ 464 */
465 static inline unsigned long 465 static inline unsigned long
466 __find_buddy_index(unsigned long page_idx, unsigned int order) 466 __find_buddy_index(unsigned long page_idx, unsigned int order)
467 { 467 {
468 return page_idx ^ (1 << order); 468 return page_idx ^ (1 << order);
469 } 469 }
470 470
471 /* 471 /*
472 * This function checks whether a page is free && is the buddy 472 * This function checks whether a page is free && is the buddy
473 * we can do coalesce a page and its buddy if 473 * we can do coalesce a page and its buddy if
474 * (a) the buddy is not in a hole && 474 * (a) the buddy is not in a hole &&
475 * (b) the buddy is in the buddy system && 475 * (b) the buddy is in the buddy system &&
476 * (c) a page and its buddy have the same order && 476 * (c) a page and its buddy have the same order &&
477 * (d) a page and its buddy are in the same zone. 477 * (d) a page and its buddy are in the same zone.
478 * 478 *
479 * For recording whether a page is in the buddy system, we set ->_mapcount -2. 479 * For recording whether a page is in the buddy system, we set ->_mapcount -2.
480 * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock. 480 * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.
481 * 481 *
482 * For recording page's order, we use page_private(page). 482 * For recording page's order, we use page_private(page).
483 */ 483 */
484 static inline int page_is_buddy(struct page *page, struct page *buddy, 484 static inline int page_is_buddy(struct page *page, struct page *buddy,
485 int order) 485 int order)
486 { 486 {
487 if (!pfn_valid_within(page_to_pfn(buddy))) 487 if (!pfn_valid_within(page_to_pfn(buddy)))
488 return 0; 488 return 0;
489 489
490 if (page_zone_id(page) != page_zone_id(buddy)) 490 if (page_zone_id(page) != page_zone_id(buddy))
491 return 0; 491 return 0;
492 492
493 if (page_is_guard(buddy) && page_order(buddy) == order) { 493 if (page_is_guard(buddy) && page_order(buddy) == order) {
494 VM_BUG_ON(page_count(buddy) != 0); 494 VM_BUG_ON(page_count(buddy) != 0);
495 return 1; 495 return 1;
496 } 496 }
497 497
498 if (PageBuddy(buddy) && page_order(buddy) == order) { 498 if (PageBuddy(buddy) && page_order(buddy) == order) {
499 VM_BUG_ON(page_count(buddy) != 0); 499 VM_BUG_ON(page_count(buddy) != 0);
500 return 1; 500 return 1;
501 } 501 }
502 return 0; 502 return 0;
503 } 503 }
504 504
505 /* 505 /*
506 * Freeing function for a buddy system allocator. 506 * Freeing function for a buddy system allocator.
507 * 507 *
508 * The concept of a buddy system is to maintain direct-mapped table 508 * The concept of a buddy system is to maintain direct-mapped table
509 * (containing bit values) for memory blocks of various "orders". 509 * (containing bit values) for memory blocks of various "orders".
510 * The bottom level table contains the map for the smallest allocatable 510 * The bottom level table contains the map for the smallest allocatable
511 * units of memory (here, pages), and each level above it describes 511 * units of memory (here, pages), and each level above it describes
512 * pairs of units from the levels below, hence, "buddies". 512 * pairs of units from the levels below, hence, "buddies".
513 * At a high level, all that happens here is marking the table entry 513 * At a high level, all that happens here is marking the table entry
514 * at the bottom level available, and propagating the changes upward 514 * at the bottom level available, and propagating the changes upward
515 * as necessary, plus some accounting needed to play nicely with other 515 * as necessary, plus some accounting needed to play nicely with other
516 * parts of the VM system. 516 * parts of the VM system.
517 * At each level, we keep a list of pages, which are heads of continuous 517 * At each level, we keep a list of pages, which are heads of continuous
518 * free pages of length of (1 << order) and marked with _mapcount -2. Page's 518 * free pages of length of (1 << order) and marked with _mapcount -2. Page's
519 * order is recorded in page_private(page) field. 519 * order is recorded in page_private(page) field.
520 * So when we are allocating or freeing one, we can derive the state of the 520 * So when we are allocating or freeing one, we can derive the state of the
521 * other. That is, if we allocate a small block, and both were 521 * other. That is, if we allocate a small block, and both were
522 * free, the remainder of the region must be split into blocks. 522 * free, the remainder of the region must be split into blocks.
523 * If a block is freed, and its buddy is also free, then this 523 * If a block is freed, and its buddy is also free, then this
524 * triggers coalescing into a block of larger size. 524 * triggers coalescing into a block of larger size.
525 * 525 *
526 * -- wli 526 * -- wli
527 */ 527 */
528 528
529 static inline void __free_one_page(struct page *page, 529 static inline void __free_one_page(struct page *page,
530 struct zone *zone, unsigned int order, 530 struct zone *zone, unsigned int order,
531 int migratetype) 531 int migratetype)
532 { 532 {
533 unsigned long page_idx; 533 unsigned long page_idx;
534 unsigned long combined_idx; 534 unsigned long combined_idx;
535 unsigned long uninitialized_var(buddy_idx); 535 unsigned long uninitialized_var(buddy_idx);
536 struct page *buddy; 536 struct page *buddy;
537 537
538 if (unlikely(PageCompound(page))) 538 if (unlikely(PageCompound(page)))
539 if (unlikely(destroy_compound_page(page, order))) 539 if (unlikely(destroy_compound_page(page, order)))
540 return; 540 return;
541 541
542 VM_BUG_ON(migratetype == -1); 542 VM_BUG_ON(migratetype == -1);
543 543
544 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 544 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
545 545
546 VM_BUG_ON(page_idx & ((1 << order) - 1)); 546 VM_BUG_ON(page_idx & ((1 << order) - 1));
547 VM_BUG_ON(bad_range(zone, page)); 547 VM_BUG_ON(bad_range(zone, page));
548 548
549 while (order < MAX_ORDER-1) { 549 while (order < MAX_ORDER-1) {
550 buddy_idx = __find_buddy_index(page_idx, order); 550 buddy_idx = __find_buddy_index(page_idx, order);
551 buddy = page + (buddy_idx - page_idx); 551 buddy = page + (buddy_idx - page_idx);
552 if (!page_is_buddy(page, buddy, order)) 552 if (!page_is_buddy(page, buddy, order))
553 break; 553 break;
554 /* 554 /*
555 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, 555 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
556 * merge with it and move up one order. 556 * merge with it and move up one order.
557 */ 557 */
558 if (page_is_guard(buddy)) { 558 if (page_is_guard(buddy)) {
559 clear_page_guard_flag(buddy); 559 clear_page_guard_flag(buddy);
560 set_page_private(page, 0); 560 set_page_private(page, 0);
561 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); 561 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
562 } else { 562 } else {
563 list_del(&buddy->lru); 563 list_del(&buddy->lru);
564 zone->free_area[order].nr_free--; 564 zone->free_area[order].nr_free--;
565 rmv_page_order(buddy); 565 rmv_page_order(buddy);
566 } 566 }
567 combined_idx = buddy_idx & page_idx; 567 combined_idx = buddy_idx & page_idx;
568 page = page + (combined_idx - page_idx); 568 page = page + (combined_idx - page_idx);
569 page_idx = combined_idx; 569 page_idx = combined_idx;
570 order++; 570 order++;
571 } 571 }
572 set_page_order(page, order); 572 set_page_order(page, order);
573 573
574 /* 574 /*
575 * If this is not the largest possible page, check if the buddy 575 * If this is not the largest possible page, check if the buddy
576 * of the next-highest order is free. If it is, it's possible 576 * of the next-highest order is free. If it is, it's possible
577 * that pages are being freed that will coalesce soon. In case, 577 * that pages are being freed that will coalesce soon. In case,
578 * that is happening, add the free page to the tail of the list 578 * that is happening, add the free page to the tail of the list
579 * so it's less likely to be used soon and more likely to be merged 579 * so it's less likely to be used soon and more likely to be merged
580 * as a higher order page 580 * as a higher order page
581 */ 581 */
582 if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { 582 if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
583 struct page *higher_page, *higher_buddy; 583 struct page *higher_page, *higher_buddy;
584 combined_idx = buddy_idx & page_idx; 584 combined_idx = buddy_idx & page_idx;
585 higher_page = page + (combined_idx - page_idx); 585 higher_page = page + (combined_idx - page_idx);
586 buddy_idx = __find_buddy_index(combined_idx, order + 1); 586 buddy_idx = __find_buddy_index(combined_idx, order + 1);
587 higher_buddy = page + (buddy_idx - combined_idx); 587 higher_buddy = page + (buddy_idx - combined_idx);
588 if (page_is_buddy(higher_page, higher_buddy, order + 1)) { 588 if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
589 list_add_tail(&page->lru, 589 list_add_tail(&page->lru,
590 &zone->free_area[order].free_list[migratetype]); 590 &zone->free_area[order].free_list[migratetype]);
591 goto out; 591 goto out;
592 } 592 }
593 } 593 }
594 594
595 list_add(&page->lru, &zone->free_area[order].free_list[migratetype]); 595 list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
596 out: 596 out:
597 zone->free_area[order].nr_free++; 597 zone->free_area[order].nr_free++;
598 } 598 }
599 599
600 /* 600 /*
601 * free_page_mlock() -- clean up attempts to free and mlocked() page. 601 * free_page_mlock() -- clean up attempts to free and mlocked() page.
602 * Page should not be on lru, so no need to fix that up. 602 * Page should not be on lru, so no need to fix that up.
603 * free_pages_check() will verify... 603 * free_pages_check() will verify...
604 */ 604 */
605 static inline void free_page_mlock(struct page *page) 605 static inline void free_page_mlock(struct page *page)
606 { 606 {
607 __dec_zone_page_state(page, NR_MLOCK); 607 __dec_zone_page_state(page, NR_MLOCK);
608 __count_vm_event(UNEVICTABLE_MLOCKFREED); 608 __count_vm_event(UNEVICTABLE_MLOCKFREED);
609 } 609 }
610 610
611 static inline int free_pages_check(struct page *page) 611 static inline int free_pages_check(struct page *page)
612 { 612 {
613 if (unlikely(page_mapcount(page) | 613 if (unlikely(page_mapcount(page) |
614 (page->mapping != NULL) | 614 (page->mapping != NULL) |
615 (atomic_read(&page->_count) != 0) | 615 (atomic_read(&page->_count) != 0) |
616 (page->flags & PAGE_FLAGS_CHECK_AT_FREE) | 616 (page->flags & PAGE_FLAGS_CHECK_AT_FREE) |
617 (mem_cgroup_bad_page_check(page)))) { 617 (mem_cgroup_bad_page_check(page)))) {
618 bad_page(page); 618 bad_page(page);
619 return 1; 619 return 1;
620 } 620 }
621 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) 621 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
622 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 622 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
623 return 0; 623 return 0;
624 } 624 }
625 625
626 /* 626 /*
627 * Frees a number of pages from the PCP lists 627 * Frees a number of pages from the PCP lists
628 * Assumes all pages on list are in same zone, and of same order. 628 * Assumes all pages on list are in same zone, and of same order.
629 * count is the number of pages to free. 629 * count is the number of pages to free.
630 * 630 *
631 * If the zone was previously in an "all pages pinned" state then look to 631 * If the zone was previously in an "all pages pinned" state then look to
632 * see if this freeing clears that state. 632 * see if this freeing clears that state.
633 * 633 *
634 * And clear the zone's pages_scanned counter, to hold off the "all pages are 634 * And clear the zone's pages_scanned counter, to hold off the "all pages are
635 * pinned" detection logic. 635 * pinned" detection logic.
636 */ 636 */
637 static void free_pcppages_bulk(struct zone *zone, int count, 637 static void free_pcppages_bulk(struct zone *zone, int count,
638 struct per_cpu_pages *pcp) 638 struct per_cpu_pages *pcp)
639 { 639 {
640 int migratetype = 0; 640 int migratetype = 0;
641 int batch_free = 0; 641 int batch_free = 0;
642 int to_free = count; 642 int to_free = count;
643 643
644 spin_lock(&zone->lock); 644 spin_lock(&zone->lock);
645 zone->all_unreclaimable = 0; 645 zone->all_unreclaimable = 0;
646 zone->pages_scanned = 0; 646 zone->pages_scanned = 0;
647 647
648 while (to_free) { 648 while (to_free) {
649 struct page *page; 649 struct page *page;
650 struct list_head *list; 650 struct list_head *list;
651 651
652 /* 652 /*
653 * Remove pages from lists in a round-robin fashion. A 653 * Remove pages from lists in a round-robin fashion. A
654 * batch_free count is maintained that is incremented when an 654 * batch_free count is maintained that is incremented when an
655 * empty list is encountered. This is so more pages are freed 655 * empty list is encountered. This is so more pages are freed
656 * off fuller lists instead of spinning excessively around empty 656 * off fuller lists instead of spinning excessively around empty
657 * lists 657 * lists
658 */ 658 */
659 do { 659 do {
660 batch_free++; 660 batch_free++;
661 if (++migratetype == MIGRATE_PCPTYPES) 661 if (++migratetype == MIGRATE_PCPTYPES)
662 migratetype = 0; 662 migratetype = 0;
663 list = &pcp->lists[migratetype]; 663 list = &pcp->lists[migratetype];
664 } while (list_empty(list)); 664 } while (list_empty(list));
665 665
666 /* This is the only non-empty list. Free them all. */ 666 /* This is the only non-empty list. Free them all. */
667 if (batch_free == MIGRATE_PCPTYPES) 667 if (batch_free == MIGRATE_PCPTYPES)
668 batch_free = to_free; 668 batch_free = to_free;
669 669
670 do { 670 do {
671 page = list_entry(list->prev, struct page, lru); 671 page = list_entry(list->prev, struct page, lru);
672 /* must delete as __free_one_page list manipulates */ 672 /* must delete as __free_one_page list manipulates */
673 list_del(&page->lru); 673 list_del(&page->lru);
674 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ 674 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
675 __free_one_page(page, zone, 0, page_private(page)); 675 __free_one_page(page, zone, 0, page_private(page));
676 trace_mm_page_pcpu_drain(page, 0, page_private(page)); 676 trace_mm_page_pcpu_drain(page, 0, page_private(page));
677 } while (--to_free && --batch_free && !list_empty(list)); 677 } while (--to_free && --batch_free && !list_empty(list));
678 } 678 }
679 __mod_zone_page_state(zone, NR_FREE_PAGES, count); 679 __mod_zone_page_state(zone, NR_FREE_PAGES, count);
680 spin_unlock(&zone->lock); 680 spin_unlock(&zone->lock);
681 } 681 }
682 682
683 static void free_one_page(struct zone *zone, struct page *page, int order, 683 static void free_one_page(struct zone *zone, struct page *page, int order,
684 int migratetype) 684 int migratetype)
685 { 685 {
686 spin_lock(&zone->lock); 686 spin_lock(&zone->lock);
687 zone->all_unreclaimable = 0; 687 zone->all_unreclaimable = 0;
688 zone->pages_scanned = 0; 688 zone->pages_scanned = 0;
689 689
690 __free_one_page(page, zone, order, migratetype); 690 __free_one_page(page, zone, order, migratetype);
691 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); 691 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
692 spin_unlock(&zone->lock); 692 spin_unlock(&zone->lock);
693 } 693 }
694 694
695 static bool free_pages_prepare(struct page *page, unsigned int order) 695 static bool free_pages_prepare(struct page *page, unsigned int order)
696 { 696 {
697 int i; 697 int i;
698 int bad = 0; 698 int bad = 0;
699 699
700 trace_mm_page_free(page, order); 700 trace_mm_page_free(page, order);
701 kmemcheck_free_shadow(page, order); 701 kmemcheck_free_shadow(page, order);
702 702
703 if (PageAnon(page)) 703 if (PageAnon(page))
704 page->mapping = NULL; 704 page->mapping = NULL;
705 for (i = 0; i < (1 << order); i++) 705 for (i = 0; i < (1 << order); i++)
706 bad += free_pages_check(page + i); 706 bad += free_pages_check(page + i);
707 if (bad) 707 if (bad)
708 return false; 708 return false;
709 709
710 if (!PageHighMem(page)) { 710 if (!PageHighMem(page)) {
711 debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order); 711 debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
712 debug_check_no_obj_freed(page_address(page), 712 debug_check_no_obj_freed(page_address(page),
713 PAGE_SIZE << order); 713 PAGE_SIZE << order);
714 } 714 }
715 arch_free_page(page, order); 715 arch_free_page(page, order);
716 kernel_map_pages(page, 1 << order, 0); 716 kernel_map_pages(page, 1 << order, 0);
717 717
718 return true; 718 return true;
719 } 719 }
720 720
721 static void __free_pages_ok(struct page *page, unsigned int order) 721 static void __free_pages_ok(struct page *page, unsigned int order)
722 { 722 {
723 unsigned long flags; 723 unsigned long flags;
724 int wasMlocked = __TestClearPageMlocked(page); 724 int wasMlocked = __TestClearPageMlocked(page);
725 725
726 if (!free_pages_prepare(page, order)) 726 if (!free_pages_prepare(page, order))
727 return; 727 return;
728 728
729 local_irq_save(flags); 729 local_irq_save(flags);
730 if (unlikely(wasMlocked)) 730 if (unlikely(wasMlocked))
731 free_page_mlock(page); 731 free_page_mlock(page);
732 __count_vm_events(PGFREE, 1 << order); 732 __count_vm_events(PGFREE, 1 << order);
733 free_one_page(page_zone(page), page, order, 733 free_one_page(page_zone(page), page, order,
734 get_pageblock_migratetype(page)); 734 get_pageblock_migratetype(page));
735 local_irq_restore(flags); 735 local_irq_restore(flags);
736 } 736 }
737 737
738 void __meminit __free_pages_bootmem(struct page *page, unsigned int order) 738 void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
739 { 739 {
740 unsigned int nr_pages = 1 << order; 740 unsigned int nr_pages = 1 << order;
741 unsigned int loop; 741 unsigned int loop;
742 742
743 prefetchw(page); 743 prefetchw(page);
744 for (loop = 0; loop < nr_pages; loop++) { 744 for (loop = 0; loop < nr_pages; loop++) {
745 struct page *p = &page[loop]; 745 struct page *p = &page[loop];
746 746
747 if (loop + 1 < nr_pages) 747 if (loop + 1 < nr_pages)
748 prefetchw(p + 1); 748 prefetchw(p + 1);
749 __ClearPageReserved(p); 749 __ClearPageReserved(p);
750 set_page_count(p, 0); 750 set_page_count(p, 0);
751 } 751 }
752 752
753 set_page_refcounted(page); 753 set_page_refcounted(page);
754 __free_pages(page, order); 754 __free_pages(page, order);
755 } 755 }
756 756
757 #ifdef CONFIG_CMA 757 #ifdef CONFIG_CMA
758 /* Free whole pageblock and set it's migration type to MIGRATE_CMA. */ 758 /* Free whole pageblock and set it's migration type to MIGRATE_CMA. */
759 void __init init_cma_reserved_pageblock(struct page *page) 759 void __init init_cma_reserved_pageblock(struct page *page)
760 { 760 {
761 unsigned i = pageblock_nr_pages; 761 unsigned i = pageblock_nr_pages;
762 struct page *p = page; 762 struct page *p = page;
763 763
764 do { 764 do {
765 __ClearPageReserved(p); 765 __ClearPageReserved(p);
766 set_page_count(p, 0); 766 set_page_count(p, 0);
767 } while (++p, --i); 767 } while (++p, --i);
768 768
769 set_page_refcounted(page); 769 set_page_refcounted(page);
770 set_pageblock_migratetype(page, MIGRATE_CMA); 770 set_pageblock_migratetype(page, MIGRATE_CMA);
771 __free_pages(page, pageblock_order); 771 __free_pages(page, pageblock_order);
772 totalram_pages += pageblock_nr_pages; 772 totalram_pages += pageblock_nr_pages;
773 } 773 }
774 #endif 774 #endif
775 775
776 /* 776 /*
777 * The order of subdivision here is critical for the IO subsystem. 777 * The order of subdivision here is critical for the IO subsystem.
778 * Please do not alter this order without good reasons and regression 778 * Please do not alter this order without good reasons and regression
779 * testing. Specifically, as large blocks of memory are subdivided, 779 * testing. Specifically, as large blocks of memory are subdivided,
780 * the order in which smaller blocks are delivered depends on the order 780 * the order in which smaller blocks are delivered depends on the order
781 * they're subdivided in this function. This is the primary factor 781 * they're subdivided in this function. This is the primary factor
782 * influencing the order in which pages are delivered to the IO 782 * influencing the order in which pages are delivered to the IO
783 * subsystem according to empirical testing, and this is also justified 783 * subsystem according to empirical testing, and this is also justified
784 * by considering the behavior of a buddy system containing a single 784 * by considering the behavior of a buddy system containing a single
785 * large block of memory acted on by a series of small allocations. 785 * large block of memory acted on by a series of small allocations.
786 * This behavior is a critical factor in sglist merging's success. 786 * This behavior is a critical factor in sglist merging's success.
787 * 787 *
788 * -- wli 788 * -- wli
789 */ 789 */
790 static inline void expand(struct zone *zone, struct page *page, 790 static inline void expand(struct zone *zone, struct page *page,
791 int low, int high, struct free_area *area, 791 int low, int high, struct free_area *area,
792 int migratetype) 792 int migratetype)
793 { 793 {
794 unsigned long size = 1 << high; 794 unsigned long size = 1 << high;
795 795
796 while (high > low) { 796 while (high > low) {
797 area--; 797 area--;
798 high--; 798 high--;
799 size >>= 1; 799 size >>= 1;
800 VM_BUG_ON(bad_range(zone, &page[size])); 800 VM_BUG_ON(bad_range(zone, &page[size]));
801 801
802 #ifdef CONFIG_DEBUG_PAGEALLOC 802 #ifdef CONFIG_DEBUG_PAGEALLOC
803 if (high < debug_guardpage_minorder()) { 803 if (high < debug_guardpage_minorder()) {
804 /* 804 /*
805 * Mark as guard pages (or page), that will allow to 805 * Mark as guard pages (or page), that will allow to
806 * merge back to allocator when buddy will be freed. 806 * merge back to allocator when buddy will be freed.
807 * Corresponding page table entries will not be touched, 807 * Corresponding page table entries will not be touched,
808 * pages will stay not present in virtual address space 808 * pages will stay not present in virtual address space
809 */ 809 */
810 INIT_LIST_HEAD(&page[size].lru); 810 INIT_LIST_HEAD(&page[size].lru);
811 set_page_guard_flag(&page[size]); 811 set_page_guard_flag(&page[size]);
812 set_page_private(&page[size], high); 812 set_page_private(&page[size], high);
813 /* Guard pages are not available for any usage */ 813 /* Guard pages are not available for any usage */
814 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high)); 814 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high));
815 continue; 815 continue;
816 } 816 }
817 #endif 817 #endif
818 list_add(&page[size].lru, &area->free_list[migratetype]); 818 list_add(&page[size].lru, &area->free_list[migratetype]);
819 area->nr_free++; 819 area->nr_free++;
820 set_page_order(&page[size], high); 820 set_page_order(&page[size], high);
821 } 821 }
822 } 822 }
823 823
824 /* 824 /*
825 * This page is about to be returned from the page allocator 825 * This page is about to be returned from the page allocator
826 */ 826 */
827 static inline int check_new_page(struct page *page) 827 static inline int check_new_page(struct page *page)
828 { 828 {
829 if (unlikely(page_mapcount(page) | 829 if (unlikely(page_mapcount(page) |
830 (page->mapping != NULL) | 830 (page->mapping != NULL) |
831 (atomic_read(&page->_count) != 0) | 831 (atomic_read(&page->_count) != 0) |
832 (page->flags & PAGE_FLAGS_CHECK_AT_PREP) | 832 (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
833 (mem_cgroup_bad_page_check(page)))) { 833 (mem_cgroup_bad_page_check(page)))) {
834 bad_page(page); 834 bad_page(page);
835 return 1; 835 return 1;
836 } 836 }
837 return 0; 837 return 0;
838 } 838 }
839 839
840 static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) 840 static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
841 { 841 {
842 int i; 842 int i;
843 843
844 for (i = 0; i < (1 << order); i++) { 844 for (i = 0; i < (1 << order); i++) {
845 struct page *p = page + i; 845 struct page *p = page + i;
846 if (unlikely(check_new_page(p))) 846 if (unlikely(check_new_page(p)))
847 return 1; 847 return 1;
848 } 848 }
849 849
850 set_page_private(page, 0); 850 set_page_private(page, 0);
851 set_page_refcounted(page); 851 set_page_refcounted(page);
852 852
853 arch_alloc_page(page, order); 853 arch_alloc_page(page, order);
854 kernel_map_pages(page, 1 << order, 1); 854 kernel_map_pages(page, 1 << order, 1);
855 855
856 if (gfp_flags & __GFP_ZERO) 856 if (gfp_flags & __GFP_ZERO)
857 prep_zero_page(page, order, gfp_flags); 857 prep_zero_page(page, order, gfp_flags);
858 858
859 if (order && (gfp_flags & __GFP_COMP)) 859 if (order && (gfp_flags & __GFP_COMP))
860 prep_compound_page(page, order); 860 prep_compound_page(page, order);
861 861
862 return 0; 862 return 0;
863 } 863 }
864 864
865 /* 865 /*
866 * Go through the free lists for the given migratetype and remove 866 * Go through the free lists for the given migratetype and remove
867 * the smallest available page from the freelists 867 * the smallest available page from the freelists
868 */ 868 */
869 static inline 869 static inline
870 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, 870 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
871 int migratetype) 871 int migratetype)
872 { 872 {
873 unsigned int current_order; 873 unsigned int current_order;
874 struct free_area * area; 874 struct free_area * area;
875 struct page *page; 875 struct page *page;
876 876
877 /* Find a page of the appropriate size in the preferred list */ 877 /* Find a page of the appropriate size in the preferred list */
878 for (current_order = order; current_order < MAX_ORDER; ++current_order) { 878 for (current_order = order; current_order < MAX_ORDER; ++current_order) {
879 area = &(zone->free_area[current_order]); 879 area = &(zone->free_area[current_order]);
880 if (list_empty(&area->free_list[migratetype])) 880 if (list_empty(&area->free_list[migratetype]))
881 continue; 881 continue;
882 882
883 page = list_entry(area->free_list[migratetype].next, 883 page = list_entry(area->free_list[migratetype].next,
884 struct page, lru); 884 struct page, lru);
885 list_del(&page->lru); 885 list_del(&page->lru);
886 rmv_page_order(page); 886 rmv_page_order(page);
887 area->nr_free--; 887 area->nr_free--;
888 expand(zone, page, order, current_order, area, migratetype); 888 expand(zone, page, order, current_order, area, migratetype);
889 return page; 889 return page;
890 } 890 }
891 891
892 return NULL; 892 return NULL;
893 } 893 }
894 894
895 895
896 /* 896 /*
897 * This array describes the order lists are fallen back to when 897 * This array describes the order lists are fallen back to when
898 * the free lists for the desirable migrate type are depleted 898 * the free lists for the desirable migrate type are depleted
899 */ 899 */
900 static int fallbacks[MIGRATE_TYPES][4] = { 900 static int fallbacks[MIGRATE_TYPES][4] = {
901 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, 901 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
902 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, 902 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
903 #ifdef CONFIG_CMA 903 #ifdef CONFIG_CMA
904 [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, 904 [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
905 [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */ 905 [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */
906 #else 906 #else
907 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, 907 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
908 #endif 908 #endif
909 [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ 909 [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */
910 [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */ 910 [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */
911 }; 911 };
912 912
913 /* 913 /*
914 * Move the free pages in a range to the free lists of the requested type. 914 * Move the free pages in a range to the free lists of the requested type.
915 * Note that start_page and end_pages are not aligned on a pageblock 915 * Note that start_page and end_pages are not aligned on a pageblock
916 * boundary. If alignment is required, use move_freepages_block() 916 * boundary. If alignment is required, use move_freepages_block()
917 */ 917 */
918 static int move_freepages(struct zone *zone, 918 static int move_freepages(struct zone *zone,
919 struct page *start_page, struct page *end_page, 919 struct page *start_page, struct page *end_page,
920 int migratetype) 920 int migratetype)
921 { 921 {
922 struct page *page; 922 struct page *page;
923 unsigned long order; 923 unsigned long order;
924 int pages_moved = 0; 924 int pages_moved = 0;
925 925
926 #ifndef CONFIG_HOLES_IN_ZONE 926 #ifndef CONFIG_HOLES_IN_ZONE
927 /* 927 /*
928 * page_zone is not safe to call in this context when 928 * page_zone is not safe to call in this context when
929 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant 929 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
930 * anyway as we check zone boundaries in move_freepages_block(). 930 * anyway as we check zone boundaries in move_freepages_block().
931 * Remove at a later date when no bug reports exist related to 931 * Remove at a later date when no bug reports exist related to
932 * grouping pages by mobility 932 * grouping pages by mobility
933 */ 933 */
934 BUG_ON(page_zone(start_page) != page_zone(end_page)); 934 BUG_ON(page_zone(start_page) != page_zone(end_page));
935 #endif 935 #endif
936 936
937 for (page = start_page; page <= end_page;) { 937 for (page = start_page; page <= end_page;) {
938 /* Make sure we are not inadvertently changing nodes */ 938 /* Make sure we are not inadvertently changing nodes */
939 VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone)); 939 VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));
940 940
941 if (!pfn_valid_within(page_to_pfn(page))) { 941 if (!pfn_valid_within(page_to_pfn(page))) {
942 page++; 942 page++;
943 continue; 943 continue;
944 } 944 }
945 945
946 if (!PageBuddy(page)) { 946 if (!PageBuddy(page)) {
947 page++; 947 page++;
948 continue; 948 continue;
949 } 949 }
950 950
951 order = page_order(page); 951 order = page_order(page);
952 list_move(&page->lru, 952 list_move(&page->lru,
953 &zone->free_area[order].free_list[migratetype]); 953 &zone->free_area[order].free_list[migratetype]);
954 page += 1 << order; 954 page += 1 << order;
955 pages_moved += 1 << order; 955 pages_moved += 1 << order;
956 } 956 }
957 957
958 return pages_moved; 958 return pages_moved;
959 } 959 }
960 960
961 int move_freepages_block(struct zone *zone, struct page *page, 961 int move_freepages_block(struct zone *zone, struct page *page,
962 int migratetype) 962 int migratetype)
963 { 963 {
964 unsigned long start_pfn, end_pfn; 964 unsigned long start_pfn, end_pfn;
965 struct page *start_page, *end_page; 965 struct page *start_page, *end_page;
966 966
967 start_pfn = page_to_pfn(page); 967 start_pfn = page_to_pfn(page);
968 start_pfn = start_pfn & ~(pageblock_nr_pages-1); 968 start_pfn = start_pfn & ~(pageblock_nr_pages-1);
969 start_page = pfn_to_page(start_pfn); 969 start_page = pfn_to_page(start_pfn);
970 end_page = start_page + pageblock_nr_pages - 1; 970 end_page = start_page + pageblock_nr_pages - 1;
971 end_pfn = start_pfn + pageblock_nr_pages - 1; 971 end_pfn = start_pfn + pageblock_nr_pages - 1;
972 972
973 /* Do not cross zone boundaries */ 973 /* Do not cross zone boundaries */
974 if (start_pfn < zone->zone_start_pfn) 974 if (start_pfn < zone->zone_start_pfn)
975 start_page = page; 975 start_page = page;
976 if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages) 976 if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages)
977 return 0; 977 return 0;
978 978
979 return move_freepages(zone, start_page, end_page, migratetype); 979 return move_freepages(zone, start_page, end_page, migratetype);
980 } 980 }
981 981
982 static void change_pageblock_range(struct page *pageblock_page, 982 static void change_pageblock_range(struct page *pageblock_page,
983 int start_order, int migratetype) 983 int start_order, int migratetype)
984 { 984 {
985 int nr_pageblocks = 1 << (start_order - pageblock_order); 985 int nr_pageblocks = 1 << (start_order - pageblock_order);
986 986
987 while (nr_pageblocks--) { 987 while (nr_pageblocks--) {
988 set_pageblock_migratetype(pageblock_page, migratetype); 988 set_pageblock_migratetype(pageblock_page, migratetype);
989 pageblock_page += pageblock_nr_pages; 989 pageblock_page += pageblock_nr_pages;
990 } 990 }
991 } 991 }
992 992
993 /* Remove an element from the buddy allocator from the fallback list */ 993 /* Remove an element from the buddy allocator from the fallback list */
994 static inline struct page * 994 static inline struct page *
995 __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) 995 __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
996 { 996 {
997 struct free_area * area; 997 struct free_area * area;
998 int current_order; 998 int current_order;
999 struct page *page; 999 struct page *page;
1000 int migratetype, i; 1000 int migratetype, i;
1001 1001
1002 /* Find the largest possible block of pages in the other list */ 1002 /* Find the largest possible block of pages in the other list */
1003 for (current_order = MAX_ORDER-1; current_order >= order; 1003 for (current_order = MAX_ORDER-1; current_order >= order;
1004 --current_order) { 1004 --current_order) {
1005 for (i = 0;; i++) { 1005 for (i = 0;; i++) {
1006 migratetype = fallbacks[start_migratetype][i]; 1006 migratetype = fallbacks[start_migratetype][i];
1007 1007
1008 /* MIGRATE_RESERVE handled later if necessary */ 1008 /* MIGRATE_RESERVE handled later if necessary */
1009 if (migratetype == MIGRATE_RESERVE) 1009 if (migratetype == MIGRATE_RESERVE)
1010 break; 1010 break;
1011 1011
1012 area = &(zone->free_area[current_order]); 1012 area = &(zone->free_area[current_order]);
1013 if (list_empty(&area->free_list[migratetype])) 1013 if (list_empty(&area->free_list[migratetype]))
1014 continue; 1014 continue;
1015 1015
1016 page = list_entry(area->free_list[migratetype].next, 1016 page = list_entry(area->free_list[migratetype].next,
1017 struct page, lru); 1017 struct page, lru);
1018 area->nr_free--; 1018 area->nr_free--;
1019 1019
1020 /* 1020 /*
1021 * If breaking a large block of pages, move all free 1021 * If breaking a large block of pages, move all free
1022 * pages to the preferred allocation list. If falling 1022 * pages to the preferred allocation list. If falling
1023 * back for a reclaimable kernel allocation, be more 1023 * back for a reclaimable kernel allocation, be more
1024 * aggressive about taking ownership of free pages 1024 * aggressive about taking ownership of free pages
1025 * 1025 *
1026 * On the other hand, never change migration 1026 * On the other hand, never change migration
1027 * type of MIGRATE_CMA pageblocks nor move CMA 1027 * type of MIGRATE_CMA pageblocks nor move CMA
1028 * pages on different free lists. We don't 1028 * pages on different free lists. We don't
1029 * want unmovable pages to be allocated from 1029 * want unmovable pages to be allocated from
1030 * MIGRATE_CMA areas. 1030 * MIGRATE_CMA areas.
1031 */ 1031 */
1032 if (!is_migrate_cma(migratetype) && 1032 if (!is_migrate_cma(migratetype) &&
1033 (unlikely(current_order >= pageblock_order / 2) || 1033 (unlikely(current_order >= pageblock_order / 2) ||
1034 start_migratetype == MIGRATE_RECLAIMABLE || 1034 start_migratetype == MIGRATE_RECLAIMABLE ||
1035 page_group_by_mobility_disabled)) { 1035 page_group_by_mobility_disabled)) {
1036 int pages; 1036 int pages;
1037 pages = move_freepages_block(zone, page, 1037 pages = move_freepages_block(zone, page,
1038 start_migratetype); 1038 start_migratetype);
1039 1039
1040 /* Claim the whole block if over half of it is free */ 1040 /* Claim the whole block if over half of it is free */
1041 if (pages >= (1 << (pageblock_order-1)) || 1041 if (pages >= (1 << (pageblock_order-1)) ||
1042 page_group_by_mobility_disabled) 1042 page_group_by_mobility_disabled)
1043 set_pageblock_migratetype(page, 1043 set_pageblock_migratetype(page,
1044 start_migratetype); 1044 start_migratetype);
1045 1045
1046 migratetype = start_migratetype; 1046 migratetype = start_migratetype;
1047 } 1047 }
1048 1048
1049 /* Remove the page from the freelists */ 1049 /* Remove the page from the freelists */
1050 list_del(&page->lru); 1050 list_del(&page->lru);
1051 rmv_page_order(page); 1051 rmv_page_order(page);
1052 1052
1053 /* Take ownership for orders >= pageblock_order */ 1053 /* Take ownership for orders >= pageblock_order */
1054 if (current_order >= pageblock_order && 1054 if (current_order >= pageblock_order &&
1055 !is_migrate_cma(migratetype)) 1055 !is_migrate_cma(migratetype))
1056 change_pageblock_range(page, current_order, 1056 change_pageblock_range(page, current_order,
1057 start_migratetype); 1057 start_migratetype);
1058 1058
1059 expand(zone, page, order, current_order, area, 1059 expand(zone, page, order, current_order, area,
1060 is_migrate_cma(migratetype) 1060 is_migrate_cma(migratetype)
1061 ? migratetype : start_migratetype); 1061 ? migratetype : start_migratetype);
1062 1062
1063 trace_mm_page_alloc_extfrag(page, order, current_order, 1063 trace_mm_page_alloc_extfrag(page, order, current_order,
1064 start_migratetype, migratetype); 1064 start_migratetype, migratetype);
1065 1065
1066 return page; 1066 return page;
1067 } 1067 }
1068 } 1068 }
1069 1069
1070 return NULL; 1070 return NULL;
1071 } 1071 }
1072 1072
1073 /* 1073 /*
1074 * Do the hard work of removing an element from the buddy allocator. 1074 * Do the hard work of removing an element from the buddy allocator.
1075 * Call me with the zone->lock already held. 1075 * Call me with the zone->lock already held.
1076 */ 1076 */
1077 static struct page *__rmqueue(struct zone *zone, unsigned int order, 1077 static struct page *__rmqueue(struct zone *zone, unsigned int order,
1078 int migratetype) 1078 int migratetype)
1079 { 1079 {
1080 struct page *page; 1080 struct page *page;
1081 1081
1082 retry_reserve: 1082 retry_reserve:
1083 page = __rmqueue_smallest(zone, order, migratetype); 1083 page = __rmqueue_smallest(zone, order, migratetype);
1084 1084
1085 if (unlikely(!page) && migratetype != MIGRATE_RESERVE) { 1085 if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
1086 page = __rmqueue_fallback(zone, order, migratetype); 1086 page = __rmqueue_fallback(zone, order, migratetype);
1087 1087
1088 /* 1088 /*
1089 * Use MIGRATE_RESERVE rather than fail an allocation. goto 1089 * Use MIGRATE_RESERVE rather than fail an allocation. goto
1090 * is used because __rmqueue_smallest is an inline function 1090 * is used because __rmqueue_smallest is an inline function
1091 * and we want just one call site 1091 * and we want just one call site
1092 */ 1092 */
1093 if (!page) { 1093 if (!page) {
1094 migratetype = MIGRATE_RESERVE; 1094 migratetype = MIGRATE_RESERVE;
1095 goto retry_reserve; 1095 goto retry_reserve;
1096 } 1096 }
1097 } 1097 }
1098 1098
1099 trace_mm_page_alloc_zone_locked(page, order, migratetype); 1099 trace_mm_page_alloc_zone_locked(page, order, migratetype);
1100 return page; 1100 return page;
1101 } 1101 }
1102 1102
1103 /* 1103 /*
1104 * Obtain a specified number of elements from the buddy allocator, all under 1104 * Obtain a specified number of elements from the buddy allocator, all under
1105 * a single hold of the lock, for efficiency. Add them to the supplied list. 1105 * a single hold of the lock, for efficiency. Add them to the supplied list.
1106 * Returns the number of new pages which were placed at *list. 1106 * Returns the number of new pages which were placed at *list.
1107 */ 1107 */
1108 static int rmqueue_bulk(struct zone *zone, unsigned int order, 1108 static int rmqueue_bulk(struct zone *zone, unsigned int order,
1109 unsigned long count, struct list_head *list, 1109 unsigned long count, struct list_head *list,
1110 int migratetype, int cold) 1110 int migratetype, int cold)
1111 { 1111 {
1112 int mt = migratetype, i; 1112 int mt = migratetype, i;
1113 1113
1114 spin_lock(&zone->lock); 1114 spin_lock(&zone->lock);
1115 for (i = 0; i < count; ++i) { 1115 for (i = 0; i < count; ++i) {
1116 struct page *page = __rmqueue(zone, order, migratetype); 1116 struct page *page = __rmqueue(zone, order, migratetype);
1117 if (unlikely(page == NULL)) 1117 if (unlikely(page == NULL))
1118 break; 1118 break;
1119 1119
1120 /* 1120 /*
1121 * Split buddy pages returned by expand() are received here 1121 * Split buddy pages returned by expand() are received here
1122 * in physical page order. The page is added to the callers and 1122 * in physical page order. The page is added to the callers and
1123 * list and the list head then moves forward. From the callers 1123 * list and the list head then moves forward. From the callers
1124 * perspective, the linked list is ordered by page number in 1124 * perspective, the linked list is ordered by page number in
1125 * some conditions. This is useful for IO devices that can 1125 * some conditions. This is useful for IO devices that can
1126 * merge IO requests if the physical pages are ordered 1126 * merge IO requests if the physical pages are ordered
1127 * properly. 1127 * properly.
1128 */ 1128 */
1129 if (likely(cold == 0)) 1129 if (likely(cold == 0))
1130 list_add(&page->lru, list); 1130 list_add(&page->lru, list);
1131 else 1131 else
1132 list_add_tail(&page->lru, list); 1132 list_add_tail(&page->lru, list);
1133 if (IS_ENABLED(CONFIG_CMA)) { 1133 if (IS_ENABLED(CONFIG_CMA)) {
1134 mt = get_pageblock_migratetype(page); 1134 mt = get_pageblock_migratetype(page);
1135 if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE) 1135 if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE)
1136 mt = migratetype; 1136 mt = migratetype;
1137 } 1137 }
1138 set_page_private(page, mt); 1138 set_page_private(page, mt);
1139 list = &page->lru; 1139 list = &page->lru;
1140 } 1140 }
1141 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); 1141 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
1142 spin_unlock(&zone->lock); 1142 spin_unlock(&zone->lock);
1143 return i; 1143 return i;
1144 } 1144 }
1145 1145
1146 #ifdef CONFIG_NUMA 1146 #ifdef CONFIG_NUMA
1147 /* 1147 /*
1148 * Called from the vmstat counter updater to drain pagesets of this 1148 * Called from the vmstat counter updater to drain pagesets of this
1149 * currently executing processor on remote nodes after they have 1149 * currently executing processor on remote nodes after they have
1150 * expired. 1150 * expired.
1151 * 1151 *
1152 * Note that this function must be called with the thread pinned to 1152 * Note that this function must be called with the thread pinned to
1153 * a single processor. 1153 * a single processor.
1154 */ 1154 */
1155 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) 1155 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
1156 { 1156 {
1157 unsigned long flags; 1157 unsigned long flags;
1158 int to_drain; 1158 int to_drain;
1159 1159
1160 local_irq_save(flags); 1160 local_irq_save(flags);
1161 if (pcp->count >= pcp->batch) 1161 if (pcp->count >= pcp->batch)
1162 to_drain = pcp->batch; 1162 to_drain = pcp->batch;
1163 else 1163 else
1164 to_drain = pcp->count; 1164 to_drain = pcp->count;
1165 if (to_drain > 0) { 1165 if (to_drain > 0) {
1166 free_pcppages_bulk(zone, to_drain, pcp); 1166 free_pcppages_bulk(zone, to_drain, pcp);
1167 pcp->count -= to_drain; 1167 pcp->count -= to_drain;
1168 } 1168 }
1169 local_irq_restore(flags); 1169 local_irq_restore(flags);
1170 } 1170 }
1171 #endif 1171 #endif
1172 1172
1173 /* 1173 /*
1174 * Drain pages of the indicated processor. 1174 * Drain pages of the indicated processor.
1175 * 1175 *
1176 * The processor must either be the current processor and the 1176 * The processor must either be the current processor and the
1177 * thread pinned to the current processor or a processor that 1177 * thread pinned to the current processor or a processor that
1178 * is not online. 1178 * is not online.
1179 */ 1179 */
1180 static void drain_pages(unsigned int cpu) 1180 static void drain_pages(unsigned int cpu)
1181 { 1181 {
1182 unsigned long flags; 1182 unsigned long flags;
1183 struct zone *zone; 1183 struct zone *zone;
1184 1184
1185 for_each_populated_zone(zone) { 1185 for_each_populated_zone(zone) {
1186 struct per_cpu_pageset *pset; 1186 struct per_cpu_pageset *pset;
1187 struct per_cpu_pages *pcp; 1187 struct per_cpu_pages *pcp;
1188 1188
1189 local_irq_save(flags); 1189 local_irq_save(flags);
1190 pset = per_cpu_ptr(zone->pageset, cpu); 1190 pset = per_cpu_ptr(zone->pageset, cpu);
1191 1191
1192 pcp = &pset->pcp; 1192 pcp = &pset->pcp;
1193 if (pcp->count) { 1193 if (pcp->count) {
1194 free_pcppages_bulk(zone, pcp->count, pcp); 1194 free_pcppages_bulk(zone, pcp->count, pcp);
1195 pcp->count = 0; 1195 pcp->count = 0;
1196 } 1196 }
1197 local_irq_restore(flags); 1197 local_irq_restore(flags);
1198 } 1198 }
1199 } 1199 }
1200 1200
1201 /* 1201 /*
1202 * Spill all of this CPU's per-cpu pages back into the buddy allocator. 1202 * Spill all of this CPU's per-cpu pages back into the buddy allocator.
1203 */ 1203 */
1204 void drain_local_pages(void *arg) 1204 void drain_local_pages(void *arg)
1205 { 1205 {
1206 drain_pages(smp_processor_id()); 1206 drain_pages(smp_processor_id());
1207 } 1207 }
1208 1208
1209 /* 1209 /*
1210 * Spill all the per-cpu pages from all CPUs back into the buddy allocator. 1210 * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
1211 * 1211 *
1212 * Note that this code is protected against sending an IPI to an offline 1212 * Note that this code is protected against sending an IPI to an offline
1213 * CPU but does not guarantee sending an IPI to newly hotplugged CPUs: 1213 * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
1214 * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but 1214 * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
1215 * nothing keeps CPUs from showing up after we populated the cpumask and 1215 * nothing keeps CPUs from showing up after we populated the cpumask and
1216 * before the call to on_each_cpu_mask(). 1216 * before the call to on_each_cpu_mask().
1217 */ 1217 */
1218 void drain_all_pages(void) 1218 void drain_all_pages(void)
1219 { 1219 {
1220 int cpu; 1220 int cpu;
1221 struct per_cpu_pageset *pcp; 1221 struct per_cpu_pageset *pcp;
1222 struct zone *zone; 1222 struct zone *zone;
1223 1223
1224 /* 1224 /*
1225 * Allocate in the BSS so we wont require allocation in 1225 * Allocate in the BSS so we wont require allocation in
1226 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y 1226 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
1227 */ 1227 */
1228 static cpumask_t cpus_with_pcps; 1228 static cpumask_t cpus_with_pcps;
1229 1229
1230 /* 1230 /*
1231 * We don't care about racing with CPU hotplug event 1231 * We don't care about racing with CPU hotplug event
1232 * as offline notification will cause the notified 1232 * as offline notification will cause the notified
1233 * cpu to drain that CPU pcps and on_each_cpu_mask 1233 * cpu to drain that CPU pcps and on_each_cpu_mask
1234 * disables preemption as part of its processing 1234 * disables preemption as part of its processing
1235 */ 1235 */
1236 for_each_online_cpu(cpu) { 1236 for_each_online_cpu(cpu) {
1237 bool has_pcps = false; 1237 bool has_pcps = false;
1238 for_each_populated_zone(zone) { 1238 for_each_populated_zone(zone) {
1239 pcp = per_cpu_ptr(zone->pageset, cpu); 1239 pcp = per_cpu_ptr(zone->pageset, cpu);
1240 if (pcp->pcp.count) { 1240 if (pcp->pcp.count) {
1241 has_pcps = true; 1241 has_pcps = true;
1242 break; 1242 break;
1243 } 1243 }
1244 } 1244 }
1245 if (has_pcps) 1245 if (has_pcps)
1246 cpumask_set_cpu(cpu, &cpus_with_pcps); 1246 cpumask_set_cpu(cpu, &cpus_with_pcps);
1247 else 1247 else
1248 cpumask_clear_cpu(cpu, &cpus_with_pcps); 1248 cpumask_clear_cpu(cpu, &cpus_with_pcps);
1249 } 1249 }
1250 on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1); 1250 on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);
1251 } 1251 }
1252 1252
1253 #ifdef CONFIG_HIBERNATION 1253 #ifdef CONFIG_HIBERNATION
1254 1254
1255 void mark_free_pages(struct zone *zone) 1255 void mark_free_pages(struct zone *zone)
1256 { 1256 {
1257 unsigned long pfn, max_zone_pfn; 1257 unsigned long pfn, max_zone_pfn;
1258 unsigned long flags; 1258 unsigned long flags;
1259 int order, t; 1259 int order, t;
1260 struct list_head *curr; 1260 struct list_head *curr;
1261 1261
1262 if (!zone->spanned_pages) 1262 if (!zone->spanned_pages)
1263 return; 1263 return;
1264 1264
1265 spin_lock_irqsave(&zone->lock, flags); 1265 spin_lock_irqsave(&zone->lock, flags);
1266 1266
1267 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 1267 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
1268 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1268 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1269 if (pfn_valid(pfn)) { 1269 if (pfn_valid(pfn)) {
1270 struct page *page = pfn_to_page(pfn); 1270 struct page *page = pfn_to_page(pfn);
1271 1271
1272 if (!swsusp_page_is_forbidden(page)) 1272 if (!swsusp_page_is_forbidden(page))
1273 swsusp_unset_page_free(page); 1273 swsusp_unset_page_free(page);
1274 } 1274 }
1275 1275
1276 for_each_migratetype_order(order, t) { 1276 for_each_migratetype_order(order, t) {
1277 list_for_each(curr, &zone->free_area[order].free_list[t]) { 1277 list_for_each(curr, &zone->free_area[order].free_list[t]) {
1278 unsigned long i; 1278 unsigned long i;
1279 1279
1280 pfn = page_to_pfn(list_entry(curr, struct page, lru)); 1280 pfn = page_to_pfn(list_entry(curr, struct page, lru));
1281 for (i = 0; i < (1UL << order); i++) 1281 for (i = 0; i < (1UL << order); i++)
1282 swsusp_set_page_free(pfn_to_page(pfn + i)); 1282 swsusp_set_page_free(pfn_to_page(pfn + i));
1283 } 1283 }
1284 } 1284 }
1285 spin_unlock_irqrestore(&zone->lock, flags); 1285 spin_unlock_irqrestore(&zone->lock, flags);
1286 } 1286 }
1287 #endif /* CONFIG_PM */ 1287 #endif /* CONFIG_PM */
1288 1288
1289 /* 1289 /*
1290 * Free a 0-order page 1290 * Free a 0-order page
1291 * cold == 1 ? free a cold page : free a hot page 1291 * cold == 1 ? free a cold page : free a hot page
1292 */ 1292 */
1293 void free_hot_cold_page(struct page *page, int cold) 1293 void free_hot_cold_page(struct page *page, int cold)
1294 { 1294 {
1295 struct zone *zone = page_zone(page); 1295 struct zone *zone = page_zone(page);
1296 struct per_cpu_pages *pcp; 1296 struct per_cpu_pages *pcp;
1297 unsigned long flags; 1297 unsigned long flags;
1298 int migratetype; 1298 int migratetype;
1299 int wasMlocked = __TestClearPageMlocked(page); 1299 int wasMlocked = __TestClearPageMlocked(page);
1300 1300
1301 if (!free_pages_prepare(page, 0)) 1301 if (!free_pages_prepare(page, 0))
1302 return; 1302 return;
1303 1303
1304 migratetype = get_pageblock_migratetype(page); 1304 migratetype = get_pageblock_migratetype(page);
1305 set_page_private(page, migratetype); 1305 set_page_private(page, migratetype);
1306 local_irq_save(flags); 1306 local_irq_save(flags);
1307 if (unlikely(wasMlocked)) 1307 if (unlikely(wasMlocked))
1308 free_page_mlock(page); 1308 free_page_mlock(page);
1309 __count_vm_event(PGFREE); 1309 __count_vm_event(PGFREE);
1310 1310
1311 /* 1311 /*
1312 * We only track unmovable, reclaimable and movable on pcp lists. 1312 * We only track unmovable, reclaimable and movable on pcp lists.
1313 * Free ISOLATE pages back to the allocator because they are being 1313 * Free ISOLATE pages back to the allocator because they are being
1314 * offlined but treat RESERVE as movable pages so we can get those 1314 * offlined but treat RESERVE as movable pages so we can get those
1315 * areas back if necessary. Otherwise, we may have to free 1315 * areas back if necessary. Otherwise, we may have to free
1316 * excessively into the page allocator 1316 * excessively into the page allocator
1317 */ 1317 */
1318 if (migratetype >= MIGRATE_PCPTYPES) { 1318 if (migratetype >= MIGRATE_PCPTYPES) {
1319 if (unlikely(migratetype == MIGRATE_ISOLATE)) { 1319 if (unlikely(migratetype == MIGRATE_ISOLATE)) {
1320 free_one_page(zone, page, 0, migratetype); 1320 free_one_page(zone, page, 0, migratetype);
1321 goto out; 1321 goto out;
1322 } 1322 }
1323 migratetype = MIGRATE_MOVABLE; 1323 migratetype = MIGRATE_MOVABLE;
1324 } 1324 }
1325 1325
1326 pcp = &this_cpu_ptr(zone->pageset)->pcp; 1326 pcp = &this_cpu_ptr(zone->pageset)->pcp;
1327 if (cold) 1327 if (cold)
1328 list_add_tail(&page->lru, &pcp->lists[migratetype]); 1328 list_add_tail(&page->lru, &pcp->lists[migratetype]);
1329 else 1329 else
1330 list_add(&page->lru, &pcp->lists[migratetype]); 1330 list_add(&page->lru, &pcp->lists[migratetype]);
1331 pcp->count++; 1331 pcp->count++;
1332 if (pcp->count >= pcp->high) { 1332 if (pcp->count >= pcp->high) {
1333 free_pcppages_bulk(zone, pcp->batch, pcp); 1333 free_pcppages_bulk(zone, pcp->batch, pcp);
1334 pcp->count -= pcp->batch; 1334 pcp->count -= pcp->batch;
1335 } 1335 }
1336 1336
1337 out: 1337 out:
1338 local_irq_restore(flags); 1338 local_irq_restore(flags);
1339 } 1339 }
1340 1340
1341 /* 1341 /*
1342 * Free a list of 0-order pages 1342 * Free a list of 0-order pages
1343 */ 1343 */
1344 void free_hot_cold_page_list(struct list_head *list, int cold) 1344 void free_hot_cold_page_list(struct list_head *list, int cold)
1345 { 1345 {
1346 struct page *page, *next; 1346 struct page *page, *next;
1347 1347
1348 list_for_each_entry_safe(page, next, list, lru) { 1348 list_for_each_entry_safe(page, next, list, lru) {
1349 trace_mm_page_free_batched(page, cold); 1349 trace_mm_page_free_batched(page, cold);
1350 free_hot_cold_page(page, cold); 1350 free_hot_cold_page(page, cold);
1351 } 1351 }
1352 } 1352 }
1353 1353
1354 /* 1354 /*
1355 * split_page takes a non-compound higher-order page, and splits it into 1355 * split_page takes a non-compound higher-order page, and splits it into
1356 * n (1<<order) sub-pages: page[0..n] 1356 * n (1<<order) sub-pages: page[0..n]
1357 * Each sub-page must be freed individually. 1357 * Each sub-page must be freed individually.
1358 * 1358 *
1359 * Note: this is probably too low level an operation for use in drivers. 1359 * Note: this is probably too low level an operation for use in drivers.
1360 * Please consult with lkml before using this in your driver. 1360 * Please consult with lkml before using this in your driver.
1361 */ 1361 */
1362 void split_page(struct page *page, unsigned int order) 1362 void split_page(struct page *page, unsigned int order)
1363 { 1363 {
1364 int i; 1364 int i;
1365 1365
1366 VM_BUG_ON(PageCompound(page)); 1366 VM_BUG_ON(PageCompound(page));
1367 VM_BUG_ON(!page_count(page)); 1367 VM_BUG_ON(!page_count(page));
1368 1368
1369 #ifdef CONFIG_KMEMCHECK 1369 #ifdef CONFIG_KMEMCHECK
1370 /* 1370 /*
1371 * Split shadow pages too, because free(page[0]) would 1371 * Split shadow pages too, because free(page[0]) would
1372 * otherwise free the whole shadow. 1372 * otherwise free the whole shadow.
1373 */ 1373 */
1374 if (kmemcheck_page_is_tracked(page)) 1374 if (kmemcheck_page_is_tracked(page))
1375 split_page(virt_to_page(page[0].shadow), order); 1375 split_page(virt_to_page(page[0].shadow), order);
1376 #endif 1376 #endif
1377 1377
1378 for (i = 1; i < (1 << order); i++) 1378 for (i = 1; i < (1 << order); i++)
1379 set_page_refcounted(page + i); 1379 set_page_refcounted(page + i);
1380 } 1380 }
1381 1381
1382 /* 1382 /*
1383 * Similar to split_page except the page is already free. As this is only 1383 * Similar to split_page except the page is already free. As this is only
1384 * being used for migration, the migratetype of the block also changes. 1384 * being used for migration, the migratetype of the block also changes.
1385 * As this is called with interrupts disabled, the caller is responsible 1385 * As this is called with interrupts disabled, the caller is responsible
1386 * for calling arch_alloc_page() and kernel_map_page() after interrupts 1386 * for calling arch_alloc_page() and kernel_map_page() after interrupts
1387 * are enabled. 1387 * are enabled.
1388 * 1388 *
1389 * Note: this is probably too low level an operation for use in drivers. 1389 * Note: this is probably too low level an operation for use in drivers.
1390 * Please consult with lkml before using this in your driver. 1390 * Please consult with lkml before using this in your driver.
1391 */ 1391 */
1392 int split_free_page(struct page *page) 1392 int split_free_page(struct page *page)
1393 { 1393 {
1394 unsigned int order; 1394 unsigned int order;
1395 unsigned long watermark; 1395 unsigned long watermark;
1396 struct zone *zone; 1396 struct zone *zone;
1397 1397
1398 BUG_ON(!PageBuddy(page)); 1398 BUG_ON(!PageBuddy(page));
1399 1399
1400 zone = page_zone(page); 1400 zone = page_zone(page);
1401 order = page_order(page); 1401 order = page_order(page);
1402 1402
1403 /* Obey watermarks as if the page was being allocated */ 1403 /* Obey watermarks as if the page was being allocated */
1404 watermark = low_wmark_pages(zone) + (1 << order); 1404 watermark = low_wmark_pages(zone) + (1 << order);
1405 if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) 1405 if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
1406 return 0; 1406 return 0;
1407 1407
1408 /* Remove page from free list */ 1408 /* Remove page from free list */
1409 list_del(&page->lru); 1409 list_del(&page->lru);
1410 zone->free_area[order].nr_free--; 1410 zone->free_area[order].nr_free--;
1411 rmv_page_order(page); 1411 rmv_page_order(page);
1412 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order)); 1412 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
1413 1413
1414 /* Split into individual pages */ 1414 /* Split into individual pages */
1415 set_page_refcounted(page); 1415 set_page_refcounted(page);
1416 split_page(page, order); 1416 split_page(page, order);
1417 1417
1418 if (order >= pageblock_order - 1) { 1418 if (order >= pageblock_order - 1) {
1419 struct page *endpage = page + (1 << order) - 1; 1419 struct page *endpage = page + (1 << order) - 1;
1420 for (; page < endpage; page += pageblock_nr_pages) { 1420 for (; page < endpage; page += pageblock_nr_pages) {
1421 int mt = get_pageblock_migratetype(page); 1421 int mt = get_pageblock_migratetype(page);
1422 if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt)) 1422 if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt))
1423 set_pageblock_migratetype(page, 1423 set_pageblock_migratetype(page,
1424 MIGRATE_MOVABLE); 1424 MIGRATE_MOVABLE);
1425 } 1425 }
1426 } 1426 }
1427 1427
1428 return 1 << order; 1428 return 1 << order;
1429 } 1429 }
1430 1430
1431 /* 1431 /*
1432 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But 1432 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But
1433 * we cheat by calling it from here, in the order > 0 path. Saves a branch 1433 * we cheat by calling it from here, in the order > 0 path. Saves a branch
1434 * or two. 1434 * or two.
1435 */ 1435 */
1436 static inline 1436 static inline
1437 struct page *buffered_rmqueue(struct zone *preferred_zone, 1437 struct page *buffered_rmqueue(struct zone *preferred_zone,
1438 struct zone *zone, int order, gfp_t gfp_flags, 1438 struct zone *zone, int order, gfp_t gfp_flags,
1439 int migratetype) 1439 int migratetype)
1440 { 1440 {
1441 unsigned long flags; 1441 unsigned long flags;
1442 struct page *page; 1442 struct page *page;
1443 int cold = !!(gfp_flags & __GFP_COLD); 1443 int cold = !!(gfp_flags & __GFP_COLD);
1444 1444
1445 again: 1445 again:
1446 if (likely(order == 0)) { 1446 if (likely(order == 0)) {
1447 struct per_cpu_pages *pcp; 1447 struct per_cpu_pages *pcp;
1448 struct list_head *list; 1448 struct list_head *list;
1449 1449
1450 local_irq_save(flags); 1450 local_irq_save(flags);
1451 pcp = &this_cpu_ptr(zone->pageset)->pcp; 1451 pcp = &this_cpu_ptr(zone->pageset)->pcp;
1452 list = &pcp->lists[migratetype]; 1452 list = &pcp->lists[migratetype];
1453 if (list_empty(list)) { 1453 if (list_empty(list)) {
1454 pcp->count += rmqueue_bulk(zone, 0, 1454 pcp->count += rmqueue_bulk(zone, 0,
1455 pcp->batch, list, 1455 pcp->batch, list,
1456 migratetype, cold); 1456 migratetype, cold);
1457 if (unlikely(list_empty(list))) 1457 if (unlikely(list_empty(list)))
1458 goto failed; 1458 goto failed;
1459 } 1459 }
1460 1460
1461 if (cold) 1461 if (cold)
1462 page = list_entry(list->prev, struct page, lru); 1462 page = list_entry(list->prev, struct page, lru);
1463 else 1463 else
1464 page = list_entry(list->next, struct page, lru); 1464 page = list_entry(list->next, struct page, lru);
1465 1465
1466 list_del(&page->lru); 1466 list_del(&page->lru);
1467 pcp->count--; 1467 pcp->count--;
1468 } else { 1468 } else {
1469 if (unlikely(gfp_flags & __GFP_NOFAIL)) { 1469 if (unlikely(gfp_flags & __GFP_NOFAIL)) {
1470 /* 1470 /*
1471 * __GFP_NOFAIL is not to be used in new code. 1471 * __GFP_NOFAIL is not to be used in new code.
1472 * 1472 *
1473 * All __GFP_NOFAIL callers should be fixed so that they 1473 * All __GFP_NOFAIL callers should be fixed so that they
1474 * properly detect and handle allocation failures. 1474 * properly detect and handle allocation failures.
1475 * 1475 *
1476 * We most definitely don't want callers attempting to 1476 * We most definitely don't want callers attempting to
1477 * allocate greater than order-1 page units with 1477 * allocate greater than order-1 page units with
1478 * __GFP_NOFAIL. 1478 * __GFP_NOFAIL.
1479 */ 1479 */
1480 WARN_ON_ONCE(order > 1); 1480 WARN_ON_ONCE(order > 1);
1481 } 1481 }
1482 spin_lock_irqsave(&zone->lock, flags); 1482 spin_lock_irqsave(&zone->lock, flags);
1483 page = __rmqueue(zone, order, migratetype); 1483 page = __rmqueue(zone, order, migratetype);
1484 spin_unlock(&zone->lock); 1484 spin_unlock(&zone->lock);
1485 if (!page) 1485 if (!page)
1486 goto failed; 1486 goto failed;
1487 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); 1487 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
1488 } 1488 }
1489 1489
1490 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1490 __count_zone_vm_events(PGALLOC, zone, 1 << order);
1491 zone_statistics(preferred_zone, zone, gfp_flags); 1491 zone_statistics(preferred_zone, zone, gfp_flags);
1492 local_irq_restore(flags); 1492 local_irq_restore(flags);
1493 1493
1494 VM_BUG_ON(bad_range(zone, page)); 1494 VM_BUG_ON(bad_range(zone, page));
1495 if (prep_new_page(page, order, gfp_flags)) 1495 if (prep_new_page(page, order, gfp_flags))
1496 goto again; 1496 goto again;
1497 return page; 1497 return page;
1498 1498
1499 failed: 1499 failed:
1500 local_irq_restore(flags); 1500 local_irq_restore(flags);
1501 return NULL; 1501 return NULL;
1502 } 1502 }
1503 1503
1504 /* The ALLOC_WMARK bits are used as an index to zone->watermark */ 1504 /* The ALLOC_WMARK bits are used as an index to zone->watermark */
1505 #define ALLOC_WMARK_MIN WMARK_MIN 1505 #define ALLOC_WMARK_MIN WMARK_MIN
1506 #define ALLOC_WMARK_LOW WMARK_LOW 1506 #define ALLOC_WMARK_LOW WMARK_LOW
1507 #define ALLOC_WMARK_HIGH WMARK_HIGH 1507 #define ALLOC_WMARK_HIGH WMARK_HIGH
1508 #define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */ 1508 #define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */
1509 1509
1510 /* Mask to get the watermark bits */ 1510 /* Mask to get the watermark bits */
1511 #define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) 1511 #define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)
1512 1512
1513 #define ALLOC_HARDER 0x10 /* try to alloc harder */ 1513 #define ALLOC_HARDER 0x10 /* try to alloc harder */
1514 #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ 1514 #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
1515 #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ 1515 #define ALLOC_CPUSET 0x40 /* check for correct cpuset */
1516 1516
1517 #ifdef CONFIG_FAIL_PAGE_ALLOC 1517 #ifdef CONFIG_FAIL_PAGE_ALLOC
1518 1518
1519 static struct { 1519 static struct {
1520 struct fault_attr attr; 1520 struct fault_attr attr;
1521 1521
1522 u32 ignore_gfp_highmem; 1522 u32 ignore_gfp_highmem;
1523 u32 ignore_gfp_wait; 1523 u32 ignore_gfp_wait;
1524 u32 min_order; 1524 u32 min_order;
1525 } fail_page_alloc = { 1525 } fail_page_alloc = {
1526 .attr = FAULT_ATTR_INITIALIZER, 1526 .attr = FAULT_ATTR_INITIALIZER,
1527 .ignore_gfp_wait = 1, 1527 .ignore_gfp_wait = 1,
1528 .ignore_gfp_highmem = 1, 1528 .ignore_gfp_highmem = 1,
1529 .min_order = 1, 1529 .min_order = 1,
1530 }; 1530 };
1531 1531
1532 static int __init setup_fail_page_alloc(char *str) 1532 static int __init setup_fail_page_alloc(char *str)
1533 { 1533 {
1534 return setup_fault_attr(&fail_page_alloc.attr, str); 1534 return setup_fault_attr(&fail_page_alloc.attr, str);
1535 } 1535 }
1536 __setup("fail_page_alloc=", setup_fail_page_alloc); 1536 __setup("fail_page_alloc=", setup_fail_page_alloc);
1537 1537
1538 static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 1538 static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1539 { 1539 {
1540 if (order < fail_page_alloc.min_order) 1540 if (order < fail_page_alloc.min_order)
1541 return false; 1541 return false;
1542 if (gfp_mask & __GFP_NOFAIL) 1542 if (gfp_mask & __GFP_NOFAIL)
1543 return false; 1543 return false;
1544 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) 1544 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
1545 return false; 1545 return false;
1546 if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) 1546 if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
1547 return false; 1547 return false;
1548 1548
1549 return should_fail(&fail_page_alloc.attr, 1 << order); 1549 return should_fail(&fail_page_alloc.attr, 1 << order);
1550 } 1550 }
1551 1551
1552 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 1552 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
1553 1553
1554 static int __init fail_page_alloc_debugfs(void) 1554 static int __init fail_page_alloc_debugfs(void)
1555 { 1555 {
1556 umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; 1556 umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
1557 struct dentry *dir; 1557 struct dentry *dir;
1558 1558
1559 dir = fault_create_debugfs_attr("fail_page_alloc", NULL, 1559 dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
1560 &fail_page_alloc.attr); 1560 &fail_page_alloc.attr);
1561 if (IS_ERR(dir)) 1561 if (IS_ERR(dir))
1562 return PTR_ERR(dir); 1562 return PTR_ERR(dir);
1563 1563
1564 if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, 1564 if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
1565 &fail_page_alloc.ignore_gfp_wait)) 1565 &fail_page_alloc.ignore_gfp_wait))
1566 goto fail; 1566 goto fail;
1567 if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir, 1567 if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
1568 &fail_page_alloc.ignore_gfp_highmem)) 1568 &fail_page_alloc.ignore_gfp_highmem))
1569 goto fail; 1569 goto fail;
1570 if (!debugfs_create_u32("min-order", mode, dir, 1570 if (!debugfs_create_u32("min-order", mode, dir,
1571 &fail_page_alloc.min_order)) 1571 &fail_page_alloc.min_order))
1572 goto fail; 1572 goto fail;
1573 1573
1574 return 0; 1574 return 0;
1575 fail: 1575 fail:
1576 debugfs_remove_recursive(dir); 1576 debugfs_remove_recursive(dir);
1577 1577
1578 return -ENOMEM; 1578 return -ENOMEM;
1579 } 1579 }
1580 1580
1581 late_initcall(fail_page_alloc_debugfs); 1581 late_initcall(fail_page_alloc_debugfs);
1582 1582
1583 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ 1583 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
1584 1584
1585 #else /* CONFIG_FAIL_PAGE_ALLOC */ 1585 #else /* CONFIG_FAIL_PAGE_ALLOC */
1586 1586
1587 static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 1587 static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1588 { 1588 {
1589 return false; 1589 return false;
1590 } 1590 }
1591 1591
1592 #endif /* CONFIG_FAIL_PAGE_ALLOC */ 1592 #endif /* CONFIG_FAIL_PAGE_ALLOC */
1593 1593
1594 /* 1594 /*
1595 * Return true if free pages are above 'mark'. This takes into account the order 1595 * Return true if free pages are above 'mark'. This takes into account the order
1596 * of the allocation. 1596 * of the allocation.
1597 */ 1597 */
1598 static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1598 static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1599 int classzone_idx, int alloc_flags, long free_pages) 1599 int classzone_idx, int alloc_flags, long free_pages)
1600 { 1600 {
1601 /* free_pages my go negative - that's OK */ 1601 /* free_pages my go negative - that's OK */
1602 long min = mark; 1602 long min = mark;
1603 long lowmem_reserve = z->lowmem_reserve[classzone_idx]; 1603 long lowmem_reserve = z->lowmem_reserve[classzone_idx];
1604 int o; 1604 int o;
1605 1605
1606 free_pages -= (1 << order) - 1; 1606 free_pages -= (1 << order) - 1;
1607 if (alloc_flags & ALLOC_HIGH) 1607 if (alloc_flags & ALLOC_HIGH)
1608 min -= min / 2; 1608 min -= min / 2;
1609 if (alloc_flags & ALLOC_HARDER) 1609 if (alloc_flags & ALLOC_HARDER)
1610 min -= min / 4; 1610 min -= min / 4;
1611 1611
1612 if (free_pages <= min + lowmem_reserve) 1612 if (free_pages <= min + lowmem_reserve)
1613 return false; 1613 return false;
1614 for (o = 0; o < order; o++) { 1614 for (o = 0; o < order; o++) {
1615 /* At the next order, this order's pages become unavailable */ 1615 /* At the next order, this order's pages become unavailable */
1616 free_pages -= z->free_area[o].nr_free << o; 1616 free_pages -= z->free_area[o].nr_free << o;
1617 1617
1618 /* Require fewer higher order pages to be free */ 1618 /* Require fewer higher order pages to be free */
1619 min >>= 1; 1619 min >>= 1;
1620 1620
1621 if (free_pages <= min) 1621 if (free_pages <= min)
1622 return false; 1622 return false;
1623 } 1623 }
1624 return true; 1624 return true;
1625 } 1625 }
1626 1626
1627 #ifdef CONFIG_MEMORY_ISOLATION 1627 #ifdef CONFIG_MEMORY_ISOLATION
1628 static inline unsigned long nr_zone_isolate_freepages(struct zone *zone) 1628 static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
1629 { 1629 {
1630 if (unlikely(zone->nr_pageblock_isolate)) 1630 if (unlikely(zone->nr_pageblock_isolate))
1631 return zone->nr_pageblock_isolate * pageblock_nr_pages; 1631 return zone->nr_pageblock_isolate * pageblock_nr_pages;
1632 return 0; 1632 return 0;
1633 } 1633 }
1634 #else 1634 #else
1635 static inline unsigned long nr_zone_isolate_freepages(struct zone *zone) 1635 static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
1636 { 1636 {
1637 return 0; 1637 return 0;
1638 } 1638 }
1639 #endif 1639 #endif
1640 1640
1641 bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1641 bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1642 int classzone_idx, int alloc_flags) 1642 int classzone_idx, int alloc_flags)
1643 { 1643 {
1644 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, 1644 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1645 zone_page_state(z, NR_FREE_PAGES)); 1645 zone_page_state(z, NR_FREE_PAGES));
1646 } 1646 }
1647 1647
1648 bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, 1648 bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
1649 int classzone_idx, int alloc_flags) 1649 int classzone_idx, int alloc_flags)
1650 { 1650 {
1651 long free_pages = zone_page_state(z, NR_FREE_PAGES); 1651 long free_pages = zone_page_state(z, NR_FREE_PAGES);
1652 1652
1653 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) 1653 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
1654 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); 1654 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
1655 1655
1656 /* 1656 /*
1657 * If the zone has MIGRATE_ISOLATE type free pages, we should consider 1657 * If the zone has MIGRATE_ISOLATE type free pages, we should consider
1658 * it. nr_zone_isolate_freepages is never accurate so kswapd might not 1658 * it. nr_zone_isolate_freepages is never accurate so kswapd might not
1659 * sleep although it could do so. But this is more desirable for memory 1659 * sleep although it could do so. But this is more desirable for memory
1660 * hotplug than sleeping which can cause a livelock in the direct 1660 * hotplug than sleeping which can cause a livelock in the direct
1661 * reclaim path. 1661 * reclaim path.
1662 */ 1662 */
1663 free_pages -= nr_zone_isolate_freepages(z); 1663 free_pages -= nr_zone_isolate_freepages(z);
1664 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, 1664 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1665 free_pages); 1665 free_pages);
1666 } 1666 }
1667 1667
1668 #ifdef CONFIG_NUMA 1668 #ifdef CONFIG_NUMA
1669 /* 1669 /*
1670 * zlc_setup - Setup for "zonelist cache". Uses cached zone data to 1670 * zlc_setup - Setup for "zonelist cache". Uses cached zone data to
1671 * skip over zones that are not allowed by the cpuset, or that have 1671 * skip over zones that are not allowed by the cpuset, or that have
1672 * been recently (in last second) found to be nearly full. See further 1672 * been recently (in last second) found to be nearly full. See further
1673 * comments in mmzone.h. Reduces cache footprint of zonelist scans 1673 * comments in mmzone.h. Reduces cache footprint of zonelist scans
1674 * that have to skip over a lot of full or unallowed zones. 1674 * that have to skip over a lot of full or unallowed zones.
1675 * 1675 *
1676 * If the zonelist cache is present in the passed in zonelist, then 1676 * If the zonelist cache is present in the passed in zonelist, then
1677 * returns a pointer to the allowed node mask (either the current 1677 * returns a pointer to the allowed node mask (either the current
1678 * tasks mems_allowed, or node_states[N_HIGH_MEMORY].) 1678 * tasks mems_allowed, or node_states[N_HIGH_MEMORY].)
1679 * 1679 *
1680 * If the zonelist cache is not available for this zonelist, does 1680 * If the zonelist cache is not available for this zonelist, does
1681 * nothing and returns NULL. 1681 * nothing and returns NULL.
1682 * 1682 *
1683 * If the fullzones BITMAP in the zonelist cache is stale (more than 1683 * If the fullzones BITMAP in the zonelist cache is stale (more than
1684 * a second since last zap'd) then we zap it out (clear its bits.) 1684 * a second since last zap'd) then we zap it out (clear its bits.)
1685 * 1685 *
1686 * We hold off even calling zlc_setup, until after we've checked the 1686 * We hold off even calling zlc_setup, until after we've checked the
1687 * first zone in the zonelist, on the theory that most allocations will 1687 * first zone in the zonelist, on the theory that most allocations will
1688 * be satisfied from that first zone, so best to examine that zone as 1688 * be satisfied from that first zone, so best to examine that zone as
1689 * quickly as we can. 1689 * quickly as we can.
1690 */ 1690 */
1691 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) 1691 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1692 { 1692 {
1693 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1693 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1694 nodemask_t *allowednodes; /* zonelist_cache approximation */ 1694 nodemask_t *allowednodes; /* zonelist_cache approximation */
1695 1695
1696 zlc = zonelist->zlcache_ptr; 1696 zlc = zonelist->zlcache_ptr;
1697 if (!zlc) 1697 if (!zlc)
1698 return NULL; 1698 return NULL;
1699 1699
1700 if (time_after(jiffies, zlc->last_full_zap + HZ)) { 1700 if (time_after(jiffies, zlc->last_full_zap + HZ)) {
1701 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 1701 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1702 zlc->last_full_zap = jiffies; 1702 zlc->last_full_zap = jiffies;
1703 } 1703 }
1704 1704
1705 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? 1705 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
1706 &cpuset_current_mems_allowed : 1706 &cpuset_current_mems_allowed :
1707 &node_states[N_HIGH_MEMORY]; 1707 &node_states[N_HIGH_MEMORY];
1708 return allowednodes; 1708 return allowednodes;
1709 } 1709 }
1710 1710
1711 /* 1711 /*
1712 * Given 'z' scanning a zonelist, run a couple of quick checks to see 1712 * Given 'z' scanning a zonelist, run a couple of quick checks to see
1713 * if it is worth looking at further for free memory: 1713 * if it is worth looking at further for free memory:
1714 * 1) Check that the zone isn't thought to be full (doesn't have its 1714 * 1) Check that the zone isn't thought to be full (doesn't have its
1715 * bit set in the zonelist_cache fullzones BITMAP). 1715 * bit set in the zonelist_cache fullzones BITMAP).
1716 * 2) Check that the zones node (obtained from the zonelist_cache 1716 * 2) Check that the zones node (obtained from the zonelist_cache
1717 * z_to_n[] mapping) is allowed in the passed in allowednodes mask. 1717 * z_to_n[] mapping) is allowed in the passed in allowednodes mask.
1718 * Return true (non-zero) if zone is worth looking at further, or 1718 * Return true (non-zero) if zone is worth looking at further, or
1719 * else return false (zero) if it is not. 1719 * else return false (zero) if it is not.
1720 * 1720 *
1721 * This check -ignores- the distinction between various watermarks, 1721 * This check -ignores- the distinction between various watermarks,
1722 * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is 1722 * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is
1723 * found to be full for any variation of these watermarks, it will 1723 * found to be full for any variation of these watermarks, it will
1724 * be considered full for up to one second by all requests, unless 1724 * be considered full for up to one second by all requests, unless
1725 * we are so low on memory on all allowed nodes that we are forced 1725 * we are so low on memory on all allowed nodes that we are forced
1726 * into the second scan of the zonelist. 1726 * into the second scan of the zonelist.
1727 * 1727 *
1728 * In the second scan we ignore this zonelist cache and exactly 1728 * In the second scan we ignore this zonelist cache and exactly
1729 * apply the watermarks to all zones, even it is slower to do so. 1729 * apply the watermarks to all zones, even it is slower to do so.
1730 * We are low on memory in the second scan, and should leave no stone 1730 * We are low on memory in the second scan, and should leave no stone
1731 * unturned looking for a free page. 1731 * unturned looking for a free page.
1732 */ 1732 */
1733 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, 1733 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
1734 nodemask_t *allowednodes) 1734 nodemask_t *allowednodes)
1735 { 1735 {
1736 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1736 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1737 int i; /* index of *z in zonelist zones */ 1737 int i; /* index of *z in zonelist zones */
1738 int n; /* node that zone *z is on */ 1738 int n; /* node that zone *z is on */
1739 1739
1740 zlc = zonelist->zlcache_ptr; 1740 zlc = zonelist->zlcache_ptr;
1741 if (!zlc) 1741 if (!zlc)
1742 return 1; 1742 return 1;
1743 1743
1744 i = z - zonelist->_zonerefs; 1744 i = z - zonelist->_zonerefs;
1745 n = zlc->z_to_n[i]; 1745 n = zlc->z_to_n[i];
1746 1746
1747 /* This zone is worth trying if it is allowed but not full */ 1747 /* This zone is worth trying if it is allowed but not full */
1748 return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones); 1748 return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
1749 } 1749 }
1750 1750
1751 /* 1751 /*
1752 * Given 'z' scanning a zonelist, set the corresponding bit in 1752 * Given 'z' scanning a zonelist, set the corresponding bit in
1753 * zlc->fullzones, so that subsequent attempts to allocate a page 1753 * zlc->fullzones, so that subsequent attempts to allocate a page
1754 * from that zone don't waste time re-examining it. 1754 * from that zone don't waste time re-examining it.
1755 */ 1755 */
1756 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) 1756 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1757 { 1757 {
1758 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1758 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1759 int i; /* index of *z in zonelist zones */ 1759 int i; /* index of *z in zonelist zones */
1760 1760
1761 zlc = zonelist->zlcache_ptr; 1761 zlc = zonelist->zlcache_ptr;
1762 if (!zlc) 1762 if (!zlc)
1763 return; 1763 return;
1764 1764
1765 i = z - zonelist->_zonerefs; 1765 i = z - zonelist->_zonerefs;
1766 1766
1767 set_bit(i, zlc->fullzones); 1767 set_bit(i, zlc->fullzones);
1768 } 1768 }
1769 1769
1770 /* 1770 /*
1771 * clear all zones full, called after direct reclaim makes progress so that 1771 * clear all zones full, called after direct reclaim makes progress so that
1772 * a zone that was recently full is not skipped over for up to a second 1772 * a zone that was recently full is not skipped over for up to a second
1773 */ 1773 */
1774 static void zlc_clear_zones_full(struct zonelist *zonelist) 1774 static void zlc_clear_zones_full(struct zonelist *zonelist)
1775 { 1775 {
1776 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1776 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1777 1777
1778 zlc = zonelist->zlcache_ptr; 1778 zlc = zonelist->zlcache_ptr;
1779 if (!zlc) 1779 if (!zlc)
1780 return; 1780 return;
1781 1781
1782 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 1782 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1783 } 1783 }
1784 1784
1785 #else /* CONFIG_NUMA */ 1785 #else /* CONFIG_NUMA */
1786 1786
1787 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) 1787 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1788 { 1788 {
1789 return NULL; 1789 return NULL;
1790 } 1790 }
1791 1791
1792 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, 1792 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
1793 nodemask_t *allowednodes) 1793 nodemask_t *allowednodes)
1794 { 1794 {
1795 return 1; 1795 return 1;
1796 } 1796 }
1797 1797
1798 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) 1798 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1799 { 1799 {
1800 } 1800 }
1801 1801
1802 static void zlc_clear_zones_full(struct zonelist *zonelist) 1802 static void zlc_clear_zones_full(struct zonelist *zonelist)
1803 { 1803 {
1804 } 1804 }
1805 #endif /* CONFIG_NUMA */ 1805 #endif /* CONFIG_NUMA */
1806 1806
1807 /* 1807 /*
1808 * get_page_from_freelist goes through the zonelist trying to allocate 1808 * get_page_from_freelist goes through the zonelist trying to allocate
1809 * a page. 1809 * a page.
1810 */ 1810 */
1811 static struct page * 1811 static struct page *
1812 get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, 1812 get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
1813 struct zonelist *zonelist, int high_zoneidx, int alloc_flags, 1813 struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
1814 struct zone *preferred_zone, int migratetype) 1814 struct zone *preferred_zone, int migratetype)
1815 { 1815 {
1816 struct zoneref *z; 1816 struct zoneref *z;
1817 struct page *page = NULL; 1817 struct page *page = NULL;
1818 int classzone_idx; 1818 int classzone_idx;
1819 struct zone *zone; 1819 struct zone *zone;
1820 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ 1820 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
1821 int zlc_active = 0; /* set if using zonelist_cache */ 1821 int zlc_active = 0; /* set if using zonelist_cache */
1822 int did_zlc_setup = 0; /* just call zlc_setup() one time */ 1822 int did_zlc_setup = 0; /* just call zlc_setup() one time */
1823 1823
1824 classzone_idx = zone_idx(preferred_zone); 1824 classzone_idx = zone_idx(preferred_zone);
1825 zonelist_scan: 1825 zonelist_scan:
1826 /* 1826 /*
1827 * Scan zonelist, looking for a zone with enough free. 1827 * Scan zonelist, looking for a zone with enough free.
1828 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 1828 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1829 */ 1829 */
1830 for_each_zone_zonelist_nodemask(zone, z, zonelist, 1830 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1831 high_zoneidx, nodemask) { 1831 high_zoneidx, nodemask) {
1832 if (NUMA_BUILD && zlc_active && 1832 if (NUMA_BUILD && zlc_active &&
1833 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1833 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1834 continue; 1834 continue;
1835 if ((alloc_flags & ALLOC_CPUSET) && 1835 if ((alloc_flags & ALLOC_CPUSET) &&
1836 !cpuset_zone_allowed_softwall(zone, gfp_mask)) 1836 !cpuset_zone_allowed_softwall(zone, gfp_mask))
1837 continue; 1837 continue;
1838 /* 1838 /*
1839 * When allocating a page cache page for writing, we 1839 * When allocating a page cache page for writing, we
1840 * want to get it from a zone that is within its dirty 1840 * want to get it from a zone that is within its dirty
1841 * limit, such that no single zone holds more than its 1841 * limit, such that no single zone holds more than its
1842 * proportional share of globally allowed dirty pages. 1842 * proportional share of globally allowed dirty pages.
1843 * The dirty limits take into account the zone's 1843 * The dirty limits take into account the zone's
1844 * lowmem reserves and high watermark so that kswapd 1844 * lowmem reserves and high watermark so that kswapd
1845 * should be able to balance it without having to 1845 * should be able to balance it without having to
1846 * write pages from its LRU list. 1846 * write pages from its LRU list.
1847 * 1847 *
1848 * This may look like it could increase pressure on 1848 * This may look like it could increase pressure on
1849 * lower zones by failing allocations in higher zones 1849 * lower zones by failing allocations in higher zones
1850 * before they are full. But the pages that do spill 1850 * before they are full. But the pages that do spill
1851 * over are limited as the lower zones are protected 1851 * over are limited as the lower zones are protected
1852 * by this very same mechanism. It should not become 1852 * by this very same mechanism. It should not become
1853 * a practical burden to them. 1853 * a practical burden to them.
1854 * 1854 *
1855 * XXX: For now, allow allocations to potentially 1855 * XXX: For now, allow allocations to potentially
1856 * exceed the per-zone dirty limit in the slowpath 1856 * exceed the per-zone dirty limit in the slowpath
1857 * (ALLOC_WMARK_LOW unset) before going into reclaim, 1857 * (ALLOC_WMARK_LOW unset) before going into reclaim,
1858 * which is important when on a NUMA setup the allowed 1858 * which is important when on a NUMA setup the allowed
1859 * zones are together not big enough to reach the 1859 * zones are together not big enough to reach the
1860 * global limit. The proper fix for these situations 1860 * global limit. The proper fix for these situations
1861 * will require awareness of zones in the 1861 * will require awareness of zones in the
1862 * dirty-throttling and the flusher threads. 1862 * dirty-throttling and the flusher threads.
1863 */ 1863 */
1864 if ((alloc_flags & ALLOC_WMARK_LOW) && 1864 if ((alloc_flags & ALLOC_WMARK_LOW) &&
1865 (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone)) 1865 (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
1866 goto this_zone_full; 1866 goto this_zone_full;
1867 1867
1868 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); 1868 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
1869 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 1869 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
1870 unsigned long mark; 1870 unsigned long mark;
1871 int ret; 1871 int ret;
1872 1872
1873 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; 1873 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
1874 if (zone_watermark_ok(zone, order, mark, 1874 if (zone_watermark_ok(zone, order, mark,
1875 classzone_idx, alloc_flags)) 1875 classzone_idx, alloc_flags))
1876 goto try_this_zone; 1876 goto try_this_zone;
1877 1877
1878 if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { 1878 if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
1879 /* 1879 /*
1880 * we do zlc_setup if there are multiple nodes 1880 * we do zlc_setup if there are multiple nodes
1881 * and before considering the first zone allowed 1881 * and before considering the first zone allowed
1882 * by the cpuset. 1882 * by the cpuset.
1883 */ 1883 */
1884 allowednodes = zlc_setup(zonelist, alloc_flags); 1884 allowednodes = zlc_setup(zonelist, alloc_flags);
1885 zlc_active = 1; 1885 zlc_active = 1;
1886 did_zlc_setup = 1; 1886 did_zlc_setup = 1;
1887 } 1887 }
1888 1888
1889 if (zone_reclaim_mode == 0) 1889 if (zone_reclaim_mode == 0)
1890 goto this_zone_full; 1890 goto this_zone_full;
1891 1891
1892 /* 1892 /*
1893 * As we may have just activated ZLC, check if the first 1893 * As we may have just activated ZLC, check if the first
1894 * eligible zone has failed zone_reclaim recently. 1894 * eligible zone has failed zone_reclaim recently.
1895 */ 1895 */
1896 if (NUMA_BUILD && zlc_active && 1896 if (NUMA_BUILD && zlc_active &&
1897 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1897 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1898 continue; 1898 continue;
1899 1899
1900 ret = zone_reclaim(zone, gfp_mask, order); 1900 ret = zone_reclaim(zone, gfp_mask, order);
1901 switch (ret) { 1901 switch (ret) {
1902 case ZONE_RECLAIM_NOSCAN: 1902 case ZONE_RECLAIM_NOSCAN:
1903 /* did not scan */ 1903 /* did not scan */
1904 continue; 1904 continue;
1905 case ZONE_RECLAIM_FULL: 1905 case ZONE_RECLAIM_FULL:
1906 /* scanned but unreclaimable */ 1906 /* scanned but unreclaimable */
1907 continue; 1907 continue;
1908 default: 1908 default:
1909 /* did we reclaim enough */ 1909 /* did we reclaim enough */
1910 if (!zone_watermark_ok(zone, order, mark, 1910 if (!zone_watermark_ok(zone, order, mark,
1911 classzone_idx, alloc_flags)) 1911 classzone_idx, alloc_flags))
1912 goto this_zone_full; 1912 goto this_zone_full;
1913 } 1913 }
1914 } 1914 }
1915 1915
1916 try_this_zone: 1916 try_this_zone:
1917 page = buffered_rmqueue(preferred_zone, zone, order, 1917 page = buffered_rmqueue(preferred_zone, zone, order,
1918 gfp_mask, migratetype); 1918 gfp_mask, migratetype);
1919 if (page) 1919 if (page)
1920 break; 1920 break;
1921 this_zone_full: 1921 this_zone_full:
1922 if (NUMA_BUILD) 1922 if (NUMA_BUILD)
1923 zlc_mark_zone_full(zonelist, z); 1923 zlc_mark_zone_full(zonelist, z);
1924 } 1924 }
1925 1925
1926 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { 1926 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
1927 /* Disable zlc cache for second zonelist scan */ 1927 /* Disable zlc cache for second zonelist scan */
1928 zlc_active = 0; 1928 zlc_active = 0;
1929 goto zonelist_scan; 1929 goto zonelist_scan;
1930 } 1930 }
1931 return page; 1931 return page;
1932 } 1932 }
1933 1933
1934 /* 1934 /*
1935 * Large machines with many possible nodes should not always dump per-node 1935 * Large machines with many possible nodes should not always dump per-node
1936 * meminfo in irq context. 1936 * meminfo in irq context.
1937 */ 1937 */
1938 static inline bool should_suppress_show_mem(void) 1938 static inline bool should_suppress_show_mem(void)
1939 { 1939 {
1940 bool ret = false; 1940 bool ret = false;
1941 1941
1942 #if NODES_SHIFT > 8 1942 #if NODES_SHIFT > 8
1943 ret = in_interrupt(); 1943 ret = in_interrupt();
1944 #endif 1944 #endif
1945 return ret; 1945 return ret;
1946 } 1946 }
1947 1947
1948 static DEFINE_RATELIMIT_STATE(nopage_rs, 1948 static DEFINE_RATELIMIT_STATE(nopage_rs,
1949 DEFAULT_RATELIMIT_INTERVAL, 1949 DEFAULT_RATELIMIT_INTERVAL,
1950 DEFAULT_RATELIMIT_BURST); 1950 DEFAULT_RATELIMIT_BURST);
1951 1951
1952 void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) 1952 void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
1953 { 1953 {
1954 unsigned int filter = SHOW_MEM_FILTER_NODES; 1954 unsigned int filter = SHOW_MEM_FILTER_NODES;
1955 1955
1956 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || 1956 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
1957 debug_guardpage_minorder() > 0) 1957 debug_guardpage_minorder() > 0)
1958 return; 1958 return;
1959 1959
1960 /* 1960 /*
1961 * This documents exceptions given to allocations in certain 1961 * This documents exceptions given to allocations in certain
1962 * contexts that are allowed to allocate outside current's set 1962 * contexts that are allowed to allocate outside current's set
1963 * of allowed nodes. 1963 * of allowed nodes.
1964 */ 1964 */
1965 if (!(gfp_mask & __GFP_NOMEMALLOC)) 1965 if (!(gfp_mask & __GFP_NOMEMALLOC))
1966 if (test_thread_flag(TIF_MEMDIE) || 1966 if (test_thread_flag(TIF_MEMDIE) ||
1967 (current->flags & (PF_MEMALLOC | PF_EXITING))) 1967 (current->flags & (PF_MEMALLOC | PF_EXITING)))
1968 filter &= ~SHOW_MEM_FILTER_NODES; 1968 filter &= ~SHOW_MEM_FILTER_NODES;
1969 if (in_interrupt() || !(gfp_mask & __GFP_WAIT)) 1969 if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
1970 filter &= ~SHOW_MEM_FILTER_NODES; 1970 filter &= ~SHOW_MEM_FILTER_NODES;
1971 1971
1972 if (fmt) { 1972 if (fmt) {
1973 struct va_format vaf; 1973 struct va_format vaf;
1974 va_list args; 1974 va_list args;
1975 1975
1976 va_start(args, fmt); 1976 va_start(args, fmt);
1977 1977
1978 vaf.fmt = fmt; 1978 vaf.fmt = fmt;
1979 vaf.va = &args; 1979 vaf.va = &args;
1980 1980
1981 pr_warn("%pV", &vaf); 1981 pr_warn("%pV", &vaf);
1982 1982
1983 va_end(args); 1983 va_end(args);
1984 } 1984 }
1985 1985
1986 pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n", 1986 pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",
1987 current->comm, order, gfp_mask); 1987 current->comm, order, gfp_mask);
1988 1988
1989 dump_stack(); 1989 dump_stack();
1990 if (!should_suppress_show_mem()) 1990 if (!should_suppress_show_mem())
1991 show_mem(filter); 1991 show_mem(filter);
1992 } 1992 }
1993 1993
1994 static inline int 1994 static inline int
1995 should_alloc_retry(gfp_t gfp_mask, unsigned int order, 1995 should_alloc_retry(gfp_t gfp_mask, unsigned int order,
1996 unsigned long did_some_progress, 1996 unsigned long did_some_progress,
1997 unsigned long pages_reclaimed) 1997 unsigned long pages_reclaimed)
1998 { 1998 {
1999 /* Do not loop if specifically requested */ 1999 /* Do not loop if specifically requested */
2000 if (gfp_mask & __GFP_NORETRY) 2000 if (gfp_mask & __GFP_NORETRY)
2001 return 0; 2001 return 0;
2002 2002
2003 /* Always retry if specifically requested */ 2003 /* Always retry if specifically requested */
2004 if (gfp_mask & __GFP_NOFAIL) 2004 if (gfp_mask & __GFP_NOFAIL)
2005 return 1; 2005 return 1;
2006 2006
2007 /* 2007 /*
2008 * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim 2008 * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim
2009 * making forward progress without invoking OOM. Suspend also disables 2009 * making forward progress without invoking OOM. Suspend also disables
2010 * storage devices so kswapd will not help. Bail if we are suspending. 2010 * storage devices so kswapd will not help. Bail if we are suspending.
2011 */ 2011 */
2012 if (!did_some_progress && pm_suspended_storage()) 2012 if (!did_some_progress && pm_suspended_storage())
2013 return 0; 2013 return 0;
2014 2014
2015 /* 2015 /*
2016 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER 2016 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
2017 * means __GFP_NOFAIL, but that may not be true in other 2017 * means __GFP_NOFAIL, but that may not be true in other
2018 * implementations. 2018 * implementations.
2019 */ 2019 */
2020 if (order <= PAGE_ALLOC_COSTLY_ORDER) 2020 if (order <= PAGE_ALLOC_COSTLY_ORDER)
2021 return 1; 2021 return 1;
2022 2022
2023 /* 2023 /*
2024 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is 2024 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
2025 * specified, then we retry until we no longer reclaim any pages 2025 * specified, then we retry until we no longer reclaim any pages
2026 * (above), or we've reclaimed an order of pages at least as 2026 * (above), or we've reclaimed an order of pages at least as
2027 * large as the allocation's order. In both cases, if the 2027 * large as the allocation's order. In both cases, if the
2028 * allocation still fails, we stop retrying. 2028 * allocation still fails, we stop retrying.
2029 */ 2029 */
2030 if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order)) 2030 if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
2031 return 1; 2031 return 1;
2032 2032
2033 return 0; 2033 return 0;
2034 } 2034 }
2035 2035
2036 static inline struct page * 2036 static inline struct page *
2037 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, 2037 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2038 struct zonelist *zonelist, enum zone_type high_zoneidx, 2038 struct zonelist *zonelist, enum zone_type high_zoneidx,
2039 nodemask_t *nodemask, struct zone *preferred_zone, 2039 nodemask_t *nodemask, struct zone *preferred_zone,
2040 int migratetype) 2040 int migratetype)
2041 { 2041 {
2042 struct page *page; 2042 struct page *page;
2043 2043
2044 /* Acquire the OOM killer lock for the zones in zonelist */ 2044 /* Acquire the OOM killer lock for the zones in zonelist */
2045 if (!try_set_zonelist_oom(zonelist, gfp_mask)) { 2045 if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
2046 schedule_timeout_uninterruptible(1); 2046 schedule_timeout_uninterruptible(1);
2047 return NULL; 2047 return NULL;
2048 } 2048 }
2049 2049
2050 /* 2050 /*
2051 * Go through the zonelist yet one more time, keep very high watermark 2051 * Go through the zonelist yet one more time, keep very high watermark
2052 * here, this is only to catch a parallel oom killing, we must fail if 2052 * here, this is only to catch a parallel oom killing, we must fail if
2053 * we're still under heavy pressure. 2053 * we're still under heavy pressure.
2054 */ 2054 */
2055 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, 2055 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
2056 order, zonelist, high_zoneidx, 2056 order, zonelist, high_zoneidx,
2057 ALLOC_WMARK_HIGH|ALLOC_CPUSET, 2057 ALLOC_WMARK_HIGH|ALLOC_CPUSET,
2058 preferred_zone, migratetype); 2058 preferred_zone, migratetype);
2059 if (page) 2059 if (page)
2060 goto out; 2060 goto out;
2061 2061
2062 if (!(gfp_mask & __GFP_NOFAIL)) { 2062 if (!(gfp_mask & __GFP_NOFAIL)) {
2063 /* The OOM killer will not help higher order allocs */ 2063 /* The OOM killer will not help higher order allocs */
2064 if (order > PAGE_ALLOC_COSTLY_ORDER) 2064 if (order > PAGE_ALLOC_COSTLY_ORDER)
2065 goto out; 2065 goto out;
2066 /* The OOM killer does not needlessly kill tasks for lowmem */ 2066 /* The OOM killer does not needlessly kill tasks for lowmem */
2067 if (high_zoneidx < ZONE_NORMAL) 2067 if (high_zoneidx < ZONE_NORMAL)
2068 goto out; 2068 goto out;
2069 /* 2069 /*
2070 * GFP_THISNODE contains __GFP_NORETRY and we never hit this. 2070 * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
2071 * Sanity check for bare calls of __GFP_THISNODE, not real OOM. 2071 * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
2072 * The caller should handle page allocation failure by itself if 2072 * The caller should handle page allocation failure by itself if
2073 * it specifies __GFP_THISNODE. 2073 * it specifies __GFP_THISNODE.
2074 * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER. 2074 * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
2075 */ 2075 */
2076 if (gfp_mask & __GFP_THISNODE) 2076 if (gfp_mask & __GFP_THISNODE)
2077 goto out; 2077 goto out;
2078 } 2078 }
2079 /* Exhausted what can be done so it's blamo time */ 2079 /* Exhausted what can be done so it's blamo time */
2080 out_of_memory(zonelist, gfp_mask, order, nodemask, false); 2080 out_of_memory(zonelist, gfp_mask, order, nodemask, false);
2081 2081
2082 out: 2082 out:
2083 clear_zonelist_oom(zonelist, gfp_mask); 2083 clear_zonelist_oom(zonelist, gfp_mask);
2084 return page; 2084 return page;
2085 } 2085 }
2086 2086
2087 #ifdef CONFIG_COMPACTION 2087 #ifdef CONFIG_COMPACTION
2088 /* Try memory compaction for high-order allocations before reclaim */ 2088 /* Try memory compaction for high-order allocations before reclaim */
2089 static struct page * 2089 static struct page *
2090 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 2090 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2091 struct zonelist *zonelist, enum zone_type high_zoneidx, 2091 struct zonelist *zonelist, enum zone_type high_zoneidx,
2092 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2092 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2093 int migratetype, bool sync_migration, 2093 int migratetype, bool sync_migration,
2094 bool *deferred_compaction, 2094 bool *deferred_compaction,
2095 unsigned long *did_some_progress) 2095 unsigned long *did_some_progress)
2096 { 2096 {
2097 struct page *page; 2097 struct page *page;
2098 2098
2099 if (!order) 2099 if (!order)
2100 return NULL; 2100 return NULL;
2101 2101
2102 if (compaction_deferred(preferred_zone, order)) { 2102 if (compaction_deferred(preferred_zone, order)) {
2103 *deferred_compaction = true; 2103 *deferred_compaction = true;
2104 return NULL; 2104 return NULL;
2105 } 2105 }
2106 2106
2107 current->flags |= PF_MEMALLOC; 2107 current->flags |= PF_MEMALLOC;
2108 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, 2108 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
2109 nodemask, sync_migration); 2109 nodemask, sync_migration);
2110 current->flags &= ~PF_MEMALLOC; 2110 current->flags &= ~PF_MEMALLOC;
2111 if (*did_some_progress != COMPACT_SKIPPED) { 2111 if (*did_some_progress != COMPACT_SKIPPED) {
2112 2112
2113 /* Page migration frees to the PCP lists but we want merging */ 2113 /* Page migration frees to the PCP lists but we want merging */
2114 drain_pages(get_cpu()); 2114 drain_pages(get_cpu());
2115 put_cpu(); 2115 put_cpu();
2116 2116
2117 page = get_page_from_freelist(gfp_mask, nodemask, 2117 page = get_page_from_freelist(gfp_mask, nodemask,
2118 order, zonelist, high_zoneidx, 2118 order, zonelist, high_zoneidx,
2119 alloc_flags, preferred_zone, 2119 alloc_flags, preferred_zone,
2120 migratetype); 2120 migratetype);
2121 if (page) { 2121 if (page) {
2122 preferred_zone->compact_considered = 0; 2122 preferred_zone->compact_considered = 0;
2123 preferred_zone->compact_defer_shift = 0; 2123 preferred_zone->compact_defer_shift = 0;
2124 if (order >= preferred_zone->compact_order_failed) 2124 if (order >= preferred_zone->compact_order_failed)
2125 preferred_zone->compact_order_failed = order + 1; 2125 preferred_zone->compact_order_failed = order + 1;
2126 count_vm_event(COMPACTSUCCESS); 2126 count_vm_event(COMPACTSUCCESS);
2127 return page; 2127 return page;
2128 } 2128 }
2129 2129
2130 /* 2130 /*
2131 * It's bad if compaction run occurs and fails. 2131 * It's bad if compaction run occurs and fails.
2132 * The most likely reason is that pages exist, 2132 * The most likely reason is that pages exist,
2133 * but not enough to satisfy watermarks. 2133 * but not enough to satisfy watermarks.
2134 */ 2134 */
2135 count_vm_event(COMPACTFAIL); 2135 count_vm_event(COMPACTFAIL);
2136 2136
2137 /* 2137 /*
2138 * As async compaction considers a subset of pageblocks, only 2138 * As async compaction considers a subset of pageblocks, only
2139 * defer if the failure was a sync compaction failure. 2139 * defer if the failure was a sync compaction failure.
2140 */ 2140 */
2141 if (sync_migration) 2141 if (sync_migration)
2142 defer_compaction(preferred_zone, order); 2142 defer_compaction(preferred_zone, order);
2143 2143
2144 cond_resched(); 2144 cond_resched();
2145 } 2145 }
2146 2146
2147 return NULL; 2147 return NULL;
2148 } 2148 }
2149 #else 2149 #else
2150 static inline struct page * 2150 static inline struct page *
2151 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 2151 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2152 struct zonelist *zonelist, enum zone_type high_zoneidx, 2152 struct zonelist *zonelist, enum zone_type high_zoneidx,
2153 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2153 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2154 int migratetype, bool sync_migration, 2154 int migratetype, bool sync_migration,
2155 bool *deferred_compaction, 2155 bool *deferred_compaction,
2156 unsigned long *did_some_progress) 2156 unsigned long *did_some_progress)
2157 { 2157 {
2158 return NULL; 2158 return NULL;
2159 } 2159 }
2160 #endif /* CONFIG_COMPACTION */ 2160 #endif /* CONFIG_COMPACTION */
2161 2161
2162 /* Perform direct synchronous page reclaim */ 2162 /* Perform direct synchronous page reclaim */
2163 static int 2163 static int
2164 __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, 2164 __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
2165 nodemask_t *nodemask) 2165 nodemask_t *nodemask)
2166 { 2166 {
2167 struct reclaim_state reclaim_state; 2167 struct reclaim_state reclaim_state;
2168 int progress; 2168 int progress;
2169 2169
2170 cond_resched(); 2170 cond_resched();
2171 2171
2172 /* We now go into synchronous reclaim */ 2172 /* We now go into synchronous reclaim */
2173 cpuset_memory_pressure_bump(); 2173 cpuset_memory_pressure_bump();
2174 current->flags |= PF_MEMALLOC; 2174 current->flags |= PF_MEMALLOC;
2175 lockdep_set_current_reclaim_state(gfp_mask); 2175 lockdep_set_current_reclaim_state(gfp_mask);
2176 reclaim_state.reclaimed_slab = 0; 2176 reclaim_state.reclaimed_slab = 0;
2177 current->reclaim_state = &reclaim_state; 2177 current->reclaim_state = &reclaim_state;
2178 2178
2179 progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); 2179 progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
2180 2180
2181 current->reclaim_state = NULL; 2181 current->reclaim_state = NULL;
2182 lockdep_clear_current_reclaim_state(); 2182 lockdep_clear_current_reclaim_state();
2183 current->flags &= ~PF_MEMALLOC; 2183 current->flags &= ~PF_MEMALLOC;
2184 2184
2185 cond_resched(); 2185 cond_resched();
2186 2186
2187 return progress; 2187 return progress;
2188 } 2188 }
2189 2189
2190 /* The really slow allocator path where we enter direct reclaim */ 2190 /* The really slow allocator path where we enter direct reclaim */
2191 static inline struct page * 2191 static inline struct page *
2192 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, 2192 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
2193 struct zonelist *zonelist, enum zone_type high_zoneidx, 2193 struct zonelist *zonelist, enum zone_type high_zoneidx,
2194 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2194 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2195 int migratetype, unsigned long *did_some_progress) 2195 int migratetype, unsigned long *did_some_progress)
2196 { 2196 {
2197 struct page *page = NULL; 2197 struct page *page = NULL;
2198 bool drained = false; 2198 bool drained = false;
2199 2199
2200 *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, 2200 *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
2201 nodemask); 2201 nodemask);
2202 if (unlikely(!(*did_some_progress))) 2202 if (unlikely(!(*did_some_progress)))
2203 return NULL; 2203 return NULL;
2204 2204
2205 /* After successful reclaim, reconsider all zones for allocation */ 2205 /* After successful reclaim, reconsider all zones for allocation */
2206 if (NUMA_BUILD) 2206 if (NUMA_BUILD)
2207 zlc_clear_zones_full(zonelist); 2207 zlc_clear_zones_full(zonelist);
2208 2208
2209 retry: 2209 retry:
2210 page = get_page_from_freelist(gfp_mask, nodemask, order, 2210 page = get_page_from_freelist(gfp_mask, nodemask, order,
2211 zonelist, high_zoneidx, 2211 zonelist, high_zoneidx,
2212 alloc_flags, preferred_zone, 2212 alloc_flags, preferred_zone,
2213 migratetype); 2213 migratetype);
2214 2214
2215 /* 2215 /*
2216 * If an allocation failed after direct reclaim, it could be because 2216 * If an allocation failed after direct reclaim, it could be because
2217 * pages are pinned on the per-cpu lists. Drain them and try again 2217 * pages are pinned on the per-cpu lists. Drain them and try again
2218 */ 2218 */
2219 if (!page && !drained) { 2219 if (!page && !drained) {
2220 drain_all_pages(); 2220 drain_all_pages();
2221 drained = true; 2221 drained = true;
2222 goto retry; 2222 goto retry;
2223 } 2223 }
2224 2224
2225 return page; 2225 return page;
2226 } 2226 }
2227 2227
2228 /* 2228 /*
2229 * This is called in the allocator slow-path if the allocation request is of 2229 * This is called in the allocator slow-path if the allocation request is of
2230 * sufficient urgency to ignore watermarks and take other desperate measures 2230 * sufficient urgency to ignore watermarks and take other desperate measures
2231 */ 2231 */
2232 static inline struct page * 2232 static inline struct page *
2233 __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, 2233 __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
2234 struct zonelist *zonelist, enum zone_type high_zoneidx, 2234 struct zonelist *zonelist, enum zone_type high_zoneidx,
2235 nodemask_t *nodemask, struct zone *preferred_zone, 2235 nodemask_t *nodemask, struct zone *preferred_zone,
2236 int migratetype) 2236 int migratetype)
2237 { 2237 {
2238 struct page *page; 2238 struct page *page;
2239 2239
2240 do { 2240 do {
2241 page = get_page_from_freelist(gfp_mask, nodemask, order, 2241 page = get_page_from_freelist(gfp_mask, nodemask, order,
2242 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, 2242 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
2243 preferred_zone, migratetype); 2243 preferred_zone, migratetype);
2244 2244
2245 if (!page && gfp_mask & __GFP_NOFAIL) 2245 if (!page && gfp_mask & __GFP_NOFAIL)
2246 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); 2246 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
2247 } while (!page && (gfp_mask & __GFP_NOFAIL)); 2247 } while (!page && (gfp_mask & __GFP_NOFAIL));
2248 2248
2249 return page; 2249 return page;
2250 } 2250 }
2251 2251
2252 static inline 2252 static inline
2253 void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, 2253 void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
2254 enum zone_type high_zoneidx, 2254 enum zone_type high_zoneidx,
2255 enum zone_type classzone_idx) 2255 enum zone_type classzone_idx)
2256 { 2256 {
2257 struct zoneref *z; 2257 struct zoneref *z;
2258 struct zone *zone; 2258 struct zone *zone;
2259 2259
2260 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) 2260 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
2261 wakeup_kswapd(zone, order, classzone_idx); 2261 wakeup_kswapd(zone, order, classzone_idx);
2262 } 2262 }
2263 2263
2264 static inline int 2264 static inline int
2265 gfp_to_alloc_flags(gfp_t gfp_mask) 2265 gfp_to_alloc_flags(gfp_t gfp_mask)
2266 { 2266 {
2267 int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; 2267 int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
2268 const gfp_t wait = gfp_mask & __GFP_WAIT; 2268 const gfp_t wait = gfp_mask & __GFP_WAIT;
2269 2269
2270 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ 2270 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
2271 BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); 2271 BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
2272 2272
2273 /* 2273 /*
2274 * The caller may dip into page reserves a bit more if the caller 2274 * The caller may dip into page reserves a bit more if the caller
2275 * cannot run direct reclaim, or if the caller has realtime scheduling 2275 * cannot run direct reclaim, or if the caller has realtime scheduling
2276 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will 2276 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
2277 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). 2277 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
2278 */ 2278 */
2279 alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); 2279 alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
2280 2280
2281 if (!wait) { 2281 if (!wait) {
2282 /* 2282 /*
2283 * Not worth trying to allocate harder for 2283 * Not worth trying to allocate harder for
2284 * __GFP_NOMEMALLOC even if it can't schedule. 2284 * __GFP_NOMEMALLOC even if it can't schedule.
2285 */ 2285 */
2286 if (!(gfp_mask & __GFP_NOMEMALLOC)) 2286 if (!(gfp_mask & __GFP_NOMEMALLOC))
2287 alloc_flags |= ALLOC_HARDER; 2287 alloc_flags |= ALLOC_HARDER;
2288 /* 2288 /*
2289 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. 2289 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
2290 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 2290 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
2291 */ 2291 */
2292 alloc_flags &= ~ALLOC_CPUSET; 2292 alloc_flags &= ~ALLOC_CPUSET;
2293 } else if (unlikely(rt_task(current)) && !in_interrupt()) 2293 } else if (unlikely(rt_task(current)) && !in_interrupt())
2294 alloc_flags |= ALLOC_HARDER; 2294 alloc_flags |= ALLOC_HARDER;
2295 2295
2296 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { 2296 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
2297 if (gfp_mask & __GFP_MEMALLOC) 2297 if (gfp_mask & __GFP_MEMALLOC)
2298 alloc_flags |= ALLOC_NO_WATERMARKS; 2298 alloc_flags |= ALLOC_NO_WATERMARKS;
2299 else if (likely(!(gfp_mask & __GFP_NOMEMALLOC)) && !in_interrupt()) 2299 else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
2300 alloc_flags |= ALLOC_NO_WATERMARKS;
2301 else if (!in_interrupt() &&
2302 ((current->flags & PF_MEMALLOC) ||
2303 unlikely(test_thread_flag(TIF_MEMDIE))))
2300 alloc_flags |= ALLOC_NO_WATERMARKS; 2304 alloc_flags |= ALLOC_NO_WATERMARKS;
2301 } 2305 }
2302 2306
2303 return alloc_flags; 2307 return alloc_flags;
2304 } 2308 }
2305 2309
2306 bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) 2310 bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
2307 { 2311 {
2308 return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS); 2312 return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
2309 } 2313 }
2310 2314
2311 static inline struct page * 2315 static inline struct page *
2312 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, 2316 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2313 struct zonelist *zonelist, enum zone_type high_zoneidx, 2317 struct zonelist *zonelist, enum zone_type high_zoneidx,
2314 nodemask_t *nodemask, struct zone *preferred_zone, 2318 nodemask_t *nodemask, struct zone *preferred_zone,
2315 int migratetype) 2319 int migratetype)
2316 { 2320 {
2317 const gfp_t wait = gfp_mask & __GFP_WAIT; 2321 const gfp_t wait = gfp_mask & __GFP_WAIT;
2318 struct page *page = NULL; 2322 struct page *page = NULL;
2319 int alloc_flags; 2323 int alloc_flags;
2320 unsigned long pages_reclaimed = 0; 2324 unsigned long pages_reclaimed = 0;
2321 unsigned long did_some_progress; 2325 unsigned long did_some_progress;
2322 bool sync_migration = false; 2326 bool sync_migration = false;
2323 bool deferred_compaction = false; 2327 bool deferred_compaction = false;
2324 2328
2325 /* 2329 /*
2326 * In the slowpath, we sanity check order to avoid ever trying to 2330 * In the slowpath, we sanity check order to avoid ever trying to
2327 * reclaim >= MAX_ORDER areas which will never succeed. Callers may 2331 * reclaim >= MAX_ORDER areas which will never succeed. Callers may
2328 * be using allocators in order of preference for an area that is 2332 * be using allocators in order of preference for an area that is
2329 * too large. 2333 * too large.
2330 */ 2334 */
2331 if (order >= MAX_ORDER) { 2335 if (order >= MAX_ORDER) {
2332 WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN)); 2336 WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
2333 return NULL; 2337 return NULL;
2334 } 2338 }
2335 2339
2336 /* 2340 /*
2337 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and 2341 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
2338 * __GFP_NOWARN set) should not cause reclaim since the subsystem 2342 * __GFP_NOWARN set) should not cause reclaim since the subsystem
2339 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim 2343 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
2340 * using a larger set of nodes after it has established that the 2344 * using a larger set of nodes after it has established that the
2341 * allowed per node queues are empty and that nodes are 2345 * allowed per node queues are empty and that nodes are
2342 * over allocated. 2346 * over allocated.
2343 */ 2347 */
2344 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) 2348 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
2345 goto nopage; 2349 goto nopage;
2346 2350
2347 restart: 2351 restart:
2348 if (!(gfp_mask & __GFP_NO_KSWAPD)) 2352 if (!(gfp_mask & __GFP_NO_KSWAPD))
2349 wake_all_kswapd(order, zonelist, high_zoneidx, 2353 wake_all_kswapd(order, zonelist, high_zoneidx,
2350 zone_idx(preferred_zone)); 2354 zone_idx(preferred_zone));
2351 2355
2352 /* 2356 /*
2353 * OK, we're below the kswapd watermark and have kicked background 2357 * OK, we're below the kswapd watermark and have kicked background
2354 * reclaim. Now things get more complex, so set up alloc_flags according 2358 * reclaim. Now things get more complex, so set up alloc_flags according
2355 * to how we want to proceed. 2359 * to how we want to proceed.
2356 */ 2360 */
2357 alloc_flags = gfp_to_alloc_flags(gfp_mask); 2361 alloc_flags = gfp_to_alloc_flags(gfp_mask);
2358 2362
2359 /* 2363 /*
2360 * Find the true preferred zone if the allocation is unconstrained by 2364 * Find the true preferred zone if the allocation is unconstrained by
2361 * cpusets. 2365 * cpusets.
2362 */ 2366 */
2363 if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) 2367 if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
2364 first_zones_zonelist(zonelist, high_zoneidx, NULL, 2368 first_zones_zonelist(zonelist, high_zoneidx, NULL,
2365 &preferred_zone); 2369 &preferred_zone);
2366 2370
2367 rebalance: 2371 rebalance:
2368 /* This is the last chance, in general, before the goto nopage. */ 2372 /* This is the last chance, in general, before the goto nopage. */
2369 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, 2373 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
2370 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, 2374 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
2371 preferred_zone, migratetype); 2375 preferred_zone, migratetype);
2372 if (page) 2376 if (page)
2373 goto got_pg; 2377 goto got_pg;
2374 2378
2375 /* Allocate without watermarks if the context allows */ 2379 /* Allocate without watermarks if the context allows */
2376 if (alloc_flags & ALLOC_NO_WATERMARKS) { 2380 if (alloc_flags & ALLOC_NO_WATERMARKS) {
2377 page = __alloc_pages_high_priority(gfp_mask, order, 2381 page = __alloc_pages_high_priority(gfp_mask, order,
2378 zonelist, high_zoneidx, nodemask, 2382 zonelist, high_zoneidx, nodemask,
2379 preferred_zone, migratetype); 2383 preferred_zone, migratetype);
2380 if (page) 2384 if (page)
2381 goto got_pg; 2385 goto got_pg;
2382 } 2386 }
2383 2387
2384 /* Atomic allocations - we can't balance anything */ 2388 /* Atomic allocations - we can't balance anything */
2385 if (!wait) 2389 if (!wait)
2386 goto nopage; 2390 goto nopage;
2387 2391
2388 /* Avoid recursion of direct reclaim */ 2392 /* Avoid recursion of direct reclaim */
2389 if (current->flags & PF_MEMALLOC) 2393 if (current->flags & PF_MEMALLOC)
2390 goto nopage; 2394 goto nopage;
2391 2395
2392 /* Avoid allocations with no watermarks from looping endlessly */ 2396 /* Avoid allocations with no watermarks from looping endlessly */
2393 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) 2397 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
2394 goto nopage; 2398 goto nopage;
2395 2399
2396 /* 2400 /*
2397 * Try direct compaction. The first pass is asynchronous. Subsequent 2401 * Try direct compaction. The first pass is asynchronous. Subsequent
2398 * attempts after direct reclaim are synchronous 2402 * attempts after direct reclaim are synchronous
2399 */ 2403 */
2400 page = __alloc_pages_direct_compact(gfp_mask, order, 2404 page = __alloc_pages_direct_compact(gfp_mask, order,
2401 zonelist, high_zoneidx, 2405 zonelist, high_zoneidx,
2402 nodemask, 2406 nodemask,
2403 alloc_flags, preferred_zone, 2407 alloc_flags, preferred_zone,
2404 migratetype, sync_migration, 2408 migratetype, sync_migration,
2405 &deferred_compaction, 2409 &deferred_compaction,
2406 &did_some_progress); 2410 &did_some_progress);
2407 if (page) 2411 if (page)
2408 goto got_pg; 2412 goto got_pg;
2409 sync_migration = true; 2413 sync_migration = true;
2410 2414
2411 /* 2415 /*
2412 * If compaction is deferred for high-order allocations, it is because 2416 * If compaction is deferred for high-order allocations, it is because
2413 * sync compaction recently failed. In this is the case and the caller 2417 * sync compaction recently failed. In this is the case and the caller
2414 * has requested the system not be heavily disrupted, fail the 2418 * has requested the system not be heavily disrupted, fail the
2415 * allocation now instead of entering direct reclaim 2419 * allocation now instead of entering direct reclaim
2416 */ 2420 */
2417 if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD)) 2421 if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD))
2418 goto nopage; 2422 goto nopage;
2419 2423
2420 /* Try direct reclaim and then allocating */ 2424 /* Try direct reclaim and then allocating */
2421 page = __alloc_pages_direct_reclaim(gfp_mask, order, 2425 page = __alloc_pages_direct_reclaim(gfp_mask, order,
2422 zonelist, high_zoneidx, 2426 zonelist, high_zoneidx,
2423 nodemask, 2427 nodemask,
2424 alloc_flags, preferred_zone, 2428 alloc_flags, preferred_zone,
2425 migratetype, &did_some_progress); 2429 migratetype, &did_some_progress);
2426 if (page) 2430 if (page)
2427 goto got_pg; 2431 goto got_pg;
2428 2432
2429 /* 2433 /*
2430 * If we failed to make any progress reclaiming, then we are 2434 * If we failed to make any progress reclaiming, then we are
2431 * running out of options and have to consider going OOM 2435 * running out of options and have to consider going OOM
2432 */ 2436 */
2433 if (!did_some_progress) { 2437 if (!did_some_progress) {
2434 if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { 2438 if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
2435 if (oom_killer_disabled) 2439 if (oom_killer_disabled)
2436 goto nopage; 2440 goto nopage;
2437 /* Coredumps can quickly deplete all memory reserves */ 2441 /* Coredumps can quickly deplete all memory reserves */
2438 if ((current->flags & PF_DUMPCORE) && 2442 if ((current->flags & PF_DUMPCORE) &&
2439 !(gfp_mask & __GFP_NOFAIL)) 2443 !(gfp_mask & __GFP_NOFAIL))
2440 goto nopage; 2444 goto nopage;
2441 page = __alloc_pages_may_oom(gfp_mask, order, 2445 page = __alloc_pages_may_oom(gfp_mask, order,
2442 zonelist, high_zoneidx, 2446 zonelist, high_zoneidx,
2443 nodemask, preferred_zone, 2447 nodemask, preferred_zone,
2444 migratetype); 2448 migratetype);
2445 if (page) 2449 if (page)
2446 goto got_pg; 2450 goto got_pg;
2447 2451
2448 if (!(gfp_mask & __GFP_NOFAIL)) { 2452 if (!(gfp_mask & __GFP_NOFAIL)) {
2449 /* 2453 /*
2450 * The oom killer is not called for high-order 2454 * The oom killer is not called for high-order
2451 * allocations that may fail, so if no progress 2455 * allocations that may fail, so if no progress
2452 * is being made, there are no other options and 2456 * is being made, there are no other options and
2453 * retrying is unlikely to help. 2457 * retrying is unlikely to help.
2454 */ 2458 */
2455 if (order > PAGE_ALLOC_COSTLY_ORDER) 2459 if (order > PAGE_ALLOC_COSTLY_ORDER)
2456 goto nopage; 2460 goto nopage;
2457 /* 2461 /*
2458 * The oom killer is not called for lowmem 2462 * The oom killer is not called for lowmem
2459 * allocations to prevent needlessly killing 2463 * allocations to prevent needlessly killing
2460 * innocent tasks. 2464 * innocent tasks.
2461 */ 2465 */
2462 if (high_zoneidx < ZONE_NORMAL) 2466 if (high_zoneidx < ZONE_NORMAL)
2463 goto nopage; 2467 goto nopage;
2464 } 2468 }
2465 2469
2466 goto restart; 2470 goto restart;
2467 } 2471 }
2468 } 2472 }
2469 2473
2470 /* Check if we should retry the allocation */ 2474 /* Check if we should retry the allocation */
2471 pages_reclaimed += did_some_progress; 2475 pages_reclaimed += did_some_progress;
2472 if (should_alloc_retry(gfp_mask, order, did_some_progress, 2476 if (should_alloc_retry(gfp_mask, order, did_some_progress,
2473 pages_reclaimed)) { 2477 pages_reclaimed)) {
2474 /* Wait for some write requests to complete then retry */ 2478 /* Wait for some write requests to complete then retry */
2475 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); 2479 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
2476 goto rebalance; 2480 goto rebalance;
2477 } else { 2481 } else {
2478 /* 2482 /*
2479 * High-order allocations do not necessarily loop after 2483 * High-order allocations do not necessarily loop after
2480 * direct reclaim and reclaim/compaction depends on compaction 2484 * direct reclaim and reclaim/compaction depends on compaction
2481 * being called after reclaim so call directly if necessary 2485 * being called after reclaim so call directly if necessary
2482 */ 2486 */
2483 page = __alloc_pages_direct_compact(gfp_mask, order, 2487 page = __alloc_pages_direct_compact(gfp_mask, order,
2484 zonelist, high_zoneidx, 2488 zonelist, high_zoneidx,
2485 nodemask, 2489 nodemask,
2486 alloc_flags, preferred_zone, 2490 alloc_flags, preferred_zone,
2487 migratetype, sync_migration, 2491 migratetype, sync_migration,
2488 &deferred_compaction, 2492 &deferred_compaction,
2489 &did_some_progress); 2493 &did_some_progress);
2490 if (page) 2494 if (page)
2491 goto got_pg; 2495 goto got_pg;
2492 } 2496 }
2493 2497
2494 nopage: 2498 nopage:
2495 warn_alloc_failed(gfp_mask, order, NULL); 2499 warn_alloc_failed(gfp_mask, order, NULL);
2496 return page; 2500 return page;
2497 got_pg: 2501 got_pg:
2498 /* 2502 /*
2499 * page->pfmemalloc is set when the caller had PFMEMALLOC set, is 2503 * page->pfmemalloc is set when the caller had PFMEMALLOC set, is
2500 * been OOM killed or specified __GFP_MEMALLOC. The expectation is 2504 * been OOM killed or specified __GFP_MEMALLOC. The expectation is
2501 * that the caller is taking steps that will free more memory. The 2505 * that the caller is taking steps that will free more memory. The
2502 * caller should avoid the page being used for !PFMEMALLOC purposes. 2506 * caller should avoid the page being used for !PFMEMALLOC purposes.
2503 */ 2507 */
2504 page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); 2508 page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
2505 2509
2506 if (kmemcheck_enabled) 2510 if (kmemcheck_enabled)
2507 kmemcheck_pagealloc_alloc(page, order, gfp_mask); 2511 kmemcheck_pagealloc_alloc(page, order, gfp_mask);
2508 2512
2509 return page; 2513 return page;
2510 } 2514 }
2511 2515
2512 /* 2516 /*
2513 * This is the 'heart' of the zoned buddy allocator. 2517 * This is the 'heart' of the zoned buddy allocator.
2514 */ 2518 */
2515 struct page * 2519 struct page *
2516 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, 2520 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2517 struct zonelist *zonelist, nodemask_t *nodemask) 2521 struct zonelist *zonelist, nodemask_t *nodemask)
2518 { 2522 {
2519 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 2523 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
2520 struct zone *preferred_zone; 2524 struct zone *preferred_zone;
2521 struct page *page = NULL; 2525 struct page *page = NULL;
2522 int migratetype = allocflags_to_migratetype(gfp_mask); 2526 int migratetype = allocflags_to_migratetype(gfp_mask);
2523 unsigned int cpuset_mems_cookie; 2527 unsigned int cpuset_mems_cookie;
2524 2528
2525 gfp_mask &= gfp_allowed_mask; 2529 gfp_mask &= gfp_allowed_mask;
2526 2530
2527 lockdep_trace_alloc(gfp_mask); 2531 lockdep_trace_alloc(gfp_mask);
2528 2532
2529 might_sleep_if(gfp_mask & __GFP_WAIT); 2533 might_sleep_if(gfp_mask & __GFP_WAIT);
2530 2534
2531 if (should_fail_alloc_page(gfp_mask, order)) 2535 if (should_fail_alloc_page(gfp_mask, order))
2532 return NULL; 2536 return NULL;
2533 2537
2534 /* 2538 /*
2535 * Check the zones suitable for the gfp_mask contain at least one 2539 * Check the zones suitable for the gfp_mask contain at least one
2536 * valid zone. It's possible to have an empty zonelist as a result 2540 * valid zone. It's possible to have an empty zonelist as a result
2537 * of GFP_THISNODE and a memoryless node 2541 * of GFP_THISNODE and a memoryless node
2538 */ 2542 */
2539 if (unlikely(!zonelist->_zonerefs->zone)) 2543 if (unlikely(!zonelist->_zonerefs->zone))
2540 return NULL; 2544 return NULL;
2541 2545
2542 retry_cpuset: 2546 retry_cpuset:
2543 cpuset_mems_cookie = get_mems_allowed(); 2547 cpuset_mems_cookie = get_mems_allowed();
2544 2548
2545 /* The preferred zone is used for statistics later */ 2549 /* The preferred zone is used for statistics later */
2546 first_zones_zonelist(zonelist, high_zoneidx, 2550 first_zones_zonelist(zonelist, high_zoneidx,
2547 nodemask ? : &cpuset_current_mems_allowed, 2551 nodemask ? : &cpuset_current_mems_allowed,
2548 &preferred_zone); 2552 &preferred_zone);
2549 if (!preferred_zone) 2553 if (!preferred_zone)
2550 goto out; 2554 goto out;
2551 2555
2552 /* First allocation attempt */ 2556 /* First allocation attempt */
2553 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 2557 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
2554 zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET, 2558 zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
2555 preferred_zone, migratetype); 2559 preferred_zone, migratetype);
2556 if (unlikely(!page)) 2560 if (unlikely(!page))
2557 page = __alloc_pages_slowpath(gfp_mask, order, 2561 page = __alloc_pages_slowpath(gfp_mask, order,
2558 zonelist, high_zoneidx, nodemask, 2562 zonelist, high_zoneidx, nodemask,
2559 preferred_zone, migratetype); 2563 preferred_zone, migratetype);
2560 else 2564 else
2561 page->pfmemalloc = false; 2565 page->pfmemalloc = false;
2562 2566
2563 trace_mm_page_alloc(page, order, gfp_mask, migratetype); 2567 trace_mm_page_alloc(page, order, gfp_mask, migratetype);
2564 2568
2565 out: 2569 out:
2566 /* 2570 /*
2567 * When updating a task's mems_allowed, it is possible to race with 2571 * When updating a task's mems_allowed, it is possible to race with
2568 * parallel threads in such a way that an allocation can fail while 2572 * parallel threads in such a way that an allocation can fail while
2569 * the mask is being updated. If a page allocation is about to fail, 2573 * the mask is being updated. If a page allocation is about to fail,
2570 * check if the cpuset changed during allocation and if so, retry. 2574 * check if the cpuset changed during allocation and if so, retry.
2571 */ 2575 */
2572 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) 2576 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2573 goto retry_cpuset; 2577 goto retry_cpuset;
2574 2578
2575 return page; 2579 return page;
2576 } 2580 }
2577 EXPORT_SYMBOL(__alloc_pages_nodemask); 2581 EXPORT_SYMBOL(__alloc_pages_nodemask);
2578 2582
2579 /* 2583 /*
2580 * Common helper functions. 2584 * Common helper functions.
2581 */ 2585 */
2582 unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) 2586 unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
2583 { 2587 {
2584 struct page *page; 2588 struct page *page;
2585 2589
2586 /* 2590 /*
2587 * __get_free_pages() returns a 32-bit address, which cannot represent 2591 * __get_free_pages() returns a 32-bit address, which cannot represent
2588 * a highmem page 2592 * a highmem page
2589 */ 2593 */
2590 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); 2594 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
2591 2595
2592 page = alloc_pages(gfp_mask, order); 2596 page = alloc_pages(gfp_mask, order);
2593 if (!page) 2597 if (!page)
2594 return 0; 2598 return 0;
2595 return (unsigned long) page_address(page); 2599 return (unsigned long) page_address(page);
2596 } 2600 }
2597 EXPORT_SYMBOL(__get_free_pages); 2601 EXPORT_SYMBOL(__get_free_pages);
2598 2602
2599 unsigned long get_zeroed_page(gfp_t gfp_mask) 2603 unsigned long get_zeroed_page(gfp_t gfp_mask)
2600 { 2604 {
2601 return __get_free_pages(gfp_mask | __GFP_ZERO, 0); 2605 return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
2602 } 2606 }
2603 EXPORT_SYMBOL(get_zeroed_page); 2607 EXPORT_SYMBOL(get_zeroed_page);
2604 2608
2605 void __free_pages(struct page *page, unsigned int order) 2609 void __free_pages(struct page *page, unsigned int order)
2606 { 2610 {
2607 if (put_page_testzero(page)) { 2611 if (put_page_testzero(page)) {
2608 if (order == 0) 2612 if (order == 0)
2609 free_hot_cold_page(page, 0); 2613 free_hot_cold_page(page, 0);
2610 else 2614 else
2611 __free_pages_ok(page, order); 2615 __free_pages_ok(page, order);
2612 } 2616 }
2613 } 2617 }
2614 2618
2615 EXPORT_SYMBOL(__free_pages); 2619 EXPORT_SYMBOL(__free_pages);
2616 2620
2617 void free_pages(unsigned long addr, unsigned int order) 2621 void free_pages(unsigned long addr, unsigned int order)
2618 { 2622 {
2619 if (addr != 0) { 2623 if (addr != 0) {
2620 VM_BUG_ON(!virt_addr_valid((void *)addr)); 2624 VM_BUG_ON(!virt_addr_valid((void *)addr));
2621 __free_pages(virt_to_page((void *)addr), order); 2625 __free_pages(virt_to_page((void *)addr), order);
2622 } 2626 }
2623 } 2627 }
2624 2628
2625 EXPORT_SYMBOL(free_pages); 2629 EXPORT_SYMBOL(free_pages);
2626 2630
2627 static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size) 2631 static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
2628 { 2632 {
2629 if (addr) { 2633 if (addr) {
2630 unsigned long alloc_end = addr + (PAGE_SIZE << order); 2634 unsigned long alloc_end = addr + (PAGE_SIZE << order);
2631 unsigned long used = addr + PAGE_ALIGN(size); 2635 unsigned long used = addr + PAGE_ALIGN(size);
2632 2636
2633 split_page(virt_to_page((void *)addr), order); 2637 split_page(virt_to_page((void *)addr), order);
2634 while (used < alloc_end) { 2638 while (used < alloc_end) {
2635 free_page(used); 2639 free_page(used);
2636 used += PAGE_SIZE; 2640 used += PAGE_SIZE;
2637 } 2641 }
2638 } 2642 }
2639 return (void *)addr; 2643 return (void *)addr;
2640 } 2644 }
2641 2645
2642 /** 2646 /**
2643 * alloc_pages_exact - allocate an exact number physically-contiguous pages. 2647 * alloc_pages_exact - allocate an exact number physically-contiguous pages.
2644 * @size: the number of bytes to allocate 2648 * @size: the number of bytes to allocate
2645 * @gfp_mask: GFP flags for the allocation 2649 * @gfp_mask: GFP flags for the allocation
2646 * 2650 *
2647 * This function is similar to alloc_pages(), except that it allocates the 2651 * This function is similar to alloc_pages(), except that it allocates the
2648 * minimum number of pages to satisfy the request. alloc_pages() can only 2652 * minimum number of pages to satisfy the request. alloc_pages() can only
2649 * allocate memory in power-of-two pages. 2653 * allocate memory in power-of-two pages.
2650 * 2654 *
2651 * This function is also limited by MAX_ORDER. 2655 * This function is also limited by MAX_ORDER.
2652 * 2656 *
2653 * Memory allocated by this function must be released by free_pages_exact(). 2657 * Memory allocated by this function must be released by free_pages_exact().
2654 */ 2658 */
2655 void *alloc_pages_exact(size_t size, gfp_t gfp_mask) 2659 void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
2656 { 2660 {
2657 unsigned int order = get_order(size); 2661 unsigned int order = get_order(size);
2658 unsigned long addr; 2662 unsigned long addr;
2659 2663
2660 addr = __get_free_pages(gfp_mask, order); 2664 addr = __get_free_pages(gfp_mask, order);
2661 return make_alloc_exact(addr, order, size); 2665 return make_alloc_exact(addr, order, size);
2662 } 2666 }
2663 EXPORT_SYMBOL(alloc_pages_exact); 2667 EXPORT_SYMBOL(alloc_pages_exact);
2664 2668
2665 /** 2669 /**
2666 * alloc_pages_exact_nid - allocate an exact number of physically-contiguous 2670 * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
2667 * pages on a node. 2671 * pages on a node.
2668 * @nid: the preferred node ID where memory should be allocated 2672 * @nid: the preferred node ID where memory should be allocated
2669 * @size: the number of bytes to allocate 2673 * @size: the number of bytes to allocate
2670 * @gfp_mask: GFP flags for the allocation 2674 * @gfp_mask: GFP flags for the allocation
2671 * 2675 *
2672 * Like alloc_pages_exact(), but try to allocate on node nid first before falling 2676 * Like alloc_pages_exact(), but try to allocate on node nid first before falling
2673 * back. 2677 * back.
2674 * Note this is not alloc_pages_exact_node() which allocates on a specific node, 2678 * Note this is not alloc_pages_exact_node() which allocates on a specific node,
2675 * but is not exact. 2679 * but is not exact.
2676 */ 2680 */
2677 void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) 2681 void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
2678 { 2682 {
2679 unsigned order = get_order(size); 2683 unsigned order = get_order(size);
2680 struct page *p = alloc_pages_node(nid, gfp_mask, order); 2684 struct page *p = alloc_pages_node(nid, gfp_mask, order);
2681 if (!p) 2685 if (!p)
2682 return NULL; 2686 return NULL;
2683 return make_alloc_exact((unsigned long)page_address(p), order, size); 2687 return make_alloc_exact((unsigned long)page_address(p), order, size);
2684 } 2688 }
2685 EXPORT_SYMBOL(alloc_pages_exact_nid); 2689 EXPORT_SYMBOL(alloc_pages_exact_nid);
2686 2690
2687 /** 2691 /**
2688 * free_pages_exact - release memory allocated via alloc_pages_exact() 2692 * free_pages_exact - release memory allocated via alloc_pages_exact()
2689 * @virt: the value returned by alloc_pages_exact. 2693 * @virt: the value returned by alloc_pages_exact.
2690 * @size: size of allocation, same value as passed to alloc_pages_exact(). 2694 * @size: size of allocation, same value as passed to alloc_pages_exact().
2691 * 2695 *
2692 * Release the memory allocated by a previous call to alloc_pages_exact. 2696 * Release the memory allocated by a previous call to alloc_pages_exact.
2693 */ 2697 */
2694 void free_pages_exact(void *virt, size_t size) 2698 void free_pages_exact(void *virt, size_t size)
2695 { 2699 {
2696 unsigned long addr = (unsigned long)virt; 2700 unsigned long addr = (unsigned long)virt;
2697 unsigned long end = addr + PAGE_ALIGN(size); 2701 unsigned long end = addr + PAGE_ALIGN(size);
2698 2702
2699 while (addr < end) { 2703 while (addr < end) {
2700 free_page(addr); 2704 free_page(addr);
2701 addr += PAGE_SIZE; 2705 addr += PAGE_SIZE;
2702 } 2706 }
2703 } 2707 }
2704 EXPORT_SYMBOL(free_pages_exact); 2708 EXPORT_SYMBOL(free_pages_exact);
2705 2709
2706 static unsigned int nr_free_zone_pages(int offset) 2710 static unsigned int nr_free_zone_pages(int offset)
2707 { 2711 {
2708 struct zoneref *z; 2712 struct zoneref *z;
2709 struct zone *zone; 2713 struct zone *zone;
2710 2714
2711 /* Just pick one node, since fallback list is circular */ 2715 /* Just pick one node, since fallback list is circular */
2712 unsigned int sum = 0; 2716 unsigned int sum = 0;
2713 2717
2714 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); 2718 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
2715 2719
2716 for_each_zone_zonelist(zone, z, zonelist, offset) { 2720 for_each_zone_zonelist(zone, z, zonelist, offset) {
2717 unsigned long size = zone->present_pages; 2721 unsigned long size = zone->present_pages;
2718 unsigned long high = high_wmark_pages(zone); 2722 unsigned long high = high_wmark_pages(zone);
2719 if (size > high) 2723 if (size > high)
2720 sum += size - high; 2724 sum += size - high;
2721 } 2725 }
2722 2726
2723 return sum; 2727 return sum;
2724 } 2728 }
2725 2729
2726 /* 2730 /*
2727 * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL 2731 * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
2728 */ 2732 */
2729 unsigned int nr_free_buffer_pages(void) 2733 unsigned int nr_free_buffer_pages(void)
2730 { 2734 {
2731 return nr_free_zone_pages(gfp_zone(GFP_USER)); 2735 return nr_free_zone_pages(gfp_zone(GFP_USER));
2732 } 2736 }
2733 EXPORT_SYMBOL_GPL(nr_free_buffer_pages); 2737 EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
2734 2738
2735 /* 2739 /*
2736 * Amount of free RAM allocatable within all zones 2740 * Amount of free RAM allocatable within all zones
2737 */ 2741 */
2738 unsigned int nr_free_pagecache_pages(void) 2742 unsigned int nr_free_pagecache_pages(void)
2739 { 2743 {
2740 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); 2744 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
2741 } 2745 }
2742 2746
2743 static inline void show_node(struct zone *zone) 2747 static inline void show_node(struct zone *zone)
2744 { 2748 {
2745 if (NUMA_BUILD) 2749 if (NUMA_BUILD)
2746 printk("Node %d ", zone_to_nid(zone)); 2750 printk("Node %d ", zone_to_nid(zone));
2747 } 2751 }
2748 2752
2749 void si_meminfo(struct sysinfo *val) 2753 void si_meminfo(struct sysinfo *val)
2750 { 2754 {
2751 val->totalram = totalram_pages; 2755 val->totalram = totalram_pages;
2752 val->sharedram = 0; 2756 val->sharedram = 0;
2753 val->freeram = global_page_state(NR_FREE_PAGES); 2757 val->freeram = global_page_state(NR_FREE_PAGES);
2754 val->bufferram = nr_blockdev_pages(); 2758 val->bufferram = nr_blockdev_pages();
2755 val->totalhigh = totalhigh_pages; 2759 val->totalhigh = totalhigh_pages;
2756 val->freehigh = nr_free_highpages(); 2760 val->freehigh = nr_free_highpages();
2757 val->mem_unit = PAGE_SIZE; 2761 val->mem_unit = PAGE_SIZE;
2758 } 2762 }
2759 2763
2760 EXPORT_SYMBOL(si_meminfo); 2764 EXPORT_SYMBOL(si_meminfo);
2761 2765
2762 #ifdef CONFIG_NUMA 2766 #ifdef CONFIG_NUMA
2763 void si_meminfo_node(struct sysinfo *val, int nid) 2767 void si_meminfo_node(struct sysinfo *val, int nid)
2764 { 2768 {
2765 pg_data_t *pgdat = NODE_DATA(nid); 2769 pg_data_t *pgdat = NODE_DATA(nid);
2766 2770
2767 val->totalram = pgdat->node_present_pages; 2771 val->totalram = pgdat->node_present_pages;
2768 val->freeram = node_page_state(nid, NR_FREE_PAGES); 2772 val->freeram = node_page_state(nid, NR_FREE_PAGES);
2769 #ifdef CONFIG_HIGHMEM 2773 #ifdef CONFIG_HIGHMEM
2770 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; 2774 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
2771 val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], 2775 val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
2772 NR_FREE_PAGES); 2776 NR_FREE_PAGES);
2773 #else 2777 #else
2774 val->totalhigh = 0; 2778 val->totalhigh = 0;
2775 val->freehigh = 0; 2779 val->freehigh = 0;
2776 #endif 2780 #endif
2777 val->mem_unit = PAGE_SIZE; 2781 val->mem_unit = PAGE_SIZE;
2778 } 2782 }
2779 #endif 2783 #endif
2780 2784
2781 /* 2785 /*
2782 * Determine whether the node should be displayed or not, depending on whether 2786 * Determine whether the node should be displayed or not, depending on whether
2783 * SHOW_MEM_FILTER_NODES was passed to show_free_areas(). 2787 * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
2784 */ 2788 */
2785 bool skip_free_areas_node(unsigned int flags, int nid) 2789 bool skip_free_areas_node(unsigned int flags, int nid)
2786 { 2790 {
2787 bool ret = false; 2791 bool ret = false;
2788 unsigned int cpuset_mems_cookie; 2792 unsigned int cpuset_mems_cookie;
2789 2793
2790 if (!(flags & SHOW_MEM_FILTER_NODES)) 2794 if (!(flags & SHOW_MEM_FILTER_NODES))
2791 goto out; 2795 goto out;
2792 2796
2793 do { 2797 do {
2794 cpuset_mems_cookie = get_mems_allowed(); 2798 cpuset_mems_cookie = get_mems_allowed();
2795 ret = !node_isset(nid, cpuset_current_mems_allowed); 2799 ret = !node_isset(nid, cpuset_current_mems_allowed);
2796 } while (!put_mems_allowed(cpuset_mems_cookie)); 2800 } while (!put_mems_allowed(cpuset_mems_cookie));
2797 out: 2801 out:
2798 return ret; 2802 return ret;
2799 } 2803 }
2800 2804
2801 #define K(x) ((x) << (PAGE_SHIFT-10)) 2805 #define K(x) ((x) << (PAGE_SHIFT-10))
2802 2806
2803 /* 2807 /*
2804 * Show free area list (used inside shift_scroll-lock stuff) 2808 * Show free area list (used inside shift_scroll-lock stuff)
2805 * We also calculate the percentage fragmentation. We do this by counting the 2809 * We also calculate the percentage fragmentation. We do this by counting the
2806 * memory on each free list with the exception of the first item on the list. 2810 * memory on each free list with the exception of the first item on the list.
2807 * Suppresses nodes that are not allowed by current's cpuset if 2811 * Suppresses nodes that are not allowed by current's cpuset if
2808 * SHOW_MEM_FILTER_NODES is passed. 2812 * SHOW_MEM_FILTER_NODES is passed.
2809 */ 2813 */
2810 void show_free_areas(unsigned int filter) 2814 void show_free_areas(unsigned int filter)
2811 { 2815 {
2812 int cpu; 2816 int cpu;
2813 struct zone *zone; 2817 struct zone *zone;
2814 2818
2815 for_each_populated_zone(zone) { 2819 for_each_populated_zone(zone) {
2816 if (skip_free_areas_node(filter, zone_to_nid(zone))) 2820 if (skip_free_areas_node(filter, zone_to_nid(zone)))
2817 continue; 2821 continue;
2818 show_node(zone); 2822 show_node(zone);
2819 printk("%s per-cpu:\n", zone->name); 2823 printk("%s per-cpu:\n", zone->name);
2820 2824
2821 for_each_online_cpu(cpu) { 2825 for_each_online_cpu(cpu) {
2822 struct per_cpu_pageset *pageset; 2826 struct per_cpu_pageset *pageset;
2823 2827
2824 pageset = per_cpu_ptr(zone->pageset, cpu); 2828 pageset = per_cpu_ptr(zone->pageset, cpu);
2825 2829
2826 printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", 2830 printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
2827 cpu, pageset->pcp.high, 2831 cpu, pageset->pcp.high,
2828 pageset->pcp.batch, pageset->pcp.count); 2832 pageset->pcp.batch, pageset->pcp.count);
2829 } 2833 }
2830 } 2834 }
2831 2835
2832 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" 2836 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
2833 " active_file:%lu inactive_file:%lu isolated_file:%lu\n" 2837 " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
2834 " unevictable:%lu" 2838 " unevictable:%lu"
2835 " dirty:%lu writeback:%lu unstable:%lu\n" 2839 " dirty:%lu writeback:%lu unstable:%lu\n"
2836 " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" 2840 " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
2837 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n", 2841 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n",
2838 global_page_state(NR_ACTIVE_ANON), 2842 global_page_state(NR_ACTIVE_ANON),
2839 global_page_state(NR_INACTIVE_ANON), 2843 global_page_state(NR_INACTIVE_ANON),
2840 global_page_state(NR_ISOLATED_ANON), 2844 global_page_state(NR_ISOLATED_ANON),
2841 global_page_state(NR_ACTIVE_FILE), 2845 global_page_state(NR_ACTIVE_FILE),
2842 global_page_state(NR_INACTIVE_FILE), 2846 global_page_state(NR_INACTIVE_FILE),
2843 global_page_state(NR_ISOLATED_FILE), 2847 global_page_state(NR_ISOLATED_FILE),
2844 global_page_state(NR_UNEVICTABLE), 2848 global_page_state(NR_UNEVICTABLE),
2845 global_page_state(NR_FILE_DIRTY), 2849 global_page_state(NR_FILE_DIRTY),
2846 global_page_state(NR_WRITEBACK), 2850 global_page_state(NR_WRITEBACK),
2847 global_page_state(NR_UNSTABLE_NFS), 2851 global_page_state(NR_UNSTABLE_NFS),
2848 global_page_state(NR_FREE_PAGES), 2852 global_page_state(NR_FREE_PAGES),
2849 global_page_state(NR_SLAB_RECLAIMABLE), 2853 global_page_state(NR_SLAB_RECLAIMABLE),
2850 global_page_state(NR_SLAB_UNRECLAIMABLE), 2854 global_page_state(NR_SLAB_UNRECLAIMABLE),
2851 global_page_state(NR_FILE_MAPPED), 2855 global_page_state(NR_FILE_MAPPED),
2852 global_page_state(NR_SHMEM), 2856 global_page_state(NR_SHMEM),
2853 global_page_state(NR_PAGETABLE), 2857 global_page_state(NR_PAGETABLE),
2854 global_page_state(NR_BOUNCE)); 2858 global_page_state(NR_BOUNCE));
2855 2859
2856 for_each_populated_zone(zone) { 2860 for_each_populated_zone(zone) {
2857 int i; 2861 int i;
2858 2862
2859 if (skip_free_areas_node(filter, zone_to_nid(zone))) 2863 if (skip_free_areas_node(filter, zone_to_nid(zone)))
2860 continue; 2864 continue;
2861 show_node(zone); 2865 show_node(zone);
2862 printk("%s" 2866 printk("%s"
2863 " free:%lukB" 2867 " free:%lukB"
2864 " min:%lukB" 2868 " min:%lukB"
2865 " low:%lukB" 2869 " low:%lukB"
2866 " high:%lukB" 2870 " high:%lukB"
2867 " active_anon:%lukB" 2871 " active_anon:%lukB"
2868 " inactive_anon:%lukB" 2872 " inactive_anon:%lukB"
2869 " active_file:%lukB" 2873 " active_file:%lukB"
2870 " inactive_file:%lukB" 2874 " inactive_file:%lukB"
2871 " unevictable:%lukB" 2875 " unevictable:%lukB"
2872 " isolated(anon):%lukB" 2876 " isolated(anon):%lukB"
2873 " isolated(file):%lukB" 2877 " isolated(file):%lukB"
2874 " present:%lukB" 2878 " present:%lukB"
2875 " mlocked:%lukB" 2879 " mlocked:%lukB"
2876 " dirty:%lukB" 2880 " dirty:%lukB"
2877 " writeback:%lukB" 2881 " writeback:%lukB"
2878 " mapped:%lukB" 2882 " mapped:%lukB"
2879 " shmem:%lukB" 2883 " shmem:%lukB"
2880 " slab_reclaimable:%lukB" 2884 " slab_reclaimable:%lukB"
2881 " slab_unreclaimable:%lukB" 2885 " slab_unreclaimable:%lukB"
2882 " kernel_stack:%lukB" 2886 " kernel_stack:%lukB"
2883 " pagetables:%lukB" 2887 " pagetables:%lukB"
2884 " unstable:%lukB" 2888 " unstable:%lukB"
2885 " bounce:%lukB" 2889 " bounce:%lukB"
2886 " writeback_tmp:%lukB" 2890 " writeback_tmp:%lukB"
2887 " pages_scanned:%lu" 2891 " pages_scanned:%lu"
2888 " all_unreclaimable? %s" 2892 " all_unreclaimable? %s"
2889 "\n", 2893 "\n",
2890 zone->name, 2894 zone->name,
2891 K(zone_page_state(zone, NR_FREE_PAGES)), 2895 K(zone_page_state(zone, NR_FREE_PAGES)),
2892 K(min_wmark_pages(zone)), 2896 K(min_wmark_pages(zone)),
2893 K(low_wmark_pages(zone)), 2897 K(low_wmark_pages(zone)),
2894 K(high_wmark_pages(zone)), 2898 K(high_wmark_pages(zone)),
2895 K(zone_page_state(zone, NR_ACTIVE_ANON)), 2899 K(zone_page_state(zone, NR_ACTIVE_ANON)),
2896 K(zone_page_state(zone, NR_INACTIVE_ANON)), 2900 K(zone_page_state(zone, NR_INACTIVE_ANON)),
2897 K(zone_page_state(zone, NR_ACTIVE_FILE)), 2901 K(zone_page_state(zone, NR_ACTIVE_FILE)),
2898 K(zone_page_state(zone, NR_INACTIVE_FILE)), 2902 K(zone_page_state(zone, NR_INACTIVE_FILE)),
2899 K(zone_page_state(zone, NR_UNEVICTABLE)), 2903 K(zone_page_state(zone, NR_UNEVICTABLE)),
2900 K(zone_page_state(zone, NR_ISOLATED_ANON)), 2904 K(zone_page_state(zone, NR_ISOLATED_ANON)),
2901 K(zone_page_state(zone, NR_ISOLATED_FILE)), 2905 K(zone_page_state(zone, NR_ISOLATED_FILE)),
2902 K(zone->present_pages), 2906 K(zone->present_pages),
2903 K(zone_page_state(zone, NR_MLOCK)), 2907 K(zone_page_state(zone, NR_MLOCK)),
2904 K(zone_page_state(zone, NR_FILE_DIRTY)), 2908 K(zone_page_state(zone, NR_FILE_DIRTY)),
2905 K(zone_page_state(zone, NR_WRITEBACK)), 2909 K(zone_page_state(zone, NR_WRITEBACK)),
2906 K(zone_page_state(zone, NR_FILE_MAPPED)), 2910 K(zone_page_state(zone, NR_FILE_MAPPED)),
2907 K(zone_page_state(zone, NR_SHMEM)), 2911 K(zone_page_state(zone, NR_SHMEM)),
2908 K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), 2912 K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
2909 K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), 2913 K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
2910 zone_page_state(zone, NR_KERNEL_STACK) * 2914 zone_page_state(zone, NR_KERNEL_STACK) *
2911 THREAD_SIZE / 1024, 2915 THREAD_SIZE / 1024,
2912 K(zone_page_state(zone, NR_PAGETABLE)), 2916 K(zone_page_state(zone, NR_PAGETABLE)),
2913 K(zone_page_state(zone, NR_UNSTABLE_NFS)), 2917 K(zone_page_state(zone, NR_UNSTABLE_NFS)),
2914 K(zone_page_state(zone, NR_BOUNCE)), 2918 K(zone_page_state(zone, NR_BOUNCE)),
2915 K(zone_page_state(zone, NR_WRITEBACK_TEMP)), 2919 K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
2916 zone->pages_scanned, 2920 zone->pages_scanned,
2917 (zone->all_unreclaimable ? "yes" : "no") 2921 (zone->all_unreclaimable ? "yes" : "no")
2918 ); 2922 );
2919 printk("lowmem_reserve[]:"); 2923 printk("lowmem_reserve[]:");
2920 for (i = 0; i < MAX_NR_ZONES; i++) 2924 for (i = 0; i < MAX_NR_ZONES; i++)
2921 printk(" %lu", zone->lowmem_reserve[i]); 2925 printk(" %lu", zone->lowmem_reserve[i]);
2922 printk("\n"); 2926 printk("\n");
2923 } 2927 }
2924 2928
2925 for_each_populated_zone(zone) { 2929 for_each_populated_zone(zone) {
2926 unsigned long nr[MAX_ORDER], flags, order, total = 0; 2930 unsigned long nr[MAX_ORDER], flags, order, total = 0;
2927 2931
2928 if (skip_free_areas_node(filter, zone_to_nid(zone))) 2932 if (skip_free_areas_node(filter, zone_to_nid(zone)))
2929 continue; 2933 continue;
2930 show_node(zone); 2934 show_node(zone);
2931 printk("%s: ", zone->name); 2935 printk("%s: ", zone->name);
2932 2936
2933 spin_lock_irqsave(&zone->lock, flags); 2937 spin_lock_irqsave(&zone->lock, flags);
2934 for (order = 0; order < MAX_ORDER; order++) { 2938 for (order = 0; order < MAX_ORDER; order++) {
2935 nr[order] = zone->free_area[order].nr_free; 2939 nr[order] = zone->free_area[order].nr_free;
2936 total += nr[order] << order; 2940 total += nr[order] << order;
2937 } 2941 }
2938 spin_unlock_irqrestore(&zone->lock, flags); 2942 spin_unlock_irqrestore(&zone->lock, flags);
2939 for (order = 0; order < MAX_ORDER; order++) 2943 for (order = 0; order < MAX_ORDER; order++)
2940 printk("%lu*%lukB ", nr[order], K(1UL) << order); 2944 printk("%lu*%lukB ", nr[order], K(1UL) << order);
2941 printk("= %lukB\n", K(total)); 2945 printk("= %lukB\n", K(total));
2942 } 2946 }
2943 2947
2944 printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES)); 2948 printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));
2945 2949
2946 show_swap_cache_info(); 2950 show_swap_cache_info();
2947 } 2951 }
2948 2952
2949 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) 2953 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
2950 { 2954 {
2951 zoneref->zone = zone; 2955 zoneref->zone = zone;
2952 zoneref->zone_idx = zone_idx(zone); 2956 zoneref->zone_idx = zone_idx(zone);
2953 } 2957 }
2954 2958
2955 /* 2959 /*
2956 * Builds allocation fallback zone lists. 2960 * Builds allocation fallback zone lists.
2957 * 2961 *
2958 * Add all populated zones of a node to the zonelist. 2962 * Add all populated zones of a node to the zonelist.
2959 */ 2963 */
2960 static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, 2964 static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
2961 int nr_zones, enum zone_type zone_type) 2965 int nr_zones, enum zone_type zone_type)
2962 { 2966 {
2963 struct zone *zone; 2967 struct zone *zone;
2964 2968
2965 BUG_ON(zone_type >= MAX_NR_ZONES); 2969 BUG_ON(zone_type >= MAX_NR_ZONES);
2966 zone_type++; 2970 zone_type++;
2967 2971
2968 do { 2972 do {
2969 zone_type--; 2973 zone_type--;
2970 zone = pgdat->node_zones + zone_type; 2974 zone = pgdat->node_zones + zone_type;
2971 if (populated_zone(zone)) { 2975 if (populated_zone(zone)) {
2972 zoneref_set_zone(zone, 2976 zoneref_set_zone(zone,
2973 &zonelist->_zonerefs[nr_zones++]); 2977 &zonelist->_zonerefs[nr_zones++]);
2974 check_highest_zone(zone_type); 2978 check_highest_zone(zone_type);
2975 } 2979 }
2976 2980
2977 } while (zone_type); 2981 } while (zone_type);
2978 return nr_zones; 2982 return nr_zones;
2979 } 2983 }
2980 2984
2981 2985
2982 /* 2986 /*
2983 * zonelist_order: 2987 * zonelist_order:
2984 * 0 = automatic detection of better ordering. 2988 * 0 = automatic detection of better ordering.
2985 * 1 = order by ([node] distance, -zonetype) 2989 * 1 = order by ([node] distance, -zonetype)
2986 * 2 = order by (-zonetype, [node] distance) 2990 * 2 = order by (-zonetype, [node] distance)
2987 * 2991 *
2988 * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create 2992 * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
2989 * the same zonelist. So only NUMA can configure this param. 2993 * the same zonelist. So only NUMA can configure this param.
2990 */ 2994 */
2991 #define ZONELIST_ORDER_DEFAULT 0 2995 #define ZONELIST_ORDER_DEFAULT 0
2992 #define ZONELIST_ORDER_NODE 1 2996 #define ZONELIST_ORDER_NODE 1
2993 #define ZONELIST_ORDER_ZONE 2 2997 #define ZONELIST_ORDER_ZONE 2
2994 2998
2995 /* zonelist order in the kernel. 2999 /* zonelist order in the kernel.
2996 * set_zonelist_order() will set this to NODE or ZONE. 3000 * set_zonelist_order() will set this to NODE or ZONE.
2997 */ 3001 */
2998 static int current_zonelist_order = ZONELIST_ORDER_DEFAULT; 3002 static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
2999 static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"}; 3003 static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
3000 3004
3001 3005
3002 #ifdef CONFIG_NUMA 3006 #ifdef CONFIG_NUMA
3003 /* The value user specified ....changed by config */ 3007 /* The value user specified ....changed by config */
3004 static int user_zonelist_order = ZONELIST_ORDER_DEFAULT; 3008 static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
3005 /* string for sysctl */ 3009 /* string for sysctl */
3006 #define NUMA_ZONELIST_ORDER_LEN 16 3010 #define NUMA_ZONELIST_ORDER_LEN 16
3007 char numa_zonelist_order[16] = "default"; 3011 char numa_zonelist_order[16] = "default";
3008 3012
3009 /* 3013 /*
3010 * interface for configure zonelist ordering. 3014 * interface for configure zonelist ordering.
3011 * command line option "numa_zonelist_order" 3015 * command line option "numa_zonelist_order"
3012 * = "[dD]efault - default, automatic configuration. 3016 * = "[dD]efault - default, automatic configuration.
3013 * = "[nN]ode - order by node locality, then by zone within node 3017 * = "[nN]ode - order by node locality, then by zone within node
3014 * = "[zZ]one - order by zone, then by locality within zone 3018 * = "[zZ]one - order by zone, then by locality within zone
3015 */ 3019 */
3016 3020
3017 static int __parse_numa_zonelist_order(char *s) 3021 static int __parse_numa_zonelist_order(char *s)
3018 { 3022 {
3019 if (*s == 'd' || *s == 'D') { 3023 if (*s == 'd' || *s == 'D') {
3020 user_zonelist_order = ZONELIST_ORDER_DEFAULT; 3024 user_zonelist_order = ZONELIST_ORDER_DEFAULT;
3021 } else if (*s == 'n' || *s == 'N') { 3025 } else if (*s == 'n' || *s == 'N') {
3022 user_zonelist_order = ZONELIST_ORDER_NODE; 3026 user_zonelist_order = ZONELIST_ORDER_NODE;
3023 } else if (*s == 'z' || *s == 'Z') { 3027 } else if (*s == 'z' || *s == 'Z') {
3024 user_zonelist_order = ZONELIST_ORDER_ZONE; 3028 user_zonelist_order = ZONELIST_ORDER_ZONE;
3025 } else { 3029 } else {
3026 printk(KERN_WARNING 3030 printk(KERN_WARNING
3027 "Ignoring invalid numa_zonelist_order value: " 3031 "Ignoring invalid numa_zonelist_order value: "
3028 "%s\n", s); 3032 "%s\n", s);
3029 return -EINVAL; 3033 return -EINVAL;
3030 } 3034 }
3031 return 0; 3035 return 0;
3032 } 3036 }
3033 3037
3034 static __init int setup_numa_zonelist_order(char *s) 3038 static __init int setup_numa_zonelist_order(char *s)
3035 { 3039 {
3036 int ret; 3040 int ret;
3037 3041
3038 if (!s) 3042 if (!s)
3039 return 0; 3043 return 0;
3040 3044
3041 ret = __parse_numa_zonelist_order(s); 3045 ret = __parse_numa_zonelist_order(s);
3042 if (ret == 0) 3046 if (ret == 0)
3043 strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN); 3047 strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
3044 3048
3045 return ret; 3049 return ret;
3046 } 3050 }
3047 early_param("numa_zonelist_order", setup_numa_zonelist_order); 3051 early_param("numa_zonelist_order", setup_numa_zonelist_order);
3048 3052
3049 /* 3053 /*
3050 * sysctl handler for numa_zonelist_order 3054 * sysctl handler for numa_zonelist_order
3051 */ 3055 */
3052 int numa_zonelist_order_handler(ctl_table *table, int write, 3056 int numa_zonelist_order_handler(ctl_table *table, int write,
3053 void __user *buffer, size_t *length, 3057 void __user *buffer, size_t *length,
3054 loff_t *ppos) 3058 loff_t *ppos)
3055 { 3059 {
3056 char saved_string[NUMA_ZONELIST_ORDER_LEN]; 3060 char saved_string[NUMA_ZONELIST_ORDER_LEN];
3057 int ret; 3061 int ret;
3058 static DEFINE_MUTEX(zl_order_mutex); 3062 static DEFINE_MUTEX(zl_order_mutex);
3059 3063
3060 mutex_lock(&zl_order_mutex); 3064 mutex_lock(&zl_order_mutex);
3061 if (write) 3065 if (write)
3062 strcpy(saved_string, (char*)table->data); 3066 strcpy(saved_string, (char*)table->data);
3063 ret = proc_dostring(table, write, buffer, length, ppos); 3067 ret = proc_dostring(table, write, buffer, length, ppos);
3064 if (ret) 3068 if (ret)
3065 goto out; 3069 goto out;
3066 if (write) { 3070 if (write) {
3067 int oldval = user_zonelist_order; 3071 int oldval = user_zonelist_order;
3068 if (__parse_numa_zonelist_order((char*)table->data)) { 3072 if (__parse_numa_zonelist_order((char*)table->data)) {
3069 /* 3073 /*
3070 * bogus value. restore saved string 3074 * bogus value. restore saved string
3071 */ 3075 */
3072 strncpy((char*)table->data, saved_string, 3076 strncpy((char*)table->data, saved_string,
3073 NUMA_ZONELIST_ORDER_LEN); 3077 NUMA_ZONELIST_ORDER_LEN);
3074 user_zonelist_order = oldval; 3078 user_zonelist_order = oldval;
3075 } else if (oldval != user_zonelist_order) { 3079 } else if (oldval != user_zonelist_order) {
3076 mutex_lock(&zonelists_mutex); 3080 mutex_lock(&zonelists_mutex);
3077 build_all_zonelists(NULL, NULL); 3081 build_all_zonelists(NULL, NULL);
3078 mutex_unlock(&zonelists_mutex); 3082 mutex_unlock(&zonelists_mutex);
3079 } 3083 }
3080 } 3084 }
3081 out: 3085 out:
3082 mutex_unlock(&zl_order_mutex); 3086 mutex_unlock(&zl_order_mutex);
3083 return ret; 3087 return ret;
3084 } 3088 }
3085 3089
3086 3090
3087 #define MAX_NODE_LOAD (nr_online_nodes) 3091 #define MAX_NODE_LOAD (nr_online_nodes)
3088 static int node_load[MAX_NUMNODES]; 3092 static int node_load[MAX_NUMNODES];
3089 3093
3090 /** 3094 /**
3091 * find_next_best_node - find the next node that should appear in a given node's fallback list 3095 * find_next_best_node - find the next node that should appear in a given node's fallback list
3092 * @node: node whose fallback list we're appending 3096 * @node: node whose fallback list we're appending
3093 * @used_node_mask: nodemask_t of already used nodes 3097 * @used_node_mask: nodemask_t of already used nodes
3094 * 3098 *
3095 * We use a number of factors to determine which is the next node that should 3099 * We use a number of factors to determine which is the next node that should
3096 * appear on a given node's fallback list. The node should not have appeared 3100 * appear on a given node's fallback list. The node should not have appeared
3097 * already in @node's fallback list, and it should be the next closest node 3101 * already in @node's fallback list, and it should be the next closest node
3098 * according to the distance array (which contains arbitrary distance values 3102 * according to the distance array (which contains arbitrary distance values
3099 * from each node to each node in the system), and should also prefer nodes 3103 * from each node to each node in the system), and should also prefer nodes
3100 * with no CPUs, since presumably they'll have very little allocation pressure 3104 * with no CPUs, since presumably they'll have very little allocation pressure
3101 * on them otherwise. 3105 * on them otherwise.
3102 * It returns -1 if no node is found. 3106 * It returns -1 if no node is found.
3103 */ 3107 */
3104 static int find_next_best_node(int node, nodemask_t *used_node_mask) 3108 static int find_next_best_node(int node, nodemask_t *used_node_mask)
3105 { 3109 {
3106 int n, val; 3110 int n, val;
3107 int min_val = INT_MAX; 3111 int min_val = INT_MAX;
3108 int best_node = -1; 3112 int best_node = -1;
3109 const struct cpumask *tmp = cpumask_of_node(0); 3113 const struct cpumask *tmp = cpumask_of_node(0);
3110 3114
3111 /* Use the local node if we haven't already */ 3115 /* Use the local node if we haven't already */
3112 if (!node_isset(node, *used_node_mask)) { 3116 if (!node_isset(node, *used_node_mask)) {
3113 node_set(node, *used_node_mask); 3117 node_set(node, *used_node_mask);
3114 return node; 3118 return node;
3115 } 3119 }
3116 3120
3117 for_each_node_state(n, N_HIGH_MEMORY) { 3121 for_each_node_state(n, N_HIGH_MEMORY) {
3118 3122
3119 /* Don't want a node to appear more than once */ 3123 /* Don't want a node to appear more than once */
3120 if (node_isset(n, *used_node_mask)) 3124 if (node_isset(n, *used_node_mask))
3121 continue; 3125 continue;
3122 3126
3123 /* Use the distance array to find the distance */ 3127 /* Use the distance array to find the distance */
3124 val = node_distance(node, n); 3128 val = node_distance(node, n);
3125 3129
3126 /* Penalize nodes under us ("prefer the next node") */ 3130 /* Penalize nodes under us ("prefer the next node") */
3127 val += (n < node); 3131 val += (n < node);
3128 3132
3129 /* Give preference to headless and unused nodes */ 3133 /* Give preference to headless and unused nodes */
3130 tmp = cpumask_of_node(n); 3134 tmp = cpumask_of_node(n);
3131 if (!cpumask_empty(tmp)) 3135 if (!cpumask_empty(tmp))
3132 val += PENALTY_FOR_NODE_WITH_CPUS; 3136 val += PENALTY_FOR_NODE_WITH_CPUS;
3133 3137
3134 /* Slight preference for less loaded node */ 3138 /* Slight preference for less loaded node */
3135 val *= (MAX_NODE_LOAD*MAX_NUMNODES); 3139 val *= (MAX_NODE_LOAD*MAX_NUMNODES);
3136 val += node_load[n]; 3140 val += node_load[n];
3137 3141
3138 if (val < min_val) { 3142 if (val < min_val) {
3139 min_val = val; 3143 min_val = val;
3140 best_node = n; 3144 best_node = n;
3141 } 3145 }
3142 } 3146 }
3143 3147
3144 if (best_node >= 0) 3148 if (best_node >= 0)
3145 node_set(best_node, *used_node_mask); 3149 node_set(best_node, *used_node_mask);
3146 3150
3147 return best_node; 3151 return best_node;
3148 } 3152 }
3149 3153
3150 3154
3151 /* 3155 /*
3152 * Build zonelists ordered by node and zones within node. 3156 * Build zonelists ordered by node and zones within node.
3153 * This results in maximum locality--normal zone overflows into local 3157 * This results in maximum locality--normal zone overflows into local
3154 * DMA zone, if any--but risks exhausting DMA zone. 3158 * DMA zone, if any--but risks exhausting DMA zone.
3155 */ 3159 */
3156 static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) 3160 static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
3157 { 3161 {
3158 int j; 3162 int j;
3159 struct zonelist *zonelist; 3163 struct zonelist *zonelist;
3160 3164
3161 zonelist = &pgdat->node_zonelists[0]; 3165 zonelist = &pgdat->node_zonelists[0];
3162 for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) 3166 for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
3163 ; 3167 ;
3164 j = build_zonelists_node(NODE_DATA(node), zonelist, j, 3168 j = build_zonelists_node(NODE_DATA(node), zonelist, j,
3165 MAX_NR_ZONES - 1); 3169 MAX_NR_ZONES - 1);
3166 zonelist->_zonerefs[j].zone = NULL; 3170 zonelist->_zonerefs[j].zone = NULL;
3167 zonelist->_zonerefs[j].zone_idx = 0; 3171 zonelist->_zonerefs[j].zone_idx = 0;
3168 } 3172 }
3169 3173
3170 /* 3174 /*
3171 * Build gfp_thisnode zonelists 3175 * Build gfp_thisnode zonelists
3172 */ 3176 */
3173 static void build_thisnode_zonelists(pg_data_t *pgdat) 3177 static void build_thisnode_zonelists(pg_data_t *pgdat)
3174 { 3178 {
3175 int j; 3179 int j;
3176 struct zonelist *zonelist; 3180 struct zonelist *zonelist;
3177 3181
3178 zonelist = &pgdat->node_zonelists[1]; 3182 zonelist = &pgdat->node_zonelists[1];
3179 j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); 3183 j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
3180 zonelist->_zonerefs[j].zone = NULL; 3184 zonelist->_zonerefs[j].zone = NULL;
3181 zonelist->_zonerefs[j].zone_idx = 0; 3185 zonelist->_zonerefs[j].zone_idx = 0;
3182 } 3186 }
3183 3187
3184 /* 3188 /*
3185 * Build zonelists ordered by zone and nodes within zones. 3189 * Build zonelists ordered by zone and nodes within zones.
3186 * This results in conserving DMA zone[s] until all Normal memory is 3190 * This results in conserving DMA zone[s] until all Normal memory is
3187 * exhausted, but results in overflowing to remote node while memory 3191 * exhausted, but results in overflowing to remote node while memory
3188 * may still exist in local DMA zone. 3192 * may still exist in local DMA zone.
3189 */ 3193 */
3190 static int node_order[MAX_NUMNODES]; 3194 static int node_order[MAX_NUMNODES];
3191 3195
3192 static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) 3196 static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
3193 { 3197 {
3194 int pos, j, node; 3198 int pos, j, node;
3195 int zone_type; /* needs to be signed */ 3199 int zone_type; /* needs to be signed */
3196 struct zone *z; 3200 struct zone *z;
3197 struct zonelist *zonelist; 3201 struct zonelist *zonelist;
3198 3202
3199 zonelist = &pgdat->node_zonelists[0]; 3203 zonelist = &pgdat->node_zonelists[0];
3200 pos = 0; 3204 pos = 0;
3201 for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) { 3205 for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
3202 for (j = 0; j < nr_nodes; j++) { 3206 for (j = 0; j < nr_nodes; j++) {
3203 node = node_order[j]; 3207 node = node_order[j];
3204 z = &NODE_DATA(node)->node_zones[zone_type]; 3208 z = &NODE_DATA(node)->node_zones[zone_type];
3205 if (populated_zone(z)) { 3209 if (populated_zone(z)) {
3206 zoneref_set_zone(z, 3210 zoneref_set_zone(z,
3207 &zonelist->_zonerefs[pos++]); 3211 &zonelist->_zonerefs[pos++]);
3208 check_highest_zone(zone_type); 3212 check_highest_zone(zone_type);
3209 } 3213 }
3210 } 3214 }
3211 } 3215 }
3212 zonelist->_zonerefs[pos].zone = NULL; 3216 zonelist->_zonerefs[pos].zone = NULL;
3213 zonelist->_zonerefs[pos].zone_idx = 0; 3217 zonelist->_zonerefs[pos].zone_idx = 0;
3214 } 3218 }
3215 3219
3216 static int default_zonelist_order(void) 3220 static int default_zonelist_order(void)
3217 { 3221 {
3218 int nid, zone_type; 3222 int nid, zone_type;
3219 unsigned long low_kmem_size,total_size; 3223 unsigned long low_kmem_size,total_size;
3220 struct zone *z; 3224 struct zone *z;
3221 int average_size; 3225 int average_size;
3222 /* 3226 /*
3223 * ZONE_DMA and ZONE_DMA32 can be very small area in the system. 3227 * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
3224 * If they are really small and used heavily, the system can fall 3228 * If they are really small and used heavily, the system can fall
3225 * into OOM very easily. 3229 * into OOM very easily.
3226 * This function detect ZONE_DMA/DMA32 size and configures zone order. 3230 * This function detect ZONE_DMA/DMA32 size and configures zone order.
3227 */ 3231 */
3228 /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */ 3232 /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
3229 low_kmem_size = 0; 3233 low_kmem_size = 0;
3230 total_size = 0; 3234 total_size = 0;
3231 for_each_online_node(nid) { 3235 for_each_online_node(nid) {
3232 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { 3236 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
3233 z = &NODE_DATA(nid)->node_zones[zone_type]; 3237 z = &NODE_DATA(nid)->node_zones[zone_type];
3234 if (populated_zone(z)) { 3238 if (populated_zone(z)) {
3235 if (zone_type < ZONE_NORMAL) 3239 if (zone_type < ZONE_NORMAL)
3236 low_kmem_size += z->present_pages; 3240 low_kmem_size += z->present_pages;
3237 total_size += z->present_pages; 3241 total_size += z->present_pages;
3238 } else if (zone_type == ZONE_NORMAL) { 3242 } else if (zone_type == ZONE_NORMAL) {
3239 /* 3243 /*
3240 * If any node has only lowmem, then node order 3244 * If any node has only lowmem, then node order
3241 * is preferred to allow kernel allocations 3245 * is preferred to allow kernel allocations
3242 * locally; otherwise, they can easily infringe 3246 * locally; otherwise, they can easily infringe
3243 * on other nodes when there is an abundance of 3247 * on other nodes when there is an abundance of
3244 * lowmem available to allocate from. 3248 * lowmem available to allocate from.
3245 */ 3249 */
3246 return ZONELIST_ORDER_NODE; 3250 return ZONELIST_ORDER_NODE;
3247 } 3251 }
3248 } 3252 }
3249 } 3253 }
3250 if (!low_kmem_size || /* there are no DMA area. */ 3254 if (!low_kmem_size || /* there are no DMA area. */
3251 low_kmem_size > total_size/2) /* DMA/DMA32 is big. */ 3255 low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
3252 return ZONELIST_ORDER_NODE; 3256 return ZONELIST_ORDER_NODE;
3253 /* 3257 /*
3254 * look into each node's config. 3258 * look into each node's config.
3255 * If there is a node whose DMA/DMA32 memory is very big area on 3259 * If there is a node whose DMA/DMA32 memory is very big area on
3256 * local memory, NODE_ORDER may be suitable. 3260 * local memory, NODE_ORDER may be suitable.
3257 */ 3261 */
3258 average_size = total_size / 3262 average_size = total_size /
3259 (nodes_weight(node_states[N_HIGH_MEMORY]) + 1); 3263 (nodes_weight(node_states[N_HIGH_MEMORY]) + 1);
3260 for_each_online_node(nid) { 3264 for_each_online_node(nid) {
3261 low_kmem_size = 0; 3265 low_kmem_size = 0;
3262 total_size = 0; 3266 total_size = 0;
3263 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { 3267 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
3264 z = &NODE_DATA(nid)->node_zones[zone_type]; 3268 z = &NODE_DATA(nid)->node_zones[zone_type];
3265 if (populated_zone(z)) { 3269 if (populated_zone(z)) {
3266 if (zone_type < ZONE_NORMAL) 3270 if (zone_type < ZONE_NORMAL)
3267 low_kmem_size += z->present_pages; 3271 low_kmem_size += z->present_pages;
3268 total_size += z->present_pages; 3272 total_size += z->present_pages;
3269 } 3273 }
3270 } 3274 }
3271 if (low_kmem_size && 3275 if (low_kmem_size &&
3272 total_size > average_size && /* ignore small node */ 3276 total_size > average_size && /* ignore small node */
3273 low_kmem_size > total_size * 70/100) 3277 low_kmem_size > total_size * 70/100)
3274 return ZONELIST_ORDER_NODE; 3278 return ZONELIST_ORDER_NODE;
3275 } 3279 }
3276 return ZONELIST_ORDER_ZONE; 3280 return ZONELIST_ORDER_ZONE;
3277 } 3281 }
3278 3282
3279 static void set_zonelist_order(void) 3283 static void set_zonelist_order(void)
3280 { 3284 {
3281 if (user_zonelist_order == ZONELIST_ORDER_DEFAULT) 3285 if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
3282 current_zonelist_order = default_zonelist_order(); 3286 current_zonelist_order = default_zonelist_order();
3283 else 3287 else
3284 current_zonelist_order = user_zonelist_order; 3288 current_zonelist_order = user_zonelist_order;
3285 } 3289 }
3286 3290
3287 static void build_zonelists(pg_data_t *pgdat) 3291 static void build_zonelists(pg_data_t *pgdat)
3288 { 3292 {
3289 int j, node, load; 3293 int j, node, load;
3290 enum zone_type i; 3294 enum zone_type i;
3291 nodemask_t used_mask; 3295 nodemask_t used_mask;
3292 int local_node, prev_node; 3296 int local_node, prev_node;
3293 struct zonelist *zonelist; 3297 struct zonelist *zonelist;
3294 int order = current_zonelist_order; 3298 int order = current_zonelist_order;
3295 3299
3296 /* initialize zonelists */ 3300 /* initialize zonelists */
3297 for (i = 0; i < MAX_ZONELISTS; i++) { 3301 for (i = 0; i < MAX_ZONELISTS; i++) {
3298 zonelist = pgdat->node_zonelists + i; 3302 zonelist = pgdat->node_zonelists + i;
3299 zonelist->_zonerefs[0].zone = NULL; 3303 zonelist->_zonerefs[0].zone = NULL;
3300 zonelist->_zonerefs[0].zone_idx = 0; 3304 zonelist->_zonerefs[0].zone_idx = 0;
3301 } 3305 }
3302 3306
3303 /* NUMA-aware ordering of nodes */ 3307 /* NUMA-aware ordering of nodes */
3304 local_node = pgdat->node_id; 3308 local_node = pgdat->node_id;
3305 load = nr_online_nodes; 3309 load = nr_online_nodes;
3306 prev_node = local_node; 3310 prev_node = local_node;
3307 nodes_clear(used_mask); 3311 nodes_clear(used_mask);
3308 3312
3309 memset(node_order, 0, sizeof(node_order)); 3313 memset(node_order, 0, sizeof(node_order));
3310 j = 0; 3314 j = 0;
3311 3315
3312 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { 3316 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
3313 int distance = node_distance(local_node, node); 3317 int distance = node_distance(local_node, node);
3314 3318
3315 /* 3319 /*
3316 * If another node is sufficiently far away then it is better 3320 * If another node is sufficiently far away then it is better
3317 * to reclaim pages in a zone before going off node. 3321 * to reclaim pages in a zone before going off node.
3318 */ 3322 */
3319 if (distance > RECLAIM_DISTANCE) 3323 if (distance > RECLAIM_DISTANCE)
3320 zone_reclaim_mode = 1; 3324 zone_reclaim_mode = 1;
3321 3325
3322 /* 3326 /*
3323 * We don't want to pressure a particular node. 3327 * We don't want to pressure a particular node.
3324 * So adding penalty to the first node in same 3328 * So adding penalty to the first node in same
3325 * distance group to make it round-robin. 3329 * distance group to make it round-robin.
3326 */ 3330 */
3327 if (distance != node_distance(local_node, prev_node)) 3331 if (distance != node_distance(local_node, prev_node))
3328 node_load[node] = load; 3332 node_load[node] = load;
3329 3333
3330 prev_node = node; 3334 prev_node = node;
3331 load--; 3335 load--;
3332 if (order == ZONELIST_ORDER_NODE) 3336 if (order == ZONELIST_ORDER_NODE)
3333 build_zonelists_in_node_order(pgdat, node); 3337 build_zonelists_in_node_order(pgdat, node);
3334 else 3338 else
3335 node_order[j++] = node; /* remember order */ 3339 node_order[j++] = node; /* remember order */
3336 } 3340 }
3337 3341
3338 if (order == ZONELIST_ORDER_ZONE) { 3342 if (order == ZONELIST_ORDER_ZONE) {
3339 /* calculate node order -- i.e., DMA last! */ 3343 /* calculate node order -- i.e., DMA last! */
3340 build_zonelists_in_zone_order(pgdat, j); 3344 build_zonelists_in_zone_order(pgdat, j);
3341 } 3345 }
3342 3346
3343 build_thisnode_zonelists(pgdat); 3347 build_thisnode_zonelists(pgdat);
3344 } 3348 }
3345 3349
3346 /* Construct the zonelist performance cache - see further mmzone.h */ 3350 /* Construct the zonelist performance cache - see further mmzone.h */
3347 static void build_zonelist_cache(pg_data_t *pgdat) 3351 static void build_zonelist_cache(pg_data_t *pgdat)
3348 { 3352 {
3349 struct zonelist *zonelist; 3353 struct zonelist *zonelist;
3350 struct zonelist_cache *zlc; 3354 struct zonelist_cache *zlc;
3351 struct zoneref *z; 3355 struct zoneref *z;
3352 3356
3353 zonelist = &pgdat->node_zonelists[0]; 3357 zonelist = &pgdat->node_zonelists[0];
3354 zonelist->zlcache_ptr = zlc = &zonelist->zlcache; 3358 zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
3355 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 3359 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
3356 for (z = zonelist->_zonerefs; z->zone; z++) 3360 for (z = zonelist->_zonerefs; z->zone; z++)
3357 zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z); 3361 zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
3358 } 3362 }
3359 3363
3360 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 3364 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
3361 /* 3365 /*
3362 * Return node id of node used for "local" allocations. 3366 * Return node id of node used for "local" allocations.
3363 * I.e., first node id of first zone in arg node's generic zonelist. 3367 * I.e., first node id of first zone in arg node's generic zonelist.
3364 * Used for initializing percpu 'numa_mem', which is used primarily 3368 * Used for initializing percpu 'numa_mem', which is used primarily
3365 * for kernel allocations, so use GFP_KERNEL flags to locate zonelist. 3369 * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
3366 */ 3370 */
3367 int local_memory_node(int node) 3371 int local_memory_node(int node)
3368 { 3372 {
3369 struct zone *zone; 3373 struct zone *zone;
3370 3374
3371 (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL), 3375 (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
3372 gfp_zone(GFP_KERNEL), 3376 gfp_zone(GFP_KERNEL),
3373 NULL, 3377 NULL,
3374 &zone); 3378 &zone);
3375 return zone->node; 3379 return zone->node;
3376 } 3380 }
3377 #endif 3381 #endif
3378 3382
3379 #else /* CONFIG_NUMA */ 3383 #else /* CONFIG_NUMA */
3380 3384
3381 static void set_zonelist_order(void) 3385 static void set_zonelist_order(void)
3382 { 3386 {
3383 current_zonelist_order = ZONELIST_ORDER_ZONE; 3387 current_zonelist_order = ZONELIST_ORDER_ZONE;
3384 } 3388 }
3385 3389
3386 static void build_zonelists(pg_data_t *pgdat) 3390 static void build_zonelists(pg_data_t *pgdat)
3387 { 3391 {
3388 int node, local_node; 3392 int node, local_node;
3389 enum zone_type j; 3393 enum zone_type j;
3390 struct zonelist *zonelist; 3394 struct zonelist *zonelist;
3391 3395
3392 local_node = pgdat->node_id; 3396 local_node = pgdat->node_id;
3393 3397
3394 zonelist = &pgdat->node_zonelists[0]; 3398 zonelist = &pgdat->node_zonelists[0];
3395 j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); 3399 j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
3396 3400
3397 /* 3401 /*
3398 * Now we build the zonelist so that it contains the zones 3402 * Now we build the zonelist so that it contains the zones
3399 * of all the other nodes. 3403 * of all the other nodes.
3400 * We don't want to pressure a particular node, so when 3404 * We don't want to pressure a particular node, so when
3401 * building the zones for node N, we make sure that the 3405 * building the zones for node N, we make sure that the
3402 * zones coming right after the local ones are those from 3406 * zones coming right after the local ones are those from
3403 * node N+1 (modulo N) 3407 * node N+1 (modulo N)
3404 */ 3408 */
3405 for (node = local_node + 1; node < MAX_NUMNODES; node++) { 3409 for (node = local_node + 1; node < MAX_NUMNODES; node++) {
3406 if (!node_online(node)) 3410 if (!node_online(node))
3407 continue; 3411 continue;
3408 j = build_zonelists_node(NODE_DATA(node), zonelist, j, 3412 j = build_zonelists_node(NODE_DATA(node), zonelist, j,
3409 MAX_NR_ZONES - 1); 3413 MAX_NR_ZONES - 1);
3410 } 3414 }
3411 for (node = 0; node < local_node; node++) { 3415 for (node = 0; node < local_node; node++) {
3412 if (!node_online(node)) 3416 if (!node_online(node))
3413 continue; 3417 continue;
3414 j = build_zonelists_node(NODE_DATA(node), zonelist, j, 3418 j = build_zonelists_node(NODE_DATA(node), zonelist, j,
3415 MAX_NR_ZONES - 1); 3419 MAX_NR_ZONES - 1);
3416 } 3420 }
3417 3421
3418 zonelist->_zonerefs[j].zone = NULL; 3422 zonelist->_zonerefs[j].zone = NULL;
3419 zonelist->_zonerefs[j].zone_idx = 0; 3423 zonelist->_zonerefs[j].zone_idx = 0;
3420 } 3424 }
3421 3425
3422 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ 3426 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
3423 static void build_zonelist_cache(pg_data_t *pgdat) 3427 static void build_zonelist_cache(pg_data_t *pgdat)
3424 { 3428 {
3425 pgdat->node_zonelists[0].zlcache_ptr = NULL; 3429 pgdat->node_zonelists[0].zlcache_ptr = NULL;
3426 } 3430 }
3427 3431
3428 #endif /* CONFIG_NUMA */ 3432 #endif /* CONFIG_NUMA */
3429 3433
3430 /* 3434 /*
3431 * Boot pageset table. One per cpu which is going to be used for all 3435 * Boot pageset table. One per cpu which is going to be used for all
3432 * zones and all nodes. The parameters will be set in such a way 3436 * zones and all nodes. The parameters will be set in such a way
3433 * that an item put on a list will immediately be handed over to 3437 * that an item put on a list will immediately be handed over to
3434 * the buddy list. This is safe since pageset manipulation is done 3438 * the buddy list. This is safe since pageset manipulation is done
3435 * with interrupts disabled. 3439 * with interrupts disabled.
3436 * 3440 *
3437 * The boot_pagesets must be kept even after bootup is complete for 3441 * The boot_pagesets must be kept even after bootup is complete for
3438 * unused processors and/or zones. They do play a role for bootstrapping 3442 * unused processors and/or zones. They do play a role for bootstrapping
3439 * hotplugged processors. 3443 * hotplugged processors.
3440 * 3444 *
3441 * zoneinfo_show() and maybe other functions do 3445 * zoneinfo_show() and maybe other functions do
3442 * not check if the processor is online before following the pageset pointer. 3446 * not check if the processor is online before following the pageset pointer.
3443 * Other parts of the kernel may not check if the zone is available. 3447 * Other parts of the kernel may not check if the zone is available.
3444 */ 3448 */
3445 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); 3449 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
3446 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); 3450 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
3447 static void setup_zone_pageset(struct zone *zone); 3451 static void setup_zone_pageset(struct zone *zone);
3448 3452
3449 /* 3453 /*
3450 * Global mutex to protect against size modification of zonelists 3454 * Global mutex to protect against size modification of zonelists
3451 * as well as to serialize pageset setup for the new populated zone. 3455 * as well as to serialize pageset setup for the new populated zone.
3452 */ 3456 */
3453 DEFINE_MUTEX(zonelists_mutex); 3457 DEFINE_MUTEX(zonelists_mutex);
3454 3458
3455 /* return values int ....just for stop_machine() */ 3459 /* return values int ....just for stop_machine() */
3456 static int __build_all_zonelists(void *data) 3460 static int __build_all_zonelists(void *data)
3457 { 3461 {
3458 int nid; 3462 int nid;
3459 int cpu; 3463 int cpu;
3460 pg_data_t *self = data; 3464 pg_data_t *self = data;
3461 3465
3462 #ifdef CONFIG_NUMA 3466 #ifdef CONFIG_NUMA
3463 memset(node_load, 0, sizeof(node_load)); 3467 memset(node_load, 0, sizeof(node_load));
3464 #endif 3468 #endif
3465 3469
3466 if (self && !node_online(self->node_id)) { 3470 if (self && !node_online(self->node_id)) {
3467 build_zonelists(self); 3471 build_zonelists(self);
3468 build_zonelist_cache(self); 3472 build_zonelist_cache(self);
3469 } 3473 }
3470 3474
3471 for_each_online_node(nid) { 3475 for_each_online_node(nid) {
3472 pg_data_t *pgdat = NODE_DATA(nid); 3476 pg_data_t *pgdat = NODE_DATA(nid);
3473 3477
3474 build_zonelists(pgdat); 3478 build_zonelists(pgdat);
3475 build_zonelist_cache(pgdat); 3479 build_zonelist_cache(pgdat);
3476 } 3480 }
3477 3481
3478 /* 3482 /*
3479 * Initialize the boot_pagesets that are going to be used 3483 * Initialize the boot_pagesets that are going to be used
3480 * for bootstrapping processors. The real pagesets for 3484 * for bootstrapping processors. The real pagesets for
3481 * each zone will be allocated later when the per cpu 3485 * each zone will be allocated later when the per cpu
3482 * allocator is available. 3486 * allocator is available.
3483 * 3487 *
3484 * boot_pagesets are used also for bootstrapping offline 3488 * boot_pagesets are used also for bootstrapping offline
3485 * cpus if the system is already booted because the pagesets 3489 * cpus if the system is already booted because the pagesets
3486 * are needed to initialize allocators on a specific cpu too. 3490 * are needed to initialize allocators on a specific cpu too.
3487 * F.e. the percpu allocator needs the page allocator which 3491 * F.e. the percpu allocator needs the page allocator which
3488 * needs the percpu allocator in order to allocate its pagesets 3492 * needs the percpu allocator in order to allocate its pagesets
3489 * (a chicken-egg dilemma). 3493 * (a chicken-egg dilemma).
3490 */ 3494 */
3491 for_each_possible_cpu(cpu) { 3495 for_each_possible_cpu(cpu) {
3492 setup_pageset(&per_cpu(boot_pageset, cpu), 0); 3496 setup_pageset(&per_cpu(boot_pageset, cpu), 0);
3493 3497
3494 #ifdef CONFIG_HAVE_MEMORYLESS_NODES 3498 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
3495 /* 3499 /*
3496 * We now know the "local memory node" for each node-- 3500 * We now know the "local memory node" for each node--
3497 * i.e., the node of the first zone in the generic zonelist. 3501 * i.e., the node of the first zone in the generic zonelist.
3498 * Set up numa_mem percpu variable for on-line cpus. During 3502 * Set up numa_mem percpu variable for on-line cpus. During
3499 * boot, only the boot cpu should be on-line; we'll init the 3503 * boot, only the boot cpu should be on-line; we'll init the
3500 * secondary cpus' numa_mem as they come on-line. During 3504 * secondary cpus' numa_mem as they come on-line. During
3501 * node/memory hotplug, we'll fixup all on-line cpus. 3505 * node/memory hotplug, we'll fixup all on-line cpus.
3502 */ 3506 */
3503 if (cpu_online(cpu)) 3507 if (cpu_online(cpu))
3504 set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu))); 3508 set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
3505 #endif 3509 #endif
3506 } 3510 }
3507 3511
3508 return 0; 3512 return 0;
3509 } 3513 }
3510 3514
3511 /* 3515 /*
3512 * Called with zonelists_mutex held always 3516 * Called with zonelists_mutex held always
3513 * unless system_state == SYSTEM_BOOTING. 3517 * unless system_state == SYSTEM_BOOTING.
3514 */ 3518 */
3515 void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) 3519 void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
3516 { 3520 {
3517 set_zonelist_order(); 3521 set_zonelist_order();
3518 3522
3519 if (system_state == SYSTEM_BOOTING) { 3523 if (system_state == SYSTEM_BOOTING) {
3520 __build_all_zonelists(NULL); 3524 __build_all_zonelists(NULL);
3521 mminit_verify_zonelist(); 3525 mminit_verify_zonelist();
3522 cpuset_init_current_mems_allowed(); 3526 cpuset_init_current_mems_allowed();
3523 } else { 3527 } else {
3524 /* we have to stop all cpus to guarantee there is no user 3528 /* we have to stop all cpus to guarantee there is no user
3525 of zonelist */ 3529 of zonelist */
3526 #ifdef CONFIG_MEMORY_HOTPLUG 3530 #ifdef CONFIG_MEMORY_HOTPLUG
3527 if (zone) 3531 if (zone)
3528 setup_zone_pageset(zone); 3532 setup_zone_pageset(zone);
3529 #endif 3533 #endif
3530 stop_machine(__build_all_zonelists, pgdat, NULL); 3534 stop_machine(__build_all_zonelists, pgdat, NULL);
3531 /* cpuset refresh routine should be here */ 3535 /* cpuset refresh routine should be here */
3532 } 3536 }
3533 vm_total_pages = nr_free_pagecache_pages(); 3537 vm_total_pages = nr_free_pagecache_pages();
3534 /* 3538 /*
3535 * Disable grouping by mobility if the number of pages in the 3539 * Disable grouping by mobility if the number of pages in the
3536 * system is too low to allow the mechanism to work. It would be 3540 * system is too low to allow the mechanism to work. It would be
3537 * more accurate, but expensive to check per-zone. This check is 3541 * more accurate, but expensive to check per-zone. This check is
3538 * made on memory-hotadd so a system can start with mobility 3542 * made on memory-hotadd so a system can start with mobility
3539 * disabled and enable it later 3543 * disabled and enable it later
3540 */ 3544 */
3541 if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES)) 3545 if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
3542 page_group_by_mobility_disabled = 1; 3546 page_group_by_mobility_disabled = 1;
3543 else 3547 else
3544 page_group_by_mobility_disabled = 0; 3548 page_group_by_mobility_disabled = 0;
3545 3549
3546 printk("Built %i zonelists in %s order, mobility grouping %s. " 3550 printk("Built %i zonelists in %s order, mobility grouping %s. "
3547 "Total pages: %ld\n", 3551 "Total pages: %ld\n",
3548 nr_online_nodes, 3552 nr_online_nodes,
3549 zonelist_order_name[current_zonelist_order], 3553 zonelist_order_name[current_zonelist_order],
3550 page_group_by_mobility_disabled ? "off" : "on", 3554 page_group_by_mobility_disabled ? "off" : "on",
3551 vm_total_pages); 3555 vm_total_pages);
3552 #ifdef CONFIG_NUMA 3556 #ifdef CONFIG_NUMA
3553 printk("Policy zone: %s\n", zone_names[policy_zone]); 3557 printk("Policy zone: %s\n", zone_names[policy_zone]);
3554 #endif 3558 #endif
3555 } 3559 }
3556 3560
3557 /* 3561 /*
3558 * Helper functions to size the waitqueue hash table. 3562 * Helper functions to size the waitqueue hash table.
3559 * Essentially these want to choose hash table sizes sufficiently 3563 * Essentially these want to choose hash table sizes sufficiently
3560 * large so that collisions trying to wait on pages are rare. 3564 * large so that collisions trying to wait on pages are rare.
3561 * But in fact, the number of active page waitqueues on typical 3565 * But in fact, the number of active page waitqueues on typical
3562 * systems is ridiculously low, less than 200. So this is even 3566 * systems is ridiculously low, less than 200. So this is even
3563 * conservative, even though it seems large. 3567 * conservative, even though it seems large.
3564 * 3568 *
3565 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to 3569 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
3566 * waitqueues, i.e. the size of the waitq table given the number of pages. 3570 * waitqueues, i.e. the size of the waitq table given the number of pages.
3567 */ 3571 */
3568 #define PAGES_PER_WAITQUEUE 256 3572 #define PAGES_PER_WAITQUEUE 256
3569 3573
3570 #ifndef CONFIG_MEMORY_HOTPLUG 3574 #ifndef CONFIG_MEMORY_HOTPLUG
3571 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) 3575 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
3572 { 3576 {
3573 unsigned long size = 1; 3577 unsigned long size = 1;
3574 3578
3575 pages /= PAGES_PER_WAITQUEUE; 3579 pages /= PAGES_PER_WAITQUEUE;
3576 3580
3577 while (size < pages) 3581 while (size < pages)
3578 size <<= 1; 3582 size <<= 1;
3579 3583
3580 /* 3584 /*
3581 * Once we have dozens or even hundreds of threads sleeping 3585 * Once we have dozens or even hundreds of threads sleeping
3582 * on IO we've got bigger problems than wait queue collision. 3586 * on IO we've got bigger problems than wait queue collision.
3583 * Limit the size of the wait table to a reasonable size. 3587 * Limit the size of the wait table to a reasonable size.
3584 */ 3588 */
3585 size = min(size, 4096UL); 3589 size = min(size, 4096UL);
3586 3590
3587 return max(size, 4UL); 3591 return max(size, 4UL);
3588 } 3592 }
3589 #else 3593 #else
3590 /* 3594 /*
3591 * A zone's size might be changed by hot-add, so it is not possible to determine 3595 * A zone's size might be changed by hot-add, so it is not possible to determine
3592 * a suitable size for its wait_table. So we use the maximum size now. 3596 * a suitable size for its wait_table. So we use the maximum size now.
3593 * 3597 *
3594 * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie: 3598 * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:
3595 * 3599 *
3596 * i386 (preemption config) : 4096 x 16 = 64Kbyte. 3600 * i386 (preemption config) : 4096 x 16 = 64Kbyte.
3597 * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte. 3601 * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
3598 * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte. 3602 * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.
3599 * 3603 *
3600 * The maximum entries are prepared when a zone's memory is (512K + 256) pages 3604 * The maximum entries are prepared when a zone's memory is (512K + 256) pages
3601 * or more by the traditional way. (See above). It equals: 3605 * or more by the traditional way. (See above). It equals:
3602 * 3606 *
3603 * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte. 3607 * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.
3604 * ia64(16K page size) : = ( 8G + 4M)byte. 3608 * ia64(16K page size) : = ( 8G + 4M)byte.
3605 * powerpc (64K page size) : = (32G +16M)byte. 3609 * powerpc (64K page size) : = (32G +16M)byte.
3606 */ 3610 */
3607 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) 3611 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
3608 { 3612 {
3609 return 4096UL; 3613 return 4096UL;
3610 } 3614 }
3611 #endif 3615 #endif
3612 3616
3613 /* 3617 /*
3614 * This is an integer logarithm so that shifts can be used later 3618 * This is an integer logarithm so that shifts can be used later
3615 * to extract the more random high bits from the multiplicative 3619 * to extract the more random high bits from the multiplicative
3616 * hash function before the remainder is taken. 3620 * hash function before the remainder is taken.
3617 */ 3621 */
3618 static inline unsigned long wait_table_bits(unsigned long size) 3622 static inline unsigned long wait_table_bits(unsigned long size)
3619 { 3623 {
3620 return ffz(~size); 3624 return ffz(~size);
3621 } 3625 }
3622 3626
3623 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) 3627 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
3624 3628
3625 /* 3629 /*
3626 * Check if a pageblock contains reserved pages 3630 * Check if a pageblock contains reserved pages
3627 */ 3631 */
3628 static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn) 3632 static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
3629 { 3633 {
3630 unsigned long pfn; 3634 unsigned long pfn;
3631 3635
3632 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 3636 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
3633 if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn))) 3637 if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
3634 return 1; 3638 return 1;
3635 } 3639 }
3636 return 0; 3640 return 0;
3637 } 3641 }
3638 3642
3639 /* 3643 /*
3640 * Mark a number of pageblocks as MIGRATE_RESERVE. The number 3644 * Mark a number of pageblocks as MIGRATE_RESERVE. The number
3641 * of blocks reserved is based on min_wmark_pages(zone). The memory within 3645 * of blocks reserved is based on min_wmark_pages(zone). The memory within
3642 * the reserve will tend to store contiguous free pages. Setting min_free_kbytes 3646 * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
3643 * higher will lead to a bigger reserve which will get freed as contiguous 3647 * higher will lead to a bigger reserve which will get freed as contiguous
3644 * blocks as reclaim kicks in 3648 * blocks as reclaim kicks in
3645 */ 3649 */
3646 static void setup_zone_migrate_reserve(struct zone *zone) 3650 static void setup_zone_migrate_reserve(struct zone *zone)
3647 { 3651 {
3648 unsigned long start_pfn, pfn, end_pfn, block_end_pfn; 3652 unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
3649 struct page *page; 3653 struct page *page;
3650 unsigned long block_migratetype; 3654 unsigned long block_migratetype;
3651 int reserve; 3655 int reserve;
3652 3656
3653 /* 3657 /*
3654 * Get the start pfn, end pfn and the number of blocks to reserve 3658 * Get the start pfn, end pfn and the number of blocks to reserve
3655 * We have to be careful to be aligned to pageblock_nr_pages to 3659 * We have to be careful to be aligned to pageblock_nr_pages to
3656 * make sure that we always check pfn_valid for the first page in 3660 * make sure that we always check pfn_valid for the first page in
3657 * the block. 3661 * the block.
3658 */ 3662 */
3659 start_pfn = zone->zone_start_pfn; 3663 start_pfn = zone->zone_start_pfn;
3660 end_pfn = start_pfn + zone->spanned_pages; 3664 end_pfn = start_pfn + zone->spanned_pages;
3661 start_pfn = roundup(start_pfn, pageblock_nr_pages); 3665 start_pfn = roundup(start_pfn, pageblock_nr_pages);
3662 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> 3666 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
3663 pageblock_order; 3667 pageblock_order;
3664 3668
3665 /* 3669 /*
3666 * Reserve blocks are generally in place to help high-order atomic 3670 * Reserve blocks are generally in place to help high-order atomic
3667 * allocations that are short-lived. A min_free_kbytes value that 3671 * allocations that are short-lived. A min_free_kbytes value that
3668 * would result in more than 2 reserve blocks for atomic allocations 3672 * would result in more than 2 reserve blocks for atomic allocations
3669 * is assumed to be in place to help anti-fragmentation for the 3673 * is assumed to be in place to help anti-fragmentation for the
3670 * future allocation of hugepages at runtime. 3674 * future allocation of hugepages at runtime.
3671 */ 3675 */
3672 reserve = min(2, reserve); 3676 reserve = min(2, reserve);
3673 3677
3674 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 3678 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
3675 if (!pfn_valid(pfn)) 3679 if (!pfn_valid(pfn))
3676 continue; 3680 continue;
3677 page = pfn_to_page(pfn); 3681 page = pfn_to_page(pfn);
3678 3682
3679 /* Watch out for overlapping nodes */ 3683 /* Watch out for overlapping nodes */
3680 if (page_to_nid(page) != zone_to_nid(zone)) 3684 if (page_to_nid(page) != zone_to_nid(zone))
3681 continue; 3685 continue;
3682 3686
3683 block_migratetype = get_pageblock_migratetype(page); 3687 block_migratetype = get_pageblock_migratetype(page);
3684 3688
3685 /* Only test what is necessary when the reserves are not met */ 3689 /* Only test what is necessary when the reserves are not met */
3686 if (reserve > 0) { 3690 if (reserve > 0) {
3687 /* 3691 /*
3688 * Blocks with reserved pages will never free, skip 3692 * Blocks with reserved pages will never free, skip
3689 * them. 3693 * them.
3690 */ 3694 */
3691 block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); 3695 block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
3692 if (pageblock_is_reserved(pfn, block_end_pfn)) 3696 if (pageblock_is_reserved(pfn, block_end_pfn))
3693 continue; 3697 continue;
3694 3698
3695 /* If this block is reserved, account for it */ 3699 /* If this block is reserved, account for it */
3696 if (block_migratetype == MIGRATE_RESERVE) { 3700 if (block_migratetype == MIGRATE_RESERVE) {
3697 reserve--; 3701 reserve--;
3698 continue; 3702 continue;
3699 } 3703 }
3700 3704
3701 /* Suitable for reserving if this block is movable */ 3705 /* Suitable for reserving if this block is movable */
3702 if (block_migratetype == MIGRATE_MOVABLE) { 3706 if (block_migratetype == MIGRATE_MOVABLE) {
3703 set_pageblock_migratetype(page, 3707 set_pageblock_migratetype(page,
3704 MIGRATE_RESERVE); 3708 MIGRATE_RESERVE);
3705 move_freepages_block(zone, page, 3709 move_freepages_block(zone, page,
3706 MIGRATE_RESERVE); 3710 MIGRATE_RESERVE);
3707 reserve--; 3711 reserve--;
3708 continue; 3712 continue;
3709 } 3713 }
3710 } 3714 }
3711 3715
3712 /* 3716 /*
3713 * If the reserve is met and this is a previous reserved block, 3717 * If the reserve is met and this is a previous reserved block,
3714 * take it back 3718 * take it back
3715 */ 3719 */
3716 if (block_migratetype == MIGRATE_RESERVE) { 3720 if (block_migratetype == MIGRATE_RESERVE) {
3717 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 3721 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
3718 move_freepages_block(zone, page, MIGRATE_MOVABLE); 3722 move_freepages_block(zone, page, MIGRATE_MOVABLE);
3719 } 3723 }
3720 } 3724 }
3721 } 3725 }
3722 3726
3723 /* 3727 /*
3724 * Initially all pages are reserved - free ones are freed 3728 * Initially all pages are reserved - free ones are freed
3725 * up by free_all_bootmem() once the early boot process is 3729 * up by free_all_bootmem() once the early boot process is
3726 * done. Non-atomic initialization, single-pass. 3730 * done. Non-atomic initialization, single-pass.
3727 */ 3731 */
3728 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, 3732 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
3729 unsigned long start_pfn, enum memmap_context context) 3733 unsigned long start_pfn, enum memmap_context context)
3730 { 3734 {
3731 struct page *page; 3735 struct page *page;
3732 unsigned long end_pfn = start_pfn + size; 3736 unsigned long end_pfn = start_pfn + size;
3733 unsigned long pfn; 3737 unsigned long pfn;
3734 struct zone *z; 3738 struct zone *z;
3735 3739
3736 if (highest_memmap_pfn < end_pfn - 1) 3740 if (highest_memmap_pfn < end_pfn - 1)
3737 highest_memmap_pfn = end_pfn - 1; 3741 highest_memmap_pfn = end_pfn - 1;
3738 3742
3739 z = &NODE_DATA(nid)->node_zones[zone]; 3743 z = &NODE_DATA(nid)->node_zones[zone];
3740 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 3744 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
3741 /* 3745 /*
3742 * There can be holes in boot-time mem_map[]s 3746 * There can be holes in boot-time mem_map[]s
3743 * handed to this function. They do not 3747 * handed to this function. They do not
3744 * exist on hotplugged memory. 3748 * exist on hotplugged memory.
3745 */ 3749 */
3746 if (context == MEMMAP_EARLY) { 3750 if (context == MEMMAP_EARLY) {
3747 if (!early_pfn_valid(pfn)) 3751 if (!early_pfn_valid(pfn))
3748 continue; 3752 continue;
3749 if (!early_pfn_in_nid(pfn, nid)) 3753 if (!early_pfn_in_nid(pfn, nid))
3750 continue; 3754 continue;
3751 } 3755 }
3752 page = pfn_to_page(pfn); 3756 page = pfn_to_page(pfn);
3753 set_page_links(page, zone, nid, pfn); 3757 set_page_links(page, zone, nid, pfn);
3754 mminit_verify_page_links(page, zone, nid, pfn); 3758 mminit_verify_page_links(page, zone, nid, pfn);
3755 init_page_count(page); 3759 init_page_count(page);
3756 reset_page_mapcount(page); 3760 reset_page_mapcount(page);
3757 SetPageReserved(page); 3761 SetPageReserved(page);
3758 /* 3762 /*
3759 * Mark the block movable so that blocks are reserved for 3763 * Mark the block movable so that blocks are reserved for
3760 * movable at startup. This will force kernel allocations 3764 * movable at startup. This will force kernel allocations
3761 * to reserve their blocks rather than leaking throughout 3765 * to reserve their blocks rather than leaking throughout
3762 * the address space during boot when many long-lived 3766 * the address space during boot when many long-lived
3763 * kernel allocations are made. Later some blocks near 3767 * kernel allocations are made. Later some blocks near
3764 * the start are marked MIGRATE_RESERVE by 3768 * the start are marked MIGRATE_RESERVE by
3765 * setup_zone_migrate_reserve() 3769 * setup_zone_migrate_reserve()
3766 * 3770 *
3767 * bitmap is created for zone's valid pfn range. but memmap 3771 * bitmap is created for zone's valid pfn range. but memmap
3768 * can be created for invalid pages (for alignment) 3772 * can be created for invalid pages (for alignment)
3769 * check here not to call set_pageblock_migratetype() against 3773 * check here not to call set_pageblock_migratetype() against
3770 * pfn out of zone. 3774 * pfn out of zone.
3771 */ 3775 */
3772 if ((z->zone_start_pfn <= pfn) 3776 if ((z->zone_start_pfn <= pfn)
3773 && (pfn < z->zone_start_pfn + z->spanned_pages) 3777 && (pfn < z->zone_start_pfn + z->spanned_pages)
3774 && !(pfn & (pageblock_nr_pages - 1))) 3778 && !(pfn & (pageblock_nr_pages - 1)))
3775 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 3779 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
3776 3780
3777 INIT_LIST_HEAD(&page->lru); 3781 INIT_LIST_HEAD(&page->lru);
3778 #ifdef WANT_PAGE_VIRTUAL 3782 #ifdef WANT_PAGE_VIRTUAL
3779 /* The shift won't overflow because ZONE_NORMAL is below 4G. */ 3783 /* The shift won't overflow because ZONE_NORMAL is below 4G. */
3780 if (!is_highmem_idx(zone)) 3784 if (!is_highmem_idx(zone))
3781 set_page_address(page, __va(pfn << PAGE_SHIFT)); 3785 set_page_address(page, __va(pfn << PAGE_SHIFT));
3782 #endif 3786 #endif
3783 } 3787 }
3784 } 3788 }
3785 3789
3786 static void __meminit zone_init_free_lists(struct zone *zone) 3790 static void __meminit zone_init_free_lists(struct zone *zone)
3787 { 3791 {
3788 int order, t; 3792 int order, t;
3789 for_each_migratetype_order(order, t) { 3793 for_each_migratetype_order(order, t) {
3790 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); 3794 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
3791 zone->free_area[order].nr_free = 0; 3795 zone->free_area[order].nr_free = 0;
3792 } 3796 }
3793 } 3797 }
3794 3798
3795 #ifndef __HAVE_ARCH_MEMMAP_INIT 3799 #ifndef __HAVE_ARCH_MEMMAP_INIT
3796 #define memmap_init(size, nid, zone, start_pfn) \ 3800 #define memmap_init(size, nid, zone, start_pfn) \
3797 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) 3801 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
3798 #endif 3802 #endif
3799 3803
3800 static int __meminit zone_batchsize(struct zone *zone) 3804 static int __meminit zone_batchsize(struct zone *zone)
3801 { 3805 {
3802 #ifdef CONFIG_MMU 3806 #ifdef CONFIG_MMU
3803 int batch; 3807 int batch;
3804 3808
3805 /* 3809 /*
3806 * The per-cpu-pages pools are set to around 1000th of the 3810 * The per-cpu-pages pools are set to around 1000th of the
3807 * size of the zone. But no more than 1/2 of a meg. 3811 * size of the zone. But no more than 1/2 of a meg.
3808 * 3812 *
3809 * OK, so we don't know how big the cache is. So guess. 3813 * OK, so we don't know how big the cache is. So guess.
3810 */ 3814 */
3811 batch = zone->present_pages / 1024; 3815 batch = zone->present_pages / 1024;
3812 if (batch * PAGE_SIZE > 512 * 1024) 3816 if (batch * PAGE_SIZE > 512 * 1024)
3813 batch = (512 * 1024) / PAGE_SIZE; 3817 batch = (512 * 1024) / PAGE_SIZE;
3814 batch /= 4; /* We effectively *= 4 below */ 3818 batch /= 4; /* We effectively *= 4 below */
3815 if (batch < 1) 3819 if (batch < 1)
3816 batch = 1; 3820 batch = 1;
3817 3821
3818 /* 3822 /*
3819 * Clamp the batch to a 2^n - 1 value. Having a power 3823 * Clamp the batch to a 2^n - 1 value. Having a power
3820 * of 2 value was found to be more likely to have 3824 * of 2 value was found to be more likely to have
3821 * suboptimal cache aliasing properties in some cases. 3825 * suboptimal cache aliasing properties in some cases.
3822 * 3826 *
3823 * For example if 2 tasks are alternately allocating 3827 * For example if 2 tasks are alternately allocating
3824 * batches of pages, one task can end up with a lot 3828 * batches of pages, one task can end up with a lot
3825 * of pages of one half of the possible page colors 3829 * of pages of one half of the possible page colors
3826 * and the other with pages of the other colors. 3830 * and the other with pages of the other colors.
3827 */ 3831 */
3828 batch = rounddown_pow_of_two(batch + batch/2) - 1; 3832 batch = rounddown_pow_of_two(batch + batch/2) - 1;
3829 3833
3830 return batch; 3834 return batch;
3831 3835
3832 #else 3836 #else
3833 /* The deferral and batching of frees should be suppressed under NOMMU 3837 /* The deferral and batching of frees should be suppressed under NOMMU
3834 * conditions. 3838 * conditions.
3835 * 3839 *
3836 * The problem is that NOMMU needs to be able to allocate large chunks 3840 * The problem is that NOMMU needs to be able to allocate large chunks
3837 * of contiguous memory as there's no hardware page translation to 3841 * of contiguous memory as there's no hardware page translation to
3838 * assemble apparent contiguous memory from discontiguous pages. 3842 * assemble apparent contiguous memory from discontiguous pages.
3839 * 3843 *
3840 * Queueing large contiguous runs of pages for batching, however, 3844 * Queueing large contiguous runs of pages for batching, however,
3841 * causes the pages to actually be freed in smaller chunks. As there 3845 * causes the pages to actually be freed in smaller chunks. As there
3842 * can be a significant delay between the individual batches being 3846 * can be a significant delay between the individual batches being
3843 * recycled, this leads to the once large chunks of space being 3847 * recycled, this leads to the once large chunks of space being
3844 * fragmented and becoming unavailable for high-order allocations. 3848 * fragmented and becoming unavailable for high-order allocations.
3845 */ 3849 */
3846 return 0; 3850 return 0;
3847 #endif 3851 #endif
3848 } 3852 }
3849 3853
3850 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) 3854 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
3851 { 3855 {
3852 struct per_cpu_pages *pcp; 3856 struct per_cpu_pages *pcp;
3853 int migratetype; 3857 int migratetype;
3854 3858
3855 memset(p, 0, sizeof(*p)); 3859 memset(p, 0, sizeof(*p));
3856 3860
3857 pcp = &p->pcp; 3861 pcp = &p->pcp;
3858 pcp->count = 0; 3862 pcp->count = 0;
3859 pcp->high = 6 * batch; 3863 pcp->high = 6 * batch;
3860 pcp->batch = max(1UL, 1 * batch); 3864 pcp->batch = max(1UL, 1 * batch);
3861 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) 3865 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
3862 INIT_LIST_HEAD(&pcp->lists[migratetype]); 3866 INIT_LIST_HEAD(&pcp->lists[migratetype]);
3863 } 3867 }
3864 3868
3865 /* 3869 /*
3866 * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist 3870 * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
3867 * to the value high for the pageset p. 3871 * to the value high for the pageset p.
3868 */ 3872 */
3869 3873
3870 static void setup_pagelist_highmark(struct per_cpu_pageset *p, 3874 static void setup_pagelist_highmark(struct per_cpu_pageset *p,
3871 unsigned long high) 3875 unsigned long high)
3872 { 3876 {
3873 struct per_cpu_pages *pcp; 3877 struct per_cpu_pages *pcp;
3874 3878
3875 pcp = &p->pcp; 3879 pcp = &p->pcp;
3876 pcp->high = high; 3880 pcp->high = high;
3877 pcp->batch = max(1UL, high/4); 3881 pcp->batch = max(1UL, high/4);
3878 if ((high/4) > (PAGE_SHIFT * 8)) 3882 if ((high/4) > (PAGE_SHIFT * 8))
3879 pcp->batch = PAGE_SHIFT * 8; 3883 pcp->batch = PAGE_SHIFT * 8;
3880 } 3884 }
3881 3885
3882 static void __meminit setup_zone_pageset(struct zone *zone) 3886 static void __meminit setup_zone_pageset(struct zone *zone)
3883 { 3887 {
3884 int cpu; 3888 int cpu;
3885 3889
3886 zone->pageset = alloc_percpu(struct per_cpu_pageset); 3890 zone->pageset = alloc_percpu(struct per_cpu_pageset);
3887 3891
3888 for_each_possible_cpu(cpu) { 3892 for_each_possible_cpu(cpu) {
3889 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); 3893 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
3890 3894
3891 setup_pageset(pcp, zone_batchsize(zone)); 3895 setup_pageset(pcp, zone_batchsize(zone));
3892 3896
3893 if (percpu_pagelist_fraction) 3897 if (percpu_pagelist_fraction)
3894 setup_pagelist_highmark(pcp, 3898 setup_pagelist_highmark(pcp,
3895 (zone->present_pages / 3899 (zone->present_pages /
3896 percpu_pagelist_fraction)); 3900 percpu_pagelist_fraction));
3897 } 3901 }
3898 } 3902 }
3899 3903
3900 /* 3904 /*
3901 * Allocate per cpu pagesets and initialize them. 3905 * Allocate per cpu pagesets and initialize them.
3902 * Before this call only boot pagesets were available. 3906 * Before this call only boot pagesets were available.
3903 */ 3907 */
3904 void __init setup_per_cpu_pageset(void) 3908 void __init setup_per_cpu_pageset(void)
3905 { 3909 {
3906 struct zone *zone; 3910 struct zone *zone;
3907 3911
3908 for_each_populated_zone(zone) 3912 for_each_populated_zone(zone)
3909 setup_zone_pageset(zone); 3913 setup_zone_pageset(zone);
3910 } 3914 }
3911 3915
3912 static noinline __init_refok 3916 static noinline __init_refok
3913 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) 3917 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
3914 { 3918 {
3915 int i; 3919 int i;
3916 struct pglist_data *pgdat = zone->zone_pgdat; 3920 struct pglist_data *pgdat = zone->zone_pgdat;
3917 size_t alloc_size; 3921 size_t alloc_size;
3918 3922
3919 /* 3923 /*
3920 * The per-page waitqueue mechanism uses hashed waitqueues 3924 * The per-page waitqueue mechanism uses hashed waitqueues
3921 * per zone. 3925 * per zone.
3922 */ 3926 */
3923 zone->wait_table_hash_nr_entries = 3927 zone->wait_table_hash_nr_entries =
3924 wait_table_hash_nr_entries(zone_size_pages); 3928 wait_table_hash_nr_entries(zone_size_pages);
3925 zone->wait_table_bits = 3929 zone->wait_table_bits =
3926 wait_table_bits(zone->wait_table_hash_nr_entries); 3930 wait_table_bits(zone->wait_table_hash_nr_entries);
3927 alloc_size = zone->wait_table_hash_nr_entries 3931 alloc_size = zone->wait_table_hash_nr_entries
3928 * sizeof(wait_queue_head_t); 3932 * sizeof(wait_queue_head_t);
3929 3933
3930 if (!slab_is_available()) { 3934 if (!slab_is_available()) {
3931 zone->wait_table = (wait_queue_head_t *) 3935 zone->wait_table = (wait_queue_head_t *)
3932 alloc_bootmem_node_nopanic(pgdat, alloc_size); 3936 alloc_bootmem_node_nopanic(pgdat, alloc_size);
3933 } else { 3937 } else {
3934 /* 3938 /*
3935 * This case means that a zone whose size was 0 gets new memory 3939 * This case means that a zone whose size was 0 gets new memory
3936 * via memory hot-add. 3940 * via memory hot-add.
3937 * But it may be the case that a new node was hot-added. In 3941 * But it may be the case that a new node was hot-added. In
3938 * this case vmalloc() will not be able to use this new node's 3942 * this case vmalloc() will not be able to use this new node's
3939 * memory - this wait_table must be initialized to use this new 3943 * memory - this wait_table must be initialized to use this new
3940 * node itself as well. 3944 * node itself as well.
3941 * To use this new node's memory, further consideration will be 3945 * To use this new node's memory, further consideration will be
3942 * necessary. 3946 * necessary.
3943 */ 3947 */
3944 zone->wait_table = vmalloc(alloc_size); 3948 zone->wait_table = vmalloc(alloc_size);
3945 } 3949 }
3946 if (!zone->wait_table) 3950 if (!zone->wait_table)
3947 return -ENOMEM; 3951 return -ENOMEM;
3948 3952
3949 for(i = 0; i < zone->wait_table_hash_nr_entries; ++i) 3953 for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
3950 init_waitqueue_head(zone->wait_table + i); 3954 init_waitqueue_head(zone->wait_table + i);
3951 3955
3952 return 0; 3956 return 0;
3953 } 3957 }
3954 3958
3955 static __meminit void zone_pcp_init(struct zone *zone) 3959 static __meminit void zone_pcp_init(struct zone *zone)
3956 { 3960 {
3957 /* 3961 /*
3958 * per cpu subsystem is not up at this point. The following code 3962 * per cpu subsystem is not up at this point. The following code
3959 * relies on the ability of the linker to provide the 3963 * relies on the ability of the linker to provide the
3960 * offset of a (static) per cpu variable into the per cpu area. 3964 * offset of a (static) per cpu variable into the per cpu area.
3961 */ 3965 */
3962 zone->pageset = &boot_pageset; 3966 zone->pageset = &boot_pageset;
3963 3967
3964 if (zone->present_pages) 3968 if (zone->present_pages)
3965 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", 3969 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",
3966 zone->name, zone->present_pages, 3970 zone->name, zone->present_pages,
3967 zone_batchsize(zone)); 3971 zone_batchsize(zone));
3968 } 3972 }
3969 3973
3970 int __meminit init_currently_empty_zone(struct zone *zone, 3974 int __meminit init_currently_empty_zone(struct zone *zone,
3971 unsigned long zone_start_pfn, 3975 unsigned long zone_start_pfn,
3972 unsigned long size, 3976 unsigned long size,
3973 enum memmap_context context) 3977 enum memmap_context context)
3974 { 3978 {
3975 struct pglist_data *pgdat = zone->zone_pgdat; 3979 struct pglist_data *pgdat = zone->zone_pgdat;
3976 int ret; 3980 int ret;
3977 ret = zone_wait_table_init(zone, size); 3981 ret = zone_wait_table_init(zone, size);
3978 if (ret) 3982 if (ret)
3979 return ret; 3983 return ret;
3980 pgdat->nr_zones = zone_idx(zone) + 1; 3984 pgdat->nr_zones = zone_idx(zone) + 1;
3981 3985
3982 zone->zone_start_pfn = zone_start_pfn; 3986 zone->zone_start_pfn = zone_start_pfn;
3983 3987
3984 mminit_dprintk(MMINIT_TRACE, "memmap_init", 3988 mminit_dprintk(MMINIT_TRACE, "memmap_init",
3985 "Initialising map node %d zone %lu pfns %lu -> %lu\n", 3989 "Initialising map node %d zone %lu pfns %lu -> %lu\n",
3986 pgdat->node_id, 3990 pgdat->node_id,
3987 (unsigned long)zone_idx(zone), 3991 (unsigned long)zone_idx(zone),
3988 zone_start_pfn, (zone_start_pfn + size)); 3992 zone_start_pfn, (zone_start_pfn + size));
3989 3993
3990 zone_init_free_lists(zone); 3994 zone_init_free_lists(zone);
3991 3995
3992 return 0; 3996 return 0;
3993 } 3997 }
3994 3998
3995 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 3999 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
3996 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID 4000 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
3997 /* 4001 /*
3998 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. 4002 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
3999 * Architectures may implement their own version but if add_active_range() 4003 * Architectures may implement their own version but if add_active_range()
4000 * was used and there are no special requirements, this is a convenient 4004 * was used and there are no special requirements, this is a convenient
4001 * alternative 4005 * alternative
4002 */ 4006 */
4003 int __meminit __early_pfn_to_nid(unsigned long pfn) 4007 int __meminit __early_pfn_to_nid(unsigned long pfn)
4004 { 4008 {
4005 unsigned long start_pfn, end_pfn; 4009 unsigned long start_pfn, end_pfn;
4006 int i, nid; 4010 int i, nid;
4007 4011
4008 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) 4012 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
4009 if (start_pfn <= pfn && pfn < end_pfn) 4013 if (start_pfn <= pfn && pfn < end_pfn)
4010 return nid; 4014 return nid;
4011 /* This is a memory hole */ 4015 /* This is a memory hole */
4012 return -1; 4016 return -1;
4013 } 4017 }
4014 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ 4018 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
4015 4019
4016 int __meminit early_pfn_to_nid(unsigned long pfn) 4020 int __meminit early_pfn_to_nid(unsigned long pfn)
4017 { 4021 {
4018 int nid; 4022 int nid;
4019 4023
4020 nid = __early_pfn_to_nid(pfn); 4024 nid = __early_pfn_to_nid(pfn);
4021 if (nid >= 0) 4025 if (nid >= 0)
4022 return nid; 4026 return nid;
4023 /* just returns 0 */ 4027 /* just returns 0 */
4024 return 0; 4028 return 0;
4025 } 4029 }
4026 4030
4027 #ifdef CONFIG_NODES_SPAN_OTHER_NODES 4031 #ifdef CONFIG_NODES_SPAN_OTHER_NODES
4028 bool __meminit early_pfn_in_nid(unsigned long pfn, int node) 4032 bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
4029 { 4033 {
4030 int nid; 4034 int nid;
4031 4035
4032 nid = __early_pfn_to_nid(pfn); 4036 nid = __early_pfn_to_nid(pfn);
4033 if (nid >= 0 && nid != node) 4037 if (nid >= 0 && nid != node)
4034 return false; 4038 return false;
4035 return true; 4039 return true;
4036 } 4040 }
4037 #endif 4041 #endif
4038 4042
4039 /** 4043 /**
4040 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range 4044 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
4041 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. 4045 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
4042 * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node 4046 * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
4043 * 4047 *
4044 * If an architecture guarantees that all ranges registered with 4048 * If an architecture guarantees that all ranges registered with
4045 * add_active_ranges() contain no holes and may be freed, this 4049 * add_active_ranges() contain no holes and may be freed, this
4046 * this function may be used instead of calling free_bootmem() manually. 4050 * this function may be used instead of calling free_bootmem() manually.
4047 */ 4051 */
4048 void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) 4052 void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
4049 { 4053 {
4050 unsigned long start_pfn, end_pfn; 4054 unsigned long start_pfn, end_pfn;
4051 int i, this_nid; 4055 int i, this_nid;
4052 4056
4053 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) { 4057 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {
4054 start_pfn = min(start_pfn, max_low_pfn); 4058 start_pfn = min(start_pfn, max_low_pfn);
4055 end_pfn = min(end_pfn, max_low_pfn); 4059 end_pfn = min(end_pfn, max_low_pfn);
4056 4060
4057 if (start_pfn < end_pfn) 4061 if (start_pfn < end_pfn)
4058 free_bootmem_node(NODE_DATA(this_nid), 4062 free_bootmem_node(NODE_DATA(this_nid),
4059 PFN_PHYS(start_pfn), 4063 PFN_PHYS(start_pfn),
4060 (end_pfn - start_pfn) << PAGE_SHIFT); 4064 (end_pfn - start_pfn) << PAGE_SHIFT);
4061 } 4065 }
4062 } 4066 }
4063 4067
4064 /** 4068 /**
4065 * sparse_memory_present_with_active_regions - Call memory_present for each active range 4069 * sparse_memory_present_with_active_regions - Call memory_present for each active range
4066 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. 4070 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
4067 * 4071 *
4068 * If an architecture guarantees that all ranges registered with 4072 * If an architecture guarantees that all ranges registered with
4069 * add_active_ranges() contain no holes and may be freed, this 4073 * add_active_ranges() contain no holes and may be freed, this
4070 * function may be used instead of calling memory_present() manually. 4074 * function may be used instead of calling memory_present() manually.
4071 */ 4075 */
4072 void __init sparse_memory_present_with_active_regions(int nid) 4076 void __init sparse_memory_present_with_active_regions(int nid)
4073 { 4077 {
4074 unsigned long start_pfn, end_pfn; 4078 unsigned long start_pfn, end_pfn;
4075 int i, this_nid; 4079 int i, this_nid;
4076 4080
4077 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) 4081 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
4078 memory_present(this_nid, start_pfn, end_pfn); 4082 memory_present(this_nid, start_pfn, end_pfn);
4079 } 4083 }
4080 4084
4081 /** 4085 /**
4082 * get_pfn_range_for_nid - Return the start and end page frames for a node 4086 * get_pfn_range_for_nid - Return the start and end page frames for a node
4083 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. 4087 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
4084 * @start_pfn: Passed by reference. On return, it will have the node start_pfn. 4088 * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
4085 * @end_pfn: Passed by reference. On return, it will have the node end_pfn. 4089 * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
4086 * 4090 *
4087 * It returns the start and end page frame of a node based on information 4091 * It returns the start and end page frame of a node based on information
4088 * provided by an arch calling add_active_range(). If called for a node 4092 * provided by an arch calling add_active_range(). If called for a node
4089 * with no available memory, a warning is printed and the start and end 4093 * with no available memory, a warning is printed and the start and end
4090 * PFNs will be 0. 4094 * PFNs will be 0.
4091 */ 4095 */
4092 void __meminit get_pfn_range_for_nid(unsigned int nid, 4096 void __meminit get_pfn_range_for_nid(unsigned int nid,
4093 unsigned long *start_pfn, unsigned long *end_pfn) 4097 unsigned long *start_pfn, unsigned long *end_pfn)
4094 { 4098 {
4095 unsigned long this_start_pfn, this_end_pfn; 4099 unsigned long this_start_pfn, this_end_pfn;
4096 int i; 4100 int i;
4097 4101
4098 *start_pfn = -1UL; 4102 *start_pfn = -1UL;
4099 *end_pfn = 0; 4103 *end_pfn = 0;
4100 4104
4101 for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) { 4105 for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
4102 *start_pfn = min(*start_pfn, this_start_pfn); 4106 *start_pfn = min(*start_pfn, this_start_pfn);
4103 *end_pfn = max(*end_pfn, this_end_pfn); 4107 *end_pfn = max(*end_pfn, this_end_pfn);
4104 } 4108 }
4105 4109
4106 if (*start_pfn == -1UL) 4110 if (*start_pfn == -1UL)
4107 *start_pfn = 0; 4111 *start_pfn = 0;
4108 } 4112 }
4109 4113
4110 /* 4114 /*
4111 * This finds a zone that can be used for ZONE_MOVABLE pages. The 4115 * This finds a zone that can be used for ZONE_MOVABLE pages. The
4112 * assumption is made that zones within a node are ordered in monotonic 4116 * assumption is made that zones within a node are ordered in monotonic
4113 * increasing memory addresses so that the "highest" populated zone is used 4117 * increasing memory addresses so that the "highest" populated zone is used
4114 */ 4118 */
4115 static void __init find_usable_zone_for_movable(void) 4119 static void __init find_usable_zone_for_movable(void)
4116 { 4120 {
4117 int zone_index; 4121 int zone_index;
4118 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) { 4122 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
4119 if (zone_index == ZONE_MOVABLE) 4123 if (zone_index == ZONE_MOVABLE)
4120 continue; 4124 continue;
4121 4125
4122 if (arch_zone_highest_possible_pfn[zone_index] > 4126 if (arch_zone_highest_possible_pfn[zone_index] >
4123 arch_zone_lowest_possible_pfn[zone_index]) 4127 arch_zone_lowest_possible_pfn[zone_index])
4124 break; 4128 break;
4125 } 4129 }
4126 4130
4127 VM_BUG_ON(zone_index == -1); 4131 VM_BUG_ON(zone_index == -1);
4128 movable_zone = zone_index; 4132 movable_zone = zone_index;
4129 } 4133 }
4130 4134
4131 /* 4135 /*
4132 * The zone ranges provided by the architecture do not include ZONE_MOVABLE 4136 * The zone ranges provided by the architecture do not include ZONE_MOVABLE
4133 * because it is sized independent of architecture. Unlike the other zones, 4137 * because it is sized independent of architecture. Unlike the other zones,
4134 * the starting point for ZONE_MOVABLE is not fixed. It may be different 4138 * the starting point for ZONE_MOVABLE is not fixed. It may be different
4135 * in each node depending on the size of each node and how evenly kernelcore 4139 * in each node depending on the size of each node and how evenly kernelcore
4136 * is distributed. This helper function adjusts the zone ranges 4140 * is distributed. This helper function adjusts the zone ranges
4137 * provided by the architecture for a given node by using the end of the 4141 * provided by the architecture for a given node by using the end of the
4138 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that 4142 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
4139 * zones within a node are in order of monotonic increases memory addresses 4143 * zones within a node are in order of monotonic increases memory addresses
4140 */ 4144 */
4141 static void __meminit adjust_zone_range_for_zone_movable(int nid, 4145 static void __meminit adjust_zone_range_for_zone_movable(int nid,
4142 unsigned long zone_type, 4146 unsigned long zone_type,
4143 unsigned long node_start_pfn, 4147 unsigned long node_start_pfn,
4144 unsigned long node_end_pfn, 4148 unsigned long node_end_pfn,
4145 unsigned long *zone_start_pfn, 4149 unsigned long *zone_start_pfn,
4146 unsigned long *zone_end_pfn) 4150 unsigned long *zone_end_pfn)
4147 { 4151 {
4148 /* Only adjust if ZONE_MOVABLE is on this node */ 4152 /* Only adjust if ZONE_MOVABLE is on this node */
4149 if (zone_movable_pfn[nid]) { 4153 if (zone_movable_pfn[nid]) {
4150 /* Size ZONE_MOVABLE */ 4154 /* Size ZONE_MOVABLE */
4151 if (zone_type == ZONE_MOVABLE) { 4155 if (zone_type == ZONE_MOVABLE) {
4152 *zone_start_pfn = zone_movable_pfn[nid]; 4156 *zone_start_pfn = zone_movable_pfn[nid];
4153 *zone_end_pfn = min(node_end_pfn, 4157 *zone_end_pfn = min(node_end_pfn,
4154 arch_zone_highest_possible_pfn[movable_zone]); 4158 arch_zone_highest_possible_pfn[movable_zone]);
4155 4159
4156 /* Adjust for ZONE_MOVABLE starting within this range */ 4160 /* Adjust for ZONE_MOVABLE starting within this range */
4157 } else if (*zone_start_pfn < zone_movable_pfn[nid] && 4161 } else if (*zone_start_pfn < zone_movable_pfn[nid] &&
4158 *zone_end_pfn > zone_movable_pfn[nid]) { 4162 *zone_end_pfn > zone_movable_pfn[nid]) {
4159 *zone_end_pfn = zone_movable_pfn[nid]; 4163 *zone_end_pfn = zone_movable_pfn[nid];
4160 4164
4161 /* Check if this whole range is within ZONE_MOVABLE */ 4165 /* Check if this whole range is within ZONE_MOVABLE */
4162 } else if (*zone_start_pfn >= zone_movable_pfn[nid]) 4166 } else if (*zone_start_pfn >= zone_movable_pfn[nid])
4163 *zone_start_pfn = *zone_end_pfn; 4167 *zone_start_pfn = *zone_end_pfn;
4164 } 4168 }
4165 } 4169 }
4166 4170
4167 /* 4171 /*
4168 * Return the number of pages a zone spans in a node, including holes 4172 * Return the number of pages a zone spans in a node, including holes
4169 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() 4173 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
4170 */ 4174 */
4171 static unsigned long __meminit zone_spanned_pages_in_node(int nid, 4175 static unsigned long __meminit zone_spanned_pages_in_node(int nid,
4172 unsigned long zone_type, 4176 unsigned long zone_type,
4173 unsigned long *ignored) 4177 unsigned long *ignored)
4174 { 4178 {
4175 unsigned long node_start_pfn, node_end_pfn; 4179 unsigned long node_start_pfn, node_end_pfn;
4176 unsigned long zone_start_pfn, zone_end_pfn; 4180 unsigned long zone_start_pfn, zone_end_pfn;
4177 4181
4178 /* Get the start and end of the node and zone */ 4182 /* Get the start and end of the node and zone */
4179 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); 4183 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
4180 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; 4184 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
4181 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; 4185 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
4182 adjust_zone_range_for_zone_movable(nid, zone_type, 4186 adjust_zone_range_for_zone_movable(nid, zone_type,
4183 node_start_pfn, node_end_pfn, 4187 node_start_pfn, node_end_pfn,
4184 &zone_start_pfn, &zone_end_pfn); 4188 &zone_start_pfn, &zone_end_pfn);
4185 4189
4186 /* Check that this node has pages within the zone's required range */ 4190 /* Check that this node has pages within the zone's required range */
4187 if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) 4191 if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
4188 return 0; 4192 return 0;
4189 4193
4190 /* Move the zone boundaries inside the node if necessary */ 4194 /* Move the zone boundaries inside the node if necessary */
4191 zone_end_pfn = min(zone_end_pfn, node_end_pfn); 4195 zone_end_pfn = min(zone_end_pfn, node_end_pfn);
4192 zone_start_pfn = max(zone_start_pfn, node_start_pfn); 4196 zone_start_pfn = max(zone_start_pfn, node_start_pfn);
4193 4197
4194 /* Return the spanned pages */ 4198 /* Return the spanned pages */
4195 return zone_end_pfn - zone_start_pfn; 4199 return zone_end_pfn - zone_start_pfn;
4196 } 4200 }
4197 4201
4198 /* 4202 /*
4199 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, 4203 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
4200 * then all holes in the requested range will be accounted for. 4204 * then all holes in the requested range will be accounted for.
4201 */ 4205 */
4202 unsigned long __meminit __absent_pages_in_range(int nid, 4206 unsigned long __meminit __absent_pages_in_range(int nid,
4203 unsigned long range_start_pfn, 4207 unsigned long range_start_pfn,
4204 unsigned long range_end_pfn) 4208 unsigned long range_end_pfn)
4205 { 4209 {
4206 unsigned long nr_absent = range_end_pfn - range_start_pfn; 4210 unsigned long nr_absent = range_end_pfn - range_start_pfn;
4207 unsigned long start_pfn, end_pfn; 4211 unsigned long start_pfn, end_pfn;
4208 int i; 4212 int i;
4209 4213
4210 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 4214 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
4211 start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn); 4215 start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
4212 end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn); 4216 end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
4213 nr_absent -= end_pfn - start_pfn; 4217 nr_absent -= end_pfn - start_pfn;
4214 } 4218 }
4215 return nr_absent; 4219 return nr_absent;
4216 } 4220 }
4217 4221
4218 /** 4222 /**
4219 * absent_pages_in_range - Return number of page frames in holes within a range 4223 * absent_pages_in_range - Return number of page frames in holes within a range
4220 * @start_pfn: The start PFN to start searching for holes 4224 * @start_pfn: The start PFN to start searching for holes
4221 * @end_pfn: The end PFN to stop searching for holes 4225 * @end_pfn: The end PFN to stop searching for holes
4222 * 4226 *
4223 * It returns the number of pages frames in memory holes within a range. 4227 * It returns the number of pages frames in memory holes within a range.
4224 */ 4228 */
4225 unsigned long __init absent_pages_in_range(unsigned long start_pfn, 4229 unsigned long __init absent_pages_in_range(unsigned long start_pfn,
4226 unsigned long end_pfn) 4230 unsigned long end_pfn)
4227 { 4231 {
4228 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn); 4232 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
4229 } 4233 }
4230 4234
4231 /* Return the number of page frames in holes in a zone on a node */ 4235 /* Return the number of page frames in holes in a zone on a node */
4232 static unsigned long __meminit zone_absent_pages_in_node(int nid, 4236 static unsigned long __meminit zone_absent_pages_in_node(int nid,
4233 unsigned long zone_type, 4237 unsigned long zone_type,
4234 unsigned long *ignored) 4238 unsigned long *ignored)
4235 { 4239 {
4236 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; 4240 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
4237 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; 4241 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
4238 unsigned long node_start_pfn, node_end_pfn; 4242 unsigned long node_start_pfn, node_end_pfn;
4239 unsigned long zone_start_pfn, zone_end_pfn; 4243 unsigned long zone_start_pfn, zone_end_pfn;
4240 4244
4241 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); 4245 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
4242 zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); 4246 zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
4243 zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); 4247 zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
4244 4248
4245 adjust_zone_range_for_zone_movable(nid, zone_type, 4249 adjust_zone_range_for_zone_movable(nid, zone_type,
4246 node_start_pfn, node_end_pfn, 4250 node_start_pfn, node_end_pfn,
4247 &zone_start_pfn, &zone_end_pfn); 4251 &zone_start_pfn, &zone_end_pfn);
4248 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); 4252 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
4249 } 4253 }
4250 4254
4251 #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 4255 #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4252 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, 4256 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
4253 unsigned long zone_type, 4257 unsigned long zone_type,
4254 unsigned long *zones_size) 4258 unsigned long *zones_size)
4255 { 4259 {
4256 return zones_size[zone_type]; 4260 return zones_size[zone_type];
4257 } 4261 }
4258 4262
4259 static inline unsigned long __meminit zone_absent_pages_in_node(int nid, 4263 static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
4260 unsigned long zone_type, 4264 unsigned long zone_type,
4261 unsigned long *zholes_size) 4265 unsigned long *zholes_size)
4262 { 4266 {
4263 if (!zholes_size) 4267 if (!zholes_size)
4264 return 0; 4268 return 0;
4265 4269
4266 return zholes_size[zone_type]; 4270 return zholes_size[zone_type];
4267 } 4271 }
4268 4272
4269 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 4273 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4270 4274
4271 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, 4275 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
4272 unsigned long *zones_size, unsigned long *zholes_size) 4276 unsigned long *zones_size, unsigned long *zholes_size)
4273 { 4277 {
4274 unsigned long realtotalpages, totalpages = 0; 4278 unsigned long realtotalpages, totalpages = 0;
4275 enum zone_type i; 4279 enum zone_type i;
4276 4280
4277 for (i = 0; i < MAX_NR_ZONES; i++) 4281 for (i = 0; i < MAX_NR_ZONES; i++)
4278 totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, 4282 totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
4279 zones_size); 4283 zones_size);
4280 pgdat->node_spanned_pages = totalpages; 4284 pgdat->node_spanned_pages = totalpages;
4281 4285
4282 realtotalpages = totalpages; 4286 realtotalpages = totalpages;
4283 for (i = 0; i < MAX_NR_ZONES; i++) 4287 for (i = 0; i < MAX_NR_ZONES; i++)
4284 realtotalpages -= 4288 realtotalpages -=
4285 zone_absent_pages_in_node(pgdat->node_id, i, 4289 zone_absent_pages_in_node(pgdat->node_id, i,
4286 zholes_size); 4290 zholes_size);
4287 pgdat->node_present_pages = realtotalpages; 4291 pgdat->node_present_pages = realtotalpages;
4288 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, 4292 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
4289 realtotalpages); 4293 realtotalpages);
4290 } 4294 }
4291 4295
4292 #ifndef CONFIG_SPARSEMEM 4296 #ifndef CONFIG_SPARSEMEM
4293 /* 4297 /*
4294 * Calculate the size of the zone->blockflags rounded to an unsigned long 4298 * Calculate the size of the zone->blockflags rounded to an unsigned long
4295 * Start by making sure zonesize is a multiple of pageblock_order by rounding 4299 * Start by making sure zonesize is a multiple of pageblock_order by rounding
4296 * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally 4300 * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
4297 * round what is now in bits to nearest long in bits, then return it in 4301 * round what is now in bits to nearest long in bits, then return it in
4298 * bytes. 4302 * bytes.
4299 */ 4303 */
4300 static unsigned long __init usemap_size(unsigned long zonesize) 4304 static unsigned long __init usemap_size(unsigned long zonesize)
4301 { 4305 {
4302 unsigned long usemapsize; 4306 unsigned long usemapsize;
4303 4307
4304 usemapsize = roundup(zonesize, pageblock_nr_pages); 4308 usemapsize = roundup(zonesize, pageblock_nr_pages);
4305 usemapsize = usemapsize >> pageblock_order; 4309 usemapsize = usemapsize >> pageblock_order;
4306 usemapsize *= NR_PAGEBLOCK_BITS; 4310 usemapsize *= NR_PAGEBLOCK_BITS;
4307 usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long)); 4311 usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
4308 4312
4309 return usemapsize / 8; 4313 return usemapsize / 8;
4310 } 4314 }
4311 4315
4312 static void __init setup_usemap(struct pglist_data *pgdat, 4316 static void __init setup_usemap(struct pglist_data *pgdat,
4313 struct zone *zone, unsigned long zonesize) 4317 struct zone *zone, unsigned long zonesize)
4314 { 4318 {
4315 unsigned long usemapsize = usemap_size(zonesize); 4319 unsigned long usemapsize = usemap_size(zonesize);
4316 zone->pageblock_flags = NULL; 4320 zone->pageblock_flags = NULL;
4317 if (usemapsize) 4321 if (usemapsize)
4318 zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat, 4322 zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
4319 usemapsize); 4323 usemapsize);
4320 } 4324 }
4321 #else 4325 #else
4322 static inline void setup_usemap(struct pglist_data *pgdat, 4326 static inline void setup_usemap(struct pglist_data *pgdat,
4323 struct zone *zone, unsigned long zonesize) {} 4327 struct zone *zone, unsigned long zonesize) {}
4324 #endif /* CONFIG_SPARSEMEM */ 4328 #endif /* CONFIG_SPARSEMEM */
4325 4329
4326 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 4330 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
4327 4331
4328 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ 4332 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
4329 void __init set_pageblock_order(void) 4333 void __init set_pageblock_order(void)
4330 { 4334 {
4331 unsigned int order; 4335 unsigned int order;
4332 4336
4333 /* Check that pageblock_nr_pages has not already been setup */ 4337 /* Check that pageblock_nr_pages has not already been setup */
4334 if (pageblock_order) 4338 if (pageblock_order)
4335 return; 4339 return;
4336 4340
4337 if (HPAGE_SHIFT > PAGE_SHIFT) 4341 if (HPAGE_SHIFT > PAGE_SHIFT)
4338 order = HUGETLB_PAGE_ORDER; 4342 order = HUGETLB_PAGE_ORDER;
4339 else 4343 else
4340 order = MAX_ORDER - 1; 4344 order = MAX_ORDER - 1;
4341 4345
4342 /* 4346 /*
4343 * Assume the largest contiguous order of interest is a huge page. 4347 * Assume the largest contiguous order of interest is a huge page.
4344 * This value may be variable depending on boot parameters on IA64 and 4348 * This value may be variable depending on boot parameters on IA64 and
4345 * powerpc. 4349 * powerpc.
4346 */ 4350 */
4347 pageblock_order = order; 4351 pageblock_order = order;
4348 } 4352 }
4349 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 4353 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
4350 4354
4351 /* 4355 /*
4352 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() 4356 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
4353 * is unused as pageblock_order is set at compile-time. See 4357 * is unused as pageblock_order is set at compile-time. See
4354 * include/linux/pageblock-flags.h for the values of pageblock_order based on 4358 * include/linux/pageblock-flags.h for the values of pageblock_order based on
4355 * the kernel config 4359 * the kernel config
4356 */ 4360 */
4357 void __init set_pageblock_order(void) 4361 void __init set_pageblock_order(void)
4358 { 4362 {
4359 } 4363 }
4360 4364
4361 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 4365 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
4362 4366
4363 /* 4367 /*
4364 * Set up the zone data structures: 4368 * Set up the zone data structures:
4365 * - mark all pages reserved 4369 * - mark all pages reserved
4366 * - mark all memory queues empty 4370 * - mark all memory queues empty
4367 * - clear the memory bitmaps 4371 * - clear the memory bitmaps
4368 */ 4372 */
4369 static void __paginginit free_area_init_core(struct pglist_data *pgdat, 4373 static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4370 unsigned long *zones_size, unsigned long *zholes_size) 4374 unsigned long *zones_size, unsigned long *zholes_size)
4371 { 4375 {
4372 enum zone_type j; 4376 enum zone_type j;
4373 int nid = pgdat->node_id; 4377 int nid = pgdat->node_id;
4374 unsigned long zone_start_pfn = pgdat->node_start_pfn; 4378 unsigned long zone_start_pfn = pgdat->node_start_pfn;
4375 int ret; 4379 int ret;
4376 4380
4377 pgdat_resize_init(pgdat); 4381 pgdat_resize_init(pgdat);
4378 pgdat->nr_zones = 0; 4382 pgdat->nr_zones = 0;
4379 init_waitqueue_head(&pgdat->kswapd_wait); 4383 init_waitqueue_head(&pgdat->kswapd_wait);
4380 pgdat->kswapd_max_order = 0; 4384 pgdat->kswapd_max_order = 0;
4381 pgdat_page_cgroup_init(pgdat); 4385 pgdat_page_cgroup_init(pgdat);
4382 4386
4383 for (j = 0; j < MAX_NR_ZONES; j++) { 4387 for (j = 0; j < MAX_NR_ZONES; j++) {
4384 struct zone *zone = pgdat->node_zones + j; 4388 struct zone *zone = pgdat->node_zones + j;
4385 unsigned long size, realsize, memmap_pages; 4389 unsigned long size, realsize, memmap_pages;
4386 4390
4387 size = zone_spanned_pages_in_node(nid, j, zones_size); 4391 size = zone_spanned_pages_in_node(nid, j, zones_size);
4388 realsize = size - zone_absent_pages_in_node(nid, j, 4392 realsize = size - zone_absent_pages_in_node(nid, j,
4389 zholes_size); 4393 zholes_size);
4390 4394
4391 /* 4395 /*
4392 * Adjust realsize so that it accounts for how much memory 4396 * Adjust realsize so that it accounts for how much memory
4393 * is used by this zone for memmap. This affects the watermark 4397 * is used by this zone for memmap. This affects the watermark
4394 * and per-cpu initialisations 4398 * and per-cpu initialisations
4395 */ 4399 */
4396 memmap_pages = 4400 memmap_pages =
4397 PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; 4401 PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
4398 if (realsize >= memmap_pages) { 4402 if (realsize >= memmap_pages) {
4399 realsize -= memmap_pages; 4403 realsize -= memmap_pages;
4400 if (memmap_pages) 4404 if (memmap_pages)
4401 printk(KERN_DEBUG 4405 printk(KERN_DEBUG
4402 " %s zone: %lu pages used for memmap\n", 4406 " %s zone: %lu pages used for memmap\n",
4403 zone_names[j], memmap_pages); 4407 zone_names[j], memmap_pages);
4404 } else 4408 } else
4405 printk(KERN_WARNING 4409 printk(KERN_WARNING
4406 " %s zone: %lu pages exceeds realsize %lu\n", 4410 " %s zone: %lu pages exceeds realsize %lu\n",
4407 zone_names[j], memmap_pages, realsize); 4411 zone_names[j], memmap_pages, realsize);
4408 4412
4409 /* Account for reserved pages */ 4413 /* Account for reserved pages */
4410 if (j == 0 && realsize > dma_reserve) { 4414 if (j == 0 && realsize > dma_reserve) {
4411 realsize -= dma_reserve; 4415 realsize -= dma_reserve;
4412 printk(KERN_DEBUG " %s zone: %lu pages reserved\n", 4416 printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
4413 zone_names[0], dma_reserve); 4417 zone_names[0], dma_reserve);
4414 } 4418 }
4415 4419
4416 if (!is_highmem_idx(j)) 4420 if (!is_highmem_idx(j))
4417 nr_kernel_pages += realsize; 4421 nr_kernel_pages += realsize;
4418 nr_all_pages += realsize; 4422 nr_all_pages += realsize;
4419 4423
4420 zone->spanned_pages = size; 4424 zone->spanned_pages = size;
4421 zone->present_pages = realsize; 4425 zone->present_pages = realsize;
4422 #if defined CONFIG_COMPACTION || defined CONFIG_CMA 4426 #if defined CONFIG_COMPACTION || defined CONFIG_CMA
4423 zone->compact_cached_free_pfn = zone->zone_start_pfn + 4427 zone->compact_cached_free_pfn = zone->zone_start_pfn +
4424 zone->spanned_pages; 4428 zone->spanned_pages;
4425 zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1); 4429 zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1);
4426 #endif 4430 #endif
4427 #ifdef CONFIG_NUMA 4431 #ifdef CONFIG_NUMA
4428 zone->node = nid; 4432 zone->node = nid;
4429 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) 4433 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
4430 / 100; 4434 / 100;
4431 zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; 4435 zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
4432 #endif 4436 #endif
4433 zone->name = zone_names[j]; 4437 zone->name = zone_names[j];
4434 spin_lock_init(&zone->lock); 4438 spin_lock_init(&zone->lock);
4435 spin_lock_init(&zone->lru_lock); 4439 spin_lock_init(&zone->lru_lock);
4436 zone_seqlock_init(zone); 4440 zone_seqlock_init(zone);
4437 zone->zone_pgdat = pgdat; 4441 zone->zone_pgdat = pgdat;
4438 4442
4439 zone_pcp_init(zone); 4443 zone_pcp_init(zone);
4440 lruvec_init(&zone->lruvec, zone); 4444 lruvec_init(&zone->lruvec, zone);
4441 zap_zone_vm_stats(zone); 4445 zap_zone_vm_stats(zone);
4442 zone->flags = 0; 4446 zone->flags = 0;
4443 #ifdef CONFIG_MEMORY_ISOLATION 4447 #ifdef CONFIG_MEMORY_ISOLATION
4444 zone->nr_pageblock_isolate = 0; 4448 zone->nr_pageblock_isolate = 0;
4445 #endif 4449 #endif
4446 if (!size) 4450 if (!size)
4447 continue; 4451 continue;
4448 4452
4449 set_pageblock_order(); 4453 set_pageblock_order();
4450 setup_usemap(pgdat, zone, size); 4454 setup_usemap(pgdat, zone, size);
4451 ret = init_currently_empty_zone(zone, zone_start_pfn, 4455 ret = init_currently_empty_zone(zone, zone_start_pfn,
4452 size, MEMMAP_EARLY); 4456 size, MEMMAP_EARLY);
4453 BUG_ON(ret); 4457 BUG_ON(ret);
4454 memmap_init(size, nid, j, zone_start_pfn); 4458 memmap_init(size, nid, j, zone_start_pfn);
4455 zone_start_pfn += size; 4459 zone_start_pfn += size;
4456 } 4460 }
4457 } 4461 }
4458 4462
4459 static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) 4463 static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
4460 { 4464 {
4461 /* Skip empty nodes */ 4465 /* Skip empty nodes */
4462 if (!pgdat->node_spanned_pages) 4466 if (!pgdat->node_spanned_pages)
4463 return; 4467 return;
4464 4468
4465 #ifdef CONFIG_FLAT_NODE_MEM_MAP 4469 #ifdef CONFIG_FLAT_NODE_MEM_MAP
4466 /* ia64 gets its own node_mem_map, before this, without bootmem */ 4470 /* ia64 gets its own node_mem_map, before this, without bootmem */
4467 if (!pgdat->node_mem_map) { 4471 if (!pgdat->node_mem_map) {
4468 unsigned long size, start, end; 4472 unsigned long size, start, end;
4469 struct page *map; 4473 struct page *map;
4470 4474
4471 /* 4475 /*
4472 * The zone's endpoints aren't required to be MAX_ORDER 4476 * The zone's endpoints aren't required to be MAX_ORDER
4473 * aligned but the node_mem_map endpoints must be in order 4477 * aligned but the node_mem_map endpoints must be in order
4474 * for the buddy allocator to function correctly. 4478 * for the buddy allocator to function correctly.
4475 */ 4479 */
4476 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); 4480 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
4477 end = pgdat->node_start_pfn + pgdat->node_spanned_pages; 4481 end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
4478 end = ALIGN(end, MAX_ORDER_NR_PAGES); 4482 end = ALIGN(end, MAX_ORDER_NR_PAGES);
4479 size = (end - start) * sizeof(struct page); 4483 size = (end - start) * sizeof(struct page);
4480 map = alloc_remap(pgdat->node_id, size); 4484 map = alloc_remap(pgdat->node_id, size);
4481 if (!map) 4485 if (!map)
4482 map = alloc_bootmem_node_nopanic(pgdat, size); 4486 map = alloc_bootmem_node_nopanic(pgdat, size);
4483 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); 4487 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
4484 } 4488 }
4485 #ifndef CONFIG_NEED_MULTIPLE_NODES 4489 #ifndef CONFIG_NEED_MULTIPLE_NODES
4486 /* 4490 /*
4487 * With no DISCONTIG, the global mem_map is just set as node 0's 4491 * With no DISCONTIG, the global mem_map is just set as node 0's
4488 */ 4492 */
4489 if (pgdat == NODE_DATA(0)) { 4493 if (pgdat == NODE_DATA(0)) {
4490 mem_map = NODE_DATA(0)->node_mem_map; 4494 mem_map = NODE_DATA(0)->node_mem_map;
4491 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 4495 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4492 if (page_to_pfn(mem_map) != pgdat->node_start_pfn) 4496 if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
4493 mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET); 4497 mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
4494 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 4498 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4495 } 4499 }
4496 #endif 4500 #endif
4497 #endif /* CONFIG_FLAT_NODE_MEM_MAP */ 4501 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
4498 } 4502 }
4499 4503
4500 void __paginginit free_area_init_node(int nid, unsigned long *zones_size, 4504 void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
4501 unsigned long node_start_pfn, unsigned long *zholes_size) 4505 unsigned long node_start_pfn, unsigned long *zholes_size)
4502 { 4506 {
4503 pg_data_t *pgdat = NODE_DATA(nid); 4507 pg_data_t *pgdat = NODE_DATA(nid);
4504 4508
4505 pgdat->node_id = nid; 4509 pgdat->node_id = nid;
4506 pgdat->node_start_pfn = node_start_pfn; 4510 pgdat->node_start_pfn = node_start_pfn;
4507 calculate_node_totalpages(pgdat, zones_size, zholes_size); 4511 calculate_node_totalpages(pgdat, zones_size, zholes_size);
4508 4512
4509 alloc_node_mem_map(pgdat); 4513 alloc_node_mem_map(pgdat);
4510 #ifdef CONFIG_FLAT_NODE_MEM_MAP 4514 #ifdef CONFIG_FLAT_NODE_MEM_MAP
4511 printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n", 4515 printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
4512 nid, (unsigned long)pgdat, 4516 nid, (unsigned long)pgdat,
4513 (unsigned long)pgdat->node_mem_map); 4517 (unsigned long)pgdat->node_mem_map);
4514 #endif 4518 #endif
4515 4519
4516 free_area_init_core(pgdat, zones_size, zholes_size); 4520 free_area_init_core(pgdat, zones_size, zholes_size);
4517 } 4521 }
4518 4522
4519 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 4523 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4520 4524
4521 #if MAX_NUMNODES > 1 4525 #if MAX_NUMNODES > 1
4522 /* 4526 /*
4523 * Figure out the number of possible node ids. 4527 * Figure out the number of possible node ids.
4524 */ 4528 */
4525 static void __init setup_nr_node_ids(void) 4529 static void __init setup_nr_node_ids(void)
4526 { 4530 {
4527 unsigned int node; 4531 unsigned int node;
4528 unsigned int highest = 0; 4532 unsigned int highest = 0;
4529 4533
4530 for_each_node_mask(node, node_possible_map) 4534 for_each_node_mask(node, node_possible_map)
4531 highest = node; 4535 highest = node;
4532 nr_node_ids = highest + 1; 4536 nr_node_ids = highest + 1;
4533 } 4537 }
4534 #else 4538 #else
4535 static inline void setup_nr_node_ids(void) 4539 static inline void setup_nr_node_ids(void)
4536 { 4540 {
4537 } 4541 }
4538 #endif 4542 #endif
4539 4543
4540 /** 4544 /**
4541 * node_map_pfn_alignment - determine the maximum internode alignment 4545 * node_map_pfn_alignment - determine the maximum internode alignment
4542 * 4546 *
4543 * This function should be called after node map is populated and sorted. 4547 * This function should be called after node map is populated and sorted.
4544 * It calculates the maximum power of two alignment which can distinguish 4548 * It calculates the maximum power of two alignment which can distinguish
4545 * all the nodes. 4549 * all the nodes.
4546 * 4550 *
4547 * For example, if all nodes are 1GiB and aligned to 1GiB, the return value 4551 * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
4548 * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the 4552 * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the
4549 * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is 4553 * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is
4550 * shifted, 1GiB is enough and this function will indicate so. 4554 * shifted, 1GiB is enough and this function will indicate so.
4551 * 4555 *
4552 * This is used to test whether pfn -> nid mapping of the chosen memory 4556 * This is used to test whether pfn -> nid mapping of the chosen memory
4553 * model has fine enough granularity to avoid incorrect mapping for the 4557 * model has fine enough granularity to avoid incorrect mapping for the
4554 * populated node map. 4558 * populated node map.
4555 * 4559 *
4556 * Returns the determined alignment in pfn's. 0 if there is no alignment 4560 * Returns the determined alignment in pfn's. 0 if there is no alignment
4557 * requirement (single node). 4561 * requirement (single node).
4558 */ 4562 */
4559 unsigned long __init node_map_pfn_alignment(void) 4563 unsigned long __init node_map_pfn_alignment(void)
4560 { 4564 {
4561 unsigned long accl_mask = 0, last_end = 0; 4565 unsigned long accl_mask = 0, last_end = 0;
4562 unsigned long start, end, mask; 4566 unsigned long start, end, mask;
4563 int last_nid = -1; 4567 int last_nid = -1;
4564 int i, nid; 4568 int i, nid;
4565 4569
4566 for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) { 4570 for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
4567 if (!start || last_nid < 0 || last_nid == nid) { 4571 if (!start || last_nid < 0 || last_nid == nid) {
4568 last_nid = nid; 4572 last_nid = nid;
4569 last_end = end; 4573 last_end = end;
4570 continue; 4574 continue;
4571 } 4575 }
4572 4576
4573 /* 4577 /*
4574 * Start with a mask granular enough to pin-point to the 4578 * Start with a mask granular enough to pin-point to the
4575 * start pfn and tick off bits one-by-one until it becomes 4579 * start pfn and tick off bits one-by-one until it becomes
4576 * too coarse to separate the current node from the last. 4580 * too coarse to separate the current node from the last.
4577 */ 4581 */
4578 mask = ~((1 << __ffs(start)) - 1); 4582 mask = ~((1 << __ffs(start)) - 1);
4579 while (mask && last_end <= (start & (mask << 1))) 4583 while (mask && last_end <= (start & (mask << 1)))
4580 mask <<= 1; 4584 mask <<= 1;
4581 4585
4582 /* accumulate all internode masks */ 4586 /* accumulate all internode masks */
4583 accl_mask |= mask; 4587 accl_mask |= mask;
4584 } 4588 }
4585 4589
4586 /* convert mask to number of pages */ 4590 /* convert mask to number of pages */
4587 return ~accl_mask + 1; 4591 return ~accl_mask + 1;
4588 } 4592 }
4589 4593
4590 /* Find the lowest pfn for a node */ 4594 /* Find the lowest pfn for a node */
4591 static unsigned long __init find_min_pfn_for_node(int nid) 4595 static unsigned long __init find_min_pfn_for_node(int nid)
4592 { 4596 {
4593 unsigned long min_pfn = ULONG_MAX; 4597 unsigned long min_pfn = ULONG_MAX;
4594 unsigned long start_pfn; 4598 unsigned long start_pfn;
4595 int i; 4599 int i;
4596 4600
4597 for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL) 4601 for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
4598 min_pfn = min(min_pfn, start_pfn); 4602 min_pfn = min(min_pfn, start_pfn);
4599 4603
4600 if (min_pfn == ULONG_MAX) { 4604 if (min_pfn == ULONG_MAX) {
4601 printk(KERN_WARNING 4605 printk(KERN_WARNING
4602 "Could not find start_pfn for node %d\n", nid); 4606 "Could not find start_pfn for node %d\n", nid);
4603 return 0; 4607 return 0;
4604 } 4608 }
4605 4609
4606 return min_pfn; 4610 return min_pfn;
4607 } 4611 }
4608 4612
4609 /** 4613 /**
4610 * find_min_pfn_with_active_regions - Find the minimum PFN registered 4614 * find_min_pfn_with_active_regions - Find the minimum PFN registered
4611 * 4615 *
4612 * It returns the minimum PFN based on information provided via 4616 * It returns the minimum PFN based on information provided via
4613 * add_active_range(). 4617 * add_active_range().
4614 */ 4618 */
4615 unsigned long __init find_min_pfn_with_active_regions(void) 4619 unsigned long __init find_min_pfn_with_active_regions(void)
4616 { 4620 {
4617 return find_min_pfn_for_node(MAX_NUMNODES); 4621 return find_min_pfn_for_node(MAX_NUMNODES);
4618 } 4622 }
4619 4623
4620 /* 4624 /*
4621 * early_calculate_totalpages() 4625 * early_calculate_totalpages()
4622 * Sum pages in active regions for movable zone. 4626 * Sum pages in active regions for movable zone.
4623 * Populate N_HIGH_MEMORY for calculating usable_nodes. 4627 * Populate N_HIGH_MEMORY for calculating usable_nodes.
4624 */ 4628 */
4625 static unsigned long __init early_calculate_totalpages(void) 4629 static unsigned long __init early_calculate_totalpages(void)
4626 { 4630 {
4627 unsigned long totalpages = 0; 4631 unsigned long totalpages = 0;
4628 unsigned long start_pfn, end_pfn; 4632 unsigned long start_pfn, end_pfn;
4629 int i, nid; 4633 int i, nid;
4630 4634
4631 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { 4635 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
4632 unsigned long pages = end_pfn - start_pfn; 4636 unsigned long pages = end_pfn - start_pfn;
4633 4637
4634 totalpages += pages; 4638 totalpages += pages;
4635 if (pages) 4639 if (pages)
4636 node_set_state(nid, N_HIGH_MEMORY); 4640 node_set_state(nid, N_HIGH_MEMORY);
4637 } 4641 }
4638 return totalpages; 4642 return totalpages;
4639 } 4643 }
4640 4644
4641 /* 4645 /*
4642 * Find the PFN the Movable zone begins in each node. Kernel memory 4646 * Find the PFN the Movable zone begins in each node. Kernel memory
4643 * is spread evenly between nodes as long as the nodes have enough 4647 * is spread evenly between nodes as long as the nodes have enough
4644 * memory. When they don't, some nodes will have more kernelcore than 4648 * memory. When they don't, some nodes will have more kernelcore than
4645 * others 4649 * others
4646 */ 4650 */
4647 static void __init find_zone_movable_pfns_for_nodes(void) 4651 static void __init find_zone_movable_pfns_for_nodes(void)
4648 { 4652 {
4649 int i, nid; 4653 int i, nid;
4650 unsigned long usable_startpfn; 4654 unsigned long usable_startpfn;
4651 unsigned long kernelcore_node, kernelcore_remaining; 4655 unsigned long kernelcore_node, kernelcore_remaining;
4652 /* save the state before borrow the nodemask */ 4656 /* save the state before borrow the nodemask */
4653 nodemask_t saved_node_state = node_states[N_HIGH_MEMORY]; 4657 nodemask_t saved_node_state = node_states[N_HIGH_MEMORY];
4654 unsigned long totalpages = early_calculate_totalpages(); 4658 unsigned long totalpages = early_calculate_totalpages();
4655 int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); 4659 int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
4656 4660
4657 /* 4661 /*
4658 * If movablecore was specified, calculate what size of 4662 * If movablecore was specified, calculate what size of
4659 * kernelcore that corresponds so that memory usable for 4663 * kernelcore that corresponds so that memory usable for
4660 * any allocation type is evenly spread. If both kernelcore 4664 * any allocation type is evenly spread. If both kernelcore
4661 * and movablecore are specified, then the value of kernelcore 4665 * and movablecore are specified, then the value of kernelcore
4662 * will be used for required_kernelcore if it's greater than 4666 * will be used for required_kernelcore if it's greater than
4663 * what movablecore would have allowed. 4667 * what movablecore would have allowed.
4664 */ 4668 */
4665 if (required_movablecore) { 4669 if (required_movablecore) {
4666 unsigned long corepages; 4670 unsigned long corepages;
4667 4671
4668 /* 4672 /*
4669 * Round-up so that ZONE_MOVABLE is at least as large as what 4673 * Round-up so that ZONE_MOVABLE is at least as large as what
4670 * was requested by the user 4674 * was requested by the user
4671 */ 4675 */
4672 required_movablecore = 4676 required_movablecore =
4673 roundup(required_movablecore, MAX_ORDER_NR_PAGES); 4677 roundup(required_movablecore, MAX_ORDER_NR_PAGES);
4674 corepages = totalpages - required_movablecore; 4678 corepages = totalpages - required_movablecore;
4675 4679
4676 required_kernelcore = max(required_kernelcore, corepages); 4680 required_kernelcore = max(required_kernelcore, corepages);
4677 } 4681 }
4678 4682
4679 /* If kernelcore was not specified, there is no ZONE_MOVABLE */ 4683 /* If kernelcore was not specified, there is no ZONE_MOVABLE */
4680 if (!required_kernelcore) 4684 if (!required_kernelcore)
4681 goto out; 4685 goto out;
4682 4686
4683 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ 4687 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
4684 find_usable_zone_for_movable(); 4688 find_usable_zone_for_movable();
4685 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; 4689 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
4686 4690
4687 restart: 4691 restart:
4688 /* Spread kernelcore memory as evenly as possible throughout nodes */ 4692 /* Spread kernelcore memory as evenly as possible throughout nodes */
4689 kernelcore_node = required_kernelcore / usable_nodes; 4693 kernelcore_node = required_kernelcore / usable_nodes;
4690 for_each_node_state(nid, N_HIGH_MEMORY) { 4694 for_each_node_state(nid, N_HIGH_MEMORY) {
4691 unsigned long start_pfn, end_pfn; 4695 unsigned long start_pfn, end_pfn;
4692 4696
4693 /* 4697 /*
4694 * Recalculate kernelcore_node if the division per node 4698 * Recalculate kernelcore_node if the division per node
4695 * now exceeds what is necessary to satisfy the requested 4699 * now exceeds what is necessary to satisfy the requested
4696 * amount of memory for the kernel 4700 * amount of memory for the kernel
4697 */ 4701 */
4698 if (required_kernelcore < kernelcore_node) 4702 if (required_kernelcore < kernelcore_node)
4699 kernelcore_node = required_kernelcore / usable_nodes; 4703 kernelcore_node = required_kernelcore / usable_nodes;
4700 4704
4701 /* 4705 /*
4702 * As the map is walked, we track how much memory is usable 4706 * As the map is walked, we track how much memory is usable
4703 * by the kernel using kernelcore_remaining. When it is 4707 * by the kernel using kernelcore_remaining. When it is
4704 * 0, the rest of the node is usable by ZONE_MOVABLE 4708 * 0, the rest of the node is usable by ZONE_MOVABLE
4705 */ 4709 */
4706 kernelcore_remaining = kernelcore_node; 4710 kernelcore_remaining = kernelcore_node;
4707 4711
4708 /* Go through each range of PFNs within this node */ 4712 /* Go through each range of PFNs within this node */
4709 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 4713 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
4710 unsigned long size_pages; 4714 unsigned long size_pages;
4711 4715
4712 start_pfn = max(start_pfn, zone_movable_pfn[nid]); 4716 start_pfn = max(start_pfn, zone_movable_pfn[nid]);
4713 if (start_pfn >= end_pfn) 4717 if (start_pfn >= end_pfn)
4714 continue; 4718 continue;
4715 4719
4716 /* Account for what is only usable for kernelcore */ 4720 /* Account for what is only usable for kernelcore */
4717 if (start_pfn < usable_startpfn) { 4721 if (start_pfn < usable_startpfn) {
4718 unsigned long kernel_pages; 4722 unsigned long kernel_pages;
4719 kernel_pages = min(end_pfn, usable_startpfn) 4723 kernel_pages = min(end_pfn, usable_startpfn)
4720 - start_pfn; 4724 - start_pfn;
4721 4725
4722 kernelcore_remaining -= min(kernel_pages, 4726 kernelcore_remaining -= min(kernel_pages,
4723 kernelcore_remaining); 4727 kernelcore_remaining);
4724 required_kernelcore -= min(kernel_pages, 4728 required_kernelcore -= min(kernel_pages,
4725 required_kernelcore); 4729 required_kernelcore);
4726 4730
4727 /* Continue if range is now fully accounted */ 4731 /* Continue if range is now fully accounted */
4728 if (end_pfn <= usable_startpfn) { 4732 if (end_pfn <= usable_startpfn) {
4729 4733
4730 /* 4734 /*
4731 * Push zone_movable_pfn to the end so 4735 * Push zone_movable_pfn to the end so
4732 * that if we have to rebalance 4736 * that if we have to rebalance
4733 * kernelcore across nodes, we will 4737 * kernelcore across nodes, we will
4734 * not double account here 4738 * not double account here
4735 */ 4739 */
4736 zone_movable_pfn[nid] = end_pfn; 4740 zone_movable_pfn[nid] = end_pfn;
4737 continue; 4741 continue;
4738 } 4742 }
4739 start_pfn = usable_startpfn; 4743 start_pfn = usable_startpfn;
4740 } 4744 }
4741 4745
4742 /* 4746 /*
4743 * The usable PFN range for ZONE_MOVABLE is from 4747 * The usable PFN range for ZONE_MOVABLE is from
4744 * start_pfn->end_pfn. Calculate size_pages as the 4748 * start_pfn->end_pfn. Calculate size_pages as the
4745 * number of pages used as kernelcore 4749 * number of pages used as kernelcore
4746 */ 4750 */
4747 size_pages = end_pfn - start_pfn; 4751 size_pages = end_pfn - start_pfn;
4748 if (size_pages > kernelcore_remaining) 4752 if (size_pages > kernelcore_remaining)
4749 size_pages = kernelcore_remaining; 4753 size_pages = kernelcore_remaining;
4750 zone_movable_pfn[nid] = start_pfn + size_pages; 4754 zone_movable_pfn[nid] = start_pfn + size_pages;
4751 4755
4752 /* 4756 /*
4753 * Some kernelcore has been met, update counts and 4757 * Some kernelcore has been met, update counts and
4754 * break if the kernelcore for this node has been 4758 * break if the kernelcore for this node has been
4755 * satisified 4759 * satisified
4756 */ 4760 */
4757 required_kernelcore -= min(required_kernelcore, 4761 required_kernelcore -= min(required_kernelcore,
4758 size_pages); 4762 size_pages);
4759 kernelcore_remaining -= size_pages; 4763 kernelcore_remaining -= size_pages;
4760 if (!kernelcore_remaining) 4764 if (!kernelcore_remaining)
4761 break; 4765 break;
4762 } 4766 }
4763 } 4767 }
4764 4768
4765 /* 4769 /*
4766 * If there is still required_kernelcore, we do another pass with one 4770 * If there is still required_kernelcore, we do another pass with one
4767 * less node in the count. This will push zone_movable_pfn[nid] further 4771 * less node in the count. This will push zone_movable_pfn[nid] further
4768 * along on the nodes that still have memory until kernelcore is 4772 * along on the nodes that still have memory until kernelcore is
4769 * satisified 4773 * satisified
4770 */ 4774 */
4771 usable_nodes--; 4775 usable_nodes--;
4772 if (usable_nodes && required_kernelcore > usable_nodes) 4776 if (usable_nodes && required_kernelcore > usable_nodes)
4773 goto restart; 4777 goto restart;
4774 4778
4775 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ 4779 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
4776 for (nid = 0; nid < MAX_NUMNODES; nid++) 4780 for (nid = 0; nid < MAX_NUMNODES; nid++)
4777 zone_movable_pfn[nid] = 4781 zone_movable_pfn[nid] =
4778 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); 4782 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
4779 4783
4780 out: 4784 out:
4781 /* restore the node_state */ 4785 /* restore the node_state */
4782 node_states[N_HIGH_MEMORY] = saved_node_state; 4786 node_states[N_HIGH_MEMORY] = saved_node_state;
4783 } 4787 }
4784 4788
4785 /* Any regular memory on that node ? */ 4789 /* Any regular memory on that node ? */
4786 static void __init check_for_regular_memory(pg_data_t *pgdat) 4790 static void __init check_for_regular_memory(pg_data_t *pgdat)
4787 { 4791 {
4788 #ifdef CONFIG_HIGHMEM 4792 #ifdef CONFIG_HIGHMEM
4789 enum zone_type zone_type; 4793 enum zone_type zone_type;
4790 4794
4791 for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) { 4795 for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {
4792 struct zone *zone = &pgdat->node_zones[zone_type]; 4796 struct zone *zone = &pgdat->node_zones[zone_type];
4793 if (zone->present_pages) { 4797 if (zone->present_pages) {
4794 node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY); 4798 node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);
4795 break; 4799 break;
4796 } 4800 }
4797 } 4801 }
4798 #endif 4802 #endif
4799 } 4803 }
4800 4804
4801 /** 4805 /**
4802 * free_area_init_nodes - Initialise all pg_data_t and zone data 4806 * free_area_init_nodes - Initialise all pg_data_t and zone data
4803 * @max_zone_pfn: an array of max PFNs for each zone 4807 * @max_zone_pfn: an array of max PFNs for each zone
4804 * 4808 *
4805 * This will call free_area_init_node() for each active node in the system. 4809 * This will call free_area_init_node() for each active node in the system.
4806 * Using the page ranges provided by add_active_range(), the size of each 4810 * Using the page ranges provided by add_active_range(), the size of each
4807 * zone in each node and their holes is calculated. If the maximum PFN 4811 * zone in each node and their holes is calculated. If the maximum PFN
4808 * between two adjacent zones match, it is assumed that the zone is empty. 4812 * between two adjacent zones match, it is assumed that the zone is empty.
4809 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed 4813 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
4810 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone 4814 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
4811 * starts where the previous one ended. For example, ZONE_DMA32 starts 4815 * starts where the previous one ended. For example, ZONE_DMA32 starts
4812 * at arch_max_dma_pfn. 4816 * at arch_max_dma_pfn.
4813 */ 4817 */
4814 void __init free_area_init_nodes(unsigned long *max_zone_pfn) 4818 void __init free_area_init_nodes(unsigned long *max_zone_pfn)
4815 { 4819 {
4816 unsigned long start_pfn, end_pfn; 4820 unsigned long start_pfn, end_pfn;
4817 int i, nid; 4821 int i, nid;
4818 4822
4819 /* Record where the zone boundaries are */ 4823 /* Record where the zone boundaries are */
4820 memset(arch_zone_lowest_possible_pfn, 0, 4824 memset(arch_zone_lowest_possible_pfn, 0,
4821 sizeof(arch_zone_lowest_possible_pfn)); 4825 sizeof(arch_zone_lowest_possible_pfn));
4822 memset(arch_zone_highest_possible_pfn, 0, 4826 memset(arch_zone_highest_possible_pfn, 0,
4823 sizeof(arch_zone_highest_possible_pfn)); 4827 sizeof(arch_zone_highest_possible_pfn));
4824 arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); 4828 arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
4825 arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; 4829 arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
4826 for (i = 1; i < MAX_NR_ZONES; i++) { 4830 for (i = 1; i < MAX_NR_ZONES; i++) {
4827 if (i == ZONE_MOVABLE) 4831 if (i == ZONE_MOVABLE)
4828 continue; 4832 continue;
4829 arch_zone_lowest_possible_pfn[i] = 4833 arch_zone_lowest_possible_pfn[i] =
4830 arch_zone_highest_possible_pfn[i-1]; 4834 arch_zone_highest_possible_pfn[i-1];
4831 arch_zone_highest_possible_pfn[i] = 4835 arch_zone_highest_possible_pfn[i] =
4832 max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); 4836 max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
4833 } 4837 }
4834 arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; 4838 arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
4835 arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; 4839 arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
4836 4840
4837 /* Find the PFNs that ZONE_MOVABLE begins at in each node */ 4841 /* Find the PFNs that ZONE_MOVABLE begins at in each node */
4838 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); 4842 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
4839 find_zone_movable_pfns_for_nodes(); 4843 find_zone_movable_pfns_for_nodes();
4840 4844
4841 /* Print out the zone ranges */ 4845 /* Print out the zone ranges */
4842 printk("Zone ranges:\n"); 4846 printk("Zone ranges:\n");
4843 for (i = 0; i < MAX_NR_ZONES; i++) { 4847 for (i = 0; i < MAX_NR_ZONES; i++) {
4844 if (i == ZONE_MOVABLE) 4848 if (i == ZONE_MOVABLE)
4845 continue; 4849 continue;
4846 printk(KERN_CONT " %-8s ", zone_names[i]); 4850 printk(KERN_CONT " %-8s ", zone_names[i]);
4847 if (arch_zone_lowest_possible_pfn[i] == 4851 if (arch_zone_lowest_possible_pfn[i] ==
4848 arch_zone_highest_possible_pfn[i]) 4852 arch_zone_highest_possible_pfn[i])
4849 printk(KERN_CONT "empty\n"); 4853 printk(KERN_CONT "empty\n");
4850 else 4854 else
4851 printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n", 4855 printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",
4852 arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT, 4856 arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,
4853 (arch_zone_highest_possible_pfn[i] 4857 (arch_zone_highest_possible_pfn[i]
4854 << PAGE_SHIFT) - 1); 4858 << PAGE_SHIFT) - 1);
4855 } 4859 }
4856 4860
4857 /* Print out the PFNs ZONE_MOVABLE begins at in each node */ 4861 /* Print out the PFNs ZONE_MOVABLE begins at in each node */
4858 printk("Movable zone start for each node\n"); 4862 printk("Movable zone start for each node\n");
4859 for (i = 0; i < MAX_NUMNODES; i++) { 4863 for (i = 0; i < MAX_NUMNODES; i++) {
4860 if (zone_movable_pfn[i]) 4864 if (zone_movable_pfn[i])
4861 printk(" Node %d: %#010lx\n", i, 4865 printk(" Node %d: %#010lx\n", i,
4862 zone_movable_pfn[i] << PAGE_SHIFT); 4866 zone_movable_pfn[i] << PAGE_SHIFT);
4863 } 4867 }
4864 4868
4865 /* Print out the early_node_map[] */ 4869 /* Print out the early_node_map[] */
4866 printk("Early memory node ranges\n"); 4870 printk("Early memory node ranges\n");
4867 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) 4871 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
4868 printk(" node %3d: [mem %#010lx-%#010lx]\n", nid, 4872 printk(" node %3d: [mem %#010lx-%#010lx]\n", nid,
4869 start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1); 4873 start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);
4870 4874
4871 /* Initialise every node */ 4875 /* Initialise every node */
4872 mminit_verify_pageflags_layout(); 4876 mminit_verify_pageflags_layout();
4873 setup_nr_node_ids(); 4877 setup_nr_node_ids();
4874 for_each_online_node(nid) { 4878 for_each_online_node(nid) {
4875 pg_data_t *pgdat = NODE_DATA(nid); 4879 pg_data_t *pgdat = NODE_DATA(nid);
4876 free_area_init_node(nid, NULL, 4880 free_area_init_node(nid, NULL,
4877 find_min_pfn_for_node(nid), NULL); 4881 find_min_pfn_for_node(nid), NULL);
4878 4882
4879 /* Any memory on that node */ 4883 /* Any memory on that node */
4880 if (pgdat->node_present_pages) 4884 if (pgdat->node_present_pages)
4881 node_set_state(nid, N_HIGH_MEMORY); 4885 node_set_state(nid, N_HIGH_MEMORY);
4882 check_for_regular_memory(pgdat); 4886 check_for_regular_memory(pgdat);
4883 } 4887 }
4884 } 4888 }
4885 4889
4886 static int __init cmdline_parse_core(char *p, unsigned long *core) 4890 static int __init cmdline_parse_core(char *p, unsigned long *core)
4887 { 4891 {
4888 unsigned long long coremem; 4892 unsigned long long coremem;
4889 if (!p) 4893 if (!p)
4890 return -EINVAL; 4894 return -EINVAL;
4891 4895
4892 coremem = memparse(p, &p); 4896 coremem = memparse(p, &p);
4893 *core = coremem >> PAGE_SHIFT; 4897 *core = coremem >> PAGE_SHIFT;
4894 4898
4895 /* Paranoid check that UL is enough for the coremem value */ 4899 /* Paranoid check that UL is enough for the coremem value */
4896 WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX); 4900 WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
4897 4901
4898 return 0; 4902 return 0;
4899 } 4903 }
4900 4904
4901 /* 4905 /*
4902 * kernelcore=size sets the amount of memory for use for allocations that 4906 * kernelcore=size sets the amount of memory for use for allocations that
4903 * cannot be reclaimed or migrated. 4907 * cannot be reclaimed or migrated.
4904 */ 4908 */
4905 static int __init cmdline_parse_kernelcore(char *p) 4909 static int __init cmdline_parse_kernelcore(char *p)
4906 { 4910 {
4907 return cmdline_parse_core(p, &required_kernelcore); 4911 return cmdline_parse_core(p, &required_kernelcore);
4908 } 4912 }
4909 4913
4910 /* 4914 /*
4911 * movablecore=size sets the amount of memory for use for allocations that 4915 * movablecore=size sets the amount of memory for use for allocations that
4912 * can be reclaimed or migrated. 4916 * can be reclaimed or migrated.
4913 */ 4917 */
4914 static int __init cmdline_parse_movablecore(char *p) 4918 static int __init cmdline_parse_movablecore(char *p)
4915 { 4919 {
4916 return cmdline_parse_core(p, &required_movablecore); 4920 return cmdline_parse_core(p, &required_movablecore);
4917 } 4921 }
4918 4922
4919 early_param("kernelcore", cmdline_parse_kernelcore); 4923 early_param("kernelcore", cmdline_parse_kernelcore);
4920 early_param("movablecore", cmdline_parse_movablecore); 4924 early_param("movablecore", cmdline_parse_movablecore);
4921 4925
4922 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 4926 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4923 4927
4924 /** 4928 /**
4925 * set_dma_reserve - set the specified number of pages reserved in the first zone 4929 * set_dma_reserve - set the specified number of pages reserved in the first zone
4926 * @new_dma_reserve: The number of pages to mark reserved 4930 * @new_dma_reserve: The number of pages to mark reserved
4927 * 4931 *
4928 * The per-cpu batchsize and zone watermarks are determined by present_pages. 4932 * The per-cpu batchsize and zone watermarks are determined by present_pages.
4929 * In the DMA zone, a significant percentage may be consumed by kernel image 4933 * In the DMA zone, a significant percentage may be consumed by kernel image
4930 * and other unfreeable allocations which can skew the watermarks badly. This 4934 * and other unfreeable allocations which can skew the watermarks badly. This
4931 * function may optionally be used to account for unfreeable pages in the 4935 * function may optionally be used to account for unfreeable pages in the
4932 * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and 4936 * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
4933 * smaller per-cpu batchsize. 4937 * smaller per-cpu batchsize.
4934 */ 4938 */
4935 void __init set_dma_reserve(unsigned long new_dma_reserve) 4939 void __init set_dma_reserve(unsigned long new_dma_reserve)
4936 { 4940 {
4937 dma_reserve = new_dma_reserve; 4941 dma_reserve = new_dma_reserve;
4938 } 4942 }
4939 4943
4940 void __init free_area_init(unsigned long *zones_size) 4944 void __init free_area_init(unsigned long *zones_size)
4941 { 4945 {
4942 free_area_init_node(0, zones_size, 4946 free_area_init_node(0, zones_size,
4943 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); 4947 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
4944 } 4948 }
4945 4949
4946 static int page_alloc_cpu_notify(struct notifier_block *self, 4950 static int page_alloc_cpu_notify(struct notifier_block *self,
4947 unsigned long action, void *hcpu) 4951 unsigned long action, void *hcpu)
4948 { 4952 {
4949 int cpu = (unsigned long)hcpu; 4953 int cpu = (unsigned long)hcpu;
4950 4954
4951 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { 4955 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
4952 lru_add_drain_cpu(cpu); 4956 lru_add_drain_cpu(cpu);
4953 drain_pages(cpu); 4957 drain_pages(cpu);
4954 4958
4955 /* 4959 /*
4956 * Spill the event counters of the dead processor 4960 * Spill the event counters of the dead processor
4957 * into the current processors event counters. 4961 * into the current processors event counters.
4958 * This artificially elevates the count of the current 4962 * This artificially elevates the count of the current
4959 * processor. 4963 * processor.
4960 */ 4964 */
4961 vm_events_fold_cpu(cpu); 4965 vm_events_fold_cpu(cpu);
4962 4966
4963 /* 4967 /*
4964 * Zero the differential counters of the dead processor 4968 * Zero the differential counters of the dead processor
4965 * so that the vm statistics are consistent. 4969 * so that the vm statistics are consistent.
4966 * 4970 *
4967 * This is only okay since the processor is dead and cannot 4971 * This is only okay since the processor is dead and cannot
4968 * race with what we are doing. 4972 * race with what we are doing.
4969 */ 4973 */
4970 refresh_cpu_vm_stats(cpu); 4974 refresh_cpu_vm_stats(cpu);
4971 } 4975 }
4972 return NOTIFY_OK; 4976 return NOTIFY_OK;
4973 } 4977 }
4974 4978
4975 void __init page_alloc_init(void) 4979 void __init page_alloc_init(void)
4976 { 4980 {
4977 hotcpu_notifier(page_alloc_cpu_notify, 0); 4981 hotcpu_notifier(page_alloc_cpu_notify, 0);
4978 } 4982 }
4979 4983
4980 /* 4984 /*
4981 * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio 4985 * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
4982 * or min_free_kbytes changes. 4986 * or min_free_kbytes changes.
4983 */ 4987 */
4984 static void calculate_totalreserve_pages(void) 4988 static void calculate_totalreserve_pages(void)
4985 { 4989 {
4986 struct pglist_data *pgdat; 4990 struct pglist_data *pgdat;
4987 unsigned long reserve_pages = 0; 4991 unsigned long reserve_pages = 0;
4988 enum zone_type i, j; 4992 enum zone_type i, j;
4989 4993
4990 for_each_online_pgdat(pgdat) { 4994 for_each_online_pgdat(pgdat) {
4991 for (i = 0; i < MAX_NR_ZONES; i++) { 4995 for (i = 0; i < MAX_NR_ZONES; i++) {
4992 struct zone *zone = pgdat->node_zones + i; 4996 struct zone *zone = pgdat->node_zones + i;
4993 unsigned long max = 0; 4997 unsigned long max = 0;
4994 4998
4995 /* Find valid and maximum lowmem_reserve in the zone */ 4999 /* Find valid and maximum lowmem_reserve in the zone */
4996 for (j = i; j < MAX_NR_ZONES; j++) { 5000 for (j = i; j < MAX_NR_ZONES; j++) {
4997 if (zone->lowmem_reserve[j] > max) 5001 if (zone->lowmem_reserve[j] > max)
4998 max = zone->lowmem_reserve[j]; 5002 max = zone->lowmem_reserve[j];
4999 } 5003 }
5000 5004
5001 /* we treat the high watermark as reserved pages. */ 5005 /* we treat the high watermark as reserved pages. */
5002 max += high_wmark_pages(zone); 5006 max += high_wmark_pages(zone);
5003 5007
5004 if (max > zone->present_pages) 5008 if (max > zone->present_pages)
5005 max = zone->present_pages; 5009 max = zone->present_pages;
5006 reserve_pages += max; 5010 reserve_pages += max;
5007 /* 5011 /*
5008 * Lowmem reserves are not available to 5012 * Lowmem reserves are not available to
5009 * GFP_HIGHUSER page cache allocations and 5013 * GFP_HIGHUSER page cache allocations and
5010 * kswapd tries to balance zones to their high 5014 * kswapd tries to balance zones to their high
5011 * watermark. As a result, neither should be 5015 * watermark. As a result, neither should be
5012 * regarded as dirtyable memory, to prevent a 5016 * regarded as dirtyable memory, to prevent a
5013 * situation where reclaim has to clean pages 5017 * situation where reclaim has to clean pages
5014 * in order to balance the zones. 5018 * in order to balance the zones.
5015 */ 5019 */
5016 zone->dirty_balance_reserve = max; 5020 zone->dirty_balance_reserve = max;
5017 } 5021 }
5018 } 5022 }
5019 dirty_balance_reserve = reserve_pages; 5023 dirty_balance_reserve = reserve_pages;
5020 totalreserve_pages = reserve_pages; 5024 totalreserve_pages = reserve_pages;
5021 } 5025 }
5022 5026
5023 /* 5027 /*
5024 * setup_per_zone_lowmem_reserve - called whenever 5028 * setup_per_zone_lowmem_reserve - called whenever
5025 * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone 5029 * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone
5026 * has a correct pages reserved value, so an adequate number of 5030 * has a correct pages reserved value, so an adequate number of
5027 * pages are left in the zone after a successful __alloc_pages(). 5031 * pages are left in the zone after a successful __alloc_pages().
5028 */ 5032 */
5029 static void setup_per_zone_lowmem_reserve(void) 5033 static void setup_per_zone_lowmem_reserve(void)
5030 { 5034 {
5031 struct pglist_data *pgdat; 5035 struct pglist_data *pgdat;
5032 enum zone_type j, idx; 5036 enum zone_type j, idx;
5033 5037
5034 for_each_online_pgdat(pgdat) { 5038 for_each_online_pgdat(pgdat) {
5035 for (j = 0; j < MAX_NR_ZONES; j++) { 5039 for (j = 0; j < MAX_NR_ZONES; j++) {
5036 struct zone *zone = pgdat->node_zones + j; 5040 struct zone *zone = pgdat->node_zones + j;
5037 unsigned long present_pages = zone->present_pages; 5041 unsigned long present_pages = zone->present_pages;
5038 5042
5039 zone->lowmem_reserve[j] = 0; 5043 zone->lowmem_reserve[j] = 0;
5040 5044
5041 idx = j; 5045 idx = j;
5042 while (idx) { 5046 while (idx) {
5043 struct zone *lower_zone; 5047 struct zone *lower_zone;
5044 5048
5045 idx--; 5049 idx--;
5046 5050
5047 if (sysctl_lowmem_reserve_ratio[idx] < 1) 5051 if (sysctl_lowmem_reserve_ratio[idx] < 1)
5048 sysctl_lowmem_reserve_ratio[idx] = 1; 5052 sysctl_lowmem_reserve_ratio[idx] = 1;
5049 5053
5050 lower_zone = pgdat->node_zones + idx; 5054 lower_zone = pgdat->node_zones + idx;
5051 lower_zone->lowmem_reserve[j] = present_pages / 5055 lower_zone->lowmem_reserve[j] = present_pages /
5052 sysctl_lowmem_reserve_ratio[idx]; 5056 sysctl_lowmem_reserve_ratio[idx];
5053 present_pages += lower_zone->present_pages; 5057 present_pages += lower_zone->present_pages;
5054 } 5058 }
5055 } 5059 }
5056 } 5060 }
5057 5061
5058 /* update totalreserve_pages */ 5062 /* update totalreserve_pages */
5059 calculate_totalreserve_pages(); 5063 calculate_totalreserve_pages();
5060 } 5064 }
5061 5065
5062 static void __setup_per_zone_wmarks(void) 5066 static void __setup_per_zone_wmarks(void)
5063 { 5067 {
5064 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); 5068 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
5065 unsigned long lowmem_pages = 0; 5069 unsigned long lowmem_pages = 0;
5066 struct zone *zone; 5070 struct zone *zone;
5067 unsigned long flags; 5071 unsigned long flags;
5068 5072
5069 /* Calculate total number of !ZONE_HIGHMEM pages */ 5073 /* Calculate total number of !ZONE_HIGHMEM pages */
5070 for_each_zone(zone) { 5074 for_each_zone(zone) {
5071 if (!is_highmem(zone)) 5075 if (!is_highmem(zone))
5072 lowmem_pages += zone->present_pages; 5076 lowmem_pages += zone->present_pages;
5073 } 5077 }
5074 5078
5075 for_each_zone(zone) { 5079 for_each_zone(zone) {
5076 u64 tmp; 5080 u64 tmp;
5077 5081
5078 spin_lock_irqsave(&zone->lock, flags); 5082 spin_lock_irqsave(&zone->lock, flags);
5079 tmp = (u64)pages_min * zone->present_pages; 5083 tmp = (u64)pages_min * zone->present_pages;
5080 do_div(tmp, lowmem_pages); 5084 do_div(tmp, lowmem_pages);
5081 if (is_highmem(zone)) { 5085 if (is_highmem(zone)) {
5082 /* 5086 /*
5083 * __GFP_HIGH and PF_MEMALLOC allocations usually don't 5087 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
5084 * need highmem pages, so cap pages_min to a small 5088 * need highmem pages, so cap pages_min to a small
5085 * value here. 5089 * value here.
5086 * 5090 *
5087 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) 5091 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
5088 * deltas controls asynch page reclaim, and so should 5092 * deltas controls asynch page reclaim, and so should
5089 * not be capped for highmem. 5093 * not be capped for highmem.
5090 */ 5094 */
5091 int min_pages; 5095 int min_pages;
5092 5096
5093 min_pages = zone->present_pages / 1024; 5097 min_pages = zone->present_pages / 1024;
5094 if (min_pages < SWAP_CLUSTER_MAX) 5098 if (min_pages < SWAP_CLUSTER_MAX)
5095 min_pages = SWAP_CLUSTER_MAX; 5099 min_pages = SWAP_CLUSTER_MAX;
5096 if (min_pages > 128) 5100 if (min_pages > 128)
5097 min_pages = 128; 5101 min_pages = 128;
5098 zone->watermark[WMARK_MIN] = min_pages; 5102 zone->watermark[WMARK_MIN] = min_pages;
5099 } else { 5103 } else {
5100 /* 5104 /*
5101 * If it's a lowmem zone, reserve a number of pages 5105 * If it's a lowmem zone, reserve a number of pages
5102 * proportionate to the zone's size. 5106 * proportionate to the zone's size.
5103 */ 5107 */
5104 zone->watermark[WMARK_MIN] = tmp; 5108 zone->watermark[WMARK_MIN] = tmp;
5105 } 5109 }
5106 5110
5107 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); 5111 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
5108 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); 5112 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
5109 5113
5110 zone->watermark[WMARK_MIN] += cma_wmark_pages(zone); 5114 zone->watermark[WMARK_MIN] += cma_wmark_pages(zone);
5111 zone->watermark[WMARK_LOW] += cma_wmark_pages(zone); 5115 zone->watermark[WMARK_LOW] += cma_wmark_pages(zone);
5112 zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone); 5116 zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone);
5113 5117
5114 setup_zone_migrate_reserve(zone); 5118 setup_zone_migrate_reserve(zone);
5115 spin_unlock_irqrestore(&zone->lock, flags); 5119 spin_unlock_irqrestore(&zone->lock, flags);
5116 } 5120 }
5117 5121
5118 /* update totalreserve_pages */ 5122 /* update totalreserve_pages */
5119 calculate_totalreserve_pages(); 5123 calculate_totalreserve_pages();
5120 } 5124 }
5121 5125
5122 /** 5126 /**
5123 * setup_per_zone_wmarks - called when min_free_kbytes changes 5127 * setup_per_zone_wmarks - called when min_free_kbytes changes
5124 * or when memory is hot-{added|removed} 5128 * or when memory is hot-{added|removed}
5125 * 5129 *
5126 * Ensures that the watermark[min,low,high] values for each zone are set 5130 * Ensures that the watermark[min,low,high] values for each zone are set
5127 * correctly with respect to min_free_kbytes. 5131 * correctly with respect to min_free_kbytes.
5128 */ 5132 */
5129 void setup_per_zone_wmarks(void) 5133 void setup_per_zone_wmarks(void)
5130 { 5134 {
5131 mutex_lock(&zonelists_mutex); 5135 mutex_lock(&zonelists_mutex);
5132 __setup_per_zone_wmarks(); 5136 __setup_per_zone_wmarks();
5133 mutex_unlock(&zonelists_mutex); 5137 mutex_unlock(&zonelists_mutex);
5134 } 5138 }
5135 5139
5136 /* 5140 /*
5137 * The inactive anon list should be small enough that the VM never has to 5141 * The inactive anon list should be small enough that the VM never has to
5138 * do too much work, but large enough that each inactive page has a chance 5142 * do too much work, but large enough that each inactive page has a chance
5139 * to be referenced again before it is swapped out. 5143 * to be referenced again before it is swapped out.
5140 * 5144 *
5141 * The inactive_anon ratio is the target ratio of ACTIVE_ANON to 5145 * The inactive_anon ratio is the target ratio of ACTIVE_ANON to
5142 * INACTIVE_ANON pages on this zone's LRU, maintained by the 5146 * INACTIVE_ANON pages on this zone's LRU, maintained by the
5143 * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of 5147 * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
5144 * the anonymous pages are kept on the inactive list. 5148 * the anonymous pages are kept on the inactive list.
5145 * 5149 *
5146 * total target max 5150 * total target max
5147 * memory ratio inactive anon 5151 * memory ratio inactive anon
5148 * ------------------------------------- 5152 * -------------------------------------
5149 * 10MB 1 5MB 5153 * 10MB 1 5MB
5150 * 100MB 1 50MB 5154 * 100MB 1 50MB
5151 * 1GB 3 250MB 5155 * 1GB 3 250MB
5152 * 10GB 10 0.9GB 5156 * 10GB 10 0.9GB
5153 * 100GB 31 3GB 5157 * 100GB 31 3GB
5154 * 1TB 101 10GB 5158 * 1TB 101 10GB
5155 * 10TB 320 32GB 5159 * 10TB 320 32GB
5156 */ 5160 */
5157 static void __meminit calculate_zone_inactive_ratio(struct zone *zone) 5161 static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
5158 { 5162 {
5159 unsigned int gb, ratio; 5163 unsigned int gb, ratio;
5160 5164
5161 /* Zone size in gigabytes */ 5165 /* Zone size in gigabytes */
5162 gb = zone->present_pages >> (30 - PAGE_SHIFT); 5166 gb = zone->present_pages >> (30 - PAGE_SHIFT);
5163 if (gb) 5167 if (gb)
5164 ratio = int_sqrt(10 * gb); 5168 ratio = int_sqrt(10 * gb);
5165 else 5169 else
5166 ratio = 1; 5170 ratio = 1;
5167 5171
5168 zone->inactive_ratio = ratio; 5172 zone->inactive_ratio = ratio;
5169 } 5173 }
5170 5174
5171 static void __meminit setup_per_zone_inactive_ratio(void) 5175 static void __meminit setup_per_zone_inactive_ratio(void)
5172 { 5176 {
5173 struct zone *zone; 5177 struct zone *zone;
5174 5178
5175 for_each_zone(zone) 5179 for_each_zone(zone)
5176 calculate_zone_inactive_ratio(zone); 5180 calculate_zone_inactive_ratio(zone);
5177 } 5181 }
5178 5182
5179 /* 5183 /*
5180 * Initialise min_free_kbytes. 5184 * Initialise min_free_kbytes.
5181 * 5185 *
5182 * For small machines we want it small (128k min). For large machines 5186 * For small machines we want it small (128k min). For large machines
5183 * we want it large (64MB max). But it is not linear, because network 5187 * we want it large (64MB max). But it is not linear, because network
5184 * bandwidth does not increase linearly with machine size. We use 5188 * bandwidth does not increase linearly with machine size. We use
5185 * 5189 *
5186 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: 5190 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
5187 * min_free_kbytes = sqrt(lowmem_kbytes * 16) 5191 * min_free_kbytes = sqrt(lowmem_kbytes * 16)
5188 * 5192 *
5189 * which yields 5193 * which yields
5190 * 5194 *
5191 * 16MB: 512k 5195 * 16MB: 512k
5192 * 32MB: 724k 5196 * 32MB: 724k
5193 * 64MB: 1024k 5197 * 64MB: 1024k
5194 * 128MB: 1448k 5198 * 128MB: 1448k
5195 * 256MB: 2048k 5199 * 256MB: 2048k
5196 * 512MB: 2896k 5200 * 512MB: 2896k
5197 * 1024MB: 4096k 5201 * 1024MB: 4096k
5198 * 2048MB: 5792k 5202 * 2048MB: 5792k
5199 * 4096MB: 8192k 5203 * 4096MB: 8192k
5200 * 8192MB: 11584k 5204 * 8192MB: 11584k
5201 * 16384MB: 16384k 5205 * 16384MB: 16384k
5202 */ 5206 */
5203 int __meminit init_per_zone_wmark_min(void) 5207 int __meminit init_per_zone_wmark_min(void)
5204 { 5208 {
5205 unsigned long lowmem_kbytes; 5209 unsigned long lowmem_kbytes;
5206 5210
5207 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); 5211 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
5208 5212
5209 min_free_kbytes = int_sqrt(lowmem_kbytes * 16); 5213 min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
5210 if (min_free_kbytes < 128) 5214 if (min_free_kbytes < 128)
5211 min_free_kbytes = 128; 5215 min_free_kbytes = 128;
5212 if (min_free_kbytes > 65536) 5216 if (min_free_kbytes > 65536)
5213 min_free_kbytes = 65536; 5217 min_free_kbytes = 65536;
5214 setup_per_zone_wmarks(); 5218 setup_per_zone_wmarks();
5215 refresh_zone_stat_thresholds(); 5219 refresh_zone_stat_thresholds();
5216 setup_per_zone_lowmem_reserve(); 5220 setup_per_zone_lowmem_reserve();
5217 setup_per_zone_inactive_ratio(); 5221 setup_per_zone_inactive_ratio();
5218 return 0; 5222 return 0;
5219 } 5223 }
5220 module_init(init_per_zone_wmark_min) 5224 module_init(init_per_zone_wmark_min)
5221 5225
5222 /* 5226 /*
5223 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 5227 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
5224 * that we can call two helper functions whenever min_free_kbytes 5228 * that we can call two helper functions whenever min_free_kbytes
5225 * changes. 5229 * changes.
5226 */ 5230 */
5227 int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 5231 int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
5228 void __user *buffer, size_t *length, loff_t *ppos) 5232 void __user *buffer, size_t *length, loff_t *ppos)
5229 { 5233 {
5230 proc_dointvec(table, write, buffer, length, ppos); 5234 proc_dointvec(table, write, buffer, length, ppos);
5231 if (write) 5235 if (write)
5232 setup_per_zone_wmarks(); 5236 setup_per_zone_wmarks();
5233 return 0; 5237 return 0;
5234 } 5238 }
5235 5239
5236 #ifdef CONFIG_NUMA 5240 #ifdef CONFIG_NUMA
5237 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, 5241 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
5238 void __user *buffer, size_t *length, loff_t *ppos) 5242 void __user *buffer, size_t *length, loff_t *ppos)
5239 { 5243 {
5240 struct zone *zone; 5244 struct zone *zone;
5241 int rc; 5245 int rc;
5242 5246
5243 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 5247 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
5244 if (rc) 5248 if (rc)
5245 return rc; 5249 return rc;
5246 5250
5247 for_each_zone(zone) 5251 for_each_zone(zone)
5248 zone->min_unmapped_pages = (zone->present_pages * 5252 zone->min_unmapped_pages = (zone->present_pages *
5249 sysctl_min_unmapped_ratio) / 100; 5253 sysctl_min_unmapped_ratio) / 100;
5250 return 0; 5254 return 0;
5251 } 5255 }
5252 5256
5253 int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, 5257 int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
5254 void __user *buffer, size_t *length, loff_t *ppos) 5258 void __user *buffer, size_t *length, loff_t *ppos)
5255 { 5259 {
5256 struct zone *zone; 5260 struct zone *zone;
5257 int rc; 5261 int rc;
5258 5262
5259 rc = proc_dointvec_minmax(table, write, buffer, length, ppos); 5263 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
5260 if (rc) 5264 if (rc)
5261 return rc; 5265 return rc;
5262 5266
5263 for_each_zone(zone) 5267 for_each_zone(zone)
5264 zone->min_slab_pages = (zone->present_pages * 5268 zone->min_slab_pages = (zone->present_pages *
5265 sysctl_min_slab_ratio) / 100; 5269 sysctl_min_slab_ratio) / 100;
5266 return 0; 5270 return 0;
5267 } 5271 }
5268 #endif 5272 #endif
5269 5273
5270 /* 5274 /*
5271 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around 5275 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
5272 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() 5276 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
5273 * whenever sysctl_lowmem_reserve_ratio changes. 5277 * whenever sysctl_lowmem_reserve_ratio changes.
5274 * 5278 *
5275 * The reserve ratio obviously has absolutely no relation with the 5279 * The reserve ratio obviously has absolutely no relation with the
5276 * minimum watermarks. The lowmem reserve ratio can only make sense 5280 * minimum watermarks. The lowmem reserve ratio can only make sense
5277 * if in function of the boot time zone sizes. 5281 * if in function of the boot time zone sizes.
5278 */ 5282 */
5279 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, 5283 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
5280 void __user *buffer, size_t *length, loff_t *ppos) 5284 void __user *buffer, size_t *length, loff_t *ppos)
5281 { 5285 {
5282 proc_dointvec_minmax(table, write, buffer, length, ppos); 5286 proc_dointvec_minmax(table, write, buffer, length, ppos);
5283 setup_per_zone_lowmem_reserve(); 5287 setup_per_zone_lowmem_reserve();
5284 return 0; 5288 return 0;
5285 } 5289 }
5286 5290
5287 /* 5291 /*
5288 * percpu_pagelist_fraction - changes the pcp->high for each zone on each 5292 * percpu_pagelist_fraction - changes the pcp->high for each zone on each
5289 * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist 5293 * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist
5290 * can have before it gets flushed back to buddy allocator. 5294 * can have before it gets flushed back to buddy allocator.
5291 */ 5295 */
5292 5296
5293 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, 5297 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
5294 void __user *buffer, size_t *length, loff_t *ppos) 5298 void __user *buffer, size_t *length, loff_t *ppos)
5295 { 5299 {
5296 struct zone *zone; 5300 struct zone *zone;
5297 unsigned int cpu; 5301 unsigned int cpu;
5298 int ret; 5302 int ret;
5299 5303
5300 ret = proc_dointvec_minmax(table, write, buffer, length, ppos); 5304 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
5301 if (!write || (ret < 0)) 5305 if (!write || (ret < 0))
5302 return ret; 5306 return ret;
5303 for_each_populated_zone(zone) { 5307 for_each_populated_zone(zone) {
5304 for_each_possible_cpu(cpu) { 5308 for_each_possible_cpu(cpu) {
5305 unsigned long high; 5309 unsigned long high;
5306 high = zone->present_pages / percpu_pagelist_fraction; 5310 high = zone->present_pages / percpu_pagelist_fraction;
5307 setup_pagelist_highmark( 5311 setup_pagelist_highmark(
5308 per_cpu_ptr(zone->pageset, cpu), high); 5312 per_cpu_ptr(zone->pageset, cpu), high);
5309 } 5313 }
5310 } 5314 }
5311 return 0; 5315 return 0;
5312 } 5316 }
5313 5317
5314 int hashdist = HASHDIST_DEFAULT; 5318 int hashdist = HASHDIST_DEFAULT;
5315 5319
5316 #ifdef CONFIG_NUMA 5320 #ifdef CONFIG_NUMA
5317 static int __init set_hashdist(char *str) 5321 static int __init set_hashdist(char *str)
5318 { 5322 {
5319 if (!str) 5323 if (!str)
5320 return 0; 5324 return 0;
5321 hashdist = simple_strtoul(str, &str, 0); 5325 hashdist = simple_strtoul(str, &str, 0);
5322 return 1; 5326 return 1;
5323 } 5327 }
5324 __setup("hashdist=", set_hashdist); 5328 __setup("hashdist=", set_hashdist);
5325 #endif 5329 #endif
5326 5330
5327 /* 5331 /*
5328 * allocate a large system hash table from bootmem 5332 * allocate a large system hash table from bootmem
5329 * - it is assumed that the hash table must contain an exact power-of-2 5333 * - it is assumed that the hash table must contain an exact power-of-2
5330 * quantity of entries 5334 * quantity of entries
5331 * - limit is the number of hash buckets, not the total allocation size 5335 * - limit is the number of hash buckets, not the total allocation size
5332 */ 5336 */
5333 void *__init alloc_large_system_hash(const char *tablename, 5337 void *__init alloc_large_system_hash(const char *tablename,
5334 unsigned long bucketsize, 5338 unsigned long bucketsize,
5335 unsigned long numentries, 5339 unsigned long numentries,
5336 int scale, 5340 int scale,
5337 int flags, 5341 int flags,
5338 unsigned int *_hash_shift, 5342 unsigned int *_hash_shift,
5339 unsigned int *_hash_mask, 5343 unsigned int *_hash_mask,
5340 unsigned long low_limit, 5344 unsigned long low_limit,
5341 unsigned long high_limit) 5345 unsigned long high_limit)
5342 { 5346 {
5343 unsigned long long max = high_limit; 5347 unsigned long long max = high_limit;
5344 unsigned long log2qty, size; 5348 unsigned long log2qty, size;
5345 void *table = NULL; 5349 void *table = NULL;
5346 5350
5347 /* allow the kernel cmdline to have a say */ 5351 /* allow the kernel cmdline to have a say */
5348 if (!numentries) { 5352 if (!numentries) {
5349 /* round applicable memory size up to nearest megabyte */ 5353 /* round applicable memory size up to nearest megabyte */
5350 numentries = nr_kernel_pages; 5354 numentries = nr_kernel_pages;
5351 numentries += (1UL << (20 - PAGE_SHIFT)) - 1; 5355 numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
5352 numentries >>= 20 - PAGE_SHIFT; 5356 numentries >>= 20 - PAGE_SHIFT;
5353 numentries <<= 20 - PAGE_SHIFT; 5357 numentries <<= 20 - PAGE_SHIFT;
5354 5358
5355 /* limit to 1 bucket per 2^scale bytes of low memory */ 5359 /* limit to 1 bucket per 2^scale bytes of low memory */
5356 if (scale > PAGE_SHIFT) 5360 if (scale > PAGE_SHIFT)
5357 numentries >>= (scale - PAGE_SHIFT); 5361 numentries >>= (scale - PAGE_SHIFT);
5358 else 5362 else
5359 numentries <<= (PAGE_SHIFT - scale); 5363 numentries <<= (PAGE_SHIFT - scale);
5360 5364
5361 /* Make sure we've got at least a 0-order allocation.. */ 5365 /* Make sure we've got at least a 0-order allocation.. */
5362 if (unlikely(flags & HASH_SMALL)) { 5366 if (unlikely(flags & HASH_SMALL)) {
5363 /* Makes no sense without HASH_EARLY */ 5367 /* Makes no sense without HASH_EARLY */
5364 WARN_ON(!(flags & HASH_EARLY)); 5368 WARN_ON(!(flags & HASH_EARLY));
5365 if (!(numentries >> *_hash_shift)) { 5369 if (!(numentries >> *_hash_shift)) {
5366 numentries = 1UL << *_hash_shift; 5370 numentries = 1UL << *_hash_shift;
5367 BUG_ON(!numentries); 5371 BUG_ON(!numentries);
5368 } 5372 }
5369 } else if (unlikely((numentries * bucketsize) < PAGE_SIZE)) 5373 } else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
5370 numentries = PAGE_SIZE / bucketsize; 5374 numentries = PAGE_SIZE / bucketsize;
5371 } 5375 }
5372 numentries = roundup_pow_of_two(numentries); 5376 numentries = roundup_pow_of_two(numentries);
5373 5377
5374 /* limit allocation size to 1/16 total memory by default */ 5378 /* limit allocation size to 1/16 total memory by default */
5375 if (max == 0) { 5379 if (max == 0) {
5376 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; 5380 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
5377 do_div(max, bucketsize); 5381 do_div(max, bucketsize);
5378 } 5382 }
5379 max = min(max, 0x80000000ULL); 5383 max = min(max, 0x80000000ULL);
5380 5384
5381 if (numentries < low_limit) 5385 if (numentries < low_limit)
5382 numentries = low_limit; 5386 numentries = low_limit;
5383 if (numentries > max) 5387 if (numentries > max)
5384 numentries = max; 5388 numentries = max;
5385 5389
5386 log2qty = ilog2(numentries); 5390 log2qty = ilog2(numentries);
5387 5391
5388 do { 5392 do {
5389 size = bucketsize << log2qty; 5393 size = bucketsize << log2qty;
5390 if (flags & HASH_EARLY) 5394 if (flags & HASH_EARLY)
5391 table = alloc_bootmem_nopanic(size); 5395 table = alloc_bootmem_nopanic(size);
5392 else if (hashdist) 5396 else if (hashdist)
5393 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); 5397 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
5394 else { 5398 else {
5395 /* 5399 /*
5396 * If bucketsize is not a power-of-two, we may free 5400 * If bucketsize is not a power-of-two, we may free
5397 * some pages at the end of hash table which 5401 * some pages at the end of hash table which
5398 * alloc_pages_exact() automatically does 5402 * alloc_pages_exact() automatically does
5399 */ 5403 */
5400 if (get_order(size) < MAX_ORDER) { 5404 if (get_order(size) < MAX_ORDER) {
5401 table = alloc_pages_exact(size, GFP_ATOMIC); 5405 table = alloc_pages_exact(size, GFP_ATOMIC);
5402 kmemleak_alloc(table, size, 1, GFP_ATOMIC); 5406 kmemleak_alloc(table, size, 1, GFP_ATOMIC);
5403 } 5407 }
5404 } 5408 }
5405 } while (!table && size > PAGE_SIZE && --log2qty); 5409 } while (!table && size > PAGE_SIZE && --log2qty);
5406 5410
5407 if (!table) 5411 if (!table)
5408 panic("Failed to allocate %s hash table\n", tablename); 5412 panic("Failed to allocate %s hash table\n", tablename);
5409 5413
5410 printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n", 5414 printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",
5411 tablename, 5415 tablename,
5412 (1UL << log2qty), 5416 (1UL << log2qty),
5413 ilog2(size) - PAGE_SHIFT, 5417 ilog2(size) - PAGE_SHIFT,
5414 size); 5418 size);
5415 5419
5416 if (_hash_shift) 5420 if (_hash_shift)
5417 *_hash_shift = log2qty; 5421 *_hash_shift = log2qty;
5418 if (_hash_mask) 5422 if (_hash_mask)
5419 *_hash_mask = (1 << log2qty) - 1; 5423 *_hash_mask = (1 << log2qty) - 1;
5420 5424
5421 return table; 5425 return table;
5422 } 5426 }
5423 5427
5424 /* Return a pointer to the bitmap storing bits affecting a block of pages */ 5428 /* Return a pointer to the bitmap storing bits affecting a block of pages */
5425 static inline unsigned long *get_pageblock_bitmap(struct zone *zone, 5429 static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
5426 unsigned long pfn) 5430 unsigned long pfn)
5427 { 5431 {
5428 #ifdef CONFIG_SPARSEMEM 5432 #ifdef CONFIG_SPARSEMEM
5429 return __pfn_to_section(pfn)->pageblock_flags; 5433 return __pfn_to_section(pfn)->pageblock_flags;
5430 #else 5434 #else
5431 return zone->pageblock_flags; 5435 return zone->pageblock_flags;
5432 #endif /* CONFIG_SPARSEMEM */ 5436 #endif /* CONFIG_SPARSEMEM */
5433 } 5437 }
5434 5438
5435 static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) 5439 static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
5436 { 5440 {
5437 #ifdef CONFIG_SPARSEMEM 5441 #ifdef CONFIG_SPARSEMEM
5438 pfn &= (PAGES_PER_SECTION-1); 5442 pfn &= (PAGES_PER_SECTION-1);
5439 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 5443 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
5440 #else 5444 #else
5441 pfn = pfn - zone->zone_start_pfn; 5445 pfn = pfn - zone->zone_start_pfn;
5442 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; 5446 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
5443 #endif /* CONFIG_SPARSEMEM */ 5447 #endif /* CONFIG_SPARSEMEM */
5444 } 5448 }
5445 5449
5446 /** 5450 /**
5447 * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages 5451 * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages
5448 * @page: The page within the block of interest 5452 * @page: The page within the block of interest
5449 * @start_bitidx: The first bit of interest to retrieve 5453 * @start_bitidx: The first bit of interest to retrieve
5450 * @end_bitidx: The last bit of interest 5454 * @end_bitidx: The last bit of interest
5451 * returns pageblock_bits flags 5455 * returns pageblock_bits flags
5452 */ 5456 */
5453 unsigned long get_pageblock_flags_group(struct page *page, 5457 unsigned long get_pageblock_flags_group(struct page *page,
5454 int start_bitidx, int end_bitidx) 5458 int start_bitidx, int end_bitidx)
5455 { 5459 {
5456 struct zone *zone; 5460 struct zone *zone;
5457 unsigned long *bitmap; 5461 unsigned long *bitmap;
5458 unsigned long pfn, bitidx; 5462 unsigned long pfn, bitidx;
5459 unsigned long flags = 0; 5463 unsigned long flags = 0;
5460 unsigned long value = 1; 5464 unsigned long value = 1;
5461 5465
5462 zone = page_zone(page); 5466 zone = page_zone(page);
5463 pfn = page_to_pfn(page); 5467 pfn = page_to_pfn(page);
5464 bitmap = get_pageblock_bitmap(zone, pfn); 5468 bitmap = get_pageblock_bitmap(zone, pfn);
5465 bitidx = pfn_to_bitidx(zone, pfn); 5469 bitidx = pfn_to_bitidx(zone, pfn);
5466 5470
5467 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) 5471 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
5468 if (test_bit(bitidx + start_bitidx, bitmap)) 5472 if (test_bit(bitidx + start_bitidx, bitmap))
5469 flags |= value; 5473 flags |= value;
5470 5474
5471 return flags; 5475 return flags;
5472 } 5476 }
5473 5477
5474 /** 5478 /**
5475 * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages 5479 * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages
5476 * @page: The page within the block of interest 5480 * @page: The page within the block of interest
5477 * @start_bitidx: The first bit of interest 5481 * @start_bitidx: The first bit of interest
5478 * @end_bitidx: The last bit of interest 5482 * @end_bitidx: The last bit of interest
5479 * @flags: The flags to set 5483 * @flags: The flags to set
5480 */ 5484 */
5481 void set_pageblock_flags_group(struct page *page, unsigned long flags, 5485 void set_pageblock_flags_group(struct page *page, unsigned long flags,
5482 int start_bitidx, int end_bitidx) 5486 int start_bitidx, int end_bitidx)
5483 { 5487 {
5484 struct zone *zone; 5488 struct zone *zone;
5485 unsigned long *bitmap; 5489 unsigned long *bitmap;
5486 unsigned long pfn, bitidx; 5490 unsigned long pfn, bitidx;
5487 unsigned long value = 1; 5491 unsigned long value = 1;
5488 5492
5489 zone = page_zone(page); 5493 zone = page_zone(page);
5490 pfn = page_to_pfn(page); 5494 pfn = page_to_pfn(page);
5491 bitmap = get_pageblock_bitmap(zone, pfn); 5495 bitmap = get_pageblock_bitmap(zone, pfn);
5492 bitidx = pfn_to_bitidx(zone, pfn); 5496 bitidx = pfn_to_bitidx(zone, pfn);
5493 VM_BUG_ON(pfn < zone->zone_start_pfn); 5497 VM_BUG_ON(pfn < zone->zone_start_pfn);
5494 VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages); 5498 VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages);
5495 5499
5496 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) 5500 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
5497 if (flags & value) 5501 if (flags & value)
5498 __set_bit(bitidx + start_bitidx, bitmap); 5502 __set_bit(bitidx + start_bitidx, bitmap);
5499 else 5503 else
5500 __clear_bit(bitidx + start_bitidx, bitmap); 5504 __clear_bit(bitidx + start_bitidx, bitmap);
5501 } 5505 }
5502 5506
5503 /* 5507 /*
5504 * This function checks whether pageblock includes unmovable pages or not. 5508 * This function checks whether pageblock includes unmovable pages or not.
5505 * If @count is not zero, it is okay to include less @count unmovable pages 5509 * If @count is not zero, it is okay to include less @count unmovable pages
5506 * 5510 *
5507 * PageLRU check wihtout isolation or lru_lock could race so that 5511 * PageLRU check wihtout isolation or lru_lock could race so that
5508 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't 5512 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
5509 * expect this function should be exact. 5513 * expect this function should be exact.
5510 */ 5514 */
5511 bool has_unmovable_pages(struct zone *zone, struct page *page, int count) 5515 bool has_unmovable_pages(struct zone *zone, struct page *page, int count)
5512 { 5516 {
5513 unsigned long pfn, iter, found; 5517 unsigned long pfn, iter, found;
5514 int mt; 5518 int mt;
5515 5519
5516 /* 5520 /*
5517 * For avoiding noise data, lru_add_drain_all() should be called 5521 * For avoiding noise data, lru_add_drain_all() should be called
5518 * If ZONE_MOVABLE, the zone never contains unmovable pages 5522 * If ZONE_MOVABLE, the zone never contains unmovable pages
5519 */ 5523 */
5520 if (zone_idx(zone) == ZONE_MOVABLE) 5524 if (zone_idx(zone) == ZONE_MOVABLE)
5521 return false; 5525 return false;
5522 mt = get_pageblock_migratetype(page); 5526 mt = get_pageblock_migratetype(page);
5523 if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) 5527 if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
5524 return false; 5528 return false;
5525 5529
5526 pfn = page_to_pfn(page); 5530 pfn = page_to_pfn(page);
5527 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { 5531 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
5528 unsigned long check = pfn + iter; 5532 unsigned long check = pfn + iter;
5529 5533
5530 if (!pfn_valid_within(check)) 5534 if (!pfn_valid_within(check))
5531 continue; 5535 continue;
5532 5536
5533 page = pfn_to_page(check); 5537 page = pfn_to_page(check);
5534 /* 5538 /*
5535 * We can't use page_count without pin a page 5539 * We can't use page_count without pin a page
5536 * because another CPU can free compound page. 5540 * because another CPU can free compound page.
5537 * This check already skips compound tails of THP 5541 * This check already skips compound tails of THP
5538 * because their page->_count is zero at all time. 5542 * because their page->_count is zero at all time.
5539 */ 5543 */
5540 if (!atomic_read(&page->_count)) { 5544 if (!atomic_read(&page->_count)) {
5541 if (PageBuddy(page)) 5545 if (PageBuddy(page))
5542 iter += (1 << page_order(page)) - 1; 5546 iter += (1 << page_order(page)) - 1;
5543 continue; 5547 continue;
5544 } 5548 }
5545 5549
5546 if (!PageLRU(page)) 5550 if (!PageLRU(page))
5547 found++; 5551 found++;
5548 /* 5552 /*
5549 * If there are RECLAIMABLE pages, we need to check it. 5553 * If there are RECLAIMABLE pages, we need to check it.
5550 * But now, memory offline itself doesn't call shrink_slab() 5554 * But now, memory offline itself doesn't call shrink_slab()
5551 * and it still to be fixed. 5555 * and it still to be fixed.
5552 */ 5556 */
5553 /* 5557 /*
5554 * If the page is not RAM, page_count()should be 0. 5558 * If the page is not RAM, page_count()should be 0.
5555 * we don't need more check. This is an _used_ not-movable page. 5559 * we don't need more check. This is an _used_ not-movable page.
5556 * 5560 *
5557 * The problematic thing here is PG_reserved pages. PG_reserved 5561 * The problematic thing here is PG_reserved pages. PG_reserved
5558 * is set to both of a memory hole page and a _used_ kernel 5562 * is set to both of a memory hole page and a _used_ kernel
5559 * page at boot. 5563 * page at boot.
5560 */ 5564 */
5561 if (found > count) 5565 if (found > count)
5562 return true; 5566 return true;
5563 } 5567 }
5564 return false; 5568 return false;
5565 } 5569 }
5566 5570
5567 bool is_pageblock_removable_nolock(struct page *page) 5571 bool is_pageblock_removable_nolock(struct page *page)
5568 { 5572 {
5569 struct zone *zone; 5573 struct zone *zone;
5570 unsigned long pfn; 5574 unsigned long pfn;
5571 5575
5572 /* 5576 /*
5573 * We have to be careful here because we are iterating over memory 5577 * We have to be careful here because we are iterating over memory
5574 * sections which are not zone aware so we might end up outside of 5578 * sections which are not zone aware so we might end up outside of
5575 * the zone but still within the section. 5579 * the zone but still within the section.
5576 * We have to take care about the node as well. If the node is offline 5580 * We have to take care about the node as well. If the node is offline
5577 * its NODE_DATA will be NULL - see page_zone. 5581 * its NODE_DATA will be NULL - see page_zone.
5578 */ 5582 */
5579 if (!node_online(page_to_nid(page))) 5583 if (!node_online(page_to_nid(page)))
5580 return false; 5584 return false;
5581 5585
5582 zone = page_zone(page); 5586 zone = page_zone(page);
5583 pfn = page_to_pfn(page); 5587 pfn = page_to_pfn(page);
5584 if (zone->zone_start_pfn > pfn || 5588 if (zone->zone_start_pfn > pfn ||
5585 zone->zone_start_pfn + zone->spanned_pages <= pfn) 5589 zone->zone_start_pfn + zone->spanned_pages <= pfn)
5586 return false; 5590 return false;
5587 5591
5588 return !has_unmovable_pages(zone, page, 0); 5592 return !has_unmovable_pages(zone, page, 0);
5589 } 5593 }
5590 5594
5591 #ifdef CONFIG_CMA 5595 #ifdef CONFIG_CMA
5592 5596
5593 static unsigned long pfn_max_align_down(unsigned long pfn) 5597 static unsigned long pfn_max_align_down(unsigned long pfn)
5594 { 5598 {
5595 return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES, 5599 return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
5596 pageblock_nr_pages) - 1); 5600 pageblock_nr_pages) - 1);
5597 } 5601 }
5598 5602
5599 static unsigned long pfn_max_align_up(unsigned long pfn) 5603 static unsigned long pfn_max_align_up(unsigned long pfn)
5600 { 5604 {
5601 return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES, 5605 return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
5602 pageblock_nr_pages)); 5606 pageblock_nr_pages));
5603 } 5607 }
5604 5608
5605 static struct page * 5609 static struct page *
5606 __alloc_contig_migrate_alloc(struct page *page, unsigned long private, 5610 __alloc_contig_migrate_alloc(struct page *page, unsigned long private,
5607 int **resultp) 5611 int **resultp)
5608 { 5612 {
5609 gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; 5613 gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
5610 5614
5611 if (PageHighMem(page)) 5615 if (PageHighMem(page))
5612 gfp_mask |= __GFP_HIGHMEM; 5616 gfp_mask |= __GFP_HIGHMEM;
5613 5617
5614 return alloc_page(gfp_mask); 5618 return alloc_page(gfp_mask);
5615 } 5619 }
5616 5620
5617 /* [start, end) must belong to a single zone. */ 5621 /* [start, end) must belong to a single zone. */
5618 static int __alloc_contig_migrate_range(unsigned long start, unsigned long end) 5622 static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
5619 { 5623 {
5620 /* This function is based on compact_zone() from compaction.c. */ 5624 /* This function is based on compact_zone() from compaction.c. */
5621 5625
5622 unsigned long pfn = start; 5626 unsigned long pfn = start;
5623 unsigned int tries = 0; 5627 unsigned int tries = 0;
5624 int ret = 0; 5628 int ret = 0;
5625 5629
5626 struct compact_control cc = { 5630 struct compact_control cc = {
5627 .nr_migratepages = 0, 5631 .nr_migratepages = 0,
5628 .order = -1, 5632 .order = -1,
5629 .zone = page_zone(pfn_to_page(start)), 5633 .zone = page_zone(pfn_to_page(start)),
5630 .sync = true, 5634 .sync = true,
5631 }; 5635 };
5632 INIT_LIST_HEAD(&cc.migratepages); 5636 INIT_LIST_HEAD(&cc.migratepages);
5633 5637
5634 migrate_prep_local(); 5638 migrate_prep_local();
5635 5639
5636 while (pfn < end || !list_empty(&cc.migratepages)) { 5640 while (pfn < end || !list_empty(&cc.migratepages)) {
5637 if (fatal_signal_pending(current)) { 5641 if (fatal_signal_pending(current)) {
5638 ret = -EINTR; 5642 ret = -EINTR;
5639 break; 5643 break;
5640 } 5644 }
5641 5645
5642 if (list_empty(&cc.migratepages)) { 5646 if (list_empty(&cc.migratepages)) {
5643 cc.nr_migratepages = 0; 5647 cc.nr_migratepages = 0;
5644 pfn = isolate_migratepages_range(cc.zone, &cc, 5648 pfn = isolate_migratepages_range(cc.zone, &cc,
5645 pfn, end); 5649 pfn, end);
5646 if (!pfn) { 5650 if (!pfn) {
5647 ret = -EINTR; 5651 ret = -EINTR;
5648 break; 5652 break;
5649 } 5653 }
5650 tries = 0; 5654 tries = 0;
5651 } else if (++tries == 5) { 5655 } else if (++tries == 5) {
5652 ret = ret < 0 ? ret : -EBUSY; 5656 ret = ret < 0 ? ret : -EBUSY;
5653 break; 5657 break;
5654 } 5658 }
5655 5659
5656 ret = migrate_pages(&cc.migratepages, 5660 ret = migrate_pages(&cc.migratepages,
5657 __alloc_contig_migrate_alloc, 5661 __alloc_contig_migrate_alloc,
5658 0, false, MIGRATE_SYNC); 5662 0, false, MIGRATE_SYNC);
5659 } 5663 }
5660 5664
5661 putback_lru_pages(&cc.migratepages); 5665 putback_lru_pages(&cc.migratepages);
5662 return ret > 0 ? 0 : ret; 5666 return ret > 0 ? 0 : ret;
5663 } 5667 }
5664 5668
5665 /* 5669 /*
5666 * Update zone's cma pages counter used for watermark level calculation. 5670 * Update zone's cma pages counter used for watermark level calculation.
5667 */ 5671 */
5668 static inline void __update_cma_watermarks(struct zone *zone, int count) 5672 static inline void __update_cma_watermarks(struct zone *zone, int count)
5669 { 5673 {
5670 unsigned long flags; 5674 unsigned long flags;
5671 spin_lock_irqsave(&zone->lock, flags); 5675 spin_lock_irqsave(&zone->lock, flags);
5672 zone->min_cma_pages += count; 5676 zone->min_cma_pages += count;
5673 spin_unlock_irqrestore(&zone->lock, flags); 5677 spin_unlock_irqrestore(&zone->lock, flags);
5674 setup_per_zone_wmarks(); 5678 setup_per_zone_wmarks();
5675 } 5679 }
5676 5680
5677 /* 5681 /*
5678 * Trigger memory pressure bump to reclaim some pages in order to be able to 5682 * Trigger memory pressure bump to reclaim some pages in order to be able to
5679 * allocate 'count' pages in single page units. Does similar work as 5683 * allocate 'count' pages in single page units. Does similar work as
5680 *__alloc_pages_slowpath() function. 5684 *__alloc_pages_slowpath() function.
5681 */ 5685 */
5682 static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count) 5686 static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)
5683 { 5687 {
5684 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 5688 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
5685 struct zonelist *zonelist = node_zonelist(0, gfp_mask); 5689 struct zonelist *zonelist = node_zonelist(0, gfp_mask);
5686 int did_some_progress = 0; 5690 int did_some_progress = 0;
5687 int order = 1; 5691 int order = 1;
5688 5692
5689 /* 5693 /*
5690 * Increase level of watermarks to force kswapd do his job 5694 * Increase level of watermarks to force kswapd do his job
5691 * to stabilise at new watermark level. 5695 * to stabilise at new watermark level.
5692 */ 5696 */
5693 __update_cma_watermarks(zone, count); 5697 __update_cma_watermarks(zone, count);
5694 5698
5695 /* Obey watermarks as if the page was being allocated */ 5699 /* Obey watermarks as if the page was being allocated */
5696 while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) { 5700 while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) {
5697 wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone)); 5701 wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone));
5698 5702
5699 did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, 5703 did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
5700 NULL); 5704 NULL);
5701 if (!did_some_progress) { 5705 if (!did_some_progress) {
5702 /* Exhausted what can be done so it's blamo time */ 5706 /* Exhausted what can be done so it's blamo time */
5703 out_of_memory(zonelist, gfp_mask, order, NULL, false); 5707 out_of_memory(zonelist, gfp_mask, order, NULL, false);
5704 } 5708 }
5705 } 5709 }
5706 5710
5707 /* Restore original watermark levels. */ 5711 /* Restore original watermark levels. */
5708 __update_cma_watermarks(zone, -count); 5712 __update_cma_watermarks(zone, -count);
5709 5713
5710 return count; 5714 return count;
5711 } 5715 }
5712 5716
5713 /** 5717 /**
5714 * alloc_contig_range() -- tries to allocate given range of pages 5718 * alloc_contig_range() -- tries to allocate given range of pages
5715 * @start: start PFN to allocate 5719 * @start: start PFN to allocate
5716 * @end: one-past-the-last PFN to allocate 5720 * @end: one-past-the-last PFN to allocate
5717 * @migratetype: migratetype of the underlaying pageblocks (either 5721 * @migratetype: migratetype of the underlaying pageblocks (either
5718 * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks 5722 * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks
5719 * in range must have the same migratetype and it must 5723 * in range must have the same migratetype and it must
5720 * be either of the two. 5724 * be either of the two.
5721 * 5725 *
5722 * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES 5726 * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
5723 * aligned, however it's the caller's responsibility to guarantee that 5727 * aligned, however it's the caller's responsibility to guarantee that
5724 * we are the only thread that changes migrate type of pageblocks the 5728 * we are the only thread that changes migrate type of pageblocks the
5725 * pages fall in. 5729 * pages fall in.
5726 * 5730 *
5727 * The PFN range must belong to a single zone. 5731 * The PFN range must belong to a single zone.
5728 * 5732 *
5729 * Returns zero on success or negative error code. On success all 5733 * Returns zero on success or negative error code. On success all
5730 * pages which PFN is in [start, end) are allocated for the caller and 5734 * pages which PFN is in [start, end) are allocated for the caller and
5731 * need to be freed with free_contig_range(). 5735 * need to be freed with free_contig_range().
5732 */ 5736 */
5733 int alloc_contig_range(unsigned long start, unsigned long end, 5737 int alloc_contig_range(unsigned long start, unsigned long end,
5734 unsigned migratetype) 5738 unsigned migratetype)
5735 { 5739 {
5736 struct zone *zone = page_zone(pfn_to_page(start)); 5740 struct zone *zone = page_zone(pfn_to_page(start));
5737 unsigned long outer_start, outer_end; 5741 unsigned long outer_start, outer_end;
5738 int ret = 0, order; 5742 int ret = 0, order;
5739 5743
5740 /* 5744 /*
5741 * What we do here is we mark all pageblocks in range as 5745 * What we do here is we mark all pageblocks in range as
5742 * MIGRATE_ISOLATE. Because pageblock and max order pages may 5746 * MIGRATE_ISOLATE. Because pageblock and max order pages may
5743 * have different sizes, and due to the way page allocator 5747 * have different sizes, and due to the way page allocator
5744 * work, we align the range to biggest of the two pages so 5748 * work, we align the range to biggest of the two pages so
5745 * that page allocator won't try to merge buddies from 5749 * that page allocator won't try to merge buddies from
5746 * different pageblocks and change MIGRATE_ISOLATE to some 5750 * different pageblocks and change MIGRATE_ISOLATE to some
5747 * other migration type. 5751 * other migration type.
5748 * 5752 *
5749 * Once the pageblocks are marked as MIGRATE_ISOLATE, we 5753 * Once the pageblocks are marked as MIGRATE_ISOLATE, we
5750 * migrate the pages from an unaligned range (ie. pages that 5754 * migrate the pages from an unaligned range (ie. pages that
5751 * we are interested in). This will put all the pages in 5755 * we are interested in). This will put all the pages in
5752 * range back to page allocator as MIGRATE_ISOLATE. 5756 * range back to page allocator as MIGRATE_ISOLATE.
5753 * 5757 *
5754 * When this is done, we take the pages in range from page 5758 * When this is done, we take the pages in range from page
5755 * allocator removing them from the buddy system. This way 5759 * allocator removing them from the buddy system. This way
5756 * page allocator will never consider using them. 5760 * page allocator will never consider using them.
5757 * 5761 *
5758 * This lets us mark the pageblocks back as 5762 * This lets us mark the pageblocks back as
5759 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the 5763 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
5760 * aligned range but not in the unaligned, original range are 5764 * aligned range but not in the unaligned, original range are
5761 * put back to page allocator so that buddy can use them. 5765 * put back to page allocator so that buddy can use them.
5762 */ 5766 */
5763 5767
5764 ret = start_isolate_page_range(pfn_max_align_down(start), 5768 ret = start_isolate_page_range(pfn_max_align_down(start),
5765 pfn_max_align_up(end), migratetype); 5769 pfn_max_align_up(end), migratetype);
5766 if (ret) 5770 if (ret)
5767 goto done; 5771 goto done;
5768 5772
5769 ret = __alloc_contig_migrate_range(start, end); 5773 ret = __alloc_contig_migrate_range(start, end);
5770 if (ret) 5774 if (ret)
5771 goto done; 5775 goto done;
5772 5776
5773 /* 5777 /*
5774 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES 5778 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
5775 * aligned blocks that are marked as MIGRATE_ISOLATE. What's 5779 * aligned blocks that are marked as MIGRATE_ISOLATE. What's
5776 * more, all pages in [start, end) are free in page allocator. 5780 * more, all pages in [start, end) are free in page allocator.
5777 * What we are going to do is to allocate all pages from 5781 * What we are going to do is to allocate all pages from
5778 * [start, end) (that is remove them from page allocator). 5782 * [start, end) (that is remove them from page allocator).
5779 * 5783 *
5780 * The only problem is that pages at the beginning and at the 5784 * The only problem is that pages at the beginning and at the
5781 * end of interesting range may be not aligned with pages that 5785 * end of interesting range may be not aligned with pages that
5782 * page allocator holds, ie. they can be part of higher order 5786 * page allocator holds, ie. they can be part of higher order
5783 * pages. Because of this, we reserve the bigger range and 5787 * pages. Because of this, we reserve the bigger range and
5784 * once this is done free the pages we are not interested in. 5788 * once this is done free the pages we are not interested in.
5785 * 5789 *
5786 * We don't have to hold zone->lock here because the pages are 5790 * We don't have to hold zone->lock here because the pages are
5787 * isolated thus they won't get removed from buddy. 5791 * isolated thus they won't get removed from buddy.
5788 */ 5792 */
5789 5793
5790 lru_add_drain_all(); 5794 lru_add_drain_all();
5791 drain_all_pages(); 5795 drain_all_pages();
5792 5796
5793 order = 0; 5797 order = 0;
5794 outer_start = start; 5798 outer_start = start;
5795 while (!PageBuddy(pfn_to_page(outer_start))) { 5799 while (!PageBuddy(pfn_to_page(outer_start))) {
5796 if (++order >= MAX_ORDER) { 5800 if (++order >= MAX_ORDER) {
5797 ret = -EBUSY; 5801 ret = -EBUSY;
5798 goto done; 5802 goto done;
5799 } 5803 }
5800 outer_start &= ~0UL << order; 5804 outer_start &= ~0UL << order;
5801 } 5805 }
5802 5806
5803 /* Make sure the range is really isolated. */ 5807 /* Make sure the range is really isolated. */
5804 if (test_pages_isolated(outer_start, end)) { 5808 if (test_pages_isolated(outer_start, end)) {
5805 pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", 5809 pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",
5806 outer_start, end); 5810 outer_start, end);
5807 ret = -EBUSY; 5811 ret = -EBUSY;
5808 goto done; 5812 goto done;
5809 } 5813 }
5810 5814
5811 /* 5815 /*
5812 * Reclaim enough pages to make sure that contiguous allocation 5816 * Reclaim enough pages to make sure that contiguous allocation
5813 * will not starve the system. 5817 * will not starve the system.
5814 */ 5818 */
5815 __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start); 5819 __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start);
5816 5820
5817 /* Grab isolated pages from freelists. */ 5821 /* Grab isolated pages from freelists. */
5818 outer_end = isolate_freepages_range(outer_start, end); 5822 outer_end = isolate_freepages_range(outer_start, end);
5819 if (!outer_end) { 5823 if (!outer_end) {
5820 ret = -EBUSY; 5824 ret = -EBUSY;
5821 goto done; 5825 goto done;
5822 } 5826 }
5823 5827
5824 /* Free head and tail (if any) */ 5828 /* Free head and tail (if any) */
5825 if (start != outer_start) 5829 if (start != outer_start)
5826 free_contig_range(outer_start, start - outer_start); 5830 free_contig_range(outer_start, start - outer_start);
5827 if (end != outer_end) 5831 if (end != outer_end)
5828 free_contig_range(end, outer_end - end); 5832 free_contig_range(end, outer_end - end);
5829 5833
5830 done: 5834 done:
5831 undo_isolate_page_range(pfn_max_align_down(start), 5835 undo_isolate_page_range(pfn_max_align_down(start),
5832 pfn_max_align_up(end), migratetype); 5836 pfn_max_align_up(end), migratetype);
5833 return ret; 5837 return ret;
5834 } 5838 }
5835 5839
5836 void free_contig_range(unsigned long pfn, unsigned nr_pages) 5840 void free_contig_range(unsigned long pfn, unsigned nr_pages)
5837 { 5841 {
5838 for (; nr_pages--; ++pfn) 5842 for (; nr_pages--; ++pfn)
5839 __free_page(pfn_to_page(pfn)); 5843 __free_page(pfn_to_page(pfn));
5840 } 5844 }
5841 #endif 5845 #endif
5842 5846
5843 #ifdef CONFIG_MEMORY_HOTPLUG 5847 #ifdef CONFIG_MEMORY_HOTPLUG
5844 static int __meminit __zone_pcp_update(void *data) 5848 static int __meminit __zone_pcp_update(void *data)
5845 { 5849 {
5846 struct zone *zone = data; 5850 struct zone *zone = data;
5847 int cpu; 5851 int cpu;
5848 unsigned long batch = zone_batchsize(zone), flags; 5852 unsigned long batch = zone_batchsize(zone), flags;
5849 5853
5850 for_each_possible_cpu(cpu) { 5854 for_each_possible_cpu(cpu) {
5851 struct per_cpu_pageset *pset; 5855 struct per_cpu_pageset *pset;
5852 struct per_cpu_pages *pcp; 5856 struct per_cpu_pages *pcp;
5853 5857
5854 pset = per_cpu_ptr(zone->pageset, cpu); 5858 pset = per_cpu_ptr(zone->pageset, cpu);
5855 pcp = &pset->pcp; 5859 pcp = &pset->pcp;
5856 5860
5857 local_irq_save(flags); 5861 local_irq_save(flags);
5858 if (pcp->count > 0) 5862 if (pcp->count > 0)
5859 free_pcppages_bulk(zone, pcp->count, pcp); 5863 free_pcppages_bulk(zone, pcp->count, pcp);
5860 setup_pageset(pset, batch); 5864 setup_pageset(pset, batch);
5861 local_irq_restore(flags); 5865 local_irq_restore(flags);
5862 } 5866 }
5863 return 0; 5867 return 0;
5864 } 5868 }
5865 5869
5866 void __meminit zone_pcp_update(struct zone *zone) 5870 void __meminit zone_pcp_update(struct zone *zone)
5867 { 5871 {
5868 stop_machine(__zone_pcp_update, zone, NULL); 5872 stop_machine(__zone_pcp_update, zone, NULL);
5869 } 5873 }
5870 #endif 5874 #endif
5871 5875
5872 #ifdef CONFIG_MEMORY_HOTREMOVE 5876 #ifdef CONFIG_MEMORY_HOTREMOVE
5873 void zone_pcp_reset(struct zone *zone) 5877 void zone_pcp_reset(struct zone *zone)
5874 { 5878 {
5875 unsigned long flags; 5879 unsigned long flags;
5876 5880
5877 /* avoid races with drain_pages() */ 5881 /* avoid races with drain_pages() */
5878 local_irq_save(flags); 5882 local_irq_save(flags);
5879 if (zone->pageset != &boot_pageset) { 5883 if (zone->pageset != &boot_pageset) {
5880 free_percpu(zone->pageset); 5884 free_percpu(zone->pageset);
5881 zone->pageset = &boot_pageset; 5885 zone->pageset = &boot_pageset;
5882 } 5886 }
5883 local_irq_restore(flags); 5887 local_irq_restore(flags);
5884 } 5888 }
5885 5889
5886 /* 5890 /*
5887 * All pages in the range must be isolated before calling this. 5891 * All pages in the range must be isolated before calling this.
5888 */ 5892 */
5889 void 5893 void
5890 __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 5894 __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
5891 { 5895 {
5892 struct page *page; 5896 struct page *page;
5893 struct zone *zone; 5897 struct zone *zone;
5894 int order, i; 5898 int order, i;
5895 unsigned long pfn; 5899 unsigned long pfn;
5896 unsigned long flags; 5900 unsigned long flags;
5897 /* find the first valid pfn */ 5901 /* find the first valid pfn */
5898 for (pfn = start_pfn; pfn < end_pfn; pfn++) 5902 for (pfn = start_pfn; pfn < end_pfn; pfn++)
5899 if (pfn_valid(pfn)) 5903 if (pfn_valid(pfn))
5900 break; 5904 break;
5901 if (pfn == end_pfn) 5905 if (pfn == end_pfn)
5902 return; 5906 return;
5903 zone = page_zone(pfn_to_page(pfn)); 5907 zone = page_zone(pfn_to_page(pfn));
5904 spin_lock_irqsave(&zone->lock, flags); 5908 spin_lock_irqsave(&zone->lock, flags);
5905 pfn = start_pfn; 5909 pfn = start_pfn;
5906 while (pfn < end_pfn) { 5910 while (pfn < end_pfn) {
5907 if (!pfn_valid(pfn)) { 5911 if (!pfn_valid(pfn)) {
5908 pfn++; 5912 pfn++;
5909 continue; 5913 continue;
5910 } 5914 }
5911 page = pfn_to_page(pfn); 5915 page = pfn_to_page(pfn);
5912 BUG_ON(page_count(page)); 5916 BUG_ON(page_count(page));
5913 BUG_ON(!PageBuddy(page)); 5917 BUG_ON(!PageBuddy(page));
5914 order = page_order(page); 5918 order = page_order(page);
5915 #ifdef CONFIG_DEBUG_VM 5919 #ifdef CONFIG_DEBUG_VM
5916 printk(KERN_INFO "remove from free list %lx %d %lx\n", 5920 printk(KERN_INFO "remove from free list %lx %d %lx\n",
5917 pfn, 1 << order, end_pfn); 5921 pfn, 1 << order, end_pfn);
5918 #endif 5922 #endif
5919 list_del(&page->lru); 5923 list_del(&page->lru);
5920 rmv_page_order(page); 5924 rmv_page_order(page);
5921 zone->free_area[order].nr_free--; 5925 zone->free_area[order].nr_free--;
5922 __mod_zone_page_state(zone, NR_FREE_PAGES, 5926 __mod_zone_page_state(zone, NR_FREE_PAGES,
5923 - (1UL << order)); 5927 - (1UL << order));
5924 for (i = 0; i < (1 << order); i++) 5928 for (i = 0; i < (1 << order); i++)
5925 SetPageReserved((page+i)); 5929 SetPageReserved((page+i));
5926 pfn += (1 << order); 5930 pfn += (1 << order);
5927 } 5931 }
5928 spin_unlock_irqrestore(&zone->lock, flags); 5932 spin_unlock_irqrestore(&zone->lock, flags);
5929 } 5933 }
5930 #endif 5934 #endif
5931 5935
5932 #ifdef CONFIG_MEMORY_FAILURE 5936 #ifdef CONFIG_MEMORY_FAILURE
5933 bool is_free_buddy_page(struct page *page) 5937 bool is_free_buddy_page(struct page *page)
5934 { 5938 {
5935 struct zone *zone = page_zone(page); 5939 struct zone *zone = page_zone(page);
5936 unsigned long pfn = page_to_pfn(page); 5940 unsigned long pfn = page_to_pfn(page);
5937 unsigned long flags; 5941 unsigned long flags;
5938 int order; 5942 int order;
5939 5943
5940 spin_lock_irqsave(&zone->lock, flags); 5944 spin_lock_irqsave(&zone->lock, flags);
5941 for (order = 0; order < MAX_ORDER; order++) { 5945 for (order = 0; order < MAX_ORDER; order++) {
5942 struct page *page_head = page - (pfn & ((1 << order) - 1)); 5946 struct page *page_head = page - (pfn & ((1 << order) - 1));
5943 5947
5944 if (PageBuddy(page_head) && page_order(page_head) >= order) 5948 if (PageBuddy(page_head) && page_order(page_head) >= order)
5945 break; 5949 break;
5946 } 5950 }
5947 spin_unlock_irqrestore(&zone->lock, flags); 5951 spin_unlock_irqrestore(&zone->lock, flags);
5948 5952
5949 return order < MAX_ORDER; 5953 return order < MAX_ORDER;
5950 } 5954 }
5951 #endif 5955 #endif
5952 5956
5953 static const struct trace_print_flags pageflag_names[] = { 5957 static const struct trace_print_flags pageflag_names[] = {
5954 {1UL << PG_locked, "locked" }, 5958 {1UL << PG_locked, "locked" },
5955 {1UL << PG_error, "error" }, 5959 {1UL << PG_error, "error" },
5956 {1UL << PG_referenced, "referenced" }, 5960 {1UL << PG_referenced, "referenced" },
5957 {1UL << PG_uptodate, "uptodate" }, 5961 {1UL << PG_uptodate, "uptodate" },
5958 {1UL << PG_dirty, "dirty" }, 5962 {1UL << PG_dirty, "dirty" },
5959 {1UL << PG_lru, "lru" }, 5963 {1UL << PG_lru, "lru" },
5960 {1UL << PG_active, "active" }, 5964 {1UL << PG_active, "active" },
5961 {1UL << PG_slab, "slab" }, 5965 {1UL << PG_slab, "slab" },
5962 {1UL << PG_owner_priv_1, "owner_priv_1" }, 5966 {1UL << PG_owner_priv_1, "owner_priv_1" },
5963 {1UL << PG_arch_1, "arch_1" }, 5967 {1UL << PG_arch_1, "arch_1" },
5964 {1UL << PG_reserved, "reserved" }, 5968 {1UL << PG_reserved, "reserved" },
5965 {1UL << PG_private, "private" }, 5969 {1UL << PG_private, "private" },
5966 {1UL << PG_private_2, "private_2" }, 5970 {1UL << PG_private_2, "private_2" },
5967 {1UL << PG_writeback, "writeback" }, 5971 {1UL << PG_writeback, "writeback" },
5968 #ifdef CONFIG_PAGEFLAGS_EXTENDED 5972 #ifdef CONFIG_PAGEFLAGS_EXTENDED
5969 {1UL << PG_head, "head" }, 5973 {1UL << PG_head, "head" },
5970 {1UL << PG_tail, "tail" }, 5974 {1UL << PG_tail, "tail" },
5971 #else 5975 #else
5972 {1UL << PG_compound, "compound" }, 5976 {1UL << PG_compound, "compound" },
5973 #endif 5977 #endif
5974 {1UL << PG_swapcache, "swapcache" }, 5978 {1UL << PG_swapcache, "swapcache" },
5975 {1UL << PG_mappedtodisk, "mappedtodisk" }, 5979 {1UL << PG_mappedtodisk, "mappedtodisk" },
5976 {1UL << PG_reclaim, "reclaim" }, 5980 {1UL << PG_reclaim, "reclaim" },
5977 {1UL << PG_swapbacked, "swapbacked" }, 5981 {1UL << PG_swapbacked, "swapbacked" },
5978 {1UL << PG_unevictable, "unevictable" }, 5982 {1UL << PG_unevictable, "unevictable" },
5979 #ifdef CONFIG_MMU 5983 #ifdef CONFIG_MMU
5980 {1UL << PG_mlocked, "mlocked" }, 5984 {1UL << PG_mlocked, "mlocked" },
5981 #endif 5985 #endif
5982 #ifdef CONFIG_ARCH_USES_PG_UNCACHED 5986 #ifdef CONFIG_ARCH_USES_PG_UNCACHED
5983 {1UL << PG_uncached, "uncached" }, 5987 {1UL << PG_uncached, "uncached" },
5984 #endif 5988 #endif
5985 #ifdef CONFIG_MEMORY_FAILURE 5989 #ifdef CONFIG_MEMORY_FAILURE
5986 {1UL << PG_hwpoison, "hwpoison" }, 5990 {1UL << PG_hwpoison, "hwpoison" },
5987 #endif 5991 #endif
5988 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 5992 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
5989 {1UL << PG_compound_lock, "compound_lock" }, 5993 {1UL << PG_compound_lock, "compound_lock" },
5990 #endif 5994 #endif
5991 }; 5995 };
5992 5996
5993 static void dump_page_flags(unsigned long flags) 5997 static void dump_page_flags(unsigned long flags)
5994 { 5998 {
5995 const char *delim = ""; 5999 const char *delim = "";
5996 unsigned long mask; 6000 unsigned long mask;
5997 int i; 6001 int i;
5998 6002
5999 BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); 6003 BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
6000 6004
6001 printk(KERN_ALERT "page flags: %#lx(", flags); 6005 printk(KERN_ALERT "page flags: %#lx(", flags);
6002 6006
6003 /* remove zone id */ 6007 /* remove zone id */
6004 flags &= (1UL << NR_PAGEFLAGS) - 1; 6008 flags &= (1UL << NR_PAGEFLAGS) - 1;
6005 6009
6006 for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) { 6010 for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {
6007 6011
6008 mask = pageflag_names[i].mask; 6012 mask = pageflag_names[i].mask;
6009 if ((flags & mask) != mask) 6013 if ((flags & mask) != mask)
6010 continue; 6014 continue;
6011 6015
6012 flags &= ~mask; 6016 flags &= ~mask;
6013 printk("%s%s", delim, pageflag_names[i].name); 6017 printk("%s%s", delim, pageflag_names[i].name);
6014 delim = "|"; 6018 delim = "|";
6015 } 6019 }
6016 6020
6017 /* check for left over flags */ 6021 /* check for left over flags */
6018 if (flags) 6022 if (flags)
6019 printk("%s%#lx", delim, flags); 6023 printk("%s%#lx", delim, flags);
6020 6024
6021 printk(")\n"); 6025 printk(")\n");
6022 } 6026 }
6023 6027
6024 void dump_page(struct page *page) 6028 void dump_page(struct page *page)
6025 { 6029 {
6026 printk(KERN_ALERT 6030 printk(KERN_ALERT
6027 "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", 6031 "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
6028 page, atomic_read(&page->_count), page_mapcount(page), 6032 page, atomic_read(&page->_count), page_mapcount(page),
6029 page->mapping, page->index); 6033 page->mapping, page->index);
6030 dump_page_flags(page->flags); 6034 dump_page_flags(page->flags);
6031 mem_cgroup_print_bad_page(page); 6035 mem_cgroup_print_bad_page(page);
6032 } 6036 }
6033 6037