Commit 907aed48f65efeecf91575397e3d79335d93a466
Committed by
Linus Torvalds
1 parent
b37f1dd0f5
Exists in
smarc-l5.0.0_1.0.0-ga
and in
5 other branches
mm: allow PF_MEMALLOC from softirq context
This is needed to allow network softirq packet processing to make use of PF_MEMALLOC. Currently softirq context cannot use PF_MEMALLOC due to it not being associated with a task, and therefore not having task flags to fiddle with - thus the gfp to alloc flag mapping ignores the task flags when in interrupts (hard or soft) context. Allowing softirqs to make use of PF_MEMALLOC therefore requires some trickery. This patch borrows the task flags from whatever process happens to be preempted by the softirq. It then modifies the gfp to alloc flags mapping to not exclude task flags in softirq context, and modify the softirq code to save, clear and restore the PF_MEMALLOC flag. The save and clear, ensures the preempted task's PF_MEMALLOC flag doesn't leak into the softirq. The restore ensures a softirq's PF_MEMALLOC flag cannot leak back into the preempted process. This should be safe due to the following reasons Softirqs can run on multiple CPUs sure but the same task should not be executing the same softirq code. Neither should the softirq handler be preempted by any other softirq handler so the flags should not leak to an unrelated softirq. Softirqs re-enable hardware interrupts in __do_softirq() so can be preempted by hardware interrupts so PF_MEMALLOC is inherited by the hard IRQ. However, this is similar to a process in reclaim being preempted by a hardirq. While PF_MEMALLOC is set, gfp_to_alloc_flags() distinguishes between hard and soft irqs and avoids giving a hardirq the ALLOC_NO_WATERMARKS flag. If the softirq is deferred to ksoftirq then its flags may be used instead of a normal tasks but as the softirq cannot be preempted, the PF_MEMALLOC flag does not leak to other code by accident. [davem@davemloft.net: Document why PF_MEMALLOC is safe] Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: David Miller <davem@davemloft.net> Cc: Neil Brown <neilb@suse.de> Cc: Mike Christie <michaelc@cs.wisc.edu> Cc: Eric B Munson <emunson@mgebm.net> Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: Sebastian Andrzej Siewior <sebastian@breakpoint.cc> Cc: Mel Gorman <mgorman@suse.de> Cc: Christoph Lameter <cl@linux.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 3 changed files with 21 additions and 1 deletions Inline Diff
include/linux/sched.h
1 | #ifndef _LINUX_SCHED_H | 1 | #ifndef _LINUX_SCHED_H |
2 | #define _LINUX_SCHED_H | 2 | #define _LINUX_SCHED_H |
3 | 3 | ||
4 | /* | 4 | /* |
5 | * cloning flags: | 5 | * cloning flags: |
6 | */ | 6 | */ |
7 | #define CSIGNAL 0x000000ff /* signal mask to be sent at exit */ | 7 | #define CSIGNAL 0x000000ff /* signal mask to be sent at exit */ |
8 | #define CLONE_VM 0x00000100 /* set if VM shared between processes */ | 8 | #define CLONE_VM 0x00000100 /* set if VM shared between processes */ |
9 | #define CLONE_FS 0x00000200 /* set if fs info shared between processes */ | 9 | #define CLONE_FS 0x00000200 /* set if fs info shared between processes */ |
10 | #define CLONE_FILES 0x00000400 /* set if open files shared between processes */ | 10 | #define CLONE_FILES 0x00000400 /* set if open files shared between processes */ |
11 | #define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */ | 11 | #define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */ |
12 | #define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */ | 12 | #define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */ |
13 | #define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */ | 13 | #define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */ |
14 | #define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */ | 14 | #define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */ |
15 | #define CLONE_THREAD 0x00010000 /* Same thread group? */ | 15 | #define CLONE_THREAD 0x00010000 /* Same thread group? */ |
16 | #define CLONE_NEWNS 0x00020000 /* New namespace group? */ | 16 | #define CLONE_NEWNS 0x00020000 /* New namespace group? */ |
17 | #define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */ | 17 | #define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */ |
18 | #define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */ | 18 | #define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */ |
19 | #define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */ | 19 | #define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */ |
20 | #define CLONE_CHILD_CLEARTID 0x00200000 /* clear the TID in the child */ | 20 | #define CLONE_CHILD_CLEARTID 0x00200000 /* clear the TID in the child */ |
21 | #define CLONE_DETACHED 0x00400000 /* Unused, ignored */ | 21 | #define CLONE_DETACHED 0x00400000 /* Unused, ignored */ |
22 | #define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */ | 22 | #define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */ |
23 | #define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */ | 23 | #define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */ |
24 | /* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state) | 24 | /* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state) |
25 | and is now available for re-use. */ | 25 | and is now available for re-use. */ |
26 | #define CLONE_NEWUTS 0x04000000 /* New utsname group? */ | 26 | #define CLONE_NEWUTS 0x04000000 /* New utsname group? */ |
27 | #define CLONE_NEWIPC 0x08000000 /* New ipcs */ | 27 | #define CLONE_NEWIPC 0x08000000 /* New ipcs */ |
28 | #define CLONE_NEWUSER 0x10000000 /* New user namespace */ | 28 | #define CLONE_NEWUSER 0x10000000 /* New user namespace */ |
29 | #define CLONE_NEWPID 0x20000000 /* New pid namespace */ | 29 | #define CLONE_NEWPID 0x20000000 /* New pid namespace */ |
30 | #define CLONE_NEWNET 0x40000000 /* New network namespace */ | 30 | #define CLONE_NEWNET 0x40000000 /* New network namespace */ |
31 | #define CLONE_IO 0x80000000 /* Clone io context */ | 31 | #define CLONE_IO 0x80000000 /* Clone io context */ |
32 | 32 | ||
33 | /* | 33 | /* |
34 | * Scheduling policies | 34 | * Scheduling policies |
35 | */ | 35 | */ |
36 | #define SCHED_NORMAL 0 | 36 | #define SCHED_NORMAL 0 |
37 | #define SCHED_FIFO 1 | 37 | #define SCHED_FIFO 1 |
38 | #define SCHED_RR 2 | 38 | #define SCHED_RR 2 |
39 | #define SCHED_BATCH 3 | 39 | #define SCHED_BATCH 3 |
40 | /* SCHED_ISO: reserved but not implemented yet */ | 40 | /* SCHED_ISO: reserved but not implemented yet */ |
41 | #define SCHED_IDLE 5 | 41 | #define SCHED_IDLE 5 |
42 | /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ | 42 | /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ |
43 | #define SCHED_RESET_ON_FORK 0x40000000 | 43 | #define SCHED_RESET_ON_FORK 0x40000000 |
44 | 44 | ||
45 | #ifdef __KERNEL__ | 45 | #ifdef __KERNEL__ |
46 | 46 | ||
47 | struct sched_param { | 47 | struct sched_param { |
48 | int sched_priority; | 48 | int sched_priority; |
49 | }; | 49 | }; |
50 | 50 | ||
51 | #include <asm/param.h> /* for HZ */ | 51 | #include <asm/param.h> /* for HZ */ |
52 | 52 | ||
53 | #include <linux/capability.h> | 53 | #include <linux/capability.h> |
54 | #include <linux/threads.h> | 54 | #include <linux/threads.h> |
55 | #include <linux/kernel.h> | 55 | #include <linux/kernel.h> |
56 | #include <linux/types.h> | 56 | #include <linux/types.h> |
57 | #include <linux/timex.h> | 57 | #include <linux/timex.h> |
58 | #include <linux/jiffies.h> | 58 | #include <linux/jiffies.h> |
59 | #include <linux/rbtree.h> | 59 | #include <linux/rbtree.h> |
60 | #include <linux/thread_info.h> | 60 | #include <linux/thread_info.h> |
61 | #include <linux/cpumask.h> | 61 | #include <linux/cpumask.h> |
62 | #include <linux/errno.h> | 62 | #include <linux/errno.h> |
63 | #include <linux/nodemask.h> | 63 | #include <linux/nodemask.h> |
64 | #include <linux/mm_types.h> | 64 | #include <linux/mm_types.h> |
65 | 65 | ||
66 | #include <asm/page.h> | 66 | #include <asm/page.h> |
67 | #include <asm/ptrace.h> | 67 | #include <asm/ptrace.h> |
68 | #include <asm/cputime.h> | 68 | #include <asm/cputime.h> |
69 | 69 | ||
70 | #include <linux/smp.h> | 70 | #include <linux/smp.h> |
71 | #include <linux/sem.h> | 71 | #include <linux/sem.h> |
72 | #include <linux/signal.h> | 72 | #include <linux/signal.h> |
73 | #include <linux/compiler.h> | 73 | #include <linux/compiler.h> |
74 | #include <linux/completion.h> | 74 | #include <linux/completion.h> |
75 | #include <linux/pid.h> | 75 | #include <linux/pid.h> |
76 | #include <linux/percpu.h> | 76 | #include <linux/percpu.h> |
77 | #include <linux/topology.h> | 77 | #include <linux/topology.h> |
78 | #include <linux/proportions.h> | 78 | #include <linux/proportions.h> |
79 | #include <linux/seccomp.h> | 79 | #include <linux/seccomp.h> |
80 | #include <linux/rcupdate.h> | 80 | #include <linux/rcupdate.h> |
81 | #include <linux/rculist.h> | 81 | #include <linux/rculist.h> |
82 | #include <linux/rtmutex.h> | 82 | #include <linux/rtmutex.h> |
83 | 83 | ||
84 | #include <linux/time.h> | 84 | #include <linux/time.h> |
85 | #include <linux/param.h> | 85 | #include <linux/param.h> |
86 | #include <linux/resource.h> | 86 | #include <linux/resource.h> |
87 | #include <linux/timer.h> | 87 | #include <linux/timer.h> |
88 | #include <linux/hrtimer.h> | 88 | #include <linux/hrtimer.h> |
89 | #include <linux/task_io_accounting.h> | 89 | #include <linux/task_io_accounting.h> |
90 | #include <linux/latencytop.h> | 90 | #include <linux/latencytop.h> |
91 | #include <linux/cred.h> | 91 | #include <linux/cred.h> |
92 | #include <linux/llist.h> | 92 | #include <linux/llist.h> |
93 | #include <linux/uidgid.h> | 93 | #include <linux/uidgid.h> |
94 | 94 | ||
95 | #include <asm/processor.h> | 95 | #include <asm/processor.h> |
96 | 96 | ||
97 | struct exec_domain; | 97 | struct exec_domain; |
98 | struct futex_pi_state; | 98 | struct futex_pi_state; |
99 | struct robust_list_head; | 99 | struct robust_list_head; |
100 | struct bio_list; | 100 | struct bio_list; |
101 | struct fs_struct; | 101 | struct fs_struct; |
102 | struct perf_event_context; | 102 | struct perf_event_context; |
103 | struct blk_plug; | 103 | struct blk_plug; |
104 | 104 | ||
105 | /* | 105 | /* |
106 | * List of flags we want to share for kernel threads, | 106 | * List of flags we want to share for kernel threads, |
107 | * if only because they are not used by them anyway. | 107 | * if only because they are not used by them anyway. |
108 | */ | 108 | */ |
109 | #define CLONE_KERNEL (CLONE_FS | CLONE_FILES | CLONE_SIGHAND) | 109 | #define CLONE_KERNEL (CLONE_FS | CLONE_FILES | CLONE_SIGHAND) |
110 | 110 | ||
111 | /* | 111 | /* |
112 | * These are the constant used to fake the fixed-point load-average | 112 | * These are the constant used to fake the fixed-point load-average |
113 | * counting. Some notes: | 113 | * counting. Some notes: |
114 | * - 11 bit fractions expand to 22 bits by the multiplies: this gives | 114 | * - 11 bit fractions expand to 22 bits by the multiplies: this gives |
115 | * a load-average precision of 10 bits integer + 11 bits fractional | 115 | * a load-average precision of 10 bits integer + 11 bits fractional |
116 | * - if you want to count load-averages more often, you need more | 116 | * - if you want to count load-averages more often, you need more |
117 | * precision, or rounding will get you. With 2-second counting freq, | 117 | * precision, or rounding will get you. With 2-second counting freq, |
118 | * the EXP_n values would be 1981, 2034 and 2043 if still using only | 118 | * the EXP_n values would be 1981, 2034 and 2043 if still using only |
119 | * 11 bit fractions. | 119 | * 11 bit fractions. |
120 | */ | 120 | */ |
121 | extern unsigned long avenrun[]; /* Load averages */ | 121 | extern unsigned long avenrun[]; /* Load averages */ |
122 | extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift); | 122 | extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift); |
123 | 123 | ||
124 | #define FSHIFT 11 /* nr of bits of precision */ | 124 | #define FSHIFT 11 /* nr of bits of precision */ |
125 | #define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */ | 125 | #define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */ |
126 | #define LOAD_FREQ (5*HZ+1) /* 5 sec intervals */ | 126 | #define LOAD_FREQ (5*HZ+1) /* 5 sec intervals */ |
127 | #define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */ | 127 | #define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */ |
128 | #define EXP_5 2014 /* 1/exp(5sec/5min) */ | 128 | #define EXP_5 2014 /* 1/exp(5sec/5min) */ |
129 | #define EXP_15 2037 /* 1/exp(5sec/15min) */ | 129 | #define EXP_15 2037 /* 1/exp(5sec/15min) */ |
130 | 130 | ||
131 | #define CALC_LOAD(load,exp,n) \ | 131 | #define CALC_LOAD(load,exp,n) \ |
132 | load *= exp; \ | 132 | load *= exp; \ |
133 | load += n*(FIXED_1-exp); \ | 133 | load += n*(FIXED_1-exp); \ |
134 | load >>= FSHIFT; | 134 | load >>= FSHIFT; |
135 | 135 | ||
136 | extern unsigned long total_forks; | 136 | extern unsigned long total_forks; |
137 | extern int nr_threads; | 137 | extern int nr_threads; |
138 | DECLARE_PER_CPU(unsigned long, process_counts); | 138 | DECLARE_PER_CPU(unsigned long, process_counts); |
139 | extern int nr_processes(void); | 139 | extern int nr_processes(void); |
140 | extern unsigned long nr_running(void); | 140 | extern unsigned long nr_running(void); |
141 | extern unsigned long nr_uninterruptible(void); | 141 | extern unsigned long nr_uninterruptible(void); |
142 | extern unsigned long nr_iowait(void); | 142 | extern unsigned long nr_iowait(void); |
143 | extern unsigned long nr_iowait_cpu(int cpu); | 143 | extern unsigned long nr_iowait_cpu(int cpu); |
144 | extern unsigned long this_cpu_load(void); | 144 | extern unsigned long this_cpu_load(void); |
145 | 145 | ||
146 | 146 | ||
147 | extern void calc_global_load(unsigned long ticks); | 147 | extern void calc_global_load(unsigned long ticks); |
148 | extern void update_cpu_load_nohz(void); | 148 | extern void update_cpu_load_nohz(void); |
149 | 149 | ||
150 | extern unsigned long get_parent_ip(unsigned long addr); | 150 | extern unsigned long get_parent_ip(unsigned long addr); |
151 | 151 | ||
152 | struct seq_file; | 152 | struct seq_file; |
153 | struct cfs_rq; | 153 | struct cfs_rq; |
154 | struct task_group; | 154 | struct task_group; |
155 | #ifdef CONFIG_SCHED_DEBUG | 155 | #ifdef CONFIG_SCHED_DEBUG |
156 | extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m); | 156 | extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m); |
157 | extern void proc_sched_set_task(struct task_struct *p); | 157 | extern void proc_sched_set_task(struct task_struct *p); |
158 | extern void | 158 | extern void |
159 | print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq); | 159 | print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq); |
160 | #else | 160 | #else |
161 | static inline void | 161 | static inline void |
162 | proc_sched_show_task(struct task_struct *p, struct seq_file *m) | 162 | proc_sched_show_task(struct task_struct *p, struct seq_file *m) |
163 | { | 163 | { |
164 | } | 164 | } |
165 | static inline void proc_sched_set_task(struct task_struct *p) | 165 | static inline void proc_sched_set_task(struct task_struct *p) |
166 | { | 166 | { |
167 | } | 167 | } |
168 | static inline void | 168 | static inline void |
169 | print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | 169 | print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) |
170 | { | 170 | { |
171 | } | 171 | } |
172 | #endif | 172 | #endif |
173 | 173 | ||
174 | /* | 174 | /* |
175 | * Task state bitmask. NOTE! These bits are also | 175 | * Task state bitmask. NOTE! These bits are also |
176 | * encoded in fs/proc/array.c: get_task_state(). | 176 | * encoded in fs/proc/array.c: get_task_state(). |
177 | * | 177 | * |
178 | * We have two separate sets of flags: task->state | 178 | * We have two separate sets of flags: task->state |
179 | * is about runnability, while task->exit_state are | 179 | * is about runnability, while task->exit_state are |
180 | * about the task exiting. Confusing, but this way | 180 | * about the task exiting. Confusing, but this way |
181 | * modifying one set can't modify the other one by | 181 | * modifying one set can't modify the other one by |
182 | * mistake. | 182 | * mistake. |
183 | */ | 183 | */ |
184 | #define TASK_RUNNING 0 | 184 | #define TASK_RUNNING 0 |
185 | #define TASK_INTERRUPTIBLE 1 | 185 | #define TASK_INTERRUPTIBLE 1 |
186 | #define TASK_UNINTERRUPTIBLE 2 | 186 | #define TASK_UNINTERRUPTIBLE 2 |
187 | #define __TASK_STOPPED 4 | 187 | #define __TASK_STOPPED 4 |
188 | #define __TASK_TRACED 8 | 188 | #define __TASK_TRACED 8 |
189 | /* in tsk->exit_state */ | 189 | /* in tsk->exit_state */ |
190 | #define EXIT_ZOMBIE 16 | 190 | #define EXIT_ZOMBIE 16 |
191 | #define EXIT_DEAD 32 | 191 | #define EXIT_DEAD 32 |
192 | /* in tsk->state again */ | 192 | /* in tsk->state again */ |
193 | #define TASK_DEAD 64 | 193 | #define TASK_DEAD 64 |
194 | #define TASK_WAKEKILL 128 | 194 | #define TASK_WAKEKILL 128 |
195 | #define TASK_WAKING 256 | 195 | #define TASK_WAKING 256 |
196 | #define TASK_STATE_MAX 512 | 196 | #define TASK_STATE_MAX 512 |
197 | 197 | ||
198 | #define TASK_STATE_TO_CHAR_STR "RSDTtZXxKW" | 198 | #define TASK_STATE_TO_CHAR_STR "RSDTtZXxKW" |
199 | 199 | ||
200 | extern char ___assert_task_state[1 - 2*!!( | 200 | extern char ___assert_task_state[1 - 2*!!( |
201 | sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)]; | 201 | sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)]; |
202 | 202 | ||
203 | /* Convenience macros for the sake of set_task_state */ | 203 | /* Convenience macros for the sake of set_task_state */ |
204 | #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) | 204 | #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) |
205 | #define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED) | 205 | #define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED) |
206 | #define TASK_TRACED (TASK_WAKEKILL | __TASK_TRACED) | 206 | #define TASK_TRACED (TASK_WAKEKILL | __TASK_TRACED) |
207 | 207 | ||
208 | /* Convenience macros for the sake of wake_up */ | 208 | /* Convenience macros for the sake of wake_up */ |
209 | #define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE) | 209 | #define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE) |
210 | #define TASK_ALL (TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED) | 210 | #define TASK_ALL (TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED) |
211 | 211 | ||
212 | /* get_task_state() */ | 212 | /* get_task_state() */ |
213 | #define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \ | 213 | #define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \ |
214 | TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \ | 214 | TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \ |
215 | __TASK_TRACED) | 215 | __TASK_TRACED) |
216 | 216 | ||
217 | #define task_is_traced(task) ((task->state & __TASK_TRACED) != 0) | 217 | #define task_is_traced(task) ((task->state & __TASK_TRACED) != 0) |
218 | #define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0) | 218 | #define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0) |
219 | #define task_is_dead(task) ((task)->exit_state != 0) | 219 | #define task_is_dead(task) ((task)->exit_state != 0) |
220 | #define task_is_stopped_or_traced(task) \ | 220 | #define task_is_stopped_or_traced(task) \ |
221 | ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0) | 221 | ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0) |
222 | #define task_contributes_to_load(task) \ | 222 | #define task_contributes_to_load(task) \ |
223 | ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \ | 223 | ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \ |
224 | (task->flags & PF_FROZEN) == 0) | 224 | (task->flags & PF_FROZEN) == 0) |
225 | 225 | ||
226 | #define __set_task_state(tsk, state_value) \ | 226 | #define __set_task_state(tsk, state_value) \ |
227 | do { (tsk)->state = (state_value); } while (0) | 227 | do { (tsk)->state = (state_value); } while (0) |
228 | #define set_task_state(tsk, state_value) \ | 228 | #define set_task_state(tsk, state_value) \ |
229 | set_mb((tsk)->state, (state_value)) | 229 | set_mb((tsk)->state, (state_value)) |
230 | 230 | ||
231 | /* | 231 | /* |
232 | * set_current_state() includes a barrier so that the write of current->state | 232 | * set_current_state() includes a barrier so that the write of current->state |
233 | * is correctly serialised wrt the caller's subsequent test of whether to | 233 | * is correctly serialised wrt the caller's subsequent test of whether to |
234 | * actually sleep: | 234 | * actually sleep: |
235 | * | 235 | * |
236 | * set_current_state(TASK_UNINTERRUPTIBLE); | 236 | * set_current_state(TASK_UNINTERRUPTIBLE); |
237 | * if (do_i_need_to_sleep()) | 237 | * if (do_i_need_to_sleep()) |
238 | * schedule(); | 238 | * schedule(); |
239 | * | 239 | * |
240 | * If the caller does not need such serialisation then use __set_current_state() | 240 | * If the caller does not need such serialisation then use __set_current_state() |
241 | */ | 241 | */ |
242 | #define __set_current_state(state_value) \ | 242 | #define __set_current_state(state_value) \ |
243 | do { current->state = (state_value); } while (0) | 243 | do { current->state = (state_value); } while (0) |
244 | #define set_current_state(state_value) \ | 244 | #define set_current_state(state_value) \ |
245 | set_mb(current->state, (state_value)) | 245 | set_mb(current->state, (state_value)) |
246 | 246 | ||
247 | /* Task command name length */ | 247 | /* Task command name length */ |
248 | #define TASK_COMM_LEN 16 | 248 | #define TASK_COMM_LEN 16 |
249 | 249 | ||
250 | #include <linux/spinlock.h> | 250 | #include <linux/spinlock.h> |
251 | 251 | ||
252 | /* | 252 | /* |
253 | * This serializes "schedule()" and also protects | 253 | * This serializes "schedule()" and also protects |
254 | * the run-queue from deletions/modifications (but | 254 | * the run-queue from deletions/modifications (but |
255 | * _adding_ to the beginning of the run-queue has | 255 | * _adding_ to the beginning of the run-queue has |
256 | * a separate lock). | 256 | * a separate lock). |
257 | */ | 257 | */ |
258 | extern rwlock_t tasklist_lock; | 258 | extern rwlock_t tasklist_lock; |
259 | extern spinlock_t mmlist_lock; | 259 | extern spinlock_t mmlist_lock; |
260 | 260 | ||
261 | struct task_struct; | 261 | struct task_struct; |
262 | 262 | ||
263 | #ifdef CONFIG_PROVE_RCU | 263 | #ifdef CONFIG_PROVE_RCU |
264 | extern int lockdep_tasklist_lock_is_held(void); | 264 | extern int lockdep_tasklist_lock_is_held(void); |
265 | #endif /* #ifdef CONFIG_PROVE_RCU */ | 265 | #endif /* #ifdef CONFIG_PROVE_RCU */ |
266 | 266 | ||
267 | extern void sched_init(void); | 267 | extern void sched_init(void); |
268 | extern void sched_init_smp(void); | 268 | extern void sched_init_smp(void); |
269 | extern asmlinkage void schedule_tail(struct task_struct *prev); | 269 | extern asmlinkage void schedule_tail(struct task_struct *prev); |
270 | extern void init_idle(struct task_struct *idle, int cpu); | 270 | extern void init_idle(struct task_struct *idle, int cpu); |
271 | extern void init_idle_bootup_task(struct task_struct *idle); | 271 | extern void init_idle_bootup_task(struct task_struct *idle); |
272 | 272 | ||
273 | extern int runqueue_is_locked(int cpu); | 273 | extern int runqueue_is_locked(int cpu); |
274 | 274 | ||
275 | #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) | 275 | #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) |
276 | extern void select_nohz_load_balancer(int stop_tick); | 276 | extern void select_nohz_load_balancer(int stop_tick); |
277 | extern void set_cpu_sd_state_idle(void); | 277 | extern void set_cpu_sd_state_idle(void); |
278 | extern int get_nohz_timer_target(void); | 278 | extern int get_nohz_timer_target(void); |
279 | #else | 279 | #else |
280 | static inline void select_nohz_load_balancer(int stop_tick) { } | 280 | static inline void select_nohz_load_balancer(int stop_tick) { } |
281 | static inline void set_cpu_sd_state_idle(void) { } | 281 | static inline void set_cpu_sd_state_idle(void) { } |
282 | #endif | 282 | #endif |
283 | 283 | ||
284 | /* | 284 | /* |
285 | * Only dump TASK_* tasks. (0 for all tasks) | 285 | * Only dump TASK_* tasks. (0 for all tasks) |
286 | */ | 286 | */ |
287 | extern void show_state_filter(unsigned long state_filter); | 287 | extern void show_state_filter(unsigned long state_filter); |
288 | 288 | ||
289 | static inline void show_state(void) | 289 | static inline void show_state(void) |
290 | { | 290 | { |
291 | show_state_filter(0); | 291 | show_state_filter(0); |
292 | } | 292 | } |
293 | 293 | ||
294 | extern void show_regs(struct pt_regs *); | 294 | extern void show_regs(struct pt_regs *); |
295 | 295 | ||
296 | /* | 296 | /* |
297 | * TASK is a pointer to the task whose backtrace we want to see (or NULL for current | 297 | * TASK is a pointer to the task whose backtrace we want to see (or NULL for current |
298 | * task), SP is the stack pointer of the first frame that should be shown in the back | 298 | * task), SP is the stack pointer of the first frame that should be shown in the back |
299 | * trace (or NULL if the entire call-chain of the task should be shown). | 299 | * trace (or NULL if the entire call-chain of the task should be shown). |
300 | */ | 300 | */ |
301 | extern void show_stack(struct task_struct *task, unsigned long *sp); | 301 | extern void show_stack(struct task_struct *task, unsigned long *sp); |
302 | 302 | ||
303 | void io_schedule(void); | 303 | void io_schedule(void); |
304 | long io_schedule_timeout(long timeout); | 304 | long io_schedule_timeout(long timeout); |
305 | 305 | ||
306 | extern void cpu_init (void); | 306 | extern void cpu_init (void); |
307 | extern void trap_init(void); | 307 | extern void trap_init(void); |
308 | extern void update_process_times(int user); | 308 | extern void update_process_times(int user); |
309 | extern void scheduler_tick(void); | 309 | extern void scheduler_tick(void); |
310 | 310 | ||
311 | extern void sched_show_task(struct task_struct *p); | 311 | extern void sched_show_task(struct task_struct *p); |
312 | 312 | ||
313 | #ifdef CONFIG_LOCKUP_DETECTOR | 313 | #ifdef CONFIG_LOCKUP_DETECTOR |
314 | extern void touch_softlockup_watchdog(void); | 314 | extern void touch_softlockup_watchdog(void); |
315 | extern void touch_softlockup_watchdog_sync(void); | 315 | extern void touch_softlockup_watchdog_sync(void); |
316 | extern void touch_all_softlockup_watchdogs(void); | 316 | extern void touch_all_softlockup_watchdogs(void); |
317 | extern int proc_dowatchdog_thresh(struct ctl_table *table, int write, | 317 | extern int proc_dowatchdog_thresh(struct ctl_table *table, int write, |
318 | void __user *buffer, | 318 | void __user *buffer, |
319 | size_t *lenp, loff_t *ppos); | 319 | size_t *lenp, loff_t *ppos); |
320 | extern unsigned int softlockup_panic; | 320 | extern unsigned int softlockup_panic; |
321 | void lockup_detector_init(void); | 321 | void lockup_detector_init(void); |
322 | #else | 322 | #else |
323 | static inline void touch_softlockup_watchdog(void) | 323 | static inline void touch_softlockup_watchdog(void) |
324 | { | 324 | { |
325 | } | 325 | } |
326 | static inline void touch_softlockup_watchdog_sync(void) | 326 | static inline void touch_softlockup_watchdog_sync(void) |
327 | { | 327 | { |
328 | } | 328 | } |
329 | static inline void touch_all_softlockup_watchdogs(void) | 329 | static inline void touch_all_softlockup_watchdogs(void) |
330 | { | 330 | { |
331 | } | 331 | } |
332 | static inline void lockup_detector_init(void) | 332 | static inline void lockup_detector_init(void) |
333 | { | 333 | { |
334 | } | 334 | } |
335 | #endif | 335 | #endif |
336 | 336 | ||
337 | #if defined(CONFIG_LOCKUP_DETECTOR) && defined(CONFIG_SUSPEND) | 337 | #if defined(CONFIG_LOCKUP_DETECTOR) && defined(CONFIG_SUSPEND) |
338 | void lockup_detector_bootcpu_resume(void); | 338 | void lockup_detector_bootcpu_resume(void); |
339 | #else | 339 | #else |
340 | static inline void lockup_detector_bootcpu_resume(void) | 340 | static inline void lockup_detector_bootcpu_resume(void) |
341 | { | 341 | { |
342 | } | 342 | } |
343 | #endif | 343 | #endif |
344 | 344 | ||
345 | #ifdef CONFIG_DETECT_HUNG_TASK | 345 | #ifdef CONFIG_DETECT_HUNG_TASK |
346 | extern unsigned int sysctl_hung_task_panic; | 346 | extern unsigned int sysctl_hung_task_panic; |
347 | extern unsigned long sysctl_hung_task_check_count; | 347 | extern unsigned long sysctl_hung_task_check_count; |
348 | extern unsigned long sysctl_hung_task_timeout_secs; | 348 | extern unsigned long sysctl_hung_task_timeout_secs; |
349 | extern unsigned long sysctl_hung_task_warnings; | 349 | extern unsigned long sysctl_hung_task_warnings; |
350 | extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, | 350 | extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, |
351 | void __user *buffer, | 351 | void __user *buffer, |
352 | size_t *lenp, loff_t *ppos); | 352 | size_t *lenp, loff_t *ppos); |
353 | #else | 353 | #else |
354 | /* Avoid need for ifdefs elsewhere in the code */ | 354 | /* Avoid need for ifdefs elsewhere in the code */ |
355 | enum { sysctl_hung_task_timeout_secs = 0 }; | 355 | enum { sysctl_hung_task_timeout_secs = 0 }; |
356 | #endif | 356 | #endif |
357 | 357 | ||
358 | /* Attach to any functions which should be ignored in wchan output. */ | 358 | /* Attach to any functions which should be ignored in wchan output. */ |
359 | #define __sched __attribute__((__section__(".sched.text"))) | 359 | #define __sched __attribute__((__section__(".sched.text"))) |
360 | 360 | ||
361 | /* Linker adds these: start and end of __sched functions */ | 361 | /* Linker adds these: start and end of __sched functions */ |
362 | extern char __sched_text_start[], __sched_text_end[]; | 362 | extern char __sched_text_start[], __sched_text_end[]; |
363 | 363 | ||
364 | /* Is this address in the __sched functions? */ | 364 | /* Is this address in the __sched functions? */ |
365 | extern int in_sched_functions(unsigned long addr); | 365 | extern int in_sched_functions(unsigned long addr); |
366 | 366 | ||
367 | #define MAX_SCHEDULE_TIMEOUT LONG_MAX | 367 | #define MAX_SCHEDULE_TIMEOUT LONG_MAX |
368 | extern signed long schedule_timeout(signed long timeout); | 368 | extern signed long schedule_timeout(signed long timeout); |
369 | extern signed long schedule_timeout_interruptible(signed long timeout); | 369 | extern signed long schedule_timeout_interruptible(signed long timeout); |
370 | extern signed long schedule_timeout_killable(signed long timeout); | 370 | extern signed long schedule_timeout_killable(signed long timeout); |
371 | extern signed long schedule_timeout_uninterruptible(signed long timeout); | 371 | extern signed long schedule_timeout_uninterruptible(signed long timeout); |
372 | asmlinkage void schedule(void); | 372 | asmlinkage void schedule(void); |
373 | extern void schedule_preempt_disabled(void); | 373 | extern void schedule_preempt_disabled(void); |
374 | extern int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner); | 374 | extern int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner); |
375 | 375 | ||
376 | struct nsproxy; | 376 | struct nsproxy; |
377 | struct user_namespace; | 377 | struct user_namespace; |
378 | 378 | ||
379 | /* | 379 | /* |
380 | * Default maximum number of active map areas, this limits the number of vmas | 380 | * Default maximum number of active map areas, this limits the number of vmas |
381 | * per mm struct. Users can overwrite this number by sysctl but there is a | 381 | * per mm struct. Users can overwrite this number by sysctl but there is a |
382 | * problem. | 382 | * problem. |
383 | * | 383 | * |
384 | * When a program's coredump is generated as ELF format, a section is created | 384 | * When a program's coredump is generated as ELF format, a section is created |
385 | * per a vma. In ELF, the number of sections is represented in unsigned short. | 385 | * per a vma. In ELF, the number of sections is represented in unsigned short. |
386 | * This means the number of sections should be smaller than 65535 at coredump. | 386 | * This means the number of sections should be smaller than 65535 at coredump. |
387 | * Because the kernel adds some informative sections to a image of program at | 387 | * Because the kernel adds some informative sections to a image of program at |
388 | * generating coredump, we need some margin. The number of extra sections is | 388 | * generating coredump, we need some margin. The number of extra sections is |
389 | * 1-3 now and depends on arch. We use "5" as safe margin, here. | 389 | * 1-3 now and depends on arch. We use "5" as safe margin, here. |
390 | */ | 390 | */ |
391 | #define MAPCOUNT_ELF_CORE_MARGIN (5) | 391 | #define MAPCOUNT_ELF_CORE_MARGIN (5) |
392 | #define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) | 392 | #define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) |
393 | 393 | ||
394 | extern int sysctl_max_map_count; | 394 | extern int sysctl_max_map_count; |
395 | 395 | ||
396 | #include <linux/aio.h> | 396 | #include <linux/aio.h> |
397 | 397 | ||
398 | #ifdef CONFIG_MMU | 398 | #ifdef CONFIG_MMU |
399 | extern void arch_pick_mmap_layout(struct mm_struct *mm); | 399 | extern void arch_pick_mmap_layout(struct mm_struct *mm); |
400 | extern unsigned long | 400 | extern unsigned long |
401 | arch_get_unmapped_area(struct file *, unsigned long, unsigned long, | 401 | arch_get_unmapped_area(struct file *, unsigned long, unsigned long, |
402 | unsigned long, unsigned long); | 402 | unsigned long, unsigned long); |
403 | extern unsigned long | 403 | extern unsigned long |
404 | arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, | 404 | arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, |
405 | unsigned long len, unsigned long pgoff, | 405 | unsigned long len, unsigned long pgoff, |
406 | unsigned long flags); | 406 | unsigned long flags); |
407 | extern void arch_unmap_area(struct mm_struct *, unsigned long); | 407 | extern void arch_unmap_area(struct mm_struct *, unsigned long); |
408 | extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); | 408 | extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); |
409 | #else | 409 | #else |
410 | static inline void arch_pick_mmap_layout(struct mm_struct *mm) {} | 410 | static inline void arch_pick_mmap_layout(struct mm_struct *mm) {} |
411 | #endif | 411 | #endif |
412 | 412 | ||
413 | 413 | ||
414 | extern void set_dumpable(struct mm_struct *mm, int value); | 414 | extern void set_dumpable(struct mm_struct *mm, int value); |
415 | extern int get_dumpable(struct mm_struct *mm); | 415 | extern int get_dumpable(struct mm_struct *mm); |
416 | 416 | ||
417 | /* get/set_dumpable() values */ | 417 | /* get/set_dumpable() values */ |
418 | #define SUID_DUMPABLE_DISABLED 0 | 418 | #define SUID_DUMPABLE_DISABLED 0 |
419 | #define SUID_DUMPABLE_ENABLED 1 | 419 | #define SUID_DUMPABLE_ENABLED 1 |
420 | #define SUID_DUMPABLE_SAFE 2 | 420 | #define SUID_DUMPABLE_SAFE 2 |
421 | 421 | ||
422 | /* mm flags */ | 422 | /* mm flags */ |
423 | /* dumpable bits */ | 423 | /* dumpable bits */ |
424 | #define MMF_DUMPABLE 0 /* core dump is permitted */ | 424 | #define MMF_DUMPABLE 0 /* core dump is permitted */ |
425 | #define MMF_DUMP_SECURELY 1 /* core file is readable only by root */ | 425 | #define MMF_DUMP_SECURELY 1 /* core file is readable only by root */ |
426 | 426 | ||
427 | #define MMF_DUMPABLE_BITS 2 | 427 | #define MMF_DUMPABLE_BITS 2 |
428 | #define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1) | 428 | #define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1) |
429 | 429 | ||
430 | /* coredump filter bits */ | 430 | /* coredump filter bits */ |
431 | #define MMF_DUMP_ANON_PRIVATE 2 | 431 | #define MMF_DUMP_ANON_PRIVATE 2 |
432 | #define MMF_DUMP_ANON_SHARED 3 | 432 | #define MMF_DUMP_ANON_SHARED 3 |
433 | #define MMF_DUMP_MAPPED_PRIVATE 4 | 433 | #define MMF_DUMP_MAPPED_PRIVATE 4 |
434 | #define MMF_DUMP_MAPPED_SHARED 5 | 434 | #define MMF_DUMP_MAPPED_SHARED 5 |
435 | #define MMF_DUMP_ELF_HEADERS 6 | 435 | #define MMF_DUMP_ELF_HEADERS 6 |
436 | #define MMF_DUMP_HUGETLB_PRIVATE 7 | 436 | #define MMF_DUMP_HUGETLB_PRIVATE 7 |
437 | #define MMF_DUMP_HUGETLB_SHARED 8 | 437 | #define MMF_DUMP_HUGETLB_SHARED 8 |
438 | 438 | ||
439 | #define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS | 439 | #define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS |
440 | #define MMF_DUMP_FILTER_BITS 7 | 440 | #define MMF_DUMP_FILTER_BITS 7 |
441 | #define MMF_DUMP_FILTER_MASK \ | 441 | #define MMF_DUMP_FILTER_MASK \ |
442 | (((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT) | 442 | (((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT) |
443 | #define MMF_DUMP_FILTER_DEFAULT \ | 443 | #define MMF_DUMP_FILTER_DEFAULT \ |
444 | ((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED) |\ | 444 | ((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED) |\ |
445 | (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF) | 445 | (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF) |
446 | 446 | ||
447 | #ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS | 447 | #ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS |
448 | # define MMF_DUMP_MASK_DEFAULT_ELF (1 << MMF_DUMP_ELF_HEADERS) | 448 | # define MMF_DUMP_MASK_DEFAULT_ELF (1 << MMF_DUMP_ELF_HEADERS) |
449 | #else | 449 | #else |
450 | # define MMF_DUMP_MASK_DEFAULT_ELF 0 | 450 | # define MMF_DUMP_MASK_DEFAULT_ELF 0 |
451 | #endif | 451 | #endif |
452 | /* leave room for more dump flags */ | 452 | /* leave room for more dump flags */ |
453 | #define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ | 453 | #define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ |
454 | #define MMF_VM_HUGEPAGE 17 /* set when VM_HUGEPAGE is set on vma */ | 454 | #define MMF_VM_HUGEPAGE 17 /* set when VM_HUGEPAGE is set on vma */ |
455 | #define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */ | 455 | #define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */ |
456 | 456 | ||
457 | #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) | 457 | #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) |
458 | 458 | ||
459 | struct sighand_struct { | 459 | struct sighand_struct { |
460 | atomic_t count; | 460 | atomic_t count; |
461 | struct k_sigaction action[_NSIG]; | 461 | struct k_sigaction action[_NSIG]; |
462 | spinlock_t siglock; | 462 | spinlock_t siglock; |
463 | wait_queue_head_t signalfd_wqh; | 463 | wait_queue_head_t signalfd_wqh; |
464 | }; | 464 | }; |
465 | 465 | ||
466 | struct pacct_struct { | 466 | struct pacct_struct { |
467 | int ac_flag; | 467 | int ac_flag; |
468 | long ac_exitcode; | 468 | long ac_exitcode; |
469 | unsigned long ac_mem; | 469 | unsigned long ac_mem; |
470 | cputime_t ac_utime, ac_stime; | 470 | cputime_t ac_utime, ac_stime; |
471 | unsigned long ac_minflt, ac_majflt; | 471 | unsigned long ac_minflt, ac_majflt; |
472 | }; | 472 | }; |
473 | 473 | ||
474 | struct cpu_itimer { | 474 | struct cpu_itimer { |
475 | cputime_t expires; | 475 | cputime_t expires; |
476 | cputime_t incr; | 476 | cputime_t incr; |
477 | u32 error; | 477 | u32 error; |
478 | u32 incr_error; | 478 | u32 incr_error; |
479 | }; | 479 | }; |
480 | 480 | ||
481 | /** | 481 | /** |
482 | * struct task_cputime - collected CPU time counts | 482 | * struct task_cputime - collected CPU time counts |
483 | * @utime: time spent in user mode, in &cputime_t units | 483 | * @utime: time spent in user mode, in &cputime_t units |
484 | * @stime: time spent in kernel mode, in &cputime_t units | 484 | * @stime: time spent in kernel mode, in &cputime_t units |
485 | * @sum_exec_runtime: total time spent on the CPU, in nanoseconds | 485 | * @sum_exec_runtime: total time spent on the CPU, in nanoseconds |
486 | * | 486 | * |
487 | * This structure groups together three kinds of CPU time that are | 487 | * This structure groups together three kinds of CPU time that are |
488 | * tracked for threads and thread groups. Most things considering | 488 | * tracked for threads and thread groups. Most things considering |
489 | * CPU time want to group these counts together and treat all three | 489 | * CPU time want to group these counts together and treat all three |
490 | * of them in parallel. | 490 | * of them in parallel. |
491 | */ | 491 | */ |
492 | struct task_cputime { | 492 | struct task_cputime { |
493 | cputime_t utime; | 493 | cputime_t utime; |
494 | cputime_t stime; | 494 | cputime_t stime; |
495 | unsigned long long sum_exec_runtime; | 495 | unsigned long long sum_exec_runtime; |
496 | }; | 496 | }; |
497 | /* Alternate field names when used to cache expirations. */ | 497 | /* Alternate field names when used to cache expirations. */ |
498 | #define prof_exp stime | 498 | #define prof_exp stime |
499 | #define virt_exp utime | 499 | #define virt_exp utime |
500 | #define sched_exp sum_exec_runtime | 500 | #define sched_exp sum_exec_runtime |
501 | 501 | ||
502 | #define INIT_CPUTIME \ | 502 | #define INIT_CPUTIME \ |
503 | (struct task_cputime) { \ | 503 | (struct task_cputime) { \ |
504 | .utime = 0, \ | 504 | .utime = 0, \ |
505 | .stime = 0, \ | 505 | .stime = 0, \ |
506 | .sum_exec_runtime = 0, \ | 506 | .sum_exec_runtime = 0, \ |
507 | } | 507 | } |
508 | 508 | ||
509 | /* | 509 | /* |
510 | * Disable preemption until the scheduler is running. | 510 | * Disable preemption until the scheduler is running. |
511 | * Reset by start_kernel()->sched_init()->init_idle(). | 511 | * Reset by start_kernel()->sched_init()->init_idle(). |
512 | * | 512 | * |
513 | * We include PREEMPT_ACTIVE to avoid cond_resched() from working | 513 | * We include PREEMPT_ACTIVE to avoid cond_resched() from working |
514 | * before the scheduler is active -- see should_resched(). | 514 | * before the scheduler is active -- see should_resched(). |
515 | */ | 515 | */ |
516 | #define INIT_PREEMPT_COUNT (1 + PREEMPT_ACTIVE) | 516 | #define INIT_PREEMPT_COUNT (1 + PREEMPT_ACTIVE) |
517 | 517 | ||
518 | /** | 518 | /** |
519 | * struct thread_group_cputimer - thread group interval timer counts | 519 | * struct thread_group_cputimer - thread group interval timer counts |
520 | * @cputime: thread group interval timers. | 520 | * @cputime: thread group interval timers. |
521 | * @running: non-zero when there are timers running and | 521 | * @running: non-zero when there are timers running and |
522 | * @cputime receives updates. | 522 | * @cputime receives updates. |
523 | * @lock: lock for fields in this struct. | 523 | * @lock: lock for fields in this struct. |
524 | * | 524 | * |
525 | * This structure contains the version of task_cputime, above, that is | 525 | * This structure contains the version of task_cputime, above, that is |
526 | * used for thread group CPU timer calculations. | 526 | * used for thread group CPU timer calculations. |
527 | */ | 527 | */ |
528 | struct thread_group_cputimer { | 528 | struct thread_group_cputimer { |
529 | struct task_cputime cputime; | 529 | struct task_cputime cputime; |
530 | int running; | 530 | int running; |
531 | raw_spinlock_t lock; | 531 | raw_spinlock_t lock; |
532 | }; | 532 | }; |
533 | 533 | ||
534 | #include <linux/rwsem.h> | 534 | #include <linux/rwsem.h> |
535 | struct autogroup; | 535 | struct autogroup; |
536 | 536 | ||
537 | /* | 537 | /* |
538 | * NOTE! "signal_struct" does not have its own | 538 | * NOTE! "signal_struct" does not have its own |
539 | * locking, because a shared signal_struct always | 539 | * locking, because a shared signal_struct always |
540 | * implies a shared sighand_struct, so locking | 540 | * implies a shared sighand_struct, so locking |
541 | * sighand_struct is always a proper superset of | 541 | * sighand_struct is always a proper superset of |
542 | * the locking of signal_struct. | 542 | * the locking of signal_struct. |
543 | */ | 543 | */ |
544 | struct signal_struct { | 544 | struct signal_struct { |
545 | atomic_t sigcnt; | 545 | atomic_t sigcnt; |
546 | atomic_t live; | 546 | atomic_t live; |
547 | int nr_threads; | 547 | int nr_threads; |
548 | 548 | ||
549 | wait_queue_head_t wait_chldexit; /* for wait4() */ | 549 | wait_queue_head_t wait_chldexit; /* for wait4() */ |
550 | 550 | ||
551 | /* current thread group signal load-balancing target: */ | 551 | /* current thread group signal load-balancing target: */ |
552 | struct task_struct *curr_target; | 552 | struct task_struct *curr_target; |
553 | 553 | ||
554 | /* shared signal handling: */ | 554 | /* shared signal handling: */ |
555 | struct sigpending shared_pending; | 555 | struct sigpending shared_pending; |
556 | 556 | ||
557 | /* thread group exit support */ | 557 | /* thread group exit support */ |
558 | int group_exit_code; | 558 | int group_exit_code; |
559 | /* overloaded: | 559 | /* overloaded: |
560 | * - notify group_exit_task when ->count is equal to notify_count | 560 | * - notify group_exit_task when ->count is equal to notify_count |
561 | * - everyone except group_exit_task is stopped during signal delivery | 561 | * - everyone except group_exit_task is stopped during signal delivery |
562 | * of fatal signals, group_exit_task processes the signal. | 562 | * of fatal signals, group_exit_task processes the signal. |
563 | */ | 563 | */ |
564 | int notify_count; | 564 | int notify_count; |
565 | struct task_struct *group_exit_task; | 565 | struct task_struct *group_exit_task; |
566 | 566 | ||
567 | /* thread group stop support, overloads group_exit_code too */ | 567 | /* thread group stop support, overloads group_exit_code too */ |
568 | int group_stop_count; | 568 | int group_stop_count; |
569 | unsigned int flags; /* see SIGNAL_* flags below */ | 569 | unsigned int flags; /* see SIGNAL_* flags below */ |
570 | 570 | ||
571 | /* | 571 | /* |
572 | * PR_SET_CHILD_SUBREAPER marks a process, like a service | 572 | * PR_SET_CHILD_SUBREAPER marks a process, like a service |
573 | * manager, to re-parent orphan (double-forking) child processes | 573 | * manager, to re-parent orphan (double-forking) child processes |
574 | * to this process instead of 'init'. The service manager is | 574 | * to this process instead of 'init'. The service manager is |
575 | * able to receive SIGCHLD signals and is able to investigate | 575 | * able to receive SIGCHLD signals and is able to investigate |
576 | * the process until it calls wait(). All children of this | 576 | * the process until it calls wait(). All children of this |
577 | * process will inherit a flag if they should look for a | 577 | * process will inherit a flag if they should look for a |
578 | * child_subreaper process at exit. | 578 | * child_subreaper process at exit. |
579 | */ | 579 | */ |
580 | unsigned int is_child_subreaper:1; | 580 | unsigned int is_child_subreaper:1; |
581 | unsigned int has_child_subreaper:1; | 581 | unsigned int has_child_subreaper:1; |
582 | 582 | ||
583 | /* POSIX.1b Interval Timers */ | 583 | /* POSIX.1b Interval Timers */ |
584 | struct list_head posix_timers; | 584 | struct list_head posix_timers; |
585 | 585 | ||
586 | /* ITIMER_REAL timer for the process */ | 586 | /* ITIMER_REAL timer for the process */ |
587 | struct hrtimer real_timer; | 587 | struct hrtimer real_timer; |
588 | struct pid *leader_pid; | 588 | struct pid *leader_pid; |
589 | ktime_t it_real_incr; | 589 | ktime_t it_real_incr; |
590 | 590 | ||
591 | /* | 591 | /* |
592 | * ITIMER_PROF and ITIMER_VIRTUAL timers for the process, we use | 592 | * ITIMER_PROF and ITIMER_VIRTUAL timers for the process, we use |
593 | * CPUCLOCK_PROF and CPUCLOCK_VIRT for indexing array as these | 593 | * CPUCLOCK_PROF and CPUCLOCK_VIRT for indexing array as these |
594 | * values are defined to 0 and 1 respectively | 594 | * values are defined to 0 and 1 respectively |
595 | */ | 595 | */ |
596 | struct cpu_itimer it[2]; | 596 | struct cpu_itimer it[2]; |
597 | 597 | ||
598 | /* | 598 | /* |
599 | * Thread group totals for process CPU timers. | 599 | * Thread group totals for process CPU timers. |
600 | * See thread_group_cputimer(), et al, for details. | 600 | * See thread_group_cputimer(), et al, for details. |
601 | */ | 601 | */ |
602 | struct thread_group_cputimer cputimer; | 602 | struct thread_group_cputimer cputimer; |
603 | 603 | ||
604 | /* Earliest-expiration cache. */ | 604 | /* Earliest-expiration cache. */ |
605 | struct task_cputime cputime_expires; | 605 | struct task_cputime cputime_expires; |
606 | 606 | ||
607 | struct list_head cpu_timers[3]; | 607 | struct list_head cpu_timers[3]; |
608 | 608 | ||
609 | struct pid *tty_old_pgrp; | 609 | struct pid *tty_old_pgrp; |
610 | 610 | ||
611 | /* boolean value for session group leader */ | 611 | /* boolean value for session group leader */ |
612 | int leader; | 612 | int leader; |
613 | 613 | ||
614 | struct tty_struct *tty; /* NULL if no tty */ | 614 | struct tty_struct *tty; /* NULL if no tty */ |
615 | 615 | ||
616 | #ifdef CONFIG_SCHED_AUTOGROUP | 616 | #ifdef CONFIG_SCHED_AUTOGROUP |
617 | struct autogroup *autogroup; | 617 | struct autogroup *autogroup; |
618 | #endif | 618 | #endif |
619 | /* | 619 | /* |
620 | * Cumulative resource counters for dead threads in the group, | 620 | * Cumulative resource counters for dead threads in the group, |
621 | * and for reaped dead child processes forked by this group. | 621 | * and for reaped dead child processes forked by this group. |
622 | * Live threads maintain their own counters and add to these | 622 | * Live threads maintain their own counters and add to these |
623 | * in __exit_signal, except for the group leader. | 623 | * in __exit_signal, except for the group leader. |
624 | */ | 624 | */ |
625 | cputime_t utime, stime, cutime, cstime; | 625 | cputime_t utime, stime, cutime, cstime; |
626 | cputime_t gtime; | 626 | cputime_t gtime; |
627 | cputime_t cgtime; | 627 | cputime_t cgtime; |
628 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | 628 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING |
629 | cputime_t prev_utime, prev_stime; | 629 | cputime_t prev_utime, prev_stime; |
630 | #endif | 630 | #endif |
631 | unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; | 631 | unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; |
632 | unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; | 632 | unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; |
633 | unsigned long inblock, oublock, cinblock, coublock; | 633 | unsigned long inblock, oublock, cinblock, coublock; |
634 | unsigned long maxrss, cmaxrss; | 634 | unsigned long maxrss, cmaxrss; |
635 | struct task_io_accounting ioac; | 635 | struct task_io_accounting ioac; |
636 | 636 | ||
637 | /* | 637 | /* |
638 | * Cumulative ns of schedule CPU time fo dead threads in the | 638 | * Cumulative ns of schedule CPU time fo dead threads in the |
639 | * group, not including a zombie group leader, (This only differs | 639 | * group, not including a zombie group leader, (This only differs |
640 | * from jiffies_to_ns(utime + stime) if sched_clock uses something | 640 | * from jiffies_to_ns(utime + stime) if sched_clock uses something |
641 | * other than jiffies.) | 641 | * other than jiffies.) |
642 | */ | 642 | */ |
643 | unsigned long long sum_sched_runtime; | 643 | unsigned long long sum_sched_runtime; |
644 | 644 | ||
645 | /* | 645 | /* |
646 | * We don't bother to synchronize most readers of this at all, | 646 | * We don't bother to synchronize most readers of this at all, |
647 | * because there is no reader checking a limit that actually needs | 647 | * because there is no reader checking a limit that actually needs |
648 | * to get both rlim_cur and rlim_max atomically, and either one | 648 | * to get both rlim_cur and rlim_max atomically, and either one |
649 | * alone is a single word that can safely be read normally. | 649 | * alone is a single word that can safely be read normally. |
650 | * getrlimit/setrlimit use task_lock(current->group_leader) to | 650 | * getrlimit/setrlimit use task_lock(current->group_leader) to |
651 | * protect this instead of the siglock, because they really | 651 | * protect this instead of the siglock, because they really |
652 | * have no need to disable irqs. | 652 | * have no need to disable irqs. |
653 | */ | 653 | */ |
654 | struct rlimit rlim[RLIM_NLIMITS]; | 654 | struct rlimit rlim[RLIM_NLIMITS]; |
655 | 655 | ||
656 | #ifdef CONFIG_BSD_PROCESS_ACCT | 656 | #ifdef CONFIG_BSD_PROCESS_ACCT |
657 | struct pacct_struct pacct; /* per-process accounting information */ | 657 | struct pacct_struct pacct; /* per-process accounting information */ |
658 | #endif | 658 | #endif |
659 | #ifdef CONFIG_TASKSTATS | 659 | #ifdef CONFIG_TASKSTATS |
660 | struct taskstats *stats; | 660 | struct taskstats *stats; |
661 | #endif | 661 | #endif |
662 | #ifdef CONFIG_AUDIT | 662 | #ifdef CONFIG_AUDIT |
663 | unsigned audit_tty; | 663 | unsigned audit_tty; |
664 | struct tty_audit_buf *tty_audit_buf; | 664 | struct tty_audit_buf *tty_audit_buf; |
665 | #endif | 665 | #endif |
666 | #ifdef CONFIG_CGROUPS | 666 | #ifdef CONFIG_CGROUPS |
667 | /* | 667 | /* |
668 | * group_rwsem prevents new tasks from entering the threadgroup and | 668 | * group_rwsem prevents new tasks from entering the threadgroup and |
669 | * member tasks from exiting,a more specifically, setting of | 669 | * member tasks from exiting,a more specifically, setting of |
670 | * PF_EXITING. fork and exit paths are protected with this rwsem | 670 | * PF_EXITING. fork and exit paths are protected with this rwsem |
671 | * using threadgroup_change_begin/end(). Users which require | 671 | * using threadgroup_change_begin/end(). Users which require |
672 | * threadgroup to remain stable should use threadgroup_[un]lock() | 672 | * threadgroup to remain stable should use threadgroup_[un]lock() |
673 | * which also takes care of exec path. Currently, cgroup is the | 673 | * which also takes care of exec path. Currently, cgroup is the |
674 | * only user. | 674 | * only user. |
675 | */ | 675 | */ |
676 | struct rw_semaphore group_rwsem; | 676 | struct rw_semaphore group_rwsem; |
677 | #endif | 677 | #endif |
678 | 678 | ||
679 | int oom_adj; /* OOM kill score adjustment (bit shift) */ | 679 | int oom_adj; /* OOM kill score adjustment (bit shift) */ |
680 | int oom_score_adj; /* OOM kill score adjustment */ | 680 | int oom_score_adj; /* OOM kill score adjustment */ |
681 | int oom_score_adj_min; /* OOM kill score adjustment minimum value. | 681 | int oom_score_adj_min; /* OOM kill score adjustment minimum value. |
682 | * Only settable by CAP_SYS_RESOURCE. */ | 682 | * Only settable by CAP_SYS_RESOURCE. */ |
683 | 683 | ||
684 | struct mutex cred_guard_mutex; /* guard against foreign influences on | 684 | struct mutex cred_guard_mutex; /* guard against foreign influences on |
685 | * credential calculations | 685 | * credential calculations |
686 | * (notably. ptrace) */ | 686 | * (notably. ptrace) */ |
687 | }; | 687 | }; |
688 | 688 | ||
689 | /* Context switch must be unlocked if interrupts are to be enabled */ | 689 | /* Context switch must be unlocked if interrupts are to be enabled */ |
690 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | 690 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
691 | # define __ARCH_WANT_UNLOCKED_CTXSW | 691 | # define __ARCH_WANT_UNLOCKED_CTXSW |
692 | #endif | 692 | #endif |
693 | 693 | ||
694 | /* | 694 | /* |
695 | * Bits in flags field of signal_struct. | 695 | * Bits in flags field of signal_struct. |
696 | */ | 696 | */ |
697 | #define SIGNAL_STOP_STOPPED 0x00000001 /* job control stop in effect */ | 697 | #define SIGNAL_STOP_STOPPED 0x00000001 /* job control stop in effect */ |
698 | #define SIGNAL_STOP_CONTINUED 0x00000002 /* SIGCONT since WCONTINUED reap */ | 698 | #define SIGNAL_STOP_CONTINUED 0x00000002 /* SIGCONT since WCONTINUED reap */ |
699 | #define SIGNAL_GROUP_EXIT 0x00000004 /* group exit in progress */ | 699 | #define SIGNAL_GROUP_EXIT 0x00000004 /* group exit in progress */ |
700 | /* | 700 | /* |
701 | * Pending notifications to parent. | 701 | * Pending notifications to parent. |
702 | */ | 702 | */ |
703 | #define SIGNAL_CLD_STOPPED 0x00000010 | 703 | #define SIGNAL_CLD_STOPPED 0x00000010 |
704 | #define SIGNAL_CLD_CONTINUED 0x00000020 | 704 | #define SIGNAL_CLD_CONTINUED 0x00000020 |
705 | #define SIGNAL_CLD_MASK (SIGNAL_CLD_STOPPED|SIGNAL_CLD_CONTINUED) | 705 | #define SIGNAL_CLD_MASK (SIGNAL_CLD_STOPPED|SIGNAL_CLD_CONTINUED) |
706 | 706 | ||
707 | #define SIGNAL_UNKILLABLE 0x00000040 /* for init: ignore fatal signals */ | 707 | #define SIGNAL_UNKILLABLE 0x00000040 /* for init: ignore fatal signals */ |
708 | 708 | ||
709 | /* If true, all threads except ->group_exit_task have pending SIGKILL */ | 709 | /* If true, all threads except ->group_exit_task have pending SIGKILL */ |
710 | static inline int signal_group_exit(const struct signal_struct *sig) | 710 | static inline int signal_group_exit(const struct signal_struct *sig) |
711 | { | 711 | { |
712 | return (sig->flags & SIGNAL_GROUP_EXIT) || | 712 | return (sig->flags & SIGNAL_GROUP_EXIT) || |
713 | (sig->group_exit_task != NULL); | 713 | (sig->group_exit_task != NULL); |
714 | } | 714 | } |
715 | 715 | ||
716 | /* | 716 | /* |
717 | * Some day this will be a full-fledged user tracking system.. | 717 | * Some day this will be a full-fledged user tracking system.. |
718 | */ | 718 | */ |
719 | struct user_struct { | 719 | struct user_struct { |
720 | atomic_t __count; /* reference count */ | 720 | atomic_t __count; /* reference count */ |
721 | atomic_t processes; /* How many processes does this user have? */ | 721 | atomic_t processes; /* How many processes does this user have? */ |
722 | atomic_t files; /* How many open files does this user have? */ | 722 | atomic_t files; /* How many open files does this user have? */ |
723 | atomic_t sigpending; /* How many pending signals does this user have? */ | 723 | atomic_t sigpending; /* How many pending signals does this user have? */ |
724 | #ifdef CONFIG_INOTIFY_USER | 724 | #ifdef CONFIG_INOTIFY_USER |
725 | atomic_t inotify_watches; /* How many inotify watches does this user have? */ | 725 | atomic_t inotify_watches; /* How many inotify watches does this user have? */ |
726 | atomic_t inotify_devs; /* How many inotify devs does this user have opened? */ | 726 | atomic_t inotify_devs; /* How many inotify devs does this user have opened? */ |
727 | #endif | 727 | #endif |
728 | #ifdef CONFIG_FANOTIFY | 728 | #ifdef CONFIG_FANOTIFY |
729 | atomic_t fanotify_listeners; | 729 | atomic_t fanotify_listeners; |
730 | #endif | 730 | #endif |
731 | #ifdef CONFIG_EPOLL | 731 | #ifdef CONFIG_EPOLL |
732 | atomic_long_t epoll_watches; /* The number of file descriptors currently watched */ | 732 | atomic_long_t epoll_watches; /* The number of file descriptors currently watched */ |
733 | #endif | 733 | #endif |
734 | #ifdef CONFIG_POSIX_MQUEUE | 734 | #ifdef CONFIG_POSIX_MQUEUE |
735 | /* protected by mq_lock */ | 735 | /* protected by mq_lock */ |
736 | unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */ | 736 | unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */ |
737 | #endif | 737 | #endif |
738 | unsigned long locked_shm; /* How many pages of mlocked shm ? */ | 738 | unsigned long locked_shm; /* How many pages of mlocked shm ? */ |
739 | 739 | ||
740 | #ifdef CONFIG_KEYS | 740 | #ifdef CONFIG_KEYS |
741 | struct key *uid_keyring; /* UID specific keyring */ | 741 | struct key *uid_keyring; /* UID specific keyring */ |
742 | struct key *session_keyring; /* UID's default session keyring */ | 742 | struct key *session_keyring; /* UID's default session keyring */ |
743 | #endif | 743 | #endif |
744 | 744 | ||
745 | /* Hash table maintenance information */ | 745 | /* Hash table maintenance information */ |
746 | struct hlist_node uidhash_node; | 746 | struct hlist_node uidhash_node; |
747 | kuid_t uid; | 747 | kuid_t uid; |
748 | 748 | ||
749 | #ifdef CONFIG_PERF_EVENTS | 749 | #ifdef CONFIG_PERF_EVENTS |
750 | atomic_long_t locked_vm; | 750 | atomic_long_t locked_vm; |
751 | #endif | 751 | #endif |
752 | }; | 752 | }; |
753 | 753 | ||
754 | extern int uids_sysfs_init(void); | 754 | extern int uids_sysfs_init(void); |
755 | 755 | ||
756 | extern struct user_struct *find_user(kuid_t); | 756 | extern struct user_struct *find_user(kuid_t); |
757 | 757 | ||
758 | extern struct user_struct root_user; | 758 | extern struct user_struct root_user; |
759 | #define INIT_USER (&root_user) | 759 | #define INIT_USER (&root_user) |
760 | 760 | ||
761 | 761 | ||
762 | struct backing_dev_info; | 762 | struct backing_dev_info; |
763 | struct reclaim_state; | 763 | struct reclaim_state; |
764 | 764 | ||
765 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | 765 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
766 | struct sched_info { | 766 | struct sched_info { |
767 | /* cumulative counters */ | 767 | /* cumulative counters */ |
768 | unsigned long pcount; /* # of times run on this cpu */ | 768 | unsigned long pcount; /* # of times run on this cpu */ |
769 | unsigned long long run_delay; /* time spent waiting on a runqueue */ | 769 | unsigned long long run_delay; /* time spent waiting on a runqueue */ |
770 | 770 | ||
771 | /* timestamps */ | 771 | /* timestamps */ |
772 | unsigned long long last_arrival,/* when we last ran on a cpu */ | 772 | unsigned long long last_arrival,/* when we last ran on a cpu */ |
773 | last_queued; /* when we were last queued to run */ | 773 | last_queued; /* when we were last queued to run */ |
774 | }; | 774 | }; |
775 | #endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */ | 775 | #endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */ |
776 | 776 | ||
777 | #ifdef CONFIG_TASK_DELAY_ACCT | 777 | #ifdef CONFIG_TASK_DELAY_ACCT |
778 | struct task_delay_info { | 778 | struct task_delay_info { |
779 | spinlock_t lock; | 779 | spinlock_t lock; |
780 | unsigned int flags; /* Private per-task flags */ | 780 | unsigned int flags; /* Private per-task flags */ |
781 | 781 | ||
782 | /* For each stat XXX, add following, aligned appropriately | 782 | /* For each stat XXX, add following, aligned appropriately |
783 | * | 783 | * |
784 | * struct timespec XXX_start, XXX_end; | 784 | * struct timespec XXX_start, XXX_end; |
785 | * u64 XXX_delay; | 785 | * u64 XXX_delay; |
786 | * u32 XXX_count; | 786 | * u32 XXX_count; |
787 | * | 787 | * |
788 | * Atomicity of updates to XXX_delay, XXX_count protected by | 788 | * Atomicity of updates to XXX_delay, XXX_count protected by |
789 | * single lock above (split into XXX_lock if contention is an issue). | 789 | * single lock above (split into XXX_lock if contention is an issue). |
790 | */ | 790 | */ |
791 | 791 | ||
792 | /* | 792 | /* |
793 | * XXX_count is incremented on every XXX operation, the delay | 793 | * XXX_count is incremented on every XXX operation, the delay |
794 | * associated with the operation is added to XXX_delay. | 794 | * associated with the operation is added to XXX_delay. |
795 | * XXX_delay contains the accumulated delay time in nanoseconds. | 795 | * XXX_delay contains the accumulated delay time in nanoseconds. |
796 | */ | 796 | */ |
797 | struct timespec blkio_start, blkio_end; /* Shared by blkio, swapin */ | 797 | struct timespec blkio_start, blkio_end; /* Shared by blkio, swapin */ |
798 | u64 blkio_delay; /* wait for sync block io completion */ | 798 | u64 blkio_delay; /* wait for sync block io completion */ |
799 | u64 swapin_delay; /* wait for swapin block io completion */ | 799 | u64 swapin_delay; /* wait for swapin block io completion */ |
800 | u32 blkio_count; /* total count of the number of sync block */ | 800 | u32 blkio_count; /* total count of the number of sync block */ |
801 | /* io operations performed */ | 801 | /* io operations performed */ |
802 | u32 swapin_count; /* total count of the number of swapin block */ | 802 | u32 swapin_count; /* total count of the number of swapin block */ |
803 | /* io operations performed */ | 803 | /* io operations performed */ |
804 | 804 | ||
805 | struct timespec freepages_start, freepages_end; | 805 | struct timespec freepages_start, freepages_end; |
806 | u64 freepages_delay; /* wait for memory reclaim */ | 806 | u64 freepages_delay; /* wait for memory reclaim */ |
807 | u32 freepages_count; /* total count of memory reclaim */ | 807 | u32 freepages_count; /* total count of memory reclaim */ |
808 | }; | 808 | }; |
809 | #endif /* CONFIG_TASK_DELAY_ACCT */ | 809 | #endif /* CONFIG_TASK_DELAY_ACCT */ |
810 | 810 | ||
811 | static inline int sched_info_on(void) | 811 | static inline int sched_info_on(void) |
812 | { | 812 | { |
813 | #ifdef CONFIG_SCHEDSTATS | 813 | #ifdef CONFIG_SCHEDSTATS |
814 | return 1; | 814 | return 1; |
815 | #elif defined(CONFIG_TASK_DELAY_ACCT) | 815 | #elif defined(CONFIG_TASK_DELAY_ACCT) |
816 | extern int delayacct_on; | 816 | extern int delayacct_on; |
817 | return delayacct_on; | 817 | return delayacct_on; |
818 | #else | 818 | #else |
819 | return 0; | 819 | return 0; |
820 | #endif | 820 | #endif |
821 | } | 821 | } |
822 | 822 | ||
823 | enum cpu_idle_type { | 823 | enum cpu_idle_type { |
824 | CPU_IDLE, | 824 | CPU_IDLE, |
825 | CPU_NOT_IDLE, | 825 | CPU_NOT_IDLE, |
826 | CPU_NEWLY_IDLE, | 826 | CPU_NEWLY_IDLE, |
827 | CPU_MAX_IDLE_TYPES | 827 | CPU_MAX_IDLE_TYPES |
828 | }; | 828 | }; |
829 | 829 | ||
830 | /* | 830 | /* |
831 | * Increase resolution of nice-level calculations for 64-bit architectures. | 831 | * Increase resolution of nice-level calculations for 64-bit architectures. |
832 | * The extra resolution improves shares distribution and load balancing of | 832 | * The extra resolution improves shares distribution and load balancing of |
833 | * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup | 833 | * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup |
834 | * hierarchies, especially on larger systems. This is not a user-visible change | 834 | * hierarchies, especially on larger systems. This is not a user-visible change |
835 | * and does not change the user-interface for setting shares/weights. | 835 | * and does not change the user-interface for setting shares/weights. |
836 | * | 836 | * |
837 | * We increase resolution only if we have enough bits to allow this increased | 837 | * We increase resolution only if we have enough bits to allow this increased |
838 | * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution | 838 | * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution |
839 | * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the | 839 | * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the |
840 | * increased costs. | 840 | * increased costs. |
841 | */ | 841 | */ |
842 | #if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load */ | 842 | #if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load */ |
843 | # define SCHED_LOAD_RESOLUTION 10 | 843 | # define SCHED_LOAD_RESOLUTION 10 |
844 | # define scale_load(w) ((w) << SCHED_LOAD_RESOLUTION) | 844 | # define scale_load(w) ((w) << SCHED_LOAD_RESOLUTION) |
845 | # define scale_load_down(w) ((w) >> SCHED_LOAD_RESOLUTION) | 845 | # define scale_load_down(w) ((w) >> SCHED_LOAD_RESOLUTION) |
846 | #else | 846 | #else |
847 | # define SCHED_LOAD_RESOLUTION 0 | 847 | # define SCHED_LOAD_RESOLUTION 0 |
848 | # define scale_load(w) (w) | 848 | # define scale_load(w) (w) |
849 | # define scale_load_down(w) (w) | 849 | # define scale_load_down(w) (w) |
850 | #endif | 850 | #endif |
851 | 851 | ||
852 | #define SCHED_LOAD_SHIFT (10 + SCHED_LOAD_RESOLUTION) | 852 | #define SCHED_LOAD_SHIFT (10 + SCHED_LOAD_RESOLUTION) |
853 | #define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT) | 853 | #define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT) |
854 | 854 | ||
855 | /* | 855 | /* |
856 | * Increase resolution of cpu_power calculations | 856 | * Increase resolution of cpu_power calculations |
857 | */ | 857 | */ |
858 | #define SCHED_POWER_SHIFT 10 | 858 | #define SCHED_POWER_SHIFT 10 |
859 | #define SCHED_POWER_SCALE (1L << SCHED_POWER_SHIFT) | 859 | #define SCHED_POWER_SCALE (1L << SCHED_POWER_SHIFT) |
860 | 860 | ||
861 | /* | 861 | /* |
862 | * sched-domains (multiprocessor balancing) declarations: | 862 | * sched-domains (multiprocessor balancing) declarations: |
863 | */ | 863 | */ |
864 | #ifdef CONFIG_SMP | 864 | #ifdef CONFIG_SMP |
865 | #define SD_LOAD_BALANCE 0x0001 /* Do load balancing on this domain. */ | 865 | #define SD_LOAD_BALANCE 0x0001 /* Do load balancing on this domain. */ |
866 | #define SD_BALANCE_NEWIDLE 0x0002 /* Balance when about to become idle */ | 866 | #define SD_BALANCE_NEWIDLE 0x0002 /* Balance when about to become idle */ |
867 | #define SD_BALANCE_EXEC 0x0004 /* Balance on exec */ | 867 | #define SD_BALANCE_EXEC 0x0004 /* Balance on exec */ |
868 | #define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */ | 868 | #define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */ |
869 | #define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */ | 869 | #define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */ |
870 | #define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ | 870 | #define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ |
871 | #define SD_PREFER_LOCAL 0x0040 /* Prefer to keep tasks local to this domain */ | 871 | #define SD_PREFER_LOCAL 0x0040 /* Prefer to keep tasks local to this domain */ |
872 | #define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */ | 872 | #define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */ |
873 | #define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ | 873 | #define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ |
874 | #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ | 874 | #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ |
875 | #define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */ | 875 | #define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */ |
876 | #define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ | 876 | #define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ |
877 | #define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */ | 877 | #define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */ |
878 | 878 | ||
879 | extern int __weak arch_sd_sibiling_asym_packing(void); | 879 | extern int __weak arch_sd_sibiling_asym_packing(void); |
880 | 880 | ||
881 | struct sched_group_power { | 881 | struct sched_group_power { |
882 | atomic_t ref; | 882 | atomic_t ref; |
883 | /* | 883 | /* |
884 | * CPU power of this group, SCHED_LOAD_SCALE being max power for a | 884 | * CPU power of this group, SCHED_LOAD_SCALE being max power for a |
885 | * single CPU. | 885 | * single CPU. |
886 | */ | 886 | */ |
887 | unsigned int power, power_orig; | 887 | unsigned int power, power_orig; |
888 | unsigned long next_update; | 888 | unsigned long next_update; |
889 | /* | 889 | /* |
890 | * Number of busy cpus in this group. | 890 | * Number of busy cpus in this group. |
891 | */ | 891 | */ |
892 | atomic_t nr_busy_cpus; | 892 | atomic_t nr_busy_cpus; |
893 | 893 | ||
894 | unsigned long cpumask[0]; /* iteration mask */ | 894 | unsigned long cpumask[0]; /* iteration mask */ |
895 | }; | 895 | }; |
896 | 896 | ||
897 | struct sched_group { | 897 | struct sched_group { |
898 | struct sched_group *next; /* Must be a circular list */ | 898 | struct sched_group *next; /* Must be a circular list */ |
899 | atomic_t ref; | 899 | atomic_t ref; |
900 | 900 | ||
901 | unsigned int group_weight; | 901 | unsigned int group_weight; |
902 | struct sched_group_power *sgp; | 902 | struct sched_group_power *sgp; |
903 | 903 | ||
904 | /* | 904 | /* |
905 | * The CPUs this group covers. | 905 | * The CPUs this group covers. |
906 | * | 906 | * |
907 | * NOTE: this field is variable length. (Allocated dynamically | 907 | * NOTE: this field is variable length. (Allocated dynamically |
908 | * by attaching extra space to the end of the structure, | 908 | * by attaching extra space to the end of the structure, |
909 | * depending on how many CPUs the kernel has booted up with) | 909 | * depending on how many CPUs the kernel has booted up with) |
910 | */ | 910 | */ |
911 | unsigned long cpumask[0]; | 911 | unsigned long cpumask[0]; |
912 | }; | 912 | }; |
913 | 913 | ||
914 | static inline struct cpumask *sched_group_cpus(struct sched_group *sg) | 914 | static inline struct cpumask *sched_group_cpus(struct sched_group *sg) |
915 | { | 915 | { |
916 | return to_cpumask(sg->cpumask); | 916 | return to_cpumask(sg->cpumask); |
917 | } | 917 | } |
918 | 918 | ||
919 | /* | 919 | /* |
920 | * cpumask masking which cpus in the group are allowed to iterate up the domain | 920 | * cpumask masking which cpus in the group are allowed to iterate up the domain |
921 | * tree. | 921 | * tree. |
922 | */ | 922 | */ |
923 | static inline struct cpumask *sched_group_mask(struct sched_group *sg) | 923 | static inline struct cpumask *sched_group_mask(struct sched_group *sg) |
924 | { | 924 | { |
925 | return to_cpumask(sg->sgp->cpumask); | 925 | return to_cpumask(sg->sgp->cpumask); |
926 | } | 926 | } |
927 | 927 | ||
928 | /** | 928 | /** |
929 | * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. | 929 | * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. |
930 | * @group: The group whose first cpu is to be returned. | 930 | * @group: The group whose first cpu is to be returned. |
931 | */ | 931 | */ |
932 | static inline unsigned int group_first_cpu(struct sched_group *group) | 932 | static inline unsigned int group_first_cpu(struct sched_group *group) |
933 | { | 933 | { |
934 | return cpumask_first(sched_group_cpus(group)); | 934 | return cpumask_first(sched_group_cpus(group)); |
935 | } | 935 | } |
936 | 936 | ||
937 | struct sched_domain_attr { | 937 | struct sched_domain_attr { |
938 | int relax_domain_level; | 938 | int relax_domain_level; |
939 | }; | 939 | }; |
940 | 940 | ||
941 | #define SD_ATTR_INIT (struct sched_domain_attr) { \ | 941 | #define SD_ATTR_INIT (struct sched_domain_attr) { \ |
942 | .relax_domain_level = -1, \ | 942 | .relax_domain_level = -1, \ |
943 | } | 943 | } |
944 | 944 | ||
945 | extern int sched_domain_level_max; | 945 | extern int sched_domain_level_max; |
946 | 946 | ||
947 | struct sched_domain { | 947 | struct sched_domain { |
948 | /* These fields must be setup */ | 948 | /* These fields must be setup */ |
949 | struct sched_domain *parent; /* top domain must be null terminated */ | 949 | struct sched_domain *parent; /* top domain must be null terminated */ |
950 | struct sched_domain *child; /* bottom domain must be null terminated */ | 950 | struct sched_domain *child; /* bottom domain must be null terminated */ |
951 | struct sched_group *groups; /* the balancing groups of the domain */ | 951 | struct sched_group *groups; /* the balancing groups of the domain */ |
952 | unsigned long min_interval; /* Minimum balance interval ms */ | 952 | unsigned long min_interval; /* Minimum balance interval ms */ |
953 | unsigned long max_interval; /* Maximum balance interval ms */ | 953 | unsigned long max_interval; /* Maximum balance interval ms */ |
954 | unsigned int busy_factor; /* less balancing by factor if busy */ | 954 | unsigned int busy_factor; /* less balancing by factor if busy */ |
955 | unsigned int imbalance_pct; /* No balance until over watermark */ | 955 | unsigned int imbalance_pct; /* No balance until over watermark */ |
956 | unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */ | 956 | unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */ |
957 | unsigned int busy_idx; | 957 | unsigned int busy_idx; |
958 | unsigned int idle_idx; | 958 | unsigned int idle_idx; |
959 | unsigned int newidle_idx; | 959 | unsigned int newidle_idx; |
960 | unsigned int wake_idx; | 960 | unsigned int wake_idx; |
961 | unsigned int forkexec_idx; | 961 | unsigned int forkexec_idx; |
962 | unsigned int smt_gain; | 962 | unsigned int smt_gain; |
963 | int flags; /* See SD_* */ | 963 | int flags; /* See SD_* */ |
964 | int level; | 964 | int level; |
965 | int idle_buddy; /* cpu assigned to select_idle_sibling() */ | 965 | int idle_buddy; /* cpu assigned to select_idle_sibling() */ |
966 | 966 | ||
967 | /* Runtime fields. */ | 967 | /* Runtime fields. */ |
968 | unsigned long last_balance; /* init to jiffies. units in jiffies */ | 968 | unsigned long last_balance; /* init to jiffies. units in jiffies */ |
969 | unsigned int balance_interval; /* initialise to 1. units in ms. */ | 969 | unsigned int balance_interval; /* initialise to 1. units in ms. */ |
970 | unsigned int nr_balance_failed; /* initialise to 0 */ | 970 | unsigned int nr_balance_failed; /* initialise to 0 */ |
971 | 971 | ||
972 | u64 last_update; | 972 | u64 last_update; |
973 | 973 | ||
974 | #ifdef CONFIG_SCHEDSTATS | 974 | #ifdef CONFIG_SCHEDSTATS |
975 | /* load_balance() stats */ | 975 | /* load_balance() stats */ |
976 | unsigned int lb_count[CPU_MAX_IDLE_TYPES]; | 976 | unsigned int lb_count[CPU_MAX_IDLE_TYPES]; |
977 | unsigned int lb_failed[CPU_MAX_IDLE_TYPES]; | 977 | unsigned int lb_failed[CPU_MAX_IDLE_TYPES]; |
978 | unsigned int lb_balanced[CPU_MAX_IDLE_TYPES]; | 978 | unsigned int lb_balanced[CPU_MAX_IDLE_TYPES]; |
979 | unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES]; | 979 | unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES]; |
980 | unsigned int lb_gained[CPU_MAX_IDLE_TYPES]; | 980 | unsigned int lb_gained[CPU_MAX_IDLE_TYPES]; |
981 | unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES]; | 981 | unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES]; |
982 | unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES]; | 982 | unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES]; |
983 | unsigned int lb_nobusyq[CPU_MAX_IDLE_TYPES]; | 983 | unsigned int lb_nobusyq[CPU_MAX_IDLE_TYPES]; |
984 | 984 | ||
985 | /* Active load balancing */ | 985 | /* Active load balancing */ |
986 | unsigned int alb_count; | 986 | unsigned int alb_count; |
987 | unsigned int alb_failed; | 987 | unsigned int alb_failed; |
988 | unsigned int alb_pushed; | 988 | unsigned int alb_pushed; |
989 | 989 | ||
990 | /* SD_BALANCE_EXEC stats */ | 990 | /* SD_BALANCE_EXEC stats */ |
991 | unsigned int sbe_count; | 991 | unsigned int sbe_count; |
992 | unsigned int sbe_balanced; | 992 | unsigned int sbe_balanced; |
993 | unsigned int sbe_pushed; | 993 | unsigned int sbe_pushed; |
994 | 994 | ||
995 | /* SD_BALANCE_FORK stats */ | 995 | /* SD_BALANCE_FORK stats */ |
996 | unsigned int sbf_count; | 996 | unsigned int sbf_count; |
997 | unsigned int sbf_balanced; | 997 | unsigned int sbf_balanced; |
998 | unsigned int sbf_pushed; | 998 | unsigned int sbf_pushed; |
999 | 999 | ||
1000 | /* try_to_wake_up() stats */ | 1000 | /* try_to_wake_up() stats */ |
1001 | unsigned int ttwu_wake_remote; | 1001 | unsigned int ttwu_wake_remote; |
1002 | unsigned int ttwu_move_affine; | 1002 | unsigned int ttwu_move_affine; |
1003 | unsigned int ttwu_move_balance; | 1003 | unsigned int ttwu_move_balance; |
1004 | #endif | 1004 | #endif |
1005 | #ifdef CONFIG_SCHED_DEBUG | 1005 | #ifdef CONFIG_SCHED_DEBUG |
1006 | char *name; | 1006 | char *name; |
1007 | #endif | 1007 | #endif |
1008 | union { | 1008 | union { |
1009 | void *private; /* used during construction */ | 1009 | void *private; /* used during construction */ |
1010 | struct rcu_head rcu; /* used during destruction */ | 1010 | struct rcu_head rcu; /* used during destruction */ |
1011 | }; | 1011 | }; |
1012 | 1012 | ||
1013 | unsigned int span_weight; | 1013 | unsigned int span_weight; |
1014 | /* | 1014 | /* |
1015 | * Span of all CPUs in this domain. | 1015 | * Span of all CPUs in this domain. |
1016 | * | 1016 | * |
1017 | * NOTE: this field is variable length. (Allocated dynamically | 1017 | * NOTE: this field is variable length. (Allocated dynamically |
1018 | * by attaching extra space to the end of the structure, | 1018 | * by attaching extra space to the end of the structure, |
1019 | * depending on how many CPUs the kernel has booted up with) | 1019 | * depending on how many CPUs the kernel has booted up with) |
1020 | */ | 1020 | */ |
1021 | unsigned long span[0]; | 1021 | unsigned long span[0]; |
1022 | }; | 1022 | }; |
1023 | 1023 | ||
1024 | static inline struct cpumask *sched_domain_span(struct sched_domain *sd) | 1024 | static inline struct cpumask *sched_domain_span(struct sched_domain *sd) |
1025 | { | 1025 | { |
1026 | return to_cpumask(sd->span); | 1026 | return to_cpumask(sd->span); |
1027 | } | 1027 | } |
1028 | 1028 | ||
1029 | extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], | 1029 | extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], |
1030 | struct sched_domain_attr *dattr_new); | 1030 | struct sched_domain_attr *dattr_new); |
1031 | 1031 | ||
1032 | /* Allocate an array of sched domains, for partition_sched_domains(). */ | 1032 | /* Allocate an array of sched domains, for partition_sched_domains(). */ |
1033 | cpumask_var_t *alloc_sched_domains(unsigned int ndoms); | 1033 | cpumask_var_t *alloc_sched_domains(unsigned int ndoms); |
1034 | void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms); | 1034 | void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms); |
1035 | 1035 | ||
1036 | /* Test a flag in parent sched domain */ | 1036 | /* Test a flag in parent sched domain */ |
1037 | static inline int test_sd_parent(struct sched_domain *sd, int flag) | 1037 | static inline int test_sd_parent(struct sched_domain *sd, int flag) |
1038 | { | 1038 | { |
1039 | if (sd->parent && (sd->parent->flags & flag)) | 1039 | if (sd->parent && (sd->parent->flags & flag)) |
1040 | return 1; | 1040 | return 1; |
1041 | 1041 | ||
1042 | return 0; | 1042 | return 0; |
1043 | } | 1043 | } |
1044 | 1044 | ||
1045 | unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu); | 1045 | unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu); |
1046 | unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu); | 1046 | unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu); |
1047 | 1047 | ||
1048 | bool cpus_share_cache(int this_cpu, int that_cpu); | 1048 | bool cpus_share_cache(int this_cpu, int that_cpu); |
1049 | 1049 | ||
1050 | #else /* CONFIG_SMP */ | 1050 | #else /* CONFIG_SMP */ |
1051 | 1051 | ||
1052 | struct sched_domain_attr; | 1052 | struct sched_domain_attr; |
1053 | 1053 | ||
1054 | static inline void | 1054 | static inline void |
1055 | partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], | 1055 | partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], |
1056 | struct sched_domain_attr *dattr_new) | 1056 | struct sched_domain_attr *dattr_new) |
1057 | { | 1057 | { |
1058 | } | 1058 | } |
1059 | 1059 | ||
1060 | static inline bool cpus_share_cache(int this_cpu, int that_cpu) | 1060 | static inline bool cpus_share_cache(int this_cpu, int that_cpu) |
1061 | { | 1061 | { |
1062 | return true; | 1062 | return true; |
1063 | } | 1063 | } |
1064 | 1064 | ||
1065 | #endif /* !CONFIG_SMP */ | 1065 | #endif /* !CONFIG_SMP */ |
1066 | 1066 | ||
1067 | 1067 | ||
1068 | struct io_context; /* See blkdev.h */ | 1068 | struct io_context; /* See blkdev.h */ |
1069 | 1069 | ||
1070 | 1070 | ||
1071 | #ifdef ARCH_HAS_PREFETCH_SWITCH_STACK | 1071 | #ifdef ARCH_HAS_PREFETCH_SWITCH_STACK |
1072 | extern void prefetch_stack(struct task_struct *t); | 1072 | extern void prefetch_stack(struct task_struct *t); |
1073 | #else | 1073 | #else |
1074 | static inline void prefetch_stack(struct task_struct *t) { } | 1074 | static inline void prefetch_stack(struct task_struct *t) { } |
1075 | #endif | 1075 | #endif |
1076 | 1076 | ||
1077 | struct audit_context; /* See audit.c */ | 1077 | struct audit_context; /* See audit.c */ |
1078 | struct mempolicy; | 1078 | struct mempolicy; |
1079 | struct pipe_inode_info; | 1079 | struct pipe_inode_info; |
1080 | struct uts_namespace; | 1080 | struct uts_namespace; |
1081 | 1081 | ||
1082 | struct rq; | 1082 | struct rq; |
1083 | struct sched_domain; | 1083 | struct sched_domain; |
1084 | 1084 | ||
1085 | /* | 1085 | /* |
1086 | * wake flags | 1086 | * wake flags |
1087 | */ | 1087 | */ |
1088 | #define WF_SYNC 0x01 /* waker goes to sleep after wakup */ | 1088 | #define WF_SYNC 0x01 /* waker goes to sleep after wakup */ |
1089 | #define WF_FORK 0x02 /* child wakeup after fork */ | 1089 | #define WF_FORK 0x02 /* child wakeup after fork */ |
1090 | #define WF_MIGRATED 0x04 /* internal use, task got migrated */ | 1090 | #define WF_MIGRATED 0x04 /* internal use, task got migrated */ |
1091 | 1091 | ||
1092 | #define ENQUEUE_WAKEUP 1 | 1092 | #define ENQUEUE_WAKEUP 1 |
1093 | #define ENQUEUE_HEAD 2 | 1093 | #define ENQUEUE_HEAD 2 |
1094 | #ifdef CONFIG_SMP | 1094 | #ifdef CONFIG_SMP |
1095 | #define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */ | 1095 | #define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */ |
1096 | #else | 1096 | #else |
1097 | #define ENQUEUE_WAKING 0 | 1097 | #define ENQUEUE_WAKING 0 |
1098 | #endif | 1098 | #endif |
1099 | 1099 | ||
1100 | #define DEQUEUE_SLEEP 1 | 1100 | #define DEQUEUE_SLEEP 1 |
1101 | 1101 | ||
1102 | struct sched_class { | 1102 | struct sched_class { |
1103 | const struct sched_class *next; | 1103 | const struct sched_class *next; |
1104 | 1104 | ||
1105 | void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); | 1105 | void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); |
1106 | void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); | 1106 | void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); |
1107 | void (*yield_task) (struct rq *rq); | 1107 | void (*yield_task) (struct rq *rq); |
1108 | bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt); | 1108 | bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt); |
1109 | 1109 | ||
1110 | void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); | 1110 | void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); |
1111 | 1111 | ||
1112 | struct task_struct * (*pick_next_task) (struct rq *rq); | 1112 | struct task_struct * (*pick_next_task) (struct rq *rq); |
1113 | void (*put_prev_task) (struct rq *rq, struct task_struct *p); | 1113 | void (*put_prev_task) (struct rq *rq, struct task_struct *p); |
1114 | 1114 | ||
1115 | #ifdef CONFIG_SMP | 1115 | #ifdef CONFIG_SMP |
1116 | int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); | 1116 | int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); |
1117 | 1117 | ||
1118 | void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); | 1118 | void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); |
1119 | void (*post_schedule) (struct rq *this_rq); | 1119 | void (*post_schedule) (struct rq *this_rq); |
1120 | void (*task_waking) (struct task_struct *task); | 1120 | void (*task_waking) (struct task_struct *task); |
1121 | void (*task_woken) (struct rq *this_rq, struct task_struct *task); | 1121 | void (*task_woken) (struct rq *this_rq, struct task_struct *task); |
1122 | 1122 | ||
1123 | void (*set_cpus_allowed)(struct task_struct *p, | 1123 | void (*set_cpus_allowed)(struct task_struct *p, |
1124 | const struct cpumask *newmask); | 1124 | const struct cpumask *newmask); |
1125 | 1125 | ||
1126 | void (*rq_online)(struct rq *rq); | 1126 | void (*rq_online)(struct rq *rq); |
1127 | void (*rq_offline)(struct rq *rq); | 1127 | void (*rq_offline)(struct rq *rq); |
1128 | #endif | 1128 | #endif |
1129 | 1129 | ||
1130 | void (*set_curr_task) (struct rq *rq); | 1130 | void (*set_curr_task) (struct rq *rq); |
1131 | void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); | 1131 | void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); |
1132 | void (*task_fork) (struct task_struct *p); | 1132 | void (*task_fork) (struct task_struct *p); |
1133 | 1133 | ||
1134 | void (*switched_from) (struct rq *this_rq, struct task_struct *task); | 1134 | void (*switched_from) (struct rq *this_rq, struct task_struct *task); |
1135 | void (*switched_to) (struct rq *this_rq, struct task_struct *task); | 1135 | void (*switched_to) (struct rq *this_rq, struct task_struct *task); |
1136 | void (*prio_changed) (struct rq *this_rq, struct task_struct *task, | 1136 | void (*prio_changed) (struct rq *this_rq, struct task_struct *task, |
1137 | int oldprio); | 1137 | int oldprio); |
1138 | 1138 | ||
1139 | unsigned int (*get_rr_interval) (struct rq *rq, | 1139 | unsigned int (*get_rr_interval) (struct rq *rq, |
1140 | struct task_struct *task); | 1140 | struct task_struct *task); |
1141 | 1141 | ||
1142 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1142 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1143 | void (*task_move_group) (struct task_struct *p, int on_rq); | 1143 | void (*task_move_group) (struct task_struct *p, int on_rq); |
1144 | #endif | 1144 | #endif |
1145 | }; | 1145 | }; |
1146 | 1146 | ||
1147 | struct load_weight { | 1147 | struct load_weight { |
1148 | unsigned long weight, inv_weight; | 1148 | unsigned long weight, inv_weight; |
1149 | }; | 1149 | }; |
1150 | 1150 | ||
1151 | #ifdef CONFIG_SCHEDSTATS | 1151 | #ifdef CONFIG_SCHEDSTATS |
1152 | struct sched_statistics { | 1152 | struct sched_statistics { |
1153 | u64 wait_start; | 1153 | u64 wait_start; |
1154 | u64 wait_max; | 1154 | u64 wait_max; |
1155 | u64 wait_count; | 1155 | u64 wait_count; |
1156 | u64 wait_sum; | 1156 | u64 wait_sum; |
1157 | u64 iowait_count; | 1157 | u64 iowait_count; |
1158 | u64 iowait_sum; | 1158 | u64 iowait_sum; |
1159 | 1159 | ||
1160 | u64 sleep_start; | 1160 | u64 sleep_start; |
1161 | u64 sleep_max; | 1161 | u64 sleep_max; |
1162 | s64 sum_sleep_runtime; | 1162 | s64 sum_sleep_runtime; |
1163 | 1163 | ||
1164 | u64 block_start; | 1164 | u64 block_start; |
1165 | u64 block_max; | 1165 | u64 block_max; |
1166 | u64 exec_max; | 1166 | u64 exec_max; |
1167 | u64 slice_max; | 1167 | u64 slice_max; |
1168 | 1168 | ||
1169 | u64 nr_migrations_cold; | 1169 | u64 nr_migrations_cold; |
1170 | u64 nr_failed_migrations_affine; | 1170 | u64 nr_failed_migrations_affine; |
1171 | u64 nr_failed_migrations_running; | 1171 | u64 nr_failed_migrations_running; |
1172 | u64 nr_failed_migrations_hot; | 1172 | u64 nr_failed_migrations_hot; |
1173 | u64 nr_forced_migrations; | 1173 | u64 nr_forced_migrations; |
1174 | 1174 | ||
1175 | u64 nr_wakeups; | 1175 | u64 nr_wakeups; |
1176 | u64 nr_wakeups_sync; | 1176 | u64 nr_wakeups_sync; |
1177 | u64 nr_wakeups_migrate; | 1177 | u64 nr_wakeups_migrate; |
1178 | u64 nr_wakeups_local; | 1178 | u64 nr_wakeups_local; |
1179 | u64 nr_wakeups_remote; | 1179 | u64 nr_wakeups_remote; |
1180 | u64 nr_wakeups_affine; | 1180 | u64 nr_wakeups_affine; |
1181 | u64 nr_wakeups_affine_attempts; | 1181 | u64 nr_wakeups_affine_attempts; |
1182 | u64 nr_wakeups_passive; | 1182 | u64 nr_wakeups_passive; |
1183 | u64 nr_wakeups_idle; | 1183 | u64 nr_wakeups_idle; |
1184 | }; | 1184 | }; |
1185 | #endif | 1185 | #endif |
1186 | 1186 | ||
1187 | struct sched_entity { | 1187 | struct sched_entity { |
1188 | struct load_weight load; /* for load-balancing */ | 1188 | struct load_weight load; /* for load-balancing */ |
1189 | struct rb_node run_node; | 1189 | struct rb_node run_node; |
1190 | struct list_head group_node; | 1190 | struct list_head group_node; |
1191 | unsigned int on_rq; | 1191 | unsigned int on_rq; |
1192 | 1192 | ||
1193 | u64 exec_start; | 1193 | u64 exec_start; |
1194 | u64 sum_exec_runtime; | 1194 | u64 sum_exec_runtime; |
1195 | u64 vruntime; | 1195 | u64 vruntime; |
1196 | u64 prev_sum_exec_runtime; | 1196 | u64 prev_sum_exec_runtime; |
1197 | 1197 | ||
1198 | u64 nr_migrations; | 1198 | u64 nr_migrations; |
1199 | 1199 | ||
1200 | #ifdef CONFIG_SCHEDSTATS | 1200 | #ifdef CONFIG_SCHEDSTATS |
1201 | struct sched_statistics statistics; | 1201 | struct sched_statistics statistics; |
1202 | #endif | 1202 | #endif |
1203 | 1203 | ||
1204 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1204 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1205 | struct sched_entity *parent; | 1205 | struct sched_entity *parent; |
1206 | /* rq on which this entity is (to be) queued: */ | 1206 | /* rq on which this entity is (to be) queued: */ |
1207 | struct cfs_rq *cfs_rq; | 1207 | struct cfs_rq *cfs_rq; |
1208 | /* rq "owned" by this entity/group: */ | 1208 | /* rq "owned" by this entity/group: */ |
1209 | struct cfs_rq *my_q; | 1209 | struct cfs_rq *my_q; |
1210 | #endif | 1210 | #endif |
1211 | }; | 1211 | }; |
1212 | 1212 | ||
1213 | struct sched_rt_entity { | 1213 | struct sched_rt_entity { |
1214 | struct list_head run_list; | 1214 | struct list_head run_list; |
1215 | unsigned long timeout; | 1215 | unsigned long timeout; |
1216 | unsigned int time_slice; | 1216 | unsigned int time_slice; |
1217 | 1217 | ||
1218 | struct sched_rt_entity *back; | 1218 | struct sched_rt_entity *back; |
1219 | #ifdef CONFIG_RT_GROUP_SCHED | 1219 | #ifdef CONFIG_RT_GROUP_SCHED |
1220 | struct sched_rt_entity *parent; | 1220 | struct sched_rt_entity *parent; |
1221 | /* rq on which this entity is (to be) queued: */ | 1221 | /* rq on which this entity is (to be) queued: */ |
1222 | struct rt_rq *rt_rq; | 1222 | struct rt_rq *rt_rq; |
1223 | /* rq "owned" by this entity/group: */ | 1223 | /* rq "owned" by this entity/group: */ |
1224 | struct rt_rq *my_q; | 1224 | struct rt_rq *my_q; |
1225 | #endif | 1225 | #endif |
1226 | }; | 1226 | }; |
1227 | 1227 | ||
1228 | /* | 1228 | /* |
1229 | * default timeslice is 100 msecs (used only for SCHED_RR tasks). | 1229 | * default timeslice is 100 msecs (used only for SCHED_RR tasks). |
1230 | * Timeslices get refilled after they expire. | 1230 | * Timeslices get refilled after they expire. |
1231 | */ | 1231 | */ |
1232 | #define RR_TIMESLICE (100 * HZ / 1000) | 1232 | #define RR_TIMESLICE (100 * HZ / 1000) |
1233 | 1233 | ||
1234 | struct rcu_node; | 1234 | struct rcu_node; |
1235 | 1235 | ||
1236 | enum perf_event_task_context { | 1236 | enum perf_event_task_context { |
1237 | perf_invalid_context = -1, | 1237 | perf_invalid_context = -1, |
1238 | perf_hw_context = 0, | 1238 | perf_hw_context = 0, |
1239 | perf_sw_context, | 1239 | perf_sw_context, |
1240 | perf_nr_task_contexts, | 1240 | perf_nr_task_contexts, |
1241 | }; | 1241 | }; |
1242 | 1242 | ||
1243 | struct task_struct { | 1243 | struct task_struct { |
1244 | volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ | 1244 | volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ |
1245 | void *stack; | 1245 | void *stack; |
1246 | atomic_t usage; | 1246 | atomic_t usage; |
1247 | unsigned int flags; /* per process flags, defined below */ | 1247 | unsigned int flags; /* per process flags, defined below */ |
1248 | unsigned int ptrace; | 1248 | unsigned int ptrace; |
1249 | 1249 | ||
1250 | #ifdef CONFIG_SMP | 1250 | #ifdef CONFIG_SMP |
1251 | struct llist_node wake_entry; | 1251 | struct llist_node wake_entry; |
1252 | int on_cpu; | 1252 | int on_cpu; |
1253 | #endif | 1253 | #endif |
1254 | int on_rq; | 1254 | int on_rq; |
1255 | 1255 | ||
1256 | int prio, static_prio, normal_prio; | 1256 | int prio, static_prio, normal_prio; |
1257 | unsigned int rt_priority; | 1257 | unsigned int rt_priority; |
1258 | const struct sched_class *sched_class; | 1258 | const struct sched_class *sched_class; |
1259 | struct sched_entity se; | 1259 | struct sched_entity se; |
1260 | struct sched_rt_entity rt; | 1260 | struct sched_rt_entity rt; |
1261 | #ifdef CONFIG_CGROUP_SCHED | 1261 | #ifdef CONFIG_CGROUP_SCHED |
1262 | struct task_group *sched_task_group; | 1262 | struct task_group *sched_task_group; |
1263 | #endif | 1263 | #endif |
1264 | 1264 | ||
1265 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 1265 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
1266 | /* list of struct preempt_notifier: */ | 1266 | /* list of struct preempt_notifier: */ |
1267 | struct hlist_head preempt_notifiers; | 1267 | struct hlist_head preempt_notifiers; |
1268 | #endif | 1268 | #endif |
1269 | 1269 | ||
1270 | /* | 1270 | /* |
1271 | * fpu_counter contains the number of consecutive context switches | 1271 | * fpu_counter contains the number of consecutive context switches |
1272 | * that the FPU is used. If this is over a threshold, the lazy fpu | 1272 | * that the FPU is used. If this is over a threshold, the lazy fpu |
1273 | * saving becomes unlazy to save the trap. This is an unsigned char | 1273 | * saving becomes unlazy to save the trap. This is an unsigned char |
1274 | * so that after 256 times the counter wraps and the behavior turns | 1274 | * so that after 256 times the counter wraps and the behavior turns |
1275 | * lazy again; this to deal with bursty apps that only use FPU for | 1275 | * lazy again; this to deal with bursty apps that only use FPU for |
1276 | * a short time | 1276 | * a short time |
1277 | */ | 1277 | */ |
1278 | unsigned char fpu_counter; | 1278 | unsigned char fpu_counter; |
1279 | #ifdef CONFIG_BLK_DEV_IO_TRACE | 1279 | #ifdef CONFIG_BLK_DEV_IO_TRACE |
1280 | unsigned int btrace_seq; | 1280 | unsigned int btrace_seq; |
1281 | #endif | 1281 | #endif |
1282 | 1282 | ||
1283 | unsigned int policy; | 1283 | unsigned int policy; |
1284 | int nr_cpus_allowed; | 1284 | int nr_cpus_allowed; |
1285 | cpumask_t cpus_allowed; | 1285 | cpumask_t cpus_allowed; |
1286 | 1286 | ||
1287 | #ifdef CONFIG_PREEMPT_RCU | 1287 | #ifdef CONFIG_PREEMPT_RCU |
1288 | int rcu_read_lock_nesting; | 1288 | int rcu_read_lock_nesting; |
1289 | char rcu_read_unlock_special; | 1289 | char rcu_read_unlock_special; |
1290 | struct list_head rcu_node_entry; | 1290 | struct list_head rcu_node_entry; |
1291 | #endif /* #ifdef CONFIG_PREEMPT_RCU */ | 1291 | #endif /* #ifdef CONFIG_PREEMPT_RCU */ |
1292 | #ifdef CONFIG_TREE_PREEMPT_RCU | 1292 | #ifdef CONFIG_TREE_PREEMPT_RCU |
1293 | struct rcu_node *rcu_blocked_node; | 1293 | struct rcu_node *rcu_blocked_node; |
1294 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | 1294 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ |
1295 | #ifdef CONFIG_RCU_BOOST | 1295 | #ifdef CONFIG_RCU_BOOST |
1296 | struct rt_mutex *rcu_boost_mutex; | 1296 | struct rt_mutex *rcu_boost_mutex; |
1297 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 1297 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
1298 | 1298 | ||
1299 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | 1299 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
1300 | struct sched_info sched_info; | 1300 | struct sched_info sched_info; |
1301 | #endif | 1301 | #endif |
1302 | 1302 | ||
1303 | struct list_head tasks; | 1303 | struct list_head tasks; |
1304 | #ifdef CONFIG_SMP | 1304 | #ifdef CONFIG_SMP |
1305 | struct plist_node pushable_tasks; | 1305 | struct plist_node pushable_tasks; |
1306 | #endif | 1306 | #endif |
1307 | 1307 | ||
1308 | struct mm_struct *mm, *active_mm; | 1308 | struct mm_struct *mm, *active_mm; |
1309 | #ifdef CONFIG_COMPAT_BRK | 1309 | #ifdef CONFIG_COMPAT_BRK |
1310 | unsigned brk_randomized:1; | 1310 | unsigned brk_randomized:1; |
1311 | #endif | 1311 | #endif |
1312 | #if defined(SPLIT_RSS_COUNTING) | 1312 | #if defined(SPLIT_RSS_COUNTING) |
1313 | struct task_rss_stat rss_stat; | 1313 | struct task_rss_stat rss_stat; |
1314 | #endif | 1314 | #endif |
1315 | /* task state */ | 1315 | /* task state */ |
1316 | int exit_state; | 1316 | int exit_state; |
1317 | int exit_code, exit_signal; | 1317 | int exit_code, exit_signal; |
1318 | int pdeath_signal; /* The signal sent when the parent dies */ | 1318 | int pdeath_signal; /* The signal sent when the parent dies */ |
1319 | unsigned int jobctl; /* JOBCTL_*, siglock protected */ | 1319 | unsigned int jobctl; /* JOBCTL_*, siglock protected */ |
1320 | /* ??? */ | 1320 | /* ??? */ |
1321 | unsigned int personality; | 1321 | unsigned int personality; |
1322 | unsigned did_exec:1; | 1322 | unsigned did_exec:1; |
1323 | unsigned in_execve:1; /* Tell the LSMs that the process is doing an | 1323 | unsigned in_execve:1; /* Tell the LSMs that the process is doing an |
1324 | * execve */ | 1324 | * execve */ |
1325 | unsigned in_iowait:1; | 1325 | unsigned in_iowait:1; |
1326 | 1326 | ||
1327 | /* task may not gain privileges */ | 1327 | /* task may not gain privileges */ |
1328 | unsigned no_new_privs:1; | 1328 | unsigned no_new_privs:1; |
1329 | 1329 | ||
1330 | /* Revert to default priority/policy when forking */ | 1330 | /* Revert to default priority/policy when forking */ |
1331 | unsigned sched_reset_on_fork:1; | 1331 | unsigned sched_reset_on_fork:1; |
1332 | unsigned sched_contributes_to_load:1; | 1332 | unsigned sched_contributes_to_load:1; |
1333 | 1333 | ||
1334 | pid_t pid; | 1334 | pid_t pid; |
1335 | pid_t tgid; | 1335 | pid_t tgid; |
1336 | 1336 | ||
1337 | #ifdef CONFIG_CC_STACKPROTECTOR | 1337 | #ifdef CONFIG_CC_STACKPROTECTOR |
1338 | /* Canary value for the -fstack-protector gcc feature */ | 1338 | /* Canary value for the -fstack-protector gcc feature */ |
1339 | unsigned long stack_canary; | 1339 | unsigned long stack_canary; |
1340 | #endif | 1340 | #endif |
1341 | /* | 1341 | /* |
1342 | * pointers to (original) parent process, youngest child, younger sibling, | 1342 | * pointers to (original) parent process, youngest child, younger sibling, |
1343 | * older sibling, respectively. (p->father can be replaced with | 1343 | * older sibling, respectively. (p->father can be replaced with |
1344 | * p->real_parent->pid) | 1344 | * p->real_parent->pid) |
1345 | */ | 1345 | */ |
1346 | struct task_struct __rcu *real_parent; /* real parent process */ | 1346 | struct task_struct __rcu *real_parent; /* real parent process */ |
1347 | struct task_struct __rcu *parent; /* recipient of SIGCHLD, wait4() reports */ | 1347 | struct task_struct __rcu *parent; /* recipient of SIGCHLD, wait4() reports */ |
1348 | /* | 1348 | /* |
1349 | * children/sibling forms the list of my natural children | 1349 | * children/sibling forms the list of my natural children |
1350 | */ | 1350 | */ |
1351 | struct list_head children; /* list of my children */ | 1351 | struct list_head children; /* list of my children */ |
1352 | struct list_head sibling; /* linkage in my parent's children list */ | 1352 | struct list_head sibling; /* linkage in my parent's children list */ |
1353 | struct task_struct *group_leader; /* threadgroup leader */ | 1353 | struct task_struct *group_leader; /* threadgroup leader */ |
1354 | 1354 | ||
1355 | /* | 1355 | /* |
1356 | * ptraced is the list of tasks this task is using ptrace on. | 1356 | * ptraced is the list of tasks this task is using ptrace on. |
1357 | * This includes both natural children and PTRACE_ATTACH targets. | 1357 | * This includes both natural children and PTRACE_ATTACH targets. |
1358 | * p->ptrace_entry is p's link on the p->parent->ptraced list. | 1358 | * p->ptrace_entry is p's link on the p->parent->ptraced list. |
1359 | */ | 1359 | */ |
1360 | struct list_head ptraced; | 1360 | struct list_head ptraced; |
1361 | struct list_head ptrace_entry; | 1361 | struct list_head ptrace_entry; |
1362 | 1362 | ||
1363 | /* PID/PID hash table linkage. */ | 1363 | /* PID/PID hash table linkage. */ |
1364 | struct pid_link pids[PIDTYPE_MAX]; | 1364 | struct pid_link pids[PIDTYPE_MAX]; |
1365 | struct list_head thread_group; | 1365 | struct list_head thread_group; |
1366 | 1366 | ||
1367 | struct completion *vfork_done; /* for vfork() */ | 1367 | struct completion *vfork_done; /* for vfork() */ |
1368 | int __user *set_child_tid; /* CLONE_CHILD_SETTID */ | 1368 | int __user *set_child_tid; /* CLONE_CHILD_SETTID */ |
1369 | int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ | 1369 | int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ |
1370 | 1370 | ||
1371 | cputime_t utime, stime, utimescaled, stimescaled; | 1371 | cputime_t utime, stime, utimescaled, stimescaled; |
1372 | cputime_t gtime; | 1372 | cputime_t gtime; |
1373 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | 1373 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING |
1374 | cputime_t prev_utime, prev_stime; | 1374 | cputime_t prev_utime, prev_stime; |
1375 | #endif | 1375 | #endif |
1376 | unsigned long nvcsw, nivcsw; /* context switch counts */ | 1376 | unsigned long nvcsw, nivcsw; /* context switch counts */ |
1377 | struct timespec start_time; /* monotonic time */ | 1377 | struct timespec start_time; /* monotonic time */ |
1378 | struct timespec real_start_time; /* boot based time */ | 1378 | struct timespec real_start_time; /* boot based time */ |
1379 | /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ | 1379 | /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ |
1380 | unsigned long min_flt, maj_flt; | 1380 | unsigned long min_flt, maj_flt; |
1381 | 1381 | ||
1382 | struct task_cputime cputime_expires; | 1382 | struct task_cputime cputime_expires; |
1383 | struct list_head cpu_timers[3]; | 1383 | struct list_head cpu_timers[3]; |
1384 | 1384 | ||
1385 | /* process credentials */ | 1385 | /* process credentials */ |
1386 | const struct cred __rcu *real_cred; /* objective and real subjective task | 1386 | const struct cred __rcu *real_cred; /* objective and real subjective task |
1387 | * credentials (COW) */ | 1387 | * credentials (COW) */ |
1388 | const struct cred __rcu *cred; /* effective (overridable) subjective task | 1388 | const struct cred __rcu *cred; /* effective (overridable) subjective task |
1389 | * credentials (COW) */ | 1389 | * credentials (COW) */ |
1390 | char comm[TASK_COMM_LEN]; /* executable name excluding path | 1390 | char comm[TASK_COMM_LEN]; /* executable name excluding path |
1391 | - access with [gs]et_task_comm (which lock | 1391 | - access with [gs]et_task_comm (which lock |
1392 | it with task_lock()) | 1392 | it with task_lock()) |
1393 | - initialized normally by setup_new_exec */ | 1393 | - initialized normally by setup_new_exec */ |
1394 | /* file system info */ | 1394 | /* file system info */ |
1395 | int link_count, total_link_count; | 1395 | int link_count, total_link_count; |
1396 | #ifdef CONFIG_SYSVIPC | 1396 | #ifdef CONFIG_SYSVIPC |
1397 | /* ipc stuff */ | 1397 | /* ipc stuff */ |
1398 | struct sysv_sem sysvsem; | 1398 | struct sysv_sem sysvsem; |
1399 | #endif | 1399 | #endif |
1400 | #ifdef CONFIG_DETECT_HUNG_TASK | 1400 | #ifdef CONFIG_DETECT_HUNG_TASK |
1401 | /* hung task detection */ | 1401 | /* hung task detection */ |
1402 | unsigned long last_switch_count; | 1402 | unsigned long last_switch_count; |
1403 | #endif | 1403 | #endif |
1404 | /* CPU-specific state of this task */ | 1404 | /* CPU-specific state of this task */ |
1405 | struct thread_struct thread; | 1405 | struct thread_struct thread; |
1406 | /* filesystem information */ | 1406 | /* filesystem information */ |
1407 | struct fs_struct *fs; | 1407 | struct fs_struct *fs; |
1408 | /* open file information */ | 1408 | /* open file information */ |
1409 | struct files_struct *files; | 1409 | struct files_struct *files; |
1410 | /* namespaces */ | 1410 | /* namespaces */ |
1411 | struct nsproxy *nsproxy; | 1411 | struct nsproxy *nsproxy; |
1412 | /* signal handlers */ | 1412 | /* signal handlers */ |
1413 | struct signal_struct *signal; | 1413 | struct signal_struct *signal; |
1414 | struct sighand_struct *sighand; | 1414 | struct sighand_struct *sighand; |
1415 | 1415 | ||
1416 | sigset_t blocked, real_blocked; | 1416 | sigset_t blocked, real_blocked; |
1417 | sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */ | 1417 | sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */ |
1418 | struct sigpending pending; | 1418 | struct sigpending pending; |
1419 | 1419 | ||
1420 | unsigned long sas_ss_sp; | 1420 | unsigned long sas_ss_sp; |
1421 | size_t sas_ss_size; | 1421 | size_t sas_ss_size; |
1422 | int (*notifier)(void *priv); | 1422 | int (*notifier)(void *priv); |
1423 | void *notifier_data; | 1423 | void *notifier_data; |
1424 | sigset_t *notifier_mask; | 1424 | sigset_t *notifier_mask; |
1425 | struct callback_head *task_works; | 1425 | struct callback_head *task_works; |
1426 | 1426 | ||
1427 | struct audit_context *audit_context; | 1427 | struct audit_context *audit_context; |
1428 | #ifdef CONFIG_AUDITSYSCALL | 1428 | #ifdef CONFIG_AUDITSYSCALL |
1429 | uid_t loginuid; | 1429 | uid_t loginuid; |
1430 | unsigned int sessionid; | 1430 | unsigned int sessionid; |
1431 | #endif | 1431 | #endif |
1432 | struct seccomp seccomp; | 1432 | struct seccomp seccomp; |
1433 | 1433 | ||
1434 | /* Thread group tracking */ | 1434 | /* Thread group tracking */ |
1435 | u32 parent_exec_id; | 1435 | u32 parent_exec_id; |
1436 | u32 self_exec_id; | 1436 | u32 self_exec_id; |
1437 | /* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, | 1437 | /* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, |
1438 | * mempolicy */ | 1438 | * mempolicy */ |
1439 | spinlock_t alloc_lock; | 1439 | spinlock_t alloc_lock; |
1440 | 1440 | ||
1441 | /* Protection of the PI data structures: */ | 1441 | /* Protection of the PI data structures: */ |
1442 | raw_spinlock_t pi_lock; | 1442 | raw_spinlock_t pi_lock; |
1443 | 1443 | ||
1444 | #ifdef CONFIG_RT_MUTEXES | 1444 | #ifdef CONFIG_RT_MUTEXES |
1445 | /* PI waiters blocked on a rt_mutex held by this task */ | 1445 | /* PI waiters blocked on a rt_mutex held by this task */ |
1446 | struct plist_head pi_waiters; | 1446 | struct plist_head pi_waiters; |
1447 | /* Deadlock detection and priority inheritance handling */ | 1447 | /* Deadlock detection and priority inheritance handling */ |
1448 | struct rt_mutex_waiter *pi_blocked_on; | 1448 | struct rt_mutex_waiter *pi_blocked_on; |
1449 | #endif | 1449 | #endif |
1450 | 1450 | ||
1451 | #ifdef CONFIG_DEBUG_MUTEXES | 1451 | #ifdef CONFIG_DEBUG_MUTEXES |
1452 | /* mutex deadlock detection */ | 1452 | /* mutex deadlock detection */ |
1453 | struct mutex_waiter *blocked_on; | 1453 | struct mutex_waiter *blocked_on; |
1454 | #endif | 1454 | #endif |
1455 | #ifdef CONFIG_TRACE_IRQFLAGS | 1455 | #ifdef CONFIG_TRACE_IRQFLAGS |
1456 | unsigned int irq_events; | 1456 | unsigned int irq_events; |
1457 | unsigned long hardirq_enable_ip; | 1457 | unsigned long hardirq_enable_ip; |
1458 | unsigned long hardirq_disable_ip; | 1458 | unsigned long hardirq_disable_ip; |
1459 | unsigned int hardirq_enable_event; | 1459 | unsigned int hardirq_enable_event; |
1460 | unsigned int hardirq_disable_event; | 1460 | unsigned int hardirq_disable_event; |
1461 | int hardirqs_enabled; | 1461 | int hardirqs_enabled; |
1462 | int hardirq_context; | 1462 | int hardirq_context; |
1463 | unsigned long softirq_disable_ip; | 1463 | unsigned long softirq_disable_ip; |
1464 | unsigned long softirq_enable_ip; | 1464 | unsigned long softirq_enable_ip; |
1465 | unsigned int softirq_disable_event; | 1465 | unsigned int softirq_disable_event; |
1466 | unsigned int softirq_enable_event; | 1466 | unsigned int softirq_enable_event; |
1467 | int softirqs_enabled; | 1467 | int softirqs_enabled; |
1468 | int softirq_context; | 1468 | int softirq_context; |
1469 | #endif | 1469 | #endif |
1470 | #ifdef CONFIG_LOCKDEP | 1470 | #ifdef CONFIG_LOCKDEP |
1471 | # define MAX_LOCK_DEPTH 48UL | 1471 | # define MAX_LOCK_DEPTH 48UL |
1472 | u64 curr_chain_key; | 1472 | u64 curr_chain_key; |
1473 | int lockdep_depth; | 1473 | int lockdep_depth; |
1474 | unsigned int lockdep_recursion; | 1474 | unsigned int lockdep_recursion; |
1475 | struct held_lock held_locks[MAX_LOCK_DEPTH]; | 1475 | struct held_lock held_locks[MAX_LOCK_DEPTH]; |
1476 | gfp_t lockdep_reclaim_gfp; | 1476 | gfp_t lockdep_reclaim_gfp; |
1477 | #endif | 1477 | #endif |
1478 | 1478 | ||
1479 | /* journalling filesystem info */ | 1479 | /* journalling filesystem info */ |
1480 | void *journal_info; | 1480 | void *journal_info; |
1481 | 1481 | ||
1482 | /* stacked block device info */ | 1482 | /* stacked block device info */ |
1483 | struct bio_list *bio_list; | 1483 | struct bio_list *bio_list; |
1484 | 1484 | ||
1485 | #ifdef CONFIG_BLOCK | 1485 | #ifdef CONFIG_BLOCK |
1486 | /* stack plugging */ | 1486 | /* stack plugging */ |
1487 | struct blk_plug *plug; | 1487 | struct blk_plug *plug; |
1488 | #endif | 1488 | #endif |
1489 | 1489 | ||
1490 | /* VM state */ | 1490 | /* VM state */ |
1491 | struct reclaim_state *reclaim_state; | 1491 | struct reclaim_state *reclaim_state; |
1492 | 1492 | ||
1493 | struct backing_dev_info *backing_dev_info; | 1493 | struct backing_dev_info *backing_dev_info; |
1494 | 1494 | ||
1495 | struct io_context *io_context; | 1495 | struct io_context *io_context; |
1496 | 1496 | ||
1497 | unsigned long ptrace_message; | 1497 | unsigned long ptrace_message; |
1498 | siginfo_t *last_siginfo; /* For ptrace use. */ | 1498 | siginfo_t *last_siginfo; /* For ptrace use. */ |
1499 | struct task_io_accounting ioac; | 1499 | struct task_io_accounting ioac; |
1500 | #if defined(CONFIG_TASK_XACCT) | 1500 | #if defined(CONFIG_TASK_XACCT) |
1501 | u64 acct_rss_mem1; /* accumulated rss usage */ | 1501 | u64 acct_rss_mem1; /* accumulated rss usage */ |
1502 | u64 acct_vm_mem1; /* accumulated virtual memory usage */ | 1502 | u64 acct_vm_mem1; /* accumulated virtual memory usage */ |
1503 | cputime_t acct_timexpd; /* stime + utime since last update */ | 1503 | cputime_t acct_timexpd; /* stime + utime since last update */ |
1504 | #endif | 1504 | #endif |
1505 | #ifdef CONFIG_CPUSETS | 1505 | #ifdef CONFIG_CPUSETS |
1506 | nodemask_t mems_allowed; /* Protected by alloc_lock */ | 1506 | nodemask_t mems_allowed; /* Protected by alloc_lock */ |
1507 | seqcount_t mems_allowed_seq; /* Seqence no to catch updates */ | 1507 | seqcount_t mems_allowed_seq; /* Seqence no to catch updates */ |
1508 | int cpuset_mem_spread_rotor; | 1508 | int cpuset_mem_spread_rotor; |
1509 | int cpuset_slab_spread_rotor; | 1509 | int cpuset_slab_spread_rotor; |
1510 | #endif | 1510 | #endif |
1511 | #ifdef CONFIG_CGROUPS | 1511 | #ifdef CONFIG_CGROUPS |
1512 | /* Control Group info protected by css_set_lock */ | 1512 | /* Control Group info protected by css_set_lock */ |
1513 | struct css_set __rcu *cgroups; | 1513 | struct css_set __rcu *cgroups; |
1514 | /* cg_list protected by css_set_lock and tsk->alloc_lock */ | 1514 | /* cg_list protected by css_set_lock and tsk->alloc_lock */ |
1515 | struct list_head cg_list; | 1515 | struct list_head cg_list; |
1516 | #endif | 1516 | #endif |
1517 | #ifdef CONFIG_FUTEX | 1517 | #ifdef CONFIG_FUTEX |
1518 | struct robust_list_head __user *robust_list; | 1518 | struct robust_list_head __user *robust_list; |
1519 | #ifdef CONFIG_COMPAT | 1519 | #ifdef CONFIG_COMPAT |
1520 | struct compat_robust_list_head __user *compat_robust_list; | 1520 | struct compat_robust_list_head __user *compat_robust_list; |
1521 | #endif | 1521 | #endif |
1522 | struct list_head pi_state_list; | 1522 | struct list_head pi_state_list; |
1523 | struct futex_pi_state *pi_state_cache; | 1523 | struct futex_pi_state *pi_state_cache; |
1524 | #endif | 1524 | #endif |
1525 | #ifdef CONFIG_PERF_EVENTS | 1525 | #ifdef CONFIG_PERF_EVENTS |
1526 | struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts]; | 1526 | struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts]; |
1527 | struct mutex perf_event_mutex; | 1527 | struct mutex perf_event_mutex; |
1528 | struct list_head perf_event_list; | 1528 | struct list_head perf_event_list; |
1529 | #endif | 1529 | #endif |
1530 | #ifdef CONFIG_NUMA | 1530 | #ifdef CONFIG_NUMA |
1531 | struct mempolicy *mempolicy; /* Protected by alloc_lock */ | 1531 | struct mempolicy *mempolicy; /* Protected by alloc_lock */ |
1532 | short il_next; | 1532 | short il_next; |
1533 | short pref_node_fork; | 1533 | short pref_node_fork; |
1534 | #endif | 1534 | #endif |
1535 | struct rcu_head rcu; | 1535 | struct rcu_head rcu; |
1536 | 1536 | ||
1537 | /* | 1537 | /* |
1538 | * cache last used pipe for splice | 1538 | * cache last used pipe for splice |
1539 | */ | 1539 | */ |
1540 | struct pipe_inode_info *splice_pipe; | 1540 | struct pipe_inode_info *splice_pipe; |
1541 | #ifdef CONFIG_TASK_DELAY_ACCT | 1541 | #ifdef CONFIG_TASK_DELAY_ACCT |
1542 | struct task_delay_info *delays; | 1542 | struct task_delay_info *delays; |
1543 | #endif | 1543 | #endif |
1544 | #ifdef CONFIG_FAULT_INJECTION | 1544 | #ifdef CONFIG_FAULT_INJECTION |
1545 | int make_it_fail; | 1545 | int make_it_fail; |
1546 | #endif | 1546 | #endif |
1547 | /* | 1547 | /* |
1548 | * when (nr_dirtied >= nr_dirtied_pause), it's time to call | 1548 | * when (nr_dirtied >= nr_dirtied_pause), it's time to call |
1549 | * balance_dirty_pages() for some dirty throttling pause | 1549 | * balance_dirty_pages() for some dirty throttling pause |
1550 | */ | 1550 | */ |
1551 | int nr_dirtied; | 1551 | int nr_dirtied; |
1552 | int nr_dirtied_pause; | 1552 | int nr_dirtied_pause; |
1553 | unsigned long dirty_paused_when; /* start of a write-and-pause period */ | 1553 | unsigned long dirty_paused_when; /* start of a write-and-pause period */ |
1554 | 1554 | ||
1555 | #ifdef CONFIG_LATENCYTOP | 1555 | #ifdef CONFIG_LATENCYTOP |
1556 | int latency_record_count; | 1556 | int latency_record_count; |
1557 | struct latency_record latency_record[LT_SAVECOUNT]; | 1557 | struct latency_record latency_record[LT_SAVECOUNT]; |
1558 | #endif | 1558 | #endif |
1559 | /* | 1559 | /* |
1560 | * time slack values; these are used to round up poll() and | 1560 | * time slack values; these are used to round up poll() and |
1561 | * select() etc timeout values. These are in nanoseconds. | 1561 | * select() etc timeout values. These are in nanoseconds. |
1562 | */ | 1562 | */ |
1563 | unsigned long timer_slack_ns; | 1563 | unsigned long timer_slack_ns; |
1564 | unsigned long default_timer_slack_ns; | 1564 | unsigned long default_timer_slack_ns; |
1565 | 1565 | ||
1566 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 1566 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER |
1567 | /* Index of current stored address in ret_stack */ | 1567 | /* Index of current stored address in ret_stack */ |
1568 | int curr_ret_stack; | 1568 | int curr_ret_stack; |
1569 | /* Stack of return addresses for return function tracing */ | 1569 | /* Stack of return addresses for return function tracing */ |
1570 | struct ftrace_ret_stack *ret_stack; | 1570 | struct ftrace_ret_stack *ret_stack; |
1571 | /* time stamp for last schedule */ | 1571 | /* time stamp for last schedule */ |
1572 | unsigned long long ftrace_timestamp; | 1572 | unsigned long long ftrace_timestamp; |
1573 | /* | 1573 | /* |
1574 | * Number of functions that haven't been traced | 1574 | * Number of functions that haven't been traced |
1575 | * because of depth overrun. | 1575 | * because of depth overrun. |
1576 | */ | 1576 | */ |
1577 | atomic_t trace_overrun; | 1577 | atomic_t trace_overrun; |
1578 | /* Pause for the tracing */ | 1578 | /* Pause for the tracing */ |
1579 | atomic_t tracing_graph_pause; | 1579 | atomic_t tracing_graph_pause; |
1580 | #endif | 1580 | #endif |
1581 | #ifdef CONFIG_TRACING | 1581 | #ifdef CONFIG_TRACING |
1582 | /* state flags for use by tracers */ | 1582 | /* state flags for use by tracers */ |
1583 | unsigned long trace; | 1583 | unsigned long trace; |
1584 | /* bitmask and counter of trace recursion */ | 1584 | /* bitmask and counter of trace recursion */ |
1585 | unsigned long trace_recursion; | 1585 | unsigned long trace_recursion; |
1586 | #endif /* CONFIG_TRACING */ | 1586 | #endif /* CONFIG_TRACING */ |
1587 | #ifdef CONFIG_MEMCG /* memcg uses this to do batch job */ | 1587 | #ifdef CONFIG_MEMCG /* memcg uses this to do batch job */ |
1588 | struct memcg_batch_info { | 1588 | struct memcg_batch_info { |
1589 | int do_batch; /* incremented when batch uncharge started */ | 1589 | int do_batch; /* incremented when batch uncharge started */ |
1590 | struct mem_cgroup *memcg; /* target memcg of uncharge */ | 1590 | struct mem_cgroup *memcg; /* target memcg of uncharge */ |
1591 | unsigned long nr_pages; /* uncharged usage */ | 1591 | unsigned long nr_pages; /* uncharged usage */ |
1592 | unsigned long memsw_nr_pages; /* uncharged mem+swap usage */ | 1592 | unsigned long memsw_nr_pages; /* uncharged mem+swap usage */ |
1593 | } memcg_batch; | 1593 | } memcg_batch; |
1594 | #endif | 1594 | #endif |
1595 | #ifdef CONFIG_HAVE_HW_BREAKPOINT | 1595 | #ifdef CONFIG_HAVE_HW_BREAKPOINT |
1596 | atomic_t ptrace_bp_refcnt; | 1596 | atomic_t ptrace_bp_refcnt; |
1597 | #endif | 1597 | #endif |
1598 | #ifdef CONFIG_UPROBES | 1598 | #ifdef CONFIG_UPROBES |
1599 | struct uprobe_task *utask; | 1599 | struct uprobe_task *utask; |
1600 | #endif | 1600 | #endif |
1601 | }; | 1601 | }; |
1602 | 1602 | ||
1603 | /* Future-safe accessor for struct task_struct's cpus_allowed. */ | 1603 | /* Future-safe accessor for struct task_struct's cpus_allowed. */ |
1604 | #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) | 1604 | #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) |
1605 | 1605 | ||
1606 | /* | 1606 | /* |
1607 | * Priority of a process goes from 0..MAX_PRIO-1, valid RT | 1607 | * Priority of a process goes from 0..MAX_PRIO-1, valid RT |
1608 | * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH | 1608 | * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH |
1609 | * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority | 1609 | * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority |
1610 | * values are inverted: lower p->prio value means higher priority. | 1610 | * values are inverted: lower p->prio value means higher priority. |
1611 | * | 1611 | * |
1612 | * The MAX_USER_RT_PRIO value allows the actual maximum | 1612 | * The MAX_USER_RT_PRIO value allows the actual maximum |
1613 | * RT priority to be separate from the value exported to | 1613 | * RT priority to be separate from the value exported to |
1614 | * user-space. This allows kernel threads to set their | 1614 | * user-space. This allows kernel threads to set their |
1615 | * priority to a value higher than any user task. Note: | 1615 | * priority to a value higher than any user task. Note: |
1616 | * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO. | 1616 | * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO. |
1617 | */ | 1617 | */ |
1618 | 1618 | ||
1619 | #define MAX_USER_RT_PRIO 100 | 1619 | #define MAX_USER_RT_PRIO 100 |
1620 | #define MAX_RT_PRIO MAX_USER_RT_PRIO | 1620 | #define MAX_RT_PRIO MAX_USER_RT_PRIO |
1621 | 1621 | ||
1622 | #define MAX_PRIO (MAX_RT_PRIO + 40) | 1622 | #define MAX_PRIO (MAX_RT_PRIO + 40) |
1623 | #define DEFAULT_PRIO (MAX_RT_PRIO + 20) | 1623 | #define DEFAULT_PRIO (MAX_RT_PRIO + 20) |
1624 | 1624 | ||
1625 | static inline int rt_prio(int prio) | 1625 | static inline int rt_prio(int prio) |
1626 | { | 1626 | { |
1627 | if (unlikely(prio < MAX_RT_PRIO)) | 1627 | if (unlikely(prio < MAX_RT_PRIO)) |
1628 | return 1; | 1628 | return 1; |
1629 | return 0; | 1629 | return 0; |
1630 | } | 1630 | } |
1631 | 1631 | ||
1632 | static inline int rt_task(struct task_struct *p) | 1632 | static inline int rt_task(struct task_struct *p) |
1633 | { | 1633 | { |
1634 | return rt_prio(p->prio); | 1634 | return rt_prio(p->prio); |
1635 | } | 1635 | } |
1636 | 1636 | ||
1637 | static inline struct pid *task_pid(struct task_struct *task) | 1637 | static inline struct pid *task_pid(struct task_struct *task) |
1638 | { | 1638 | { |
1639 | return task->pids[PIDTYPE_PID].pid; | 1639 | return task->pids[PIDTYPE_PID].pid; |
1640 | } | 1640 | } |
1641 | 1641 | ||
1642 | static inline struct pid *task_tgid(struct task_struct *task) | 1642 | static inline struct pid *task_tgid(struct task_struct *task) |
1643 | { | 1643 | { |
1644 | return task->group_leader->pids[PIDTYPE_PID].pid; | 1644 | return task->group_leader->pids[PIDTYPE_PID].pid; |
1645 | } | 1645 | } |
1646 | 1646 | ||
1647 | /* | 1647 | /* |
1648 | * Without tasklist or rcu lock it is not safe to dereference | 1648 | * Without tasklist or rcu lock it is not safe to dereference |
1649 | * the result of task_pgrp/task_session even if task == current, | 1649 | * the result of task_pgrp/task_session even if task == current, |
1650 | * we can race with another thread doing sys_setsid/sys_setpgid. | 1650 | * we can race with another thread doing sys_setsid/sys_setpgid. |
1651 | */ | 1651 | */ |
1652 | static inline struct pid *task_pgrp(struct task_struct *task) | 1652 | static inline struct pid *task_pgrp(struct task_struct *task) |
1653 | { | 1653 | { |
1654 | return task->group_leader->pids[PIDTYPE_PGID].pid; | 1654 | return task->group_leader->pids[PIDTYPE_PGID].pid; |
1655 | } | 1655 | } |
1656 | 1656 | ||
1657 | static inline struct pid *task_session(struct task_struct *task) | 1657 | static inline struct pid *task_session(struct task_struct *task) |
1658 | { | 1658 | { |
1659 | return task->group_leader->pids[PIDTYPE_SID].pid; | 1659 | return task->group_leader->pids[PIDTYPE_SID].pid; |
1660 | } | 1660 | } |
1661 | 1661 | ||
1662 | struct pid_namespace; | 1662 | struct pid_namespace; |
1663 | 1663 | ||
1664 | /* | 1664 | /* |
1665 | * the helpers to get the task's different pids as they are seen | 1665 | * the helpers to get the task's different pids as they are seen |
1666 | * from various namespaces | 1666 | * from various namespaces |
1667 | * | 1667 | * |
1668 | * task_xid_nr() : global id, i.e. the id seen from the init namespace; | 1668 | * task_xid_nr() : global id, i.e. the id seen from the init namespace; |
1669 | * task_xid_vnr() : virtual id, i.e. the id seen from the pid namespace of | 1669 | * task_xid_vnr() : virtual id, i.e. the id seen from the pid namespace of |
1670 | * current. | 1670 | * current. |
1671 | * task_xid_nr_ns() : id seen from the ns specified; | 1671 | * task_xid_nr_ns() : id seen from the ns specified; |
1672 | * | 1672 | * |
1673 | * set_task_vxid() : assigns a virtual id to a task; | 1673 | * set_task_vxid() : assigns a virtual id to a task; |
1674 | * | 1674 | * |
1675 | * see also pid_nr() etc in include/linux/pid.h | 1675 | * see also pid_nr() etc in include/linux/pid.h |
1676 | */ | 1676 | */ |
1677 | pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, | 1677 | pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, |
1678 | struct pid_namespace *ns); | 1678 | struct pid_namespace *ns); |
1679 | 1679 | ||
1680 | static inline pid_t task_pid_nr(struct task_struct *tsk) | 1680 | static inline pid_t task_pid_nr(struct task_struct *tsk) |
1681 | { | 1681 | { |
1682 | return tsk->pid; | 1682 | return tsk->pid; |
1683 | } | 1683 | } |
1684 | 1684 | ||
1685 | static inline pid_t task_pid_nr_ns(struct task_struct *tsk, | 1685 | static inline pid_t task_pid_nr_ns(struct task_struct *tsk, |
1686 | struct pid_namespace *ns) | 1686 | struct pid_namespace *ns) |
1687 | { | 1687 | { |
1688 | return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns); | 1688 | return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns); |
1689 | } | 1689 | } |
1690 | 1690 | ||
1691 | static inline pid_t task_pid_vnr(struct task_struct *tsk) | 1691 | static inline pid_t task_pid_vnr(struct task_struct *tsk) |
1692 | { | 1692 | { |
1693 | return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL); | 1693 | return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL); |
1694 | } | 1694 | } |
1695 | 1695 | ||
1696 | 1696 | ||
1697 | static inline pid_t task_tgid_nr(struct task_struct *tsk) | 1697 | static inline pid_t task_tgid_nr(struct task_struct *tsk) |
1698 | { | 1698 | { |
1699 | return tsk->tgid; | 1699 | return tsk->tgid; |
1700 | } | 1700 | } |
1701 | 1701 | ||
1702 | pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns); | 1702 | pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns); |
1703 | 1703 | ||
1704 | static inline pid_t task_tgid_vnr(struct task_struct *tsk) | 1704 | static inline pid_t task_tgid_vnr(struct task_struct *tsk) |
1705 | { | 1705 | { |
1706 | return pid_vnr(task_tgid(tsk)); | 1706 | return pid_vnr(task_tgid(tsk)); |
1707 | } | 1707 | } |
1708 | 1708 | ||
1709 | 1709 | ||
1710 | static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, | 1710 | static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, |
1711 | struct pid_namespace *ns) | 1711 | struct pid_namespace *ns) |
1712 | { | 1712 | { |
1713 | return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns); | 1713 | return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns); |
1714 | } | 1714 | } |
1715 | 1715 | ||
1716 | static inline pid_t task_pgrp_vnr(struct task_struct *tsk) | 1716 | static inline pid_t task_pgrp_vnr(struct task_struct *tsk) |
1717 | { | 1717 | { |
1718 | return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL); | 1718 | return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL); |
1719 | } | 1719 | } |
1720 | 1720 | ||
1721 | 1721 | ||
1722 | static inline pid_t task_session_nr_ns(struct task_struct *tsk, | 1722 | static inline pid_t task_session_nr_ns(struct task_struct *tsk, |
1723 | struct pid_namespace *ns) | 1723 | struct pid_namespace *ns) |
1724 | { | 1724 | { |
1725 | return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns); | 1725 | return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns); |
1726 | } | 1726 | } |
1727 | 1727 | ||
1728 | static inline pid_t task_session_vnr(struct task_struct *tsk) | 1728 | static inline pid_t task_session_vnr(struct task_struct *tsk) |
1729 | { | 1729 | { |
1730 | return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL); | 1730 | return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL); |
1731 | } | 1731 | } |
1732 | 1732 | ||
1733 | /* obsolete, do not use */ | 1733 | /* obsolete, do not use */ |
1734 | static inline pid_t task_pgrp_nr(struct task_struct *tsk) | 1734 | static inline pid_t task_pgrp_nr(struct task_struct *tsk) |
1735 | { | 1735 | { |
1736 | return task_pgrp_nr_ns(tsk, &init_pid_ns); | 1736 | return task_pgrp_nr_ns(tsk, &init_pid_ns); |
1737 | } | 1737 | } |
1738 | 1738 | ||
1739 | /** | 1739 | /** |
1740 | * pid_alive - check that a task structure is not stale | 1740 | * pid_alive - check that a task structure is not stale |
1741 | * @p: Task structure to be checked. | 1741 | * @p: Task structure to be checked. |
1742 | * | 1742 | * |
1743 | * Test if a process is not yet dead (at most zombie state) | 1743 | * Test if a process is not yet dead (at most zombie state) |
1744 | * If pid_alive fails, then pointers within the task structure | 1744 | * If pid_alive fails, then pointers within the task structure |
1745 | * can be stale and must not be dereferenced. | 1745 | * can be stale and must not be dereferenced. |
1746 | */ | 1746 | */ |
1747 | static inline int pid_alive(struct task_struct *p) | 1747 | static inline int pid_alive(struct task_struct *p) |
1748 | { | 1748 | { |
1749 | return p->pids[PIDTYPE_PID].pid != NULL; | 1749 | return p->pids[PIDTYPE_PID].pid != NULL; |
1750 | } | 1750 | } |
1751 | 1751 | ||
1752 | /** | 1752 | /** |
1753 | * is_global_init - check if a task structure is init | 1753 | * is_global_init - check if a task structure is init |
1754 | * @tsk: Task structure to be checked. | 1754 | * @tsk: Task structure to be checked. |
1755 | * | 1755 | * |
1756 | * Check if a task structure is the first user space task the kernel created. | 1756 | * Check if a task structure is the first user space task the kernel created. |
1757 | */ | 1757 | */ |
1758 | static inline int is_global_init(struct task_struct *tsk) | 1758 | static inline int is_global_init(struct task_struct *tsk) |
1759 | { | 1759 | { |
1760 | return tsk->pid == 1; | 1760 | return tsk->pid == 1; |
1761 | } | 1761 | } |
1762 | 1762 | ||
1763 | /* | 1763 | /* |
1764 | * is_container_init: | 1764 | * is_container_init: |
1765 | * check whether in the task is init in its own pid namespace. | 1765 | * check whether in the task is init in its own pid namespace. |
1766 | */ | 1766 | */ |
1767 | extern int is_container_init(struct task_struct *tsk); | 1767 | extern int is_container_init(struct task_struct *tsk); |
1768 | 1768 | ||
1769 | extern struct pid *cad_pid; | 1769 | extern struct pid *cad_pid; |
1770 | 1770 | ||
1771 | extern void free_task(struct task_struct *tsk); | 1771 | extern void free_task(struct task_struct *tsk); |
1772 | #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0) | 1772 | #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0) |
1773 | 1773 | ||
1774 | extern void __put_task_struct(struct task_struct *t); | 1774 | extern void __put_task_struct(struct task_struct *t); |
1775 | 1775 | ||
1776 | static inline void put_task_struct(struct task_struct *t) | 1776 | static inline void put_task_struct(struct task_struct *t) |
1777 | { | 1777 | { |
1778 | if (atomic_dec_and_test(&t->usage)) | 1778 | if (atomic_dec_and_test(&t->usage)) |
1779 | __put_task_struct(t); | 1779 | __put_task_struct(t); |
1780 | } | 1780 | } |
1781 | 1781 | ||
1782 | extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st); | 1782 | extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st); |
1783 | extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st); | 1783 | extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st); |
1784 | 1784 | ||
1785 | /* | 1785 | /* |
1786 | * Per process flags | 1786 | * Per process flags |
1787 | */ | 1787 | */ |
1788 | #define PF_EXITING 0x00000004 /* getting shut down */ | 1788 | #define PF_EXITING 0x00000004 /* getting shut down */ |
1789 | #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ | 1789 | #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ |
1790 | #define PF_VCPU 0x00000010 /* I'm a virtual CPU */ | 1790 | #define PF_VCPU 0x00000010 /* I'm a virtual CPU */ |
1791 | #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ | 1791 | #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ |
1792 | #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ | 1792 | #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ |
1793 | #define PF_MCE_PROCESS 0x00000080 /* process policy on mce errors */ | 1793 | #define PF_MCE_PROCESS 0x00000080 /* process policy on mce errors */ |
1794 | #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ | 1794 | #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ |
1795 | #define PF_DUMPCORE 0x00000200 /* dumped core */ | 1795 | #define PF_DUMPCORE 0x00000200 /* dumped core */ |
1796 | #define PF_SIGNALED 0x00000400 /* killed by a signal */ | 1796 | #define PF_SIGNALED 0x00000400 /* killed by a signal */ |
1797 | #define PF_MEMALLOC 0x00000800 /* Allocating memory */ | 1797 | #define PF_MEMALLOC 0x00000800 /* Allocating memory */ |
1798 | #define PF_NPROC_EXCEEDED 0x00001000 /* set_user noticed that RLIMIT_NPROC was exceeded */ | 1798 | #define PF_NPROC_EXCEEDED 0x00001000 /* set_user noticed that RLIMIT_NPROC was exceeded */ |
1799 | #define PF_USED_MATH 0x00002000 /* if unset the fpu must be initialized before use */ | 1799 | #define PF_USED_MATH 0x00002000 /* if unset the fpu must be initialized before use */ |
1800 | #define PF_NOFREEZE 0x00008000 /* this thread should not be frozen */ | 1800 | #define PF_NOFREEZE 0x00008000 /* this thread should not be frozen */ |
1801 | #define PF_FROZEN 0x00010000 /* frozen for system suspend */ | 1801 | #define PF_FROZEN 0x00010000 /* frozen for system suspend */ |
1802 | #define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */ | 1802 | #define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */ |
1803 | #define PF_KSWAPD 0x00040000 /* I am kswapd */ | 1803 | #define PF_KSWAPD 0x00040000 /* I am kswapd */ |
1804 | #define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ | 1804 | #define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ |
1805 | #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ | 1805 | #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ |
1806 | #define PF_RANDOMIZE 0x00400000 /* randomize virtual address space */ | 1806 | #define PF_RANDOMIZE 0x00400000 /* randomize virtual address space */ |
1807 | #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ | 1807 | #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ |
1808 | #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ | 1808 | #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ |
1809 | #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ | 1809 | #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ |
1810 | #define PF_THREAD_BOUND 0x04000000 /* Thread bound to specific cpu */ | 1810 | #define PF_THREAD_BOUND 0x04000000 /* Thread bound to specific cpu */ |
1811 | #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ | 1811 | #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ |
1812 | #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ | 1812 | #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ |
1813 | #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ | 1813 | #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ |
1814 | #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ | 1814 | #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ |
1815 | 1815 | ||
1816 | /* | 1816 | /* |
1817 | * Only the _current_ task can read/write to tsk->flags, but other | 1817 | * Only the _current_ task can read/write to tsk->flags, but other |
1818 | * tasks can access tsk->flags in readonly mode for example | 1818 | * tasks can access tsk->flags in readonly mode for example |
1819 | * with tsk_used_math (like during threaded core dumping). | 1819 | * with tsk_used_math (like during threaded core dumping). |
1820 | * There is however an exception to this rule during ptrace | 1820 | * There is however an exception to this rule during ptrace |
1821 | * or during fork: the ptracer task is allowed to write to the | 1821 | * or during fork: the ptracer task is allowed to write to the |
1822 | * child->flags of its traced child (same goes for fork, the parent | 1822 | * child->flags of its traced child (same goes for fork, the parent |
1823 | * can write to the child->flags), because we're guaranteed the | 1823 | * can write to the child->flags), because we're guaranteed the |
1824 | * child is not running and in turn not changing child->flags | 1824 | * child is not running and in turn not changing child->flags |
1825 | * at the same time the parent does it. | 1825 | * at the same time the parent does it. |
1826 | */ | 1826 | */ |
1827 | #define clear_stopped_child_used_math(child) do { (child)->flags &= ~PF_USED_MATH; } while (0) | 1827 | #define clear_stopped_child_used_math(child) do { (child)->flags &= ~PF_USED_MATH; } while (0) |
1828 | #define set_stopped_child_used_math(child) do { (child)->flags |= PF_USED_MATH; } while (0) | 1828 | #define set_stopped_child_used_math(child) do { (child)->flags |= PF_USED_MATH; } while (0) |
1829 | #define clear_used_math() clear_stopped_child_used_math(current) | 1829 | #define clear_used_math() clear_stopped_child_used_math(current) |
1830 | #define set_used_math() set_stopped_child_used_math(current) | 1830 | #define set_used_math() set_stopped_child_used_math(current) |
1831 | #define conditional_stopped_child_used_math(condition, child) \ | 1831 | #define conditional_stopped_child_used_math(condition, child) \ |
1832 | do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0) | 1832 | do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0) |
1833 | #define conditional_used_math(condition) \ | 1833 | #define conditional_used_math(condition) \ |
1834 | conditional_stopped_child_used_math(condition, current) | 1834 | conditional_stopped_child_used_math(condition, current) |
1835 | #define copy_to_stopped_child_used_math(child) \ | 1835 | #define copy_to_stopped_child_used_math(child) \ |
1836 | do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0) | 1836 | do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0) |
1837 | /* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */ | 1837 | /* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */ |
1838 | #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) | 1838 | #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) |
1839 | #define used_math() tsk_used_math(current) | 1839 | #define used_math() tsk_used_math(current) |
1840 | 1840 | ||
1841 | /* | 1841 | /* |
1842 | * task->jobctl flags | 1842 | * task->jobctl flags |
1843 | */ | 1843 | */ |
1844 | #define JOBCTL_STOP_SIGMASK 0xffff /* signr of the last group stop */ | 1844 | #define JOBCTL_STOP_SIGMASK 0xffff /* signr of the last group stop */ |
1845 | 1845 | ||
1846 | #define JOBCTL_STOP_DEQUEUED_BIT 16 /* stop signal dequeued */ | 1846 | #define JOBCTL_STOP_DEQUEUED_BIT 16 /* stop signal dequeued */ |
1847 | #define JOBCTL_STOP_PENDING_BIT 17 /* task should stop for group stop */ | 1847 | #define JOBCTL_STOP_PENDING_BIT 17 /* task should stop for group stop */ |
1848 | #define JOBCTL_STOP_CONSUME_BIT 18 /* consume group stop count */ | 1848 | #define JOBCTL_STOP_CONSUME_BIT 18 /* consume group stop count */ |
1849 | #define JOBCTL_TRAP_STOP_BIT 19 /* trap for STOP */ | 1849 | #define JOBCTL_TRAP_STOP_BIT 19 /* trap for STOP */ |
1850 | #define JOBCTL_TRAP_NOTIFY_BIT 20 /* trap for NOTIFY */ | 1850 | #define JOBCTL_TRAP_NOTIFY_BIT 20 /* trap for NOTIFY */ |
1851 | #define JOBCTL_TRAPPING_BIT 21 /* switching to TRACED */ | 1851 | #define JOBCTL_TRAPPING_BIT 21 /* switching to TRACED */ |
1852 | #define JOBCTL_LISTENING_BIT 22 /* ptracer is listening for events */ | 1852 | #define JOBCTL_LISTENING_BIT 22 /* ptracer is listening for events */ |
1853 | 1853 | ||
1854 | #define JOBCTL_STOP_DEQUEUED (1 << JOBCTL_STOP_DEQUEUED_BIT) | 1854 | #define JOBCTL_STOP_DEQUEUED (1 << JOBCTL_STOP_DEQUEUED_BIT) |
1855 | #define JOBCTL_STOP_PENDING (1 << JOBCTL_STOP_PENDING_BIT) | 1855 | #define JOBCTL_STOP_PENDING (1 << JOBCTL_STOP_PENDING_BIT) |
1856 | #define JOBCTL_STOP_CONSUME (1 << JOBCTL_STOP_CONSUME_BIT) | 1856 | #define JOBCTL_STOP_CONSUME (1 << JOBCTL_STOP_CONSUME_BIT) |
1857 | #define JOBCTL_TRAP_STOP (1 << JOBCTL_TRAP_STOP_BIT) | 1857 | #define JOBCTL_TRAP_STOP (1 << JOBCTL_TRAP_STOP_BIT) |
1858 | #define JOBCTL_TRAP_NOTIFY (1 << JOBCTL_TRAP_NOTIFY_BIT) | 1858 | #define JOBCTL_TRAP_NOTIFY (1 << JOBCTL_TRAP_NOTIFY_BIT) |
1859 | #define JOBCTL_TRAPPING (1 << JOBCTL_TRAPPING_BIT) | 1859 | #define JOBCTL_TRAPPING (1 << JOBCTL_TRAPPING_BIT) |
1860 | #define JOBCTL_LISTENING (1 << JOBCTL_LISTENING_BIT) | 1860 | #define JOBCTL_LISTENING (1 << JOBCTL_LISTENING_BIT) |
1861 | 1861 | ||
1862 | #define JOBCTL_TRAP_MASK (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY) | 1862 | #define JOBCTL_TRAP_MASK (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY) |
1863 | #define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK) | 1863 | #define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK) |
1864 | 1864 | ||
1865 | extern bool task_set_jobctl_pending(struct task_struct *task, | 1865 | extern bool task_set_jobctl_pending(struct task_struct *task, |
1866 | unsigned int mask); | 1866 | unsigned int mask); |
1867 | extern void task_clear_jobctl_trapping(struct task_struct *task); | 1867 | extern void task_clear_jobctl_trapping(struct task_struct *task); |
1868 | extern void task_clear_jobctl_pending(struct task_struct *task, | 1868 | extern void task_clear_jobctl_pending(struct task_struct *task, |
1869 | unsigned int mask); | 1869 | unsigned int mask); |
1870 | 1870 | ||
1871 | #ifdef CONFIG_PREEMPT_RCU | 1871 | #ifdef CONFIG_PREEMPT_RCU |
1872 | 1872 | ||
1873 | #define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */ | 1873 | #define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */ |
1874 | #define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */ | 1874 | #define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */ |
1875 | 1875 | ||
1876 | static inline void rcu_copy_process(struct task_struct *p) | 1876 | static inline void rcu_copy_process(struct task_struct *p) |
1877 | { | 1877 | { |
1878 | p->rcu_read_lock_nesting = 0; | 1878 | p->rcu_read_lock_nesting = 0; |
1879 | p->rcu_read_unlock_special = 0; | 1879 | p->rcu_read_unlock_special = 0; |
1880 | #ifdef CONFIG_TREE_PREEMPT_RCU | 1880 | #ifdef CONFIG_TREE_PREEMPT_RCU |
1881 | p->rcu_blocked_node = NULL; | 1881 | p->rcu_blocked_node = NULL; |
1882 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | 1882 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ |
1883 | #ifdef CONFIG_RCU_BOOST | 1883 | #ifdef CONFIG_RCU_BOOST |
1884 | p->rcu_boost_mutex = NULL; | 1884 | p->rcu_boost_mutex = NULL; |
1885 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 1885 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
1886 | INIT_LIST_HEAD(&p->rcu_node_entry); | 1886 | INIT_LIST_HEAD(&p->rcu_node_entry); |
1887 | } | 1887 | } |
1888 | 1888 | ||
1889 | #else | 1889 | #else |
1890 | 1890 | ||
1891 | static inline void rcu_copy_process(struct task_struct *p) | 1891 | static inline void rcu_copy_process(struct task_struct *p) |
1892 | { | 1892 | { |
1893 | } | 1893 | } |
1894 | 1894 | ||
1895 | #endif | 1895 | #endif |
1896 | 1896 | ||
1897 | static inline void tsk_restore_flags(struct task_struct *task, | ||
1898 | unsigned long orig_flags, unsigned long flags) | ||
1899 | { | ||
1900 | task->flags &= ~flags; | ||
1901 | task->flags |= orig_flags & flags; | ||
1902 | } | ||
1903 | |||
1897 | #ifdef CONFIG_SMP | 1904 | #ifdef CONFIG_SMP |
1898 | extern void do_set_cpus_allowed(struct task_struct *p, | 1905 | extern void do_set_cpus_allowed(struct task_struct *p, |
1899 | const struct cpumask *new_mask); | 1906 | const struct cpumask *new_mask); |
1900 | 1907 | ||
1901 | extern int set_cpus_allowed_ptr(struct task_struct *p, | 1908 | extern int set_cpus_allowed_ptr(struct task_struct *p, |
1902 | const struct cpumask *new_mask); | 1909 | const struct cpumask *new_mask); |
1903 | #else | 1910 | #else |
1904 | static inline void do_set_cpus_allowed(struct task_struct *p, | 1911 | static inline void do_set_cpus_allowed(struct task_struct *p, |
1905 | const struct cpumask *new_mask) | 1912 | const struct cpumask *new_mask) |
1906 | { | 1913 | { |
1907 | } | 1914 | } |
1908 | static inline int set_cpus_allowed_ptr(struct task_struct *p, | 1915 | static inline int set_cpus_allowed_ptr(struct task_struct *p, |
1909 | const struct cpumask *new_mask) | 1916 | const struct cpumask *new_mask) |
1910 | { | 1917 | { |
1911 | if (!cpumask_test_cpu(0, new_mask)) | 1918 | if (!cpumask_test_cpu(0, new_mask)) |
1912 | return -EINVAL; | 1919 | return -EINVAL; |
1913 | return 0; | 1920 | return 0; |
1914 | } | 1921 | } |
1915 | #endif | 1922 | #endif |
1916 | 1923 | ||
1917 | #ifdef CONFIG_NO_HZ | 1924 | #ifdef CONFIG_NO_HZ |
1918 | void calc_load_enter_idle(void); | 1925 | void calc_load_enter_idle(void); |
1919 | void calc_load_exit_idle(void); | 1926 | void calc_load_exit_idle(void); |
1920 | #else | 1927 | #else |
1921 | static inline void calc_load_enter_idle(void) { } | 1928 | static inline void calc_load_enter_idle(void) { } |
1922 | static inline void calc_load_exit_idle(void) { } | 1929 | static inline void calc_load_exit_idle(void) { } |
1923 | #endif /* CONFIG_NO_HZ */ | 1930 | #endif /* CONFIG_NO_HZ */ |
1924 | 1931 | ||
1925 | #ifndef CONFIG_CPUMASK_OFFSTACK | 1932 | #ifndef CONFIG_CPUMASK_OFFSTACK |
1926 | static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) | 1933 | static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) |
1927 | { | 1934 | { |
1928 | return set_cpus_allowed_ptr(p, &new_mask); | 1935 | return set_cpus_allowed_ptr(p, &new_mask); |
1929 | } | 1936 | } |
1930 | #endif | 1937 | #endif |
1931 | 1938 | ||
1932 | /* | 1939 | /* |
1933 | * Do not use outside of architecture code which knows its limitations. | 1940 | * Do not use outside of architecture code which knows its limitations. |
1934 | * | 1941 | * |
1935 | * sched_clock() has no promise of monotonicity or bounded drift between | 1942 | * sched_clock() has no promise of monotonicity or bounded drift between |
1936 | * CPUs, use (which you should not) requires disabling IRQs. | 1943 | * CPUs, use (which you should not) requires disabling IRQs. |
1937 | * | 1944 | * |
1938 | * Please use one of the three interfaces below. | 1945 | * Please use one of the three interfaces below. |
1939 | */ | 1946 | */ |
1940 | extern unsigned long long notrace sched_clock(void); | 1947 | extern unsigned long long notrace sched_clock(void); |
1941 | /* | 1948 | /* |
1942 | * See the comment in kernel/sched/clock.c | 1949 | * See the comment in kernel/sched/clock.c |
1943 | */ | 1950 | */ |
1944 | extern u64 cpu_clock(int cpu); | 1951 | extern u64 cpu_clock(int cpu); |
1945 | extern u64 local_clock(void); | 1952 | extern u64 local_clock(void); |
1946 | extern u64 sched_clock_cpu(int cpu); | 1953 | extern u64 sched_clock_cpu(int cpu); |
1947 | 1954 | ||
1948 | 1955 | ||
1949 | extern void sched_clock_init(void); | 1956 | extern void sched_clock_init(void); |
1950 | 1957 | ||
1951 | #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK | 1958 | #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK |
1952 | static inline void sched_clock_tick(void) | 1959 | static inline void sched_clock_tick(void) |
1953 | { | 1960 | { |
1954 | } | 1961 | } |
1955 | 1962 | ||
1956 | static inline void sched_clock_idle_sleep_event(void) | 1963 | static inline void sched_clock_idle_sleep_event(void) |
1957 | { | 1964 | { |
1958 | } | 1965 | } |
1959 | 1966 | ||
1960 | static inline void sched_clock_idle_wakeup_event(u64 delta_ns) | 1967 | static inline void sched_clock_idle_wakeup_event(u64 delta_ns) |
1961 | { | 1968 | { |
1962 | } | 1969 | } |
1963 | #else | 1970 | #else |
1964 | /* | 1971 | /* |
1965 | * Architectures can set this to 1 if they have specified | 1972 | * Architectures can set this to 1 if they have specified |
1966 | * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig, | 1973 | * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig, |
1967 | * but then during bootup it turns out that sched_clock() | 1974 | * but then during bootup it turns out that sched_clock() |
1968 | * is reliable after all: | 1975 | * is reliable after all: |
1969 | */ | 1976 | */ |
1970 | extern int sched_clock_stable; | 1977 | extern int sched_clock_stable; |
1971 | 1978 | ||
1972 | extern void sched_clock_tick(void); | 1979 | extern void sched_clock_tick(void); |
1973 | extern void sched_clock_idle_sleep_event(void); | 1980 | extern void sched_clock_idle_sleep_event(void); |
1974 | extern void sched_clock_idle_wakeup_event(u64 delta_ns); | 1981 | extern void sched_clock_idle_wakeup_event(u64 delta_ns); |
1975 | #endif | 1982 | #endif |
1976 | 1983 | ||
1977 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 1984 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
1978 | /* | 1985 | /* |
1979 | * An i/f to runtime opt-in for irq time accounting based off of sched_clock. | 1986 | * An i/f to runtime opt-in for irq time accounting based off of sched_clock. |
1980 | * The reason for this explicit opt-in is not to have perf penalty with | 1987 | * The reason for this explicit opt-in is not to have perf penalty with |
1981 | * slow sched_clocks. | 1988 | * slow sched_clocks. |
1982 | */ | 1989 | */ |
1983 | extern void enable_sched_clock_irqtime(void); | 1990 | extern void enable_sched_clock_irqtime(void); |
1984 | extern void disable_sched_clock_irqtime(void); | 1991 | extern void disable_sched_clock_irqtime(void); |
1985 | #else | 1992 | #else |
1986 | static inline void enable_sched_clock_irqtime(void) {} | 1993 | static inline void enable_sched_clock_irqtime(void) {} |
1987 | static inline void disable_sched_clock_irqtime(void) {} | 1994 | static inline void disable_sched_clock_irqtime(void) {} |
1988 | #endif | 1995 | #endif |
1989 | 1996 | ||
1990 | extern unsigned long long | 1997 | extern unsigned long long |
1991 | task_sched_runtime(struct task_struct *task); | 1998 | task_sched_runtime(struct task_struct *task); |
1992 | 1999 | ||
1993 | /* sched_exec is called by processes performing an exec */ | 2000 | /* sched_exec is called by processes performing an exec */ |
1994 | #ifdef CONFIG_SMP | 2001 | #ifdef CONFIG_SMP |
1995 | extern void sched_exec(void); | 2002 | extern void sched_exec(void); |
1996 | #else | 2003 | #else |
1997 | #define sched_exec() {} | 2004 | #define sched_exec() {} |
1998 | #endif | 2005 | #endif |
1999 | 2006 | ||
2000 | extern void sched_clock_idle_sleep_event(void); | 2007 | extern void sched_clock_idle_sleep_event(void); |
2001 | extern void sched_clock_idle_wakeup_event(u64 delta_ns); | 2008 | extern void sched_clock_idle_wakeup_event(u64 delta_ns); |
2002 | 2009 | ||
2003 | #ifdef CONFIG_HOTPLUG_CPU | 2010 | #ifdef CONFIG_HOTPLUG_CPU |
2004 | extern void idle_task_exit(void); | 2011 | extern void idle_task_exit(void); |
2005 | #else | 2012 | #else |
2006 | static inline void idle_task_exit(void) {} | 2013 | static inline void idle_task_exit(void) {} |
2007 | #endif | 2014 | #endif |
2008 | 2015 | ||
2009 | #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) | 2016 | #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) |
2010 | extern void wake_up_idle_cpu(int cpu); | 2017 | extern void wake_up_idle_cpu(int cpu); |
2011 | #else | 2018 | #else |
2012 | static inline void wake_up_idle_cpu(int cpu) { } | 2019 | static inline void wake_up_idle_cpu(int cpu) { } |
2013 | #endif | 2020 | #endif |
2014 | 2021 | ||
2015 | extern unsigned int sysctl_sched_latency; | 2022 | extern unsigned int sysctl_sched_latency; |
2016 | extern unsigned int sysctl_sched_min_granularity; | 2023 | extern unsigned int sysctl_sched_min_granularity; |
2017 | extern unsigned int sysctl_sched_wakeup_granularity; | 2024 | extern unsigned int sysctl_sched_wakeup_granularity; |
2018 | extern unsigned int sysctl_sched_child_runs_first; | 2025 | extern unsigned int sysctl_sched_child_runs_first; |
2019 | 2026 | ||
2020 | enum sched_tunable_scaling { | 2027 | enum sched_tunable_scaling { |
2021 | SCHED_TUNABLESCALING_NONE, | 2028 | SCHED_TUNABLESCALING_NONE, |
2022 | SCHED_TUNABLESCALING_LOG, | 2029 | SCHED_TUNABLESCALING_LOG, |
2023 | SCHED_TUNABLESCALING_LINEAR, | 2030 | SCHED_TUNABLESCALING_LINEAR, |
2024 | SCHED_TUNABLESCALING_END, | 2031 | SCHED_TUNABLESCALING_END, |
2025 | }; | 2032 | }; |
2026 | extern enum sched_tunable_scaling sysctl_sched_tunable_scaling; | 2033 | extern enum sched_tunable_scaling sysctl_sched_tunable_scaling; |
2027 | 2034 | ||
2028 | #ifdef CONFIG_SCHED_DEBUG | 2035 | #ifdef CONFIG_SCHED_DEBUG |
2029 | extern unsigned int sysctl_sched_migration_cost; | 2036 | extern unsigned int sysctl_sched_migration_cost; |
2030 | extern unsigned int sysctl_sched_nr_migrate; | 2037 | extern unsigned int sysctl_sched_nr_migrate; |
2031 | extern unsigned int sysctl_sched_time_avg; | 2038 | extern unsigned int sysctl_sched_time_avg; |
2032 | extern unsigned int sysctl_timer_migration; | 2039 | extern unsigned int sysctl_timer_migration; |
2033 | extern unsigned int sysctl_sched_shares_window; | 2040 | extern unsigned int sysctl_sched_shares_window; |
2034 | 2041 | ||
2035 | int sched_proc_update_handler(struct ctl_table *table, int write, | 2042 | int sched_proc_update_handler(struct ctl_table *table, int write, |
2036 | void __user *buffer, size_t *length, | 2043 | void __user *buffer, size_t *length, |
2037 | loff_t *ppos); | 2044 | loff_t *ppos); |
2038 | #endif | 2045 | #endif |
2039 | #ifdef CONFIG_SCHED_DEBUG | 2046 | #ifdef CONFIG_SCHED_DEBUG |
2040 | static inline unsigned int get_sysctl_timer_migration(void) | 2047 | static inline unsigned int get_sysctl_timer_migration(void) |
2041 | { | 2048 | { |
2042 | return sysctl_timer_migration; | 2049 | return sysctl_timer_migration; |
2043 | } | 2050 | } |
2044 | #else | 2051 | #else |
2045 | static inline unsigned int get_sysctl_timer_migration(void) | 2052 | static inline unsigned int get_sysctl_timer_migration(void) |
2046 | { | 2053 | { |
2047 | return 1; | 2054 | return 1; |
2048 | } | 2055 | } |
2049 | #endif | 2056 | #endif |
2050 | extern unsigned int sysctl_sched_rt_period; | 2057 | extern unsigned int sysctl_sched_rt_period; |
2051 | extern int sysctl_sched_rt_runtime; | 2058 | extern int sysctl_sched_rt_runtime; |
2052 | 2059 | ||
2053 | int sched_rt_handler(struct ctl_table *table, int write, | 2060 | int sched_rt_handler(struct ctl_table *table, int write, |
2054 | void __user *buffer, size_t *lenp, | 2061 | void __user *buffer, size_t *lenp, |
2055 | loff_t *ppos); | 2062 | loff_t *ppos); |
2056 | 2063 | ||
2057 | #ifdef CONFIG_SCHED_AUTOGROUP | 2064 | #ifdef CONFIG_SCHED_AUTOGROUP |
2058 | extern unsigned int sysctl_sched_autogroup_enabled; | 2065 | extern unsigned int sysctl_sched_autogroup_enabled; |
2059 | 2066 | ||
2060 | extern void sched_autogroup_create_attach(struct task_struct *p); | 2067 | extern void sched_autogroup_create_attach(struct task_struct *p); |
2061 | extern void sched_autogroup_detach(struct task_struct *p); | 2068 | extern void sched_autogroup_detach(struct task_struct *p); |
2062 | extern void sched_autogroup_fork(struct signal_struct *sig); | 2069 | extern void sched_autogroup_fork(struct signal_struct *sig); |
2063 | extern void sched_autogroup_exit(struct signal_struct *sig); | 2070 | extern void sched_autogroup_exit(struct signal_struct *sig); |
2064 | #ifdef CONFIG_PROC_FS | 2071 | #ifdef CONFIG_PROC_FS |
2065 | extern void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m); | 2072 | extern void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m); |
2066 | extern int proc_sched_autogroup_set_nice(struct task_struct *p, int nice); | 2073 | extern int proc_sched_autogroup_set_nice(struct task_struct *p, int nice); |
2067 | #endif | 2074 | #endif |
2068 | #else | 2075 | #else |
2069 | static inline void sched_autogroup_create_attach(struct task_struct *p) { } | 2076 | static inline void sched_autogroup_create_attach(struct task_struct *p) { } |
2070 | static inline void sched_autogroup_detach(struct task_struct *p) { } | 2077 | static inline void sched_autogroup_detach(struct task_struct *p) { } |
2071 | static inline void sched_autogroup_fork(struct signal_struct *sig) { } | 2078 | static inline void sched_autogroup_fork(struct signal_struct *sig) { } |
2072 | static inline void sched_autogroup_exit(struct signal_struct *sig) { } | 2079 | static inline void sched_autogroup_exit(struct signal_struct *sig) { } |
2073 | #endif | 2080 | #endif |
2074 | 2081 | ||
2075 | #ifdef CONFIG_CFS_BANDWIDTH | 2082 | #ifdef CONFIG_CFS_BANDWIDTH |
2076 | extern unsigned int sysctl_sched_cfs_bandwidth_slice; | 2083 | extern unsigned int sysctl_sched_cfs_bandwidth_slice; |
2077 | #endif | 2084 | #endif |
2078 | 2085 | ||
2079 | #ifdef CONFIG_RT_MUTEXES | 2086 | #ifdef CONFIG_RT_MUTEXES |
2080 | extern int rt_mutex_getprio(struct task_struct *p); | 2087 | extern int rt_mutex_getprio(struct task_struct *p); |
2081 | extern void rt_mutex_setprio(struct task_struct *p, int prio); | 2088 | extern void rt_mutex_setprio(struct task_struct *p, int prio); |
2082 | extern void rt_mutex_adjust_pi(struct task_struct *p); | 2089 | extern void rt_mutex_adjust_pi(struct task_struct *p); |
2083 | static inline bool tsk_is_pi_blocked(struct task_struct *tsk) | 2090 | static inline bool tsk_is_pi_blocked(struct task_struct *tsk) |
2084 | { | 2091 | { |
2085 | return tsk->pi_blocked_on != NULL; | 2092 | return tsk->pi_blocked_on != NULL; |
2086 | } | 2093 | } |
2087 | #else | 2094 | #else |
2088 | static inline int rt_mutex_getprio(struct task_struct *p) | 2095 | static inline int rt_mutex_getprio(struct task_struct *p) |
2089 | { | 2096 | { |
2090 | return p->normal_prio; | 2097 | return p->normal_prio; |
2091 | } | 2098 | } |
2092 | # define rt_mutex_adjust_pi(p) do { } while (0) | 2099 | # define rt_mutex_adjust_pi(p) do { } while (0) |
2093 | static inline bool tsk_is_pi_blocked(struct task_struct *tsk) | 2100 | static inline bool tsk_is_pi_blocked(struct task_struct *tsk) |
2094 | { | 2101 | { |
2095 | return false; | 2102 | return false; |
2096 | } | 2103 | } |
2097 | #endif | 2104 | #endif |
2098 | 2105 | ||
2099 | extern bool yield_to(struct task_struct *p, bool preempt); | 2106 | extern bool yield_to(struct task_struct *p, bool preempt); |
2100 | extern void set_user_nice(struct task_struct *p, long nice); | 2107 | extern void set_user_nice(struct task_struct *p, long nice); |
2101 | extern int task_prio(const struct task_struct *p); | 2108 | extern int task_prio(const struct task_struct *p); |
2102 | extern int task_nice(const struct task_struct *p); | 2109 | extern int task_nice(const struct task_struct *p); |
2103 | extern int can_nice(const struct task_struct *p, const int nice); | 2110 | extern int can_nice(const struct task_struct *p, const int nice); |
2104 | extern int task_curr(const struct task_struct *p); | 2111 | extern int task_curr(const struct task_struct *p); |
2105 | extern int idle_cpu(int cpu); | 2112 | extern int idle_cpu(int cpu); |
2106 | extern int sched_setscheduler(struct task_struct *, int, | 2113 | extern int sched_setscheduler(struct task_struct *, int, |
2107 | const struct sched_param *); | 2114 | const struct sched_param *); |
2108 | extern int sched_setscheduler_nocheck(struct task_struct *, int, | 2115 | extern int sched_setscheduler_nocheck(struct task_struct *, int, |
2109 | const struct sched_param *); | 2116 | const struct sched_param *); |
2110 | extern struct task_struct *idle_task(int cpu); | 2117 | extern struct task_struct *idle_task(int cpu); |
2111 | /** | 2118 | /** |
2112 | * is_idle_task - is the specified task an idle task? | 2119 | * is_idle_task - is the specified task an idle task? |
2113 | * @p: the task in question. | 2120 | * @p: the task in question. |
2114 | */ | 2121 | */ |
2115 | static inline bool is_idle_task(const struct task_struct *p) | 2122 | static inline bool is_idle_task(const struct task_struct *p) |
2116 | { | 2123 | { |
2117 | return p->pid == 0; | 2124 | return p->pid == 0; |
2118 | } | 2125 | } |
2119 | extern struct task_struct *curr_task(int cpu); | 2126 | extern struct task_struct *curr_task(int cpu); |
2120 | extern void set_curr_task(int cpu, struct task_struct *p); | 2127 | extern void set_curr_task(int cpu, struct task_struct *p); |
2121 | 2128 | ||
2122 | void yield(void); | 2129 | void yield(void); |
2123 | 2130 | ||
2124 | /* | 2131 | /* |
2125 | * The default (Linux) execution domain. | 2132 | * The default (Linux) execution domain. |
2126 | */ | 2133 | */ |
2127 | extern struct exec_domain default_exec_domain; | 2134 | extern struct exec_domain default_exec_domain; |
2128 | 2135 | ||
2129 | union thread_union { | 2136 | union thread_union { |
2130 | struct thread_info thread_info; | 2137 | struct thread_info thread_info; |
2131 | unsigned long stack[THREAD_SIZE/sizeof(long)]; | 2138 | unsigned long stack[THREAD_SIZE/sizeof(long)]; |
2132 | }; | 2139 | }; |
2133 | 2140 | ||
2134 | #ifndef __HAVE_ARCH_KSTACK_END | 2141 | #ifndef __HAVE_ARCH_KSTACK_END |
2135 | static inline int kstack_end(void *addr) | 2142 | static inline int kstack_end(void *addr) |
2136 | { | 2143 | { |
2137 | /* Reliable end of stack detection: | 2144 | /* Reliable end of stack detection: |
2138 | * Some APM bios versions misalign the stack | 2145 | * Some APM bios versions misalign the stack |
2139 | */ | 2146 | */ |
2140 | return !(((unsigned long)addr+sizeof(void*)-1) & (THREAD_SIZE-sizeof(void*))); | 2147 | return !(((unsigned long)addr+sizeof(void*)-1) & (THREAD_SIZE-sizeof(void*))); |
2141 | } | 2148 | } |
2142 | #endif | 2149 | #endif |
2143 | 2150 | ||
2144 | extern union thread_union init_thread_union; | 2151 | extern union thread_union init_thread_union; |
2145 | extern struct task_struct init_task; | 2152 | extern struct task_struct init_task; |
2146 | 2153 | ||
2147 | extern struct mm_struct init_mm; | 2154 | extern struct mm_struct init_mm; |
2148 | 2155 | ||
2149 | extern struct pid_namespace init_pid_ns; | 2156 | extern struct pid_namespace init_pid_ns; |
2150 | 2157 | ||
2151 | /* | 2158 | /* |
2152 | * find a task by one of its numerical ids | 2159 | * find a task by one of its numerical ids |
2153 | * | 2160 | * |
2154 | * find_task_by_pid_ns(): | 2161 | * find_task_by_pid_ns(): |
2155 | * finds a task by its pid in the specified namespace | 2162 | * finds a task by its pid in the specified namespace |
2156 | * find_task_by_vpid(): | 2163 | * find_task_by_vpid(): |
2157 | * finds a task by its virtual pid | 2164 | * finds a task by its virtual pid |
2158 | * | 2165 | * |
2159 | * see also find_vpid() etc in include/linux/pid.h | 2166 | * see also find_vpid() etc in include/linux/pid.h |
2160 | */ | 2167 | */ |
2161 | 2168 | ||
2162 | extern struct task_struct *find_task_by_vpid(pid_t nr); | 2169 | extern struct task_struct *find_task_by_vpid(pid_t nr); |
2163 | extern struct task_struct *find_task_by_pid_ns(pid_t nr, | 2170 | extern struct task_struct *find_task_by_pid_ns(pid_t nr, |
2164 | struct pid_namespace *ns); | 2171 | struct pid_namespace *ns); |
2165 | 2172 | ||
2166 | extern void __set_special_pids(struct pid *pid); | 2173 | extern void __set_special_pids(struct pid *pid); |
2167 | 2174 | ||
2168 | /* per-UID process charging. */ | 2175 | /* per-UID process charging. */ |
2169 | extern struct user_struct * alloc_uid(kuid_t); | 2176 | extern struct user_struct * alloc_uid(kuid_t); |
2170 | static inline struct user_struct *get_uid(struct user_struct *u) | 2177 | static inline struct user_struct *get_uid(struct user_struct *u) |
2171 | { | 2178 | { |
2172 | atomic_inc(&u->__count); | 2179 | atomic_inc(&u->__count); |
2173 | return u; | 2180 | return u; |
2174 | } | 2181 | } |
2175 | extern void free_uid(struct user_struct *); | 2182 | extern void free_uid(struct user_struct *); |
2176 | 2183 | ||
2177 | #include <asm/current.h> | 2184 | #include <asm/current.h> |
2178 | 2185 | ||
2179 | extern void xtime_update(unsigned long ticks); | 2186 | extern void xtime_update(unsigned long ticks); |
2180 | 2187 | ||
2181 | extern int wake_up_state(struct task_struct *tsk, unsigned int state); | 2188 | extern int wake_up_state(struct task_struct *tsk, unsigned int state); |
2182 | extern int wake_up_process(struct task_struct *tsk); | 2189 | extern int wake_up_process(struct task_struct *tsk); |
2183 | extern void wake_up_new_task(struct task_struct *tsk); | 2190 | extern void wake_up_new_task(struct task_struct *tsk); |
2184 | #ifdef CONFIG_SMP | 2191 | #ifdef CONFIG_SMP |
2185 | extern void kick_process(struct task_struct *tsk); | 2192 | extern void kick_process(struct task_struct *tsk); |
2186 | #else | 2193 | #else |
2187 | static inline void kick_process(struct task_struct *tsk) { } | 2194 | static inline void kick_process(struct task_struct *tsk) { } |
2188 | #endif | 2195 | #endif |
2189 | extern void sched_fork(struct task_struct *p); | 2196 | extern void sched_fork(struct task_struct *p); |
2190 | extern void sched_dead(struct task_struct *p); | 2197 | extern void sched_dead(struct task_struct *p); |
2191 | 2198 | ||
2192 | extern void proc_caches_init(void); | 2199 | extern void proc_caches_init(void); |
2193 | extern void flush_signals(struct task_struct *); | 2200 | extern void flush_signals(struct task_struct *); |
2194 | extern void __flush_signals(struct task_struct *); | 2201 | extern void __flush_signals(struct task_struct *); |
2195 | extern void ignore_signals(struct task_struct *); | 2202 | extern void ignore_signals(struct task_struct *); |
2196 | extern void flush_signal_handlers(struct task_struct *, int force_default); | 2203 | extern void flush_signal_handlers(struct task_struct *, int force_default); |
2197 | extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info); | 2204 | extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info); |
2198 | 2205 | ||
2199 | static inline int dequeue_signal_lock(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) | 2206 | static inline int dequeue_signal_lock(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) |
2200 | { | 2207 | { |
2201 | unsigned long flags; | 2208 | unsigned long flags; |
2202 | int ret; | 2209 | int ret; |
2203 | 2210 | ||
2204 | spin_lock_irqsave(&tsk->sighand->siglock, flags); | 2211 | spin_lock_irqsave(&tsk->sighand->siglock, flags); |
2205 | ret = dequeue_signal(tsk, mask, info); | 2212 | ret = dequeue_signal(tsk, mask, info); |
2206 | spin_unlock_irqrestore(&tsk->sighand->siglock, flags); | 2213 | spin_unlock_irqrestore(&tsk->sighand->siglock, flags); |
2207 | 2214 | ||
2208 | return ret; | 2215 | return ret; |
2209 | } | 2216 | } |
2210 | 2217 | ||
2211 | extern void block_all_signals(int (*notifier)(void *priv), void *priv, | 2218 | extern void block_all_signals(int (*notifier)(void *priv), void *priv, |
2212 | sigset_t *mask); | 2219 | sigset_t *mask); |
2213 | extern void unblock_all_signals(void); | 2220 | extern void unblock_all_signals(void); |
2214 | extern void release_task(struct task_struct * p); | 2221 | extern void release_task(struct task_struct * p); |
2215 | extern int send_sig_info(int, struct siginfo *, struct task_struct *); | 2222 | extern int send_sig_info(int, struct siginfo *, struct task_struct *); |
2216 | extern int force_sigsegv(int, struct task_struct *); | 2223 | extern int force_sigsegv(int, struct task_struct *); |
2217 | extern int force_sig_info(int, struct siginfo *, struct task_struct *); | 2224 | extern int force_sig_info(int, struct siginfo *, struct task_struct *); |
2218 | extern int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp); | 2225 | extern int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp); |
2219 | extern int kill_pid_info(int sig, struct siginfo *info, struct pid *pid); | 2226 | extern int kill_pid_info(int sig, struct siginfo *info, struct pid *pid); |
2220 | extern int kill_pid_info_as_cred(int, struct siginfo *, struct pid *, | 2227 | extern int kill_pid_info_as_cred(int, struct siginfo *, struct pid *, |
2221 | const struct cred *, u32); | 2228 | const struct cred *, u32); |
2222 | extern int kill_pgrp(struct pid *pid, int sig, int priv); | 2229 | extern int kill_pgrp(struct pid *pid, int sig, int priv); |
2223 | extern int kill_pid(struct pid *pid, int sig, int priv); | 2230 | extern int kill_pid(struct pid *pid, int sig, int priv); |
2224 | extern int kill_proc_info(int, struct siginfo *, pid_t); | 2231 | extern int kill_proc_info(int, struct siginfo *, pid_t); |
2225 | extern __must_check bool do_notify_parent(struct task_struct *, int); | 2232 | extern __must_check bool do_notify_parent(struct task_struct *, int); |
2226 | extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent); | 2233 | extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent); |
2227 | extern void force_sig(int, struct task_struct *); | 2234 | extern void force_sig(int, struct task_struct *); |
2228 | extern int send_sig(int, struct task_struct *, int); | 2235 | extern int send_sig(int, struct task_struct *, int); |
2229 | extern int zap_other_threads(struct task_struct *p); | 2236 | extern int zap_other_threads(struct task_struct *p); |
2230 | extern struct sigqueue *sigqueue_alloc(void); | 2237 | extern struct sigqueue *sigqueue_alloc(void); |
2231 | extern void sigqueue_free(struct sigqueue *); | 2238 | extern void sigqueue_free(struct sigqueue *); |
2232 | extern int send_sigqueue(struct sigqueue *, struct task_struct *, int group); | 2239 | extern int send_sigqueue(struct sigqueue *, struct task_struct *, int group); |
2233 | extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *); | 2240 | extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *); |
2234 | extern int do_sigaltstack(const stack_t __user *, stack_t __user *, unsigned long); | 2241 | extern int do_sigaltstack(const stack_t __user *, stack_t __user *, unsigned long); |
2235 | 2242 | ||
2236 | static inline void restore_saved_sigmask(void) | 2243 | static inline void restore_saved_sigmask(void) |
2237 | { | 2244 | { |
2238 | if (test_and_clear_restore_sigmask()) | 2245 | if (test_and_clear_restore_sigmask()) |
2239 | __set_current_blocked(¤t->saved_sigmask); | 2246 | __set_current_blocked(¤t->saved_sigmask); |
2240 | } | 2247 | } |
2241 | 2248 | ||
2242 | static inline sigset_t *sigmask_to_save(void) | 2249 | static inline sigset_t *sigmask_to_save(void) |
2243 | { | 2250 | { |
2244 | sigset_t *res = ¤t->blocked; | 2251 | sigset_t *res = ¤t->blocked; |
2245 | if (unlikely(test_restore_sigmask())) | 2252 | if (unlikely(test_restore_sigmask())) |
2246 | res = ¤t->saved_sigmask; | 2253 | res = ¤t->saved_sigmask; |
2247 | return res; | 2254 | return res; |
2248 | } | 2255 | } |
2249 | 2256 | ||
2250 | static inline int kill_cad_pid(int sig, int priv) | 2257 | static inline int kill_cad_pid(int sig, int priv) |
2251 | { | 2258 | { |
2252 | return kill_pid(cad_pid, sig, priv); | 2259 | return kill_pid(cad_pid, sig, priv); |
2253 | } | 2260 | } |
2254 | 2261 | ||
2255 | /* These can be the second arg to send_sig_info/send_group_sig_info. */ | 2262 | /* These can be the second arg to send_sig_info/send_group_sig_info. */ |
2256 | #define SEND_SIG_NOINFO ((struct siginfo *) 0) | 2263 | #define SEND_SIG_NOINFO ((struct siginfo *) 0) |
2257 | #define SEND_SIG_PRIV ((struct siginfo *) 1) | 2264 | #define SEND_SIG_PRIV ((struct siginfo *) 1) |
2258 | #define SEND_SIG_FORCED ((struct siginfo *) 2) | 2265 | #define SEND_SIG_FORCED ((struct siginfo *) 2) |
2259 | 2266 | ||
2260 | /* | 2267 | /* |
2261 | * True if we are on the alternate signal stack. | 2268 | * True if we are on the alternate signal stack. |
2262 | */ | 2269 | */ |
2263 | static inline int on_sig_stack(unsigned long sp) | 2270 | static inline int on_sig_stack(unsigned long sp) |
2264 | { | 2271 | { |
2265 | #ifdef CONFIG_STACK_GROWSUP | 2272 | #ifdef CONFIG_STACK_GROWSUP |
2266 | return sp >= current->sas_ss_sp && | 2273 | return sp >= current->sas_ss_sp && |
2267 | sp - current->sas_ss_sp < current->sas_ss_size; | 2274 | sp - current->sas_ss_sp < current->sas_ss_size; |
2268 | #else | 2275 | #else |
2269 | return sp > current->sas_ss_sp && | 2276 | return sp > current->sas_ss_sp && |
2270 | sp - current->sas_ss_sp <= current->sas_ss_size; | 2277 | sp - current->sas_ss_sp <= current->sas_ss_size; |
2271 | #endif | 2278 | #endif |
2272 | } | 2279 | } |
2273 | 2280 | ||
2274 | static inline int sas_ss_flags(unsigned long sp) | 2281 | static inline int sas_ss_flags(unsigned long sp) |
2275 | { | 2282 | { |
2276 | return (current->sas_ss_size == 0 ? SS_DISABLE | 2283 | return (current->sas_ss_size == 0 ? SS_DISABLE |
2277 | : on_sig_stack(sp) ? SS_ONSTACK : 0); | 2284 | : on_sig_stack(sp) ? SS_ONSTACK : 0); |
2278 | } | 2285 | } |
2279 | 2286 | ||
2280 | /* | 2287 | /* |
2281 | * Routines for handling mm_structs | 2288 | * Routines for handling mm_structs |
2282 | */ | 2289 | */ |
2283 | extern struct mm_struct * mm_alloc(void); | 2290 | extern struct mm_struct * mm_alloc(void); |
2284 | 2291 | ||
2285 | /* mmdrop drops the mm and the page tables */ | 2292 | /* mmdrop drops the mm and the page tables */ |
2286 | extern void __mmdrop(struct mm_struct *); | 2293 | extern void __mmdrop(struct mm_struct *); |
2287 | static inline void mmdrop(struct mm_struct * mm) | 2294 | static inline void mmdrop(struct mm_struct * mm) |
2288 | { | 2295 | { |
2289 | if (unlikely(atomic_dec_and_test(&mm->mm_count))) | 2296 | if (unlikely(atomic_dec_and_test(&mm->mm_count))) |
2290 | __mmdrop(mm); | 2297 | __mmdrop(mm); |
2291 | } | 2298 | } |
2292 | 2299 | ||
2293 | /* mmput gets rid of the mappings and all user-space */ | 2300 | /* mmput gets rid of the mappings and all user-space */ |
2294 | extern void mmput(struct mm_struct *); | 2301 | extern void mmput(struct mm_struct *); |
2295 | /* Grab a reference to a task's mm, if it is not already going away */ | 2302 | /* Grab a reference to a task's mm, if it is not already going away */ |
2296 | extern struct mm_struct *get_task_mm(struct task_struct *task); | 2303 | extern struct mm_struct *get_task_mm(struct task_struct *task); |
2297 | /* | 2304 | /* |
2298 | * Grab a reference to a task's mm, if it is not already going away | 2305 | * Grab a reference to a task's mm, if it is not already going away |
2299 | * and ptrace_may_access with the mode parameter passed to it | 2306 | * and ptrace_may_access with the mode parameter passed to it |
2300 | * succeeds. | 2307 | * succeeds. |
2301 | */ | 2308 | */ |
2302 | extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode); | 2309 | extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode); |
2303 | /* Remove the current tasks stale references to the old mm_struct */ | 2310 | /* Remove the current tasks stale references to the old mm_struct */ |
2304 | extern void mm_release(struct task_struct *, struct mm_struct *); | 2311 | extern void mm_release(struct task_struct *, struct mm_struct *); |
2305 | /* Allocate a new mm structure and copy contents from tsk->mm */ | 2312 | /* Allocate a new mm structure and copy contents from tsk->mm */ |
2306 | extern struct mm_struct *dup_mm(struct task_struct *tsk); | 2313 | extern struct mm_struct *dup_mm(struct task_struct *tsk); |
2307 | 2314 | ||
2308 | extern int copy_thread(unsigned long, unsigned long, unsigned long, | 2315 | extern int copy_thread(unsigned long, unsigned long, unsigned long, |
2309 | struct task_struct *, struct pt_regs *); | 2316 | struct task_struct *, struct pt_regs *); |
2310 | extern void flush_thread(void); | 2317 | extern void flush_thread(void); |
2311 | extern void exit_thread(void); | 2318 | extern void exit_thread(void); |
2312 | 2319 | ||
2313 | extern void exit_files(struct task_struct *); | 2320 | extern void exit_files(struct task_struct *); |
2314 | extern void __cleanup_sighand(struct sighand_struct *); | 2321 | extern void __cleanup_sighand(struct sighand_struct *); |
2315 | 2322 | ||
2316 | extern void exit_itimers(struct signal_struct *); | 2323 | extern void exit_itimers(struct signal_struct *); |
2317 | extern void flush_itimer_signals(void); | 2324 | extern void flush_itimer_signals(void); |
2318 | 2325 | ||
2319 | extern void do_group_exit(int); | 2326 | extern void do_group_exit(int); |
2320 | 2327 | ||
2321 | extern void daemonize(const char *, ...); | 2328 | extern void daemonize(const char *, ...); |
2322 | extern int allow_signal(int); | 2329 | extern int allow_signal(int); |
2323 | extern int disallow_signal(int); | 2330 | extern int disallow_signal(int); |
2324 | 2331 | ||
2325 | extern int do_execve(const char *, | 2332 | extern int do_execve(const char *, |
2326 | const char __user * const __user *, | 2333 | const char __user * const __user *, |
2327 | const char __user * const __user *, struct pt_regs *); | 2334 | const char __user * const __user *, struct pt_regs *); |
2328 | extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *); | 2335 | extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *); |
2329 | struct task_struct *fork_idle(int); | 2336 | struct task_struct *fork_idle(int); |
2330 | 2337 | ||
2331 | extern void set_task_comm(struct task_struct *tsk, char *from); | 2338 | extern void set_task_comm(struct task_struct *tsk, char *from); |
2332 | extern char *get_task_comm(char *to, struct task_struct *tsk); | 2339 | extern char *get_task_comm(char *to, struct task_struct *tsk); |
2333 | 2340 | ||
2334 | #ifdef CONFIG_SMP | 2341 | #ifdef CONFIG_SMP |
2335 | void scheduler_ipi(void); | 2342 | void scheduler_ipi(void); |
2336 | extern unsigned long wait_task_inactive(struct task_struct *, long match_state); | 2343 | extern unsigned long wait_task_inactive(struct task_struct *, long match_state); |
2337 | #else | 2344 | #else |
2338 | static inline void scheduler_ipi(void) { } | 2345 | static inline void scheduler_ipi(void) { } |
2339 | static inline unsigned long wait_task_inactive(struct task_struct *p, | 2346 | static inline unsigned long wait_task_inactive(struct task_struct *p, |
2340 | long match_state) | 2347 | long match_state) |
2341 | { | 2348 | { |
2342 | return 1; | 2349 | return 1; |
2343 | } | 2350 | } |
2344 | #endif | 2351 | #endif |
2345 | 2352 | ||
2346 | #define next_task(p) \ | 2353 | #define next_task(p) \ |
2347 | list_entry_rcu((p)->tasks.next, struct task_struct, tasks) | 2354 | list_entry_rcu((p)->tasks.next, struct task_struct, tasks) |
2348 | 2355 | ||
2349 | #define for_each_process(p) \ | 2356 | #define for_each_process(p) \ |
2350 | for (p = &init_task ; (p = next_task(p)) != &init_task ; ) | 2357 | for (p = &init_task ; (p = next_task(p)) != &init_task ; ) |
2351 | 2358 | ||
2352 | extern bool current_is_single_threaded(void); | 2359 | extern bool current_is_single_threaded(void); |
2353 | 2360 | ||
2354 | /* | 2361 | /* |
2355 | * Careful: do_each_thread/while_each_thread is a double loop so | 2362 | * Careful: do_each_thread/while_each_thread is a double loop so |
2356 | * 'break' will not work as expected - use goto instead. | 2363 | * 'break' will not work as expected - use goto instead. |
2357 | */ | 2364 | */ |
2358 | #define do_each_thread(g, t) \ | 2365 | #define do_each_thread(g, t) \ |
2359 | for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do | 2366 | for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do |
2360 | 2367 | ||
2361 | #define while_each_thread(g, t) \ | 2368 | #define while_each_thread(g, t) \ |
2362 | while ((t = next_thread(t)) != g) | 2369 | while ((t = next_thread(t)) != g) |
2363 | 2370 | ||
2364 | static inline int get_nr_threads(struct task_struct *tsk) | 2371 | static inline int get_nr_threads(struct task_struct *tsk) |
2365 | { | 2372 | { |
2366 | return tsk->signal->nr_threads; | 2373 | return tsk->signal->nr_threads; |
2367 | } | 2374 | } |
2368 | 2375 | ||
2369 | static inline bool thread_group_leader(struct task_struct *p) | 2376 | static inline bool thread_group_leader(struct task_struct *p) |
2370 | { | 2377 | { |
2371 | return p->exit_signal >= 0; | 2378 | return p->exit_signal >= 0; |
2372 | } | 2379 | } |
2373 | 2380 | ||
2374 | /* Do to the insanities of de_thread it is possible for a process | 2381 | /* Do to the insanities of de_thread it is possible for a process |
2375 | * to have the pid of the thread group leader without actually being | 2382 | * to have the pid of the thread group leader without actually being |
2376 | * the thread group leader. For iteration through the pids in proc | 2383 | * the thread group leader. For iteration through the pids in proc |
2377 | * all we care about is that we have a task with the appropriate | 2384 | * all we care about is that we have a task with the appropriate |
2378 | * pid, we don't actually care if we have the right task. | 2385 | * pid, we don't actually care if we have the right task. |
2379 | */ | 2386 | */ |
2380 | static inline int has_group_leader_pid(struct task_struct *p) | 2387 | static inline int has_group_leader_pid(struct task_struct *p) |
2381 | { | 2388 | { |
2382 | return p->pid == p->tgid; | 2389 | return p->pid == p->tgid; |
2383 | } | 2390 | } |
2384 | 2391 | ||
2385 | static inline | 2392 | static inline |
2386 | int same_thread_group(struct task_struct *p1, struct task_struct *p2) | 2393 | int same_thread_group(struct task_struct *p1, struct task_struct *p2) |
2387 | { | 2394 | { |
2388 | return p1->tgid == p2->tgid; | 2395 | return p1->tgid == p2->tgid; |
2389 | } | 2396 | } |
2390 | 2397 | ||
2391 | static inline struct task_struct *next_thread(const struct task_struct *p) | 2398 | static inline struct task_struct *next_thread(const struct task_struct *p) |
2392 | { | 2399 | { |
2393 | return list_entry_rcu(p->thread_group.next, | 2400 | return list_entry_rcu(p->thread_group.next, |
2394 | struct task_struct, thread_group); | 2401 | struct task_struct, thread_group); |
2395 | } | 2402 | } |
2396 | 2403 | ||
2397 | static inline int thread_group_empty(struct task_struct *p) | 2404 | static inline int thread_group_empty(struct task_struct *p) |
2398 | { | 2405 | { |
2399 | return list_empty(&p->thread_group); | 2406 | return list_empty(&p->thread_group); |
2400 | } | 2407 | } |
2401 | 2408 | ||
2402 | #define delay_group_leader(p) \ | 2409 | #define delay_group_leader(p) \ |
2403 | (thread_group_leader(p) && !thread_group_empty(p)) | 2410 | (thread_group_leader(p) && !thread_group_empty(p)) |
2404 | 2411 | ||
2405 | /* | 2412 | /* |
2406 | * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring | 2413 | * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring |
2407 | * subscriptions and synchronises with wait4(). Also used in procfs. Also | 2414 | * subscriptions and synchronises with wait4(). Also used in procfs. Also |
2408 | * pins the final release of task.io_context. Also protects ->cpuset and | 2415 | * pins the final release of task.io_context. Also protects ->cpuset and |
2409 | * ->cgroup.subsys[]. And ->vfork_done. | 2416 | * ->cgroup.subsys[]. And ->vfork_done. |
2410 | * | 2417 | * |
2411 | * Nests both inside and outside of read_lock(&tasklist_lock). | 2418 | * Nests both inside and outside of read_lock(&tasklist_lock). |
2412 | * It must not be nested with write_lock_irq(&tasklist_lock), | 2419 | * It must not be nested with write_lock_irq(&tasklist_lock), |
2413 | * neither inside nor outside. | 2420 | * neither inside nor outside. |
2414 | */ | 2421 | */ |
2415 | static inline void task_lock(struct task_struct *p) | 2422 | static inline void task_lock(struct task_struct *p) |
2416 | { | 2423 | { |
2417 | spin_lock(&p->alloc_lock); | 2424 | spin_lock(&p->alloc_lock); |
2418 | } | 2425 | } |
2419 | 2426 | ||
2420 | static inline void task_unlock(struct task_struct *p) | 2427 | static inline void task_unlock(struct task_struct *p) |
2421 | { | 2428 | { |
2422 | spin_unlock(&p->alloc_lock); | 2429 | spin_unlock(&p->alloc_lock); |
2423 | } | 2430 | } |
2424 | 2431 | ||
2425 | extern struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, | 2432 | extern struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, |
2426 | unsigned long *flags); | 2433 | unsigned long *flags); |
2427 | 2434 | ||
2428 | static inline struct sighand_struct *lock_task_sighand(struct task_struct *tsk, | 2435 | static inline struct sighand_struct *lock_task_sighand(struct task_struct *tsk, |
2429 | unsigned long *flags) | 2436 | unsigned long *flags) |
2430 | { | 2437 | { |
2431 | struct sighand_struct *ret; | 2438 | struct sighand_struct *ret; |
2432 | 2439 | ||
2433 | ret = __lock_task_sighand(tsk, flags); | 2440 | ret = __lock_task_sighand(tsk, flags); |
2434 | (void)__cond_lock(&tsk->sighand->siglock, ret); | 2441 | (void)__cond_lock(&tsk->sighand->siglock, ret); |
2435 | return ret; | 2442 | return ret; |
2436 | } | 2443 | } |
2437 | 2444 | ||
2438 | static inline void unlock_task_sighand(struct task_struct *tsk, | 2445 | static inline void unlock_task_sighand(struct task_struct *tsk, |
2439 | unsigned long *flags) | 2446 | unsigned long *flags) |
2440 | { | 2447 | { |
2441 | spin_unlock_irqrestore(&tsk->sighand->siglock, *flags); | 2448 | spin_unlock_irqrestore(&tsk->sighand->siglock, *flags); |
2442 | } | 2449 | } |
2443 | 2450 | ||
2444 | #ifdef CONFIG_CGROUPS | 2451 | #ifdef CONFIG_CGROUPS |
2445 | static inline void threadgroup_change_begin(struct task_struct *tsk) | 2452 | static inline void threadgroup_change_begin(struct task_struct *tsk) |
2446 | { | 2453 | { |
2447 | down_read(&tsk->signal->group_rwsem); | 2454 | down_read(&tsk->signal->group_rwsem); |
2448 | } | 2455 | } |
2449 | static inline void threadgroup_change_end(struct task_struct *tsk) | 2456 | static inline void threadgroup_change_end(struct task_struct *tsk) |
2450 | { | 2457 | { |
2451 | up_read(&tsk->signal->group_rwsem); | 2458 | up_read(&tsk->signal->group_rwsem); |
2452 | } | 2459 | } |
2453 | 2460 | ||
2454 | /** | 2461 | /** |
2455 | * threadgroup_lock - lock threadgroup | 2462 | * threadgroup_lock - lock threadgroup |
2456 | * @tsk: member task of the threadgroup to lock | 2463 | * @tsk: member task of the threadgroup to lock |
2457 | * | 2464 | * |
2458 | * Lock the threadgroup @tsk belongs to. No new task is allowed to enter | 2465 | * Lock the threadgroup @tsk belongs to. No new task is allowed to enter |
2459 | * and member tasks aren't allowed to exit (as indicated by PF_EXITING) or | 2466 | * and member tasks aren't allowed to exit (as indicated by PF_EXITING) or |
2460 | * perform exec. This is useful for cases where the threadgroup needs to | 2467 | * perform exec. This is useful for cases where the threadgroup needs to |
2461 | * stay stable across blockable operations. | 2468 | * stay stable across blockable operations. |
2462 | * | 2469 | * |
2463 | * fork and exit paths explicitly call threadgroup_change_{begin|end}() for | 2470 | * fork and exit paths explicitly call threadgroup_change_{begin|end}() for |
2464 | * synchronization. While held, no new task will be added to threadgroup | 2471 | * synchronization. While held, no new task will be added to threadgroup |
2465 | * and no existing live task will have its PF_EXITING set. | 2472 | * and no existing live task will have its PF_EXITING set. |
2466 | * | 2473 | * |
2467 | * During exec, a task goes and puts its thread group through unusual | 2474 | * During exec, a task goes and puts its thread group through unusual |
2468 | * changes. After de-threading, exclusive access is assumed to resources | 2475 | * changes. After de-threading, exclusive access is assumed to resources |
2469 | * which are usually shared by tasks in the same group - e.g. sighand may | 2476 | * which are usually shared by tasks in the same group - e.g. sighand may |
2470 | * be replaced with a new one. Also, the exec'ing task takes over group | 2477 | * be replaced with a new one. Also, the exec'ing task takes over group |
2471 | * leader role including its pid. Exclude these changes while locked by | 2478 | * leader role including its pid. Exclude these changes while locked by |
2472 | * grabbing cred_guard_mutex which is used to synchronize exec path. | 2479 | * grabbing cred_guard_mutex which is used to synchronize exec path. |
2473 | */ | 2480 | */ |
2474 | static inline void threadgroup_lock(struct task_struct *tsk) | 2481 | static inline void threadgroup_lock(struct task_struct *tsk) |
2475 | { | 2482 | { |
2476 | /* | 2483 | /* |
2477 | * exec uses exit for de-threading nesting group_rwsem inside | 2484 | * exec uses exit for de-threading nesting group_rwsem inside |
2478 | * cred_guard_mutex. Grab cred_guard_mutex first. | 2485 | * cred_guard_mutex. Grab cred_guard_mutex first. |
2479 | */ | 2486 | */ |
2480 | mutex_lock(&tsk->signal->cred_guard_mutex); | 2487 | mutex_lock(&tsk->signal->cred_guard_mutex); |
2481 | down_write(&tsk->signal->group_rwsem); | 2488 | down_write(&tsk->signal->group_rwsem); |
2482 | } | 2489 | } |
2483 | 2490 | ||
2484 | /** | 2491 | /** |
2485 | * threadgroup_unlock - unlock threadgroup | 2492 | * threadgroup_unlock - unlock threadgroup |
2486 | * @tsk: member task of the threadgroup to unlock | 2493 | * @tsk: member task of the threadgroup to unlock |
2487 | * | 2494 | * |
2488 | * Reverse threadgroup_lock(). | 2495 | * Reverse threadgroup_lock(). |
2489 | */ | 2496 | */ |
2490 | static inline void threadgroup_unlock(struct task_struct *tsk) | 2497 | static inline void threadgroup_unlock(struct task_struct *tsk) |
2491 | { | 2498 | { |
2492 | up_write(&tsk->signal->group_rwsem); | 2499 | up_write(&tsk->signal->group_rwsem); |
2493 | mutex_unlock(&tsk->signal->cred_guard_mutex); | 2500 | mutex_unlock(&tsk->signal->cred_guard_mutex); |
2494 | } | 2501 | } |
2495 | #else | 2502 | #else |
2496 | static inline void threadgroup_change_begin(struct task_struct *tsk) {} | 2503 | static inline void threadgroup_change_begin(struct task_struct *tsk) {} |
2497 | static inline void threadgroup_change_end(struct task_struct *tsk) {} | 2504 | static inline void threadgroup_change_end(struct task_struct *tsk) {} |
2498 | static inline void threadgroup_lock(struct task_struct *tsk) {} | 2505 | static inline void threadgroup_lock(struct task_struct *tsk) {} |
2499 | static inline void threadgroup_unlock(struct task_struct *tsk) {} | 2506 | static inline void threadgroup_unlock(struct task_struct *tsk) {} |
2500 | #endif | 2507 | #endif |
2501 | 2508 | ||
2502 | #ifndef __HAVE_THREAD_FUNCTIONS | 2509 | #ifndef __HAVE_THREAD_FUNCTIONS |
2503 | 2510 | ||
2504 | #define task_thread_info(task) ((struct thread_info *)(task)->stack) | 2511 | #define task_thread_info(task) ((struct thread_info *)(task)->stack) |
2505 | #define task_stack_page(task) ((task)->stack) | 2512 | #define task_stack_page(task) ((task)->stack) |
2506 | 2513 | ||
2507 | static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org) | 2514 | static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org) |
2508 | { | 2515 | { |
2509 | *task_thread_info(p) = *task_thread_info(org); | 2516 | *task_thread_info(p) = *task_thread_info(org); |
2510 | task_thread_info(p)->task = p; | 2517 | task_thread_info(p)->task = p; |
2511 | } | 2518 | } |
2512 | 2519 | ||
2513 | static inline unsigned long *end_of_stack(struct task_struct *p) | 2520 | static inline unsigned long *end_of_stack(struct task_struct *p) |
2514 | { | 2521 | { |
2515 | return (unsigned long *)(task_thread_info(p) + 1); | 2522 | return (unsigned long *)(task_thread_info(p) + 1); |
2516 | } | 2523 | } |
2517 | 2524 | ||
2518 | #endif | 2525 | #endif |
2519 | 2526 | ||
2520 | static inline int object_is_on_stack(void *obj) | 2527 | static inline int object_is_on_stack(void *obj) |
2521 | { | 2528 | { |
2522 | void *stack = task_stack_page(current); | 2529 | void *stack = task_stack_page(current); |
2523 | 2530 | ||
2524 | return (obj >= stack) && (obj < (stack + THREAD_SIZE)); | 2531 | return (obj >= stack) && (obj < (stack + THREAD_SIZE)); |
2525 | } | 2532 | } |
2526 | 2533 | ||
2527 | extern void thread_info_cache_init(void); | 2534 | extern void thread_info_cache_init(void); |
2528 | 2535 | ||
2529 | #ifdef CONFIG_DEBUG_STACK_USAGE | 2536 | #ifdef CONFIG_DEBUG_STACK_USAGE |
2530 | static inline unsigned long stack_not_used(struct task_struct *p) | 2537 | static inline unsigned long stack_not_used(struct task_struct *p) |
2531 | { | 2538 | { |
2532 | unsigned long *n = end_of_stack(p); | 2539 | unsigned long *n = end_of_stack(p); |
2533 | 2540 | ||
2534 | do { /* Skip over canary */ | 2541 | do { /* Skip over canary */ |
2535 | n++; | 2542 | n++; |
2536 | } while (!*n); | 2543 | } while (!*n); |
2537 | 2544 | ||
2538 | return (unsigned long)n - (unsigned long)end_of_stack(p); | 2545 | return (unsigned long)n - (unsigned long)end_of_stack(p); |
2539 | } | 2546 | } |
2540 | #endif | 2547 | #endif |
2541 | 2548 | ||
2542 | /* set thread flags in other task's structures | 2549 | /* set thread flags in other task's structures |
2543 | * - see asm/thread_info.h for TIF_xxxx flags available | 2550 | * - see asm/thread_info.h for TIF_xxxx flags available |
2544 | */ | 2551 | */ |
2545 | static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag) | 2552 | static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag) |
2546 | { | 2553 | { |
2547 | set_ti_thread_flag(task_thread_info(tsk), flag); | 2554 | set_ti_thread_flag(task_thread_info(tsk), flag); |
2548 | } | 2555 | } |
2549 | 2556 | ||
2550 | static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag) | 2557 | static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag) |
2551 | { | 2558 | { |
2552 | clear_ti_thread_flag(task_thread_info(tsk), flag); | 2559 | clear_ti_thread_flag(task_thread_info(tsk), flag); |
2553 | } | 2560 | } |
2554 | 2561 | ||
2555 | static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag) | 2562 | static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag) |
2556 | { | 2563 | { |
2557 | return test_and_set_ti_thread_flag(task_thread_info(tsk), flag); | 2564 | return test_and_set_ti_thread_flag(task_thread_info(tsk), flag); |
2558 | } | 2565 | } |
2559 | 2566 | ||
2560 | static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag) | 2567 | static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag) |
2561 | { | 2568 | { |
2562 | return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag); | 2569 | return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag); |
2563 | } | 2570 | } |
2564 | 2571 | ||
2565 | static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag) | 2572 | static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag) |
2566 | { | 2573 | { |
2567 | return test_ti_thread_flag(task_thread_info(tsk), flag); | 2574 | return test_ti_thread_flag(task_thread_info(tsk), flag); |
2568 | } | 2575 | } |
2569 | 2576 | ||
2570 | static inline void set_tsk_need_resched(struct task_struct *tsk) | 2577 | static inline void set_tsk_need_resched(struct task_struct *tsk) |
2571 | { | 2578 | { |
2572 | set_tsk_thread_flag(tsk,TIF_NEED_RESCHED); | 2579 | set_tsk_thread_flag(tsk,TIF_NEED_RESCHED); |
2573 | } | 2580 | } |
2574 | 2581 | ||
2575 | static inline void clear_tsk_need_resched(struct task_struct *tsk) | 2582 | static inline void clear_tsk_need_resched(struct task_struct *tsk) |
2576 | { | 2583 | { |
2577 | clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED); | 2584 | clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED); |
2578 | } | 2585 | } |
2579 | 2586 | ||
2580 | static inline int test_tsk_need_resched(struct task_struct *tsk) | 2587 | static inline int test_tsk_need_resched(struct task_struct *tsk) |
2581 | { | 2588 | { |
2582 | return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); | 2589 | return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); |
2583 | } | 2590 | } |
2584 | 2591 | ||
2585 | static inline int restart_syscall(void) | 2592 | static inline int restart_syscall(void) |
2586 | { | 2593 | { |
2587 | set_tsk_thread_flag(current, TIF_SIGPENDING); | 2594 | set_tsk_thread_flag(current, TIF_SIGPENDING); |
2588 | return -ERESTARTNOINTR; | 2595 | return -ERESTARTNOINTR; |
2589 | } | 2596 | } |
2590 | 2597 | ||
2591 | static inline int signal_pending(struct task_struct *p) | 2598 | static inline int signal_pending(struct task_struct *p) |
2592 | { | 2599 | { |
2593 | return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING)); | 2600 | return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING)); |
2594 | } | 2601 | } |
2595 | 2602 | ||
2596 | static inline int __fatal_signal_pending(struct task_struct *p) | 2603 | static inline int __fatal_signal_pending(struct task_struct *p) |
2597 | { | 2604 | { |
2598 | return unlikely(sigismember(&p->pending.signal, SIGKILL)); | 2605 | return unlikely(sigismember(&p->pending.signal, SIGKILL)); |
2599 | } | 2606 | } |
2600 | 2607 | ||
2601 | static inline int fatal_signal_pending(struct task_struct *p) | 2608 | static inline int fatal_signal_pending(struct task_struct *p) |
2602 | { | 2609 | { |
2603 | return signal_pending(p) && __fatal_signal_pending(p); | 2610 | return signal_pending(p) && __fatal_signal_pending(p); |
2604 | } | 2611 | } |
2605 | 2612 | ||
2606 | static inline int signal_pending_state(long state, struct task_struct *p) | 2613 | static inline int signal_pending_state(long state, struct task_struct *p) |
2607 | { | 2614 | { |
2608 | if (!(state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL))) | 2615 | if (!(state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL))) |
2609 | return 0; | 2616 | return 0; |
2610 | if (!signal_pending(p)) | 2617 | if (!signal_pending(p)) |
2611 | return 0; | 2618 | return 0; |
2612 | 2619 | ||
2613 | return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p); | 2620 | return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p); |
2614 | } | 2621 | } |
2615 | 2622 | ||
2616 | static inline int need_resched(void) | 2623 | static inline int need_resched(void) |
2617 | { | 2624 | { |
2618 | return unlikely(test_thread_flag(TIF_NEED_RESCHED)); | 2625 | return unlikely(test_thread_flag(TIF_NEED_RESCHED)); |
2619 | } | 2626 | } |
2620 | 2627 | ||
2621 | /* | 2628 | /* |
2622 | * cond_resched() and cond_resched_lock(): latency reduction via | 2629 | * cond_resched() and cond_resched_lock(): latency reduction via |
2623 | * explicit rescheduling in places that are safe. The return | 2630 | * explicit rescheduling in places that are safe. The return |
2624 | * value indicates whether a reschedule was done in fact. | 2631 | * value indicates whether a reschedule was done in fact. |
2625 | * cond_resched_lock() will drop the spinlock before scheduling, | 2632 | * cond_resched_lock() will drop the spinlock before scheduling, |
2626 | * cond_resched_softirq() will enable bhs before scheduling. | 2633 | * cond_resched_softirq() will enable bhs before scheduling. |
2627 | */ | 2634 | */ |
2628 | extern int _cond_resched(void); | 2635 | extern int _cond_resched(void); |
2629 | 2636 | ||
2630 | #define cond_resched() ({ \ | 2637 | #define cond_resched() ({ \ |
2631 | __might_sleep(__FILE__, __LINE__, 0); \ | 2638 | __might_sleep(__FILE__, __LINE__, 0); \ |
2632 | _cond_resched(); \ | 2639 | _cond_resched(); \ |
2633 | }) | 2640 | }) |
2634 | 2641 | ||
2635 | extern int __cond_resched_lock(spinlock_t *lock); | 2642 | extern int __cond_resched_lock(spinlock_t *lock); |
2636 | 2643 | ||
2637 | #ifdef CONFIG_PREEMPT_COUNT | 2644 | #ifdef CONFIG_PREEMPT_COUNT |
2638 | #define PREEMPT_LOCK_OFFSET PREEMPT_OFFSET | 2645 | #define PREEMPT_LOCK_OFFSET PREEMPT_OFFSET |
2639 | #else | 2646 | #else |
2640 | #define PREEMPT_LOCK_OFFSET 0 | 2647 | #define PREEMPT_LOCK_OFFSET 0 |
2641 | #endif | 2648 | #endif |
2642 | 2649 | ||
2643 | #define cond_resched_lock(lock) ({ \ | 2650 | #define cond_resched_lock(lock) ({ \ |
2644 | __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \ | 2651 | __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \ |
2645 | __cond_resched_lock(lock); \ | 2652 | __cond_resched_lock(lock); \ |
2646 | }) | 2653 | }) |
2647 | 2654 | ||
2648 | extern int __cond_resched_softirq(void); | 2655 | extern int __cond_resched_softirq(void); |
2649 | 2656 | ||
2650 | #define cond_resched_softirq() ({ \ | 2657 | #define cond_resched_softirq() ({ \ |
2651 | __might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET); \ | 2658 | __might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET); \ |
2652 | __cond_resched_softirq(); \ | 2659 | __cond_resched_softirq(); \ |
2653 | }) | 2660 | }) |
2654 | 2661 | ||
2655 | /* | 2662 | /* |
2656 | * Does a critical section need to be broken due to another | 2663 | * Does a critical section need to be broken due to another |
2657 | * task waiting?: (technically does not depend on CONFIG_PREEMPT, | 2664 | * task waiting?: (technically does not depend on CONFIG_PREEMPT, |
2658 | * but a general need for low latency) | 2665 | * but a general need for low latency) |
2659 | */ | 2666 | */ |
2660 | static inline int spin_needbreak(spinlock_t *lock) | 2667 | static inline int spin_needbreak(spinlock_t *lock) |
2661 | { | 2668 | { |
2662 | #ifdef CONFIG_PREEMPT | 2669 | #ifdef CONFIG_PREEMPT |
2663 | return spin_is_contended(lock); | 2670 | return spin_is_contended(lock); |
2664 | #else | 2671 | #else |
2665 | return 0; | 2672 | return 0; |
2666 | #endif | 2673 | #endif |
2667 | } | 2674 | } |
2668 | 2675 | ||
2669 | /* | 2676 | /* |
2670 | * Thread group CPU time accounting. | 2677 | * Thread group CPU time accounting. |
2671 | */ | 2678 | */ |
2672 | void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times); | 2679 | void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times); |
2673 | void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times); | 2680 | void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times); |
2674 | 2681 | ||
2675 | static inline void thread_group_cputime_init(struct signal_struct *sig) | 2682 | static inline void thread_group_cputime_init(struct signal_struct *sig) |
2676 | { | 2683 | { |
2677 | raw_spin_lock_init(&sig->cputimer.lock); | 2684 | raw_spin_lock_init(&sig->cputimer.lock); |
2678 | } | 2685 | } |
2679 | 2686 | ||
2680 | /* | 2687 | /* |
2681 | * Reevaluate whether the task has signals pending delivery. | 2688 | * Reevaluate whether the task has signals pending delivery. |
2682 | * Wake the task if so. | 2689 | * Wake the task if so. |
2683 | * This is required every time the blocked sigset_t changes. | 2690 | * This is required every time the blocked sigset_t changes. |
2684 | * callers must hold sighand->siglock. | 2691 | * callers must hold sighand->siglock. |
2685 | */ | 2692 | */ |
2686 | extern void recalc_sigpending_and_wake(struct task_struct *t); | 2693 | extern void recalc_sigpending_and_wake(struct task_struct *t); |
2687 | extern void recalc_sigpending(void); | 2694 | extern void recalc_sigpending(void); |
2688 | 2695 | ||
2689 | extern void signal_wake_up(struct task_struct *t, int resume_stopped); | 2696 | extern void signal_wake_up(struct task_struct *t, int resume_stopped); |
2690 | 2697 | ||
2691 | /* | 2698 | /* |
2692 | * Wrappers for p->thread_info->cpu access. No-op on UP. | 2699 | * Wrappers for p->thread_info->cpu access. No-op on UP. |
2693 | */ | 2700 | */ |
2694 | #ifdef CONFIG_SMP | 2701 | #ifdef CONFIG_SMP |
2695 | 2702 | ||
2696 | static inline unsigned int task_cpu(const struct task_struct *p) | 2703 | static inline unsigned int task_cpu(const struct task_struct *p) |
2697 | { | 2704 | { |
2698 | return task_thread_info(p)->cpu; | 2705 | return task_thread_info(p)->cpu; |
2699 | } | 2706 | } |
2700 | 2707 | ||
2701 | extern void set_task_cpu(struct task_struct *p, unsigned int cpu); | 2708 | extern void set_task_cpu(struct task_struct *p, unsigned int cpu); |
2702 | 2709 | ||
2703 | #else | 2710 | #else |
2704 | 2711 | ||
2705 | static inline unsigned int task_cpu(const struct task_struct *p) | 2712 | static inline unsigned int task_cpu(const struct task_struct *p) |
2706 | { | 2713 | { |
2707 | return 0; | 2714 | return 0; |
2708 | } | 2715 | } |
2709 | 2716 | ||
2710 | static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) | 2717 | static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) |
2711 | { | 2718 | { |
2712 | } | 2719 | } |
2713 | 2720 | ||
2714 | #endif /* CONFIG_SMP */ | 2721 | #endif /* CONFIG_SMP */ |
2715 | 2722 | ||
2716 | extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask); | 2723 | extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask); |
2717 | extern long sched_getaffinity(pid_t pid, struct cpumask *mask); | 2724 | extern long sched_getaffinity(pid_t pid, struct cpumask *mask); |
2718 | 2725 | ||
2719 | extern void normalize_rt_tasks(void); | 2726 | extern void normalize_rt_tasks(void); |
2720 | 2727 | ||
2721 | #ifdef CONFIG_CGROUP_SCHED | 2728 | #ifdef CONFIG_CGROUP_SCHED |
2722 | 2729 | ||
2723 | extern struct task_group root_task_group; | 2730 | extern struct task_group root_task_group; |
2724 | 2731 | ||
2725 | extern struct task_group *sched_create_group(struct task_group *parent); | 2732 | extern struct task_group *sched_create_group(struct task_group *parent); |
2726 | extern void sched_destroy_group(struct task_group *tg); | 2733 | extern void sched_destroy_group(struct task_group *tg); |
2727 | extern void sched_move_task(struct task_struct *tsk); | 2734 | extern void sched_move_task(struct task_struct *tsk); |
2728 | #ifdef CONFIG_FAIR_GROUP_SCHED | 2735 | #ifdef CONFIG_FAIR_GROUP_SCHED |
2729 | extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); | 2736 | extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); |
2730 | extern unsigned long sched_group_shares(struct task_group *tg); | 2737 | extern unsigned long sched_group_shares(struct task_group *tg); |
2731 | #endif | 2738 | #endif |
2732 | #ifdef CONFIG_RT_GROUP_SCHED | 2739 | #ifdef CONFIG_RT_GROUP_SCHED |
2733 | extern int sched_group_set_rt_runtime(struct task_group *tg, | 2740 | extern int sched_group_set_rt_runtime(struct task_group *tg, |
2734 | long rt_runtime_us); | 2741 | long rt_runtime_us); |
2735 | extern long sched_group_rt_runtime(struct task_group *tg); | 2742 | extern long sched_group_rt_runtime(struct task_group *tg); |
2736 | extern int sched_group_set_rt_period(struct task_group *tg, | 2743 | extern int sched_group_set_rt_period(struct task_group *tg, |
2737 | long rt_period_us); | 2744 | long rt_period_us); |
2738 | extern long sched_group_rt_period(struct task_group *tg); | 2745 | extern long sched_group_rt_period(struct task_group *tg); |
2739 | extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk); | 2746 | extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk); |
2740 | #endif | 2747 | #endif |
2741 | #endif /* CONFIG_CGROUP_SCHED */ | 2748 | #endif /* CONFIG_CGROUP_SCHED */ |
2742 | 2749 | ||
2743 | extern int task_can_switch_user(struct user_struct *up, | 2750 | extern int task_can_switch_user(struct user_struct *up, |
2744 | struct task_struct *tsk); | 2751 | struct task_struct *tsk); |
2745 | 2752 | ||
2746 | #ifdef CONFIG_TASK_XACCT | 2753 | #ifdef CONFIG_TASK_XACCT |
2747 | static inline void add_rchar(struct task_struct *tsk, ssize_t amt) | 2754 | static inline void add_rchar(struct task_struct *tsk, ssize_t amt) |
2748 | { | 2755 | { |
2749 | tsk->ioac.rchar += amt; | 2756 | tsk->ioac.rchar += amt; |
2750 | } | 2757 | } |
2751 | 2758 | ||
2752 | static inline void add_wchar(struct task_struct *tsk, ssize_t amt) | 2759 | static inline void add_wchar(struct task_struct *tsk, ssize_t amt) |
2753 | { | 2760 | { |
2754 | tsk->ioac.wchar += amt; | 2761 | tsk->ioac.wchar += amt; |
2755 | } | 2762 | } |
2756 | 2763 | ||
2757 | static inline void inc_syscr(struct task_struct *tsk) | 2764 | static inline void inc_syscr(struct task_struct *tsk) |
2758 | { | 2765 | { |
2759 | tsk->ioac.syscr++; | 2766 | tsk->ioac.syscr++; |
2760 | } | 2767 | } |
2761 | 2768 | ||
2762 | static inline void inc_syscw(struct task_struct *tsk) | 2769 | static inline void inc_syscw(struct task_struct *tsk) |
2763 | { | 2770 | { |
2764 | tsk->ioac.syscw++; | 2771 | tsk->ioac.syscw++; |
2765 | } | 2772 | } |
2766 | #else | 2773 | #else |
2767 | static inline void add_rchar(struct task_struct *tsk, ssize_t amt) | 2774 | static inline void add_rchar(struct task_struct *tsk, ssize_t amt) |
2768 | { | 2775 | { |
2769 | } | 2776 | } |
2770 | 2777 | ||
2771 | static inline void add_wchar(struct task_struct *tsk, ssize_t amt) | 2778 | static inline void add_wchar(struct task_struct *tsk, ssize_t amt) |
2772 | { | 2779 | { |
2773 | } | 2780 | } |
2774 | 2781 | ||
2775 | static inline void inc_syscr(struct task_struct *tsk) | 2782 | static inline void inc_syscr(struct task_struct *tsk) |
2776 | { | 2783 | { |
2777 | } | 2784 | } |
2778 | 2785 | ||
2779 | static inline void inc_syscw(struct task_struct *tsk) | 2786 | static inline void inc_syscw(struct task_struct *tsk) |
2780 | { | 2787 | { |
2781 | } | 2788 | } |
2782 | #endif | 2789 | #endif |
2783 | 2790 | ||
2784 | #ifndef TASK_SIZE_OF | 2791 | #ifndef TASK_SIZE_OF |
2785 | #define TASK_SIZE_OF(tsk) TASK_SIZE | 2792 | #define TASK_SIZE_OF(tsk) TASK_SIZE |
2786 | #endif | 2793 | #endif |
2787 | 2794 | ||
2788 | #ifdef CONFIG_MM_OWNER | 2795 | #ifdef CONFIG_MM_OWNER |
2789 | extern void mm_update_next_owner(struct mm_struct *mm); | 2796 | extern void mm_update_next_owner(struct mm_struct *mm); |
2790 | extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p); | 2797 | extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p); |
2791 | #else | 2798 | #else |
2792 | static inline void mm_update_next_owner(struct mm_struct *mm) | 2799 | static inline void mm_update_next_owner(struct mm_struct *mm) |
2793 | { | 2800 | { |
2794 | } | 2801 | } |
2795 | 2802 | ||
2796 | static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p) | 2803 | static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p) |
2797 | { | 2804 | { |
2798 | } | 2805 | } |
2799 | #endif /* CONFIG_MM_OWNER */ | 2806 | #endif /* CONFIG_MM_OWNER */ |
2800 | 2807 | ||
2801 | static inline unsigned long task_rlimit(const struct task_struct *tsk, | 2808 | static inline unsigned long task_rlimit(const struct task_struct *tsk, |
2802 | unsigned int limit) | 2809 | unsigned int limit) |
2803 | { | 2810 | { |
2804 | return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_cur); | 2811 | return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_cur); |
2805 | } | 2812 | } |
2806 | 2813 | ||
2807 | static inline unsigned long task_rlimit_max(const struct task_struct *tsk, | 2814 | static inline unsigned long task_rlimit_max(const struct task_struct *tsk, |
2808 | unsigned int limit) | 2815 | unsigned int limit) |
2809 | { | 2816 | { |
2810 | return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_max); | 2817 | return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_max); |
2811 | } | 2818 | } |
2812 | 2819 | ||
2813 | static inline unsigned long rlimit(unsigned int limit) | 2820 | static inline unsigned long rlimit(unsigned int limit) |
2814 | { | 2821 | { |
2815 | return task_rlimit(current, limit); | 2822 | return task_rlimit(current, limit); |
2816 | } | 2823 | } |
2817 | 2824 | ||
2818 | static inline unsigned long rlimit_max(unsigned int limit) | 2825 | static inline unsigned long rlimit_max(unsigned int limit) |
2819 | { | 2826 | { |
2820 | return task_rlimit_max(current, limit); | 2827 | return task_rlimit_max(current, limit); |
2821 | } | 2828 | } |
2822 | 2829 | ||
2823 | #endif /* __KERNEL__ */ | 2830 | #endif /* __KERNEL__ */ |
2824 | 2831 | ||
2825 | #endif | 2832 | #endif |
2826 | 2833 |
kernel/softirq.c
1 | /* | 1 | /* |
2 | * linux/kernel/softirq.c | 2 | * linux/kernel/softirq.c |
3 | * | 3 | * |
4 | * Copyright (C) 1992 Linus Torvalds | 4 | * Copyright (C) 1992 Linus Torvalds |
5 | * | 5 | * |
6 | * Distribute under GPLv2. | 6 | * Distribute under GPLv2. |
7 | * | 7 | * |
8 | * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) | 8 | * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) |
9 | * | 9 | * |
10 | * Remote softirq infrastructure is by Jens Axboe. | 10 | * Remote softirq infrastructure is by Jens Axboe. |
11 | */ | 11 | */ |
12 | 12 | ||
13 | #include <linux/export.h> | 13 | #include <linux/export.h> |
14 | #include <linux/kernel_stat.h> | 14 | #include <linux/kernel_stat.h> |
15 | #include <linux/interrupt.h> | 15 | #include <linux/interrupt.h> |
16 | #include <linux/init.h> | 16 | #include <linux/init.h> |
17 | #include <linux/mm.h> | 17 | #include <linux/mm.h> |
18 | #include <linux/notifier.h> | 18 | #include <linux/notifier.h> |
19 | #include <linux/percpu.h> | 19 | #include <linux/percpu.h> |
20 | #include <linux/cpu.h> | 20 | #include <linux/cpu.h> |
21 | #include <linux/freezer.h> | 21 | #include <linux/freezer.h> |
22 | #include <linux/kthread.h> | 22 | #include <linux/kthread.h> |
23 | #include <linux/rcupdate.h> | 23 | #include <linux/rcupdate.h> |
24 | #include <linux/ftrace.h> | 24 | #include <linux/ftrace.h> |
25 | #include <linux/smp.h> | 25 | #include <linux/smp.h> |
26 | #include <linux/tick.h> | 26 | #include <linux/tick.h> |
27 | 27 | ||
28 | #define CREATE_TRACE_POINTS | 28 | #define CREATE_TRACE_POINTS |
29 | #include <trace/events/irq.h> | 29 | #include <trace/events/irq.h> |
30 | 30 | ||
31 | #include <asm/irq.h> | 31 | #include <asm/irq.h> |
32 | /* | 32 | /* |
33 | - No shared variables, all the data are CPU local. | 33 | - No shared variables, all the data are CPU local. |
34 | - If a softirq needs serialization, let it serialize itself | 34 | - If a softirq needs serialization, let it serialize itself |
35 | by its own spinlocks. | 35 | by its own spinlocks. |
36 | - Even if softirq is serialized, only local cpu is marked for | 36 | - Even if softirq is serialized, only local cpu is marked for |
37 | execution. Hence, we get something sort of weak cpu binding. | 37 | execution. Hence, we get something sort of weak cpu binding. |
38 | Though it is still not clear, will it result in better locality | 38 | Though it is still not clear, will it result in better locality |
39 | or will not. | 39 | or will not. |
40 | 40 | ||
41 | Examples: | 41 | Examples: |
42 | - NET RX softirq. It is multithreaded and does not require | 42 | - NET RX softirq. It is multithreaded and does not require |
43 | any global serialization. | 43 | any global serialization. |
44 | - NET TX softirq. It kicks software netdevice queues, hence | 44 | - NET TX softirq. It kicks software netdevice queues, hence |
45 | it is logically serialized per device, but this serialization | 45 | it is logically serialized per device, but this serialization |
46 | is invisible to common code. | 46 | is invisible to common code. |
47 | - Tasklets: serialized wrt itself. | 47 | - Tasklets: serialized wrt itself. |
48 | */ | 48 | */ |
49 | 49 | ||
50 | #ifndef __ARCH_IRQ_STAT | 50 | #ifndef __ARCH_IRQ_STAT |
51 | irq_cpustat_t irq_stat[NR_CPUS] ____cacheline_aligned; | 51 | irq_cpustat_t irq_stat[NR_CPUS] ____cacheline_aligned; |
52 | EXPORT_SYMBOL(irq_stat); | 52 | EXPORT_SYMBOL(irq_stat); |
53 | #endif | 53 | #endif |
54 | 54 | ||
55 | static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; | 55 | static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; |
56 | 56 | ||
57 | DEFINE_PER_CPU(struct task_struct *, ksoftirqd); | 57 | DEFINE_PER_CPU(struct task_struct *, ksoftirqd); |
58 | 58 | ||
59 | char *softirq_to_name[NR_SOFTIRQS] = { | 59 | char *softirq_to_name[NR_SOFTIRQS] = { |
60 | "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", | 60 | "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", |
61 | "TASKLET", "SCHED", "HRTIMER", "RCU" | 61 | "TASKLET", "SCHED", "HRTIMER", "RCU" |
62 | }; | 62 | }; |
63 | 63 | ||
64 | /* | 64 | /* |
65 | * we cannot loop indefinitely here to avoid userspace starvation, | 65 | * we cannot loop indefinitely here to avoid userspace starvation, |
66 | * but we also don't want to introduce a worst case 1/HZ latency | 66 | * but we also don't want to introduce a worst case 1/HZ latency |
67 | * to the pending events, so lets the scheduler to balance | 67 | * to the pending events, so lets the scheduler to balance |
68 | * the softirq load for us. | 68 | * the softirq load for us. |
69 | */ | 69 | */ |
70 | static void wakeup_softirqd(void) | 70 | static void wakeup_softirqd(void) |
71 | { | 71 | { |
72 | /* Interrupts are disabled: no need to stop preemption */ | 72 | /* Interrupts are disabled: no need to stop preemption */ |
73 | struct task_struct *tsk = __this_cpu_read(ksoftirqd); | 73 | struct task_struct *tsk = __this_cpu_read(ksoftirqd); |
74 | 74 | ||
75 | if (tsk && tsk->state != TASK_RUNNING) | 75 | if (tsk && tsk->state != TASK_RUNNING) |
76 | wake_up_process(tsk); | 76 | wake_up_process(tsk); |
77 | } | 77 | } |
78 | 78 | ||
79 | /* | 79 | /* |
80 | * preempt_count and SOFTIRQ_OFFSET usage: | 80 | * preempt_count and SOFTIRQ_OFFSET usage: |
81 | * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving | 81 | * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving |
82 | * softirq processing. | 82 | * softirq processing. |
83 | * - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET) | 83 | * - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET) |
84 | * on local_bh_disable or local_bh_enable. | 84 | * on local_bh_disable or local_bh_enable. |
85 | * This lets us distinguish between whether we are currently processing | 85 | * This lets us distinguish between whether we are currently processing |
86 | * softirq and whether we just have bh disabled. | 86 | * softirq and whether we just have bh disabled. |
87 | */ | 87 | */ |
88 | 88 | ||
89 | /* | 89 | /* |
90 | * This one is for softirq.c-internal use, | 90 | * This one is for softirq.c-internal use, |
91 | * where hardirqs are disabled legitimately: | 91 | * where hardirqs are disabled legitimately: |
92 | */ | 92 | */ |
93 | #ifdef CONFIG_TRACE_IRQFLAGS | 93 | #ifdef CONFIG_TRACE_IRQFLAGS |
94 | static void __local_bh_disable(unsigned long ip, unsigned int cnt) | 94 | static void __local_bh_disable(unsigned long ip, unsigned int cnt) |
95 | { | 95 | { |
96 | unsigned long flags; | 96 | unsigned long flags; |
97 | 97 | ||
98 | WARN_ON_ONCE(in_irq()); | 98 | WARN_ON_ONCE(in_irq()); |
99 | 99 | ||
100 | raw_local_irq_save(flags); | 100 | raw_local_irq_save(flags); |
101 | /* | 101 | /* |
102 | * The preempt tracer hooks into add_preempt_count and will break | 102 | * The preempt tracer hooks into add_preempt_count and will break |
103 | * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET | 103 | * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET |
104 | * is set and before current->softirq_enabled is cleared. | 104 | * is set and before current->softirq_enabled is cleared. |
105 | * We must manually increment preempt_count here and manually | 105 | * We must manually increment preempt_count here and manually |
106 | * call the trace_preempt_off later. | 106 | * call the trace_preempt_off later. |
107 | */ | 107 | */ |
108 | preempt_count() += cnt; | 108 | preempt_count() += cnt; |
109 | /* | 109 | /* |
110 | * Were softirqs turned off above: | 110 | * Were softirqs turned off above: |
111 | */ | 111 | */ |
112 | if (softirq_count() == cnt) | 112 | if (softirq_count() == cnt) |
113 | trace_softirqs_off(ip); | 113 | trace_softirqs_off(ip); |
114 | raw_local_irq_restore(flags); | 114 | raw_local_irq_restore(flags); |
115 | 115 | ||
116 | if (preempt_count() == cnt) | 116 | if (preempt_count() == cnt) |
117 | trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); | 117 | trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); |
118 | } | 118 | } |
119 | #else /* !CONFIG_TRACE_IRQFLAGS */ | 119 | #else /* !CONFIG_TRACE_IRQFLAGS */ |
120 | static inline void __local_bh_disable(unsigned long ip, unsigned int cnt) | 120 | static inline void __local_bh_disable(unsigned long ip, unsigned int cnt) |
121 | { | 121 | { |
122 | add_preempt_count(cnt); | 122 | add_preempt_count(cnt); |
123 | barrier(); | 123 | barrier(); |
124 | } | 124 | } |
125 | #endif /* CONFIG_TRACE_IRQFLAGS */ | 125 | #endif /* CONFIG_TRACE_IRQFLAGS */ |
126 | 126 | ||
127 | void local_bh_disable(void) | 127 | void local_bh_disable(void) |
128 | { | 128 | { |
129 | __local_bh_disable((unsigned long)__builtin_return_address(0), | 129 | __local_bh_disable((unsigned long)__builtin_return_address(0), |
130 | SOFTIRQ_DISABLE_OFFSET); | 130 | SOFTIRQ_DISABLE_OFFSET); |
131 | } | 131 | } |
132 | 132 | ||
133 | EXPORT_SYMBOL(local_bh_disable); | 133 | EXPORT_SYMBOL(local_bh_disable); |
134 | 134 | ||
135 | static void __local_bh_enable(unsigned int cnt) | 135 | static void __local_bh_enable(unsigned int cnt) |
136 | { | 136 | { |
137 | WARN_ON_ONCE(in_irq()); | 137 | WARN_ON_ONCE(in_irq()); |
138 | WARN_ON_ONCE(!irqs_disabled()); | 138 | WARN_ON_ONCE(!irqs_disabled()); |
139 | 139 | ||
140 | if (softirq_count() == cnt) | 140 | if (softirq_count() == cnt) |
141 | trace_softirqs_on((unsigned long)__builtin_return_address(0)); | 141 | trace_softirqs_on((unsigned long)__builtin_return_address(0)); |
142 | sub_preempt_count(cnt); | 142 | sub_preempt_count(cnt); |
143 | } | 143 | } |
144 | 144 | ||
145 | /* | 145 | /* |
146 | * Special-case - softirqs can safely be enabled in | 146 | * Special-case - softirqs can safely be enabled in |
147 | * cond_resched_softirq(), or by __do_softirq(), | 147 | * cond_resched_softirq(), or by __do_softirq(), |
148 | * without processing still-pending softirqs: | 148 | * without processing still-pending softirqs: |
149 | */ | 149 | */ |
150 | void _local_bh_enable(void) | 150 | void _local_bh_enable(void) |
151 | { | 151 | { |
152 | __local_bh_enable(SOFTIRQ_DISABLE_OFFSET); | 152 | __local_bh_enable(SOFTIRQ_DISABLE_OFFSET); |
153 | } | 153 | } |
154 | 154 | ||
155 | EXPORT_SYMBOL(_local_bh_enable); | 155 | EXPORT_SYMBOL(_local_bh_enable); |
156 | 156 | ||
157 | static inline void _local_bh_enable_ip(unsigned long ip) | 157 | static inline void _local_bh_enable_ip(unsigned long ip) |
158 | { | 158 | { |
159 | WARN_ON_ONCE(in_irq() || irqs_disabled()); | 159 | WARN_ON_ONCE(in_irq() || irqs_disabled()); |
160 | #ifdef CONFIG_TRACE_IRQFLAGS | 160 | #ifdef CONFIG_TRACE_IRQFLAGS |
161 | local_irq_disable(); | 161 | local_irq_disable(); |
162 | #endif | 162 | #endif |
163 | /* | 163 | /* |
164 | * Are softirqs going to be turned on now: | 164 | * Are softirqs going to be turned on now: |
165 | */ | 165 | */ |
166 | if (softirq_count() == SOFTIRQ_DISABLE_OFFSET) | 166 | if (softirq_count() == SOFTIRQ_DISABLE_OFFSET) |
167 | trace_softirqs_on(ip); | 167 | trace_softirqs_on(ip); |
168 | /* | 168 | /* |
169 | * Keep preemption disabled until we are done with | 169 | * Keep preemption disabled until we are done with |
170 | * softirq processing: | 170 | * softirq processing: |
171 | */ | 171 | */ |
172 | sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1); | 172 | sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1); |
173 | 173 | ||
174 | if (unlikely(!in_interrupt() && local_softirq_pending())) | 174 | if (unlikely(!in_interrupt() && local_softirq_pending())) |
175 | do_softirq(); | 175 | do_softirq(); |
176 | 176 | ||
177 | dec_preempt_count(); | 177 | dec_preempt_count(); |
178 | #ifdef CONFIG_TRACE_IRQFLAGS | 178 | #ifdef CONFIG_TRACE_IRQFLAGS |
179 | local_irq_enable(); | 179 | local_irq_enable(); |
180 | #endif | 180 | #endif |
181 | preempt_check_resched(); | 181 | preempt_check_resched(); |
182 | } | 182 | } |
183 | 183 | ||
184 | void local_bh_enable(void) | 184 | void local_bh_enable(void) |
185 | { | 185 | { |
186 | _local_bh_enable_ip((unsigned long)__builtin_return_address(0)); | 186 | _local_bh_enable_ip((unsigned long)__builtin_return_address(0)); |
187 | } | 187 | } |
188 | EXPORT_SYMBOL(local_bh_enable); | 188 | EXPORT_SYMBOL(local_bh_enable); |
189 | 189 | ||
190 | void local_bh_enable_ip(unsigned long ip) | 190 | void local_bh_enable_ip(unsigned long ip) |
191 | { | 191 | { |
192 | _local_bh_enable_ip(ip); | 192 | _local_bh_enable_ip(ip); |
193 | } | 193 | } |
194 | EXPORT_SYMBOL(local_bh_enable_ip); | 194 | EXPORT_SYMBOL(local_bh_enable_ip); |
195 | 195 | ||
196 | /* | 196 | /* |
197 | * We restart softirq processing MAX_SOFTIRQ_RESTART times, | 197 | * We restart softirq processing MAX_SOFTIRQ_RESTART times, |
198 | * and we fall back to softirqd after that. | 198 | * and we fall back to softirqd after that. |
199 | * | 199 | * |
200 | * This number has been established via experimentation. | 200 | * This number has been established via experimentation. |
201 | * The two things to balance is latency against fairness - | 201 | * The two things to balance is latency against fairness - |
202 | * we want to handle softirqs as soon as possible, but they | 202 | * we want to handle softirqs as soon as possible, but they |
203 | * should not be able to lock up the box. | 203 | * should not be able to lock up the box. |
204 | */ | 204 | */ |
205 | #define MAX_SOFTIRQ_RESTART 10 | 205 | #define MAX_SOFTIRQ_RESTART 10 |
206 | 206 | ||
207 | asmlinkage void __do_softirq(void) | 207 | asmlinkage void __do_softirq(void) |
208 | { | 208 | { |
209 | struct softirq_action *h; | 209 | struct softirq_action *h; |
210 | __u32 pending; | 210 | __u32 pending; |
211 | int max_restart = MAX_SOFTIRQ_RESTART; | 211 | int max_restart = MAX_SOFTIRQ_RESTART; |
212 | int cpu; | 212 | int cpu; |
213 | unsigned long old_flags = current->flags; | ||
213 | 214 | ||
215 | /* | ||
216 | * Mask out PF_MEMALLOC s current task context is borrowed for the | ||
217 | * softirq. A softirq handled such as network RX might set PF_MEMALLOC | ||
218 | * again if the socket is related to swap | ||
219 | */ | ||
220 | current->flags &= ~PF_MEMALLOC; | ||
221 | |||
214 | pending = local_softirq_pending(); | 222 | pending = local_softirq_pending(); |
215 | account_system_vtime(current); | 223 | account_system_vtime(current); |
216 | 224 | ||
217 | __local_bh_disable((unsigned long)__builtin_return_address(0), | 225 | __local_bh_disable((unsigned long)__builtin_return_address(0), |
218 | SOFTIRQ_OFFSET); | 226 | SOFTIRQ_OFFSET); |
219 | lockdep_softirq_enter(); | 227 | lockdep_softirq_enter(); |
220 | 228 | ||
221 | cpu = smp_processor_id(); | 229 | cpu = smp_processor_id(); |
222 | restart: | 230 | restart: |
223 | /* Reset the pending bitmask before enabling irqs */ | 231 | /* Reset the pending bitmask before enabling irqs */ |
224 | set_softirq_pending(0); | 232 | set_softirq_pending(0); |
225 | 233 | ||
226 | local_irq_enable(); | 234 | local_irq_enable(); |
227 | 235 | ||
228 | h = softirq_vec; | 236 | h = softirq_vec; |
229 | 237 | ||
230 | do { | 238 | do { |
231 | if (pending & 1) { | 239 | if (pending & 1) { |
232 | unsigned int vec_nr = h - softirq_vec; | 240 | unsigned int vec_nr = h - softirq_vec; |
233 | int prev_count = preempt_count(); | 241 | int prev_count = preempt_count(); |
234 | 242 | ||
235 | kstat_incr_softirqs_this_cpu(vec_nr); | 243 | kstat_incr_softirqs_this_cpu(vec_nr); |
236 | 244 | ||
237 | trace_softirq_entry(vec_nr); | 245 | trace_softirq_entry(vec_nr); |
238 | h->action(h); | 246 | h->action(h); |
239 | trace_softirq_exit(vec_nr); | 247 | trace_softirq_exit(vec_nr); |
240 | if (unlikely(prev_count != preempt_count())) { | 248 | if (unlikely(prev_count != preempt_count())) { |
241 | printk(KERN_ERR "huh, entered softirq %u %s %p" | 249 | printk(KERN_ERR "huh, entered softirq %u %s %p" |
242 | "with preempt_count %08x," | 250 | "with preempt_count %08x," |
243 | " exited with %08x?\n", vec_nr, | 251 | " exited with %08x?\n", vec_nr, |
244 | softirq_to_name[vec_nr], h->action, | 252 | softirq_to_name[vec_nr], h->action, |
245 | prev_count, preempt_count()); | 253 | prev_count, preempt_count()); |
246 | preempt_count() = prev_count; | 254 | preempt_count() = prev_count; |
247 | } | 255 | } |
248 | 256 | ||
249 | rcu_bh_qs(cpu); | 257 | rcu_bh_qs(cpu); |
250 | } | 258 | } |
251 | h++; | 259 | h++; |
252 | pending >>= 1; | 260 | pending >>= 1; |
253 | } while (pending); | 261 | } while (pending); |
254 | 262 | ||
255 | local_irq_disable(); | 263 | local_irq_disable(); |
256 | 264 | ||
257 | pending = local_softirq_pending(); | 265 | pending = local_softirq_pending(); |
258 | if (pending && --max_restart) | 266 | if (pending && --max_restart) |
259 | goto restart; | 267 | goto restart; |
260 | 268 | ||
261 | if (pending) | 269 | if (pending) |
262 | wakeup_softirqd(); | 270 | wakeup_softirqd(); |
263 | 271 | ||
264 | lockdep_softirq_exit(); | 272 | lockdep_softirq_exit(); |
265 | 273 | ||
266 | account_system_vtime(current); | 274 | account_system_vtime(current); |
267 | __local_bh_enable(SOFTIRQ_OFFSET); | 275 | __local_bh_enable(SOFTIRQ_OFFSET); |
276 | tsk_restore_flags(current, old_flags, PF_MEMALLOC); | ||
268 | } | 277 | } |
269 | 278 | ||
270 | #ifndef __ARCH_HAS_DO_SOFTIRQ | 279 | #ifndef __ARCH_HAS_DO_SOFTIRQ |
271 | 280 | ||
272 | asmlinkage void do_softirq(void) | 281 | asmlinkage void do_softirq(void) |
273 | { | 282 | { |
274 | __u32 pending; | 283 | __u32 pending; |
275 | unsigned long flags; | 284 | unsigned long flags; |
276 | 285 | ||
277 | if (in_interrupt()) | 286 | if (in_interrupt()) |
278 | return; | 287 | return; |
279 | 288 | ||
280 | local_irq_save(flags); | 289 | local_irq_save(flags); |
281 | 290 | ||
282 | pending = local_softirq_pending(); | 291 | pending = local_softirq_pending(); |
283 | 292 | ||
284 | if (pending) | 293 | if (pending) |
285 | __do_softirq(); | 294 | __do_softirq(); |
286 | 295 | ||
287 | local_irq_restore(flags); | 296 | local_irq_restore(flags); |
288 | } | 297 | } |
289 | 298 | ||
290 | #endif | 299 | #endif |
291 | 300 | ||
292 | /* | 301 | /* |
293 | * Enter an interrupt context. | 302 | * Enter an interrupt context. |
294 | */ | 303 | */ |
295 | void irq_enter(void) | 304 | void irq_enter(void) |
296 | { | 305 | { |
297 | int cpu = smp_processor_id(); | 306 | int cpu = smp_processor_id(); |
298 | 307 | ||
299 | rcu_irq_enter(); | 308 | rcu_irq_enter(); |
300 | if (is_idle_task(current) && !in_interrupt()) { | 309 | if (is_idle_task(current) && !in_interrupt()) { |
301 | /* | 310 | /* |
302 | * Prevent raise_softirq from needlessly waking up ksoftirqd | 311 | * Prevent raise_softirq from needlessly waking up ksoftirqd |
303 | * here, as softirq will be serviced on return from interrupt. | 312 | * here, as softirq will be serviced on return from interrupt. |
304 | */ | 313 | */ |
305 | local_bh_disable(); | 314 | local_bh_disable(); |
306 | tick_check_idle(cpu); | 315 | tick_check_idle(cpu); |
307 | _local_bh_enable(); | 316 | _local_bh_enable(); |
308 | } | 317 | } |
309 | 318 | ||
310 | __irq_enter(); | 319 | __irq_enter(); |
311 | } | 320 | } |
312 | 321 | ||
313 | static inline void invoke_softirq(void) | 322 | static inline void invoke_softirq(void) |
314 | { | 323 | { |
315 | if (!force_irqthreads) { | 324 | if (!force_irqthreads) { |
316 | #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED | 325 | #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED |
317 | __do_softirq(); | 326 | __do_softirq(); |
318 | #else | 327 | #else |
319 | do_softirq(); | 328 | do_softirq(); |
320 | #endif | 329 | #endif |
321 | } else { | 330 | } else { |
322 | __local_bh_disable((unsigned long)__builtin_return_address(0), | 331 | __local_bh_disable((unsigned long)__builtin_return_address(0), |
323 | SOFTIRQ_OFFSET); | 332 | SOFTIRQ_OFFSET); |
324 | wakeup_softirqd(); | 333 | wakeup_softirqd(); |
325 | __local_bh_enable(SOFTIRQ_OFFSET); | 334 | __local_bh_enable(SOFTIRQ_OFFSET); |
326 | } | 335 | } |
327 | } | 336 | } |
328 | 337 | ||
329 | /* | 338 | /* |
330 | * Exit an interrupt context. Process softirqs if needed and possible: | 339 | * Exit an interrupt context. Process softirqs if needed and possible: |
331 | */ | 340 | */ |
332 | void irq_exit(void) | 341 | void irq_exit(void) |
333 | { | 342 | { |
334 | account_system_vtime(current); | 343 | account_system_vtime(current); |
335 | trace_hardirq_exit(); | 344 | trace_hardirq_exit(); |
336 | sub_preempt_count(IRQ_EXIT_OFFSET); | 345 | sub_preempt_count(IRQ_EXIT_OFFSET); |
337 | if (!in_interrupt() && local_softirq_pending()) | 346 | if (!in_interrupt() && local_softirq_pending()) |
338 | invoke_softirq(); | 347 | invoke_softirq(); |
339 | 348 | ||
340 | #ifdef CONFIG_NO_HZ | 349 | #ifdef CONFIG_NO_HZ |
341 | /* Make sure that timer wheel updates are propagated */ | 350 | /* Make sure that timer wheel updates are propagated */ |
342 | if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) | 351 | if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) |
343 | tick_nohz_irq_exit(); | 352 | tick_nohz_irq_exit(); |
344 | #endif | 353 | #endif |
345 | rcu_irq_exit(); | 354 | rcu_irq_exit(); |
346 | sched_preempt_enable_no_resched(); | 355 | sched_preempt_enable_no_resched(); |
347 | } | 356 | } |
348 | 357 | ||
349 | /* | 358 | /* |
350 | * This function must run with irqs disabled! | 359 | * This function must run with irqs disabled! |
351 | */ | 360 | */ |
352 | inline void raise_softirq_irqoff(unsigned int nr) | 361 | inline void raise_softirq_irqoff(unsigned int nr) |
353 | { | 362 | { |
354 | __raise_softirq_irqoff(nr); | 363 | __raise_softirq_irqoff(nr); |
355 | 364 | ||
356 | /* | 365 | /* |
357 | * If we're in an interrupt or softirq, we're done | 366 | * If we're in an interrupt or softirq, we're done |
358 | * (this also catches softirq-disabled code). We will | 367 | * (this also catches softirq-disabled code). We will |
359 | * actually run the softirq once we return from | 368 | * actually run the softirq once we return from |
360 | * the irq or softirq. | 369 | * the irq or softirq. |
361 | * | 370 | * |
362 | * Otherwise we wake up ksoftirqd to make sure we | 371 | * Otherwise we wake up ksoftirqd to make sure we |
363 | * schedule the softirq soon. | 372 | * schedule the softirq soon. |
364 | */ | 373 | */ |
365 | if (!in_interrupt()) | 374 | if (!in_interrupt()) |
366 | wakeup_softirqd(); | 375 | wakeup_softirqd(); |
367 | } | 376 | } |
368 | 377 | ||
369 | void raise_softirq(unsigned int nr) | 378 | void raise_softirq(unsigned int nr) |
370 | { | 379 | { |
371 | unsigned long flags; | 380 | unsigned long flags; |
372 | 381 | ||
373 | local_irq_save(flags); | 382 | local_irq_save(flags); |
374 | raise_softirq_irqoff(nr); | 383 | raise_softirq_irqoff(nr); |
375 | local_irq_restore(flags); | 384 | local_irq_restore(flags); |
376 | } | 385 | } |
377 | 386 | ||
378 | void __raise_softirq_irqoff(unsigned int nr) | 387 | void __raise_softirq_irqoff(unsigned int nr) |
379 | { | 388 | { |
380 | trace_softirq_raise(nr); | 389 | trace_softirq_raise(nr); |
381 | or_softirq_pending(1UL << nr); | 390 | or_softirq_pending(1UL << nr); |
382 | } | 391 | } |
383 | 392 | ||
384 | void open_softirq(int nr, void (*action)(struct softirq_action *)) | 393 | void open_softirq(int nr, void (*action)(struct softirq_action *)) |
385 | { | 394 | { |
386 | softirq_vec[nr].action = action; | 395 | softirq_vec[nr].action = action; |
387 | } | 396 | } |
388 | 397 | ||
389 | /* | 398 | /* |
390 | * Tasklets | 399 | * Tasklets |
391 | */ | 400 | */ |
392 | struct tasklet_head | 401 | struct tasklet_head |
393 | { | 402 | { |
394 | struct tasklet_struct *head; | 403 | struct tasklet_struct *head; |
395 | struct tasklet_struct **tail; | 404 | struct tasklet_struct **tail; |
396 | }; | 405 | }; |
397 | 406 | ||
398 | static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec); | 407 | static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec); |
399 | static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec); | 408 | static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec); |
400 | 409 | ||
401 | void __tasklet_schedule(struct tasklet_struct *t) | 410 | void __tasklet_schedule(struct tasklet_struct *t) |
402 | { | 411 | { |
403 | unsigned long flags; | 412 | unsigned long flags; |
404 | 413 | ||
405 | local_irq_save(flags); | 414 | local_irq_save(flags); |
406 | t->next = NULL; | 415 | t->next = NULL; |
407 | *__this_cpu_read(tasklet_vec.tail) = t; | 416 | *__this_cpu_read(tasklet_vec.tail) = t; |
408 | __this_cpu_write(tasklet_vec.tail, &(t->next)); | 417 | __this_cpu_write(tasklet_vec.tail, &(t->next)); |
409 | raise_softirq_irqoff(TASKLET_SOFTIRQ); | 418 | raise_softirq_irqoff(TASKLET_SOFTIRQ); |
410 | local_irq_restore(flags); | 419 | local_irq_restore(flags); |
411 | } | 420 | } |
412 | 421 | ||
413 | EXPORT_SYMBOL(__tasklet_schedule); | 422 | EXPORT_SYMBOL(__tasklet_schedule); |
414 | 423 | ||
415 | void __tasklet_hi_schedule(struct tasklet_struct *t) | 424 | void __tasklet_hi_schedule(struct tasklet_struct *t) |
416 | { | 425 | { |
417 | unsigned long flags; | 426 | unsigned long flags; |
418 | 427 | ||
419 | local_irq_save(flags); | 428 | local_irq_save(flags); |
420 | t->next = NULL; | 429 | t->next = NULL; |
421 | *__this_cpu_read(tasklet_hi_vec.tail) = t; | 430 | *__this_cpu_read(tasklet_hi_vec.tail) = t; |
422 | __this_cpu_write(tasklet_hi_vec.tail, &(t->next)); | 431 | __this_cpu_write(tasklet_hi_vec.tail, &(t->next)); |
423 | raise_softirq_irqoff(HI_SOFTIRQ); | 432 | raise_softirq_irqoff(HI_SOFTIRQ); |
424 | local_irq_restore(flags); | 433 | local_irq_restore(flags); |
425 | } | 434 | } |
426 | 435 | ||
427 | EXPORT_SYMBOL(__tasklet_hi_schedule); | 436 | EXPORT_SYMBOL(__tasklet_hi_schedule); |
428 | 437 | ||
429 | void __tasklet_hi_schedule_first(struct tasklet_struct *t) | 438 | void __tasklet_hi_schedule_first(struct tasklet_struct *t) |
430 | { | 439 | { |
431 | BUG_ON(!irqs_disabled()); | 440 | BUG_ON(!irqs_disabled()); |
432 | 441 | ||
433 | t->next = __this_cpu_read(tasklet_hi_vec.head); | 442 | t->next = __this_cpu_read(tasklet_hi_vec.head); |
434 | __this_cpu_write(tasklet_hi_vec.head, t); | 443 | __this_cpu_write(tasklet_hi_vec.head, t); |
435 | __raise_softirq_irqoff(HI_SOFTIRQ); | 444 | __raise_softirq_irqoff(HI_SOFTIRQ); |
436 | } | 445 | } |
437 | 446 | ||
438 | EXPORT_SYMBOL(__tasklet_hi_schedule_first); | 447 | EXPORT_SYMBOL(__tasklet_hi_schedule_first); |
439 | 448 | ||
440 | static void tasklet_action(struct softirq_action *a) | 449 | static void tasklet_action(struct softirq_action *a) |
441 | { | 450 | { |
442 | struct tasklet_struct *list; | 451 | struct tasklet_struct *list; |
443 | 452 | ||
444 | local_irq_disable(); | 453 | local_irq_disable(); |
445 | list = __this_cpu_read(tasklet_vec.head); | 454 | list = __this_cpu_read(tasklet_vec.head); |
446 | __this_cpu_write(tasklet_vec.head, NULL); | 455 | __this_cpu_write(tasklet_vec.head, NULL); |
447 | __this_cpu_write(tasklet_vec.tail, &__get_cpu_var(tasklet_vec).head); | 456 | __this_cpu_write(tasklet_vec.tail, &__get_cpu_var(tasklet_vec).head); |
448 | local_irq_enable(); | 457 | local_irq_enable(); |
449 | 458 | ||
450 | while (list) { | 459 | while (list) { |
451 | struct tasklet_struct *t = list; | 460 | struct tasklet_struct *t = list; |
452 | 461 | ||
453 | list = list->next; | 462 | list = list->next; |
454 | 463 | ||
455 | if (tasklet_trylock(t)) { | 464 | if (tasklet_trylock(t)) { |
456 | if (!atomic_read(&t->count)) { | 465 | if (!atomic_read(&t->count)) { |
457 | if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) | 466 | if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) |
458 | BUG(); | 467 | BUG(); |
459 | t->func(t->data); | 468 | t->func(t->data); |
460 | tasklet_unlock(t); | 469 | tasklet_unlock(t); |
461 | continue; | 470 | continue; |
462 | } | 471 | } |
463 | tasklet_unlock(t); | 472 | tasklet_unlock(t); |
464 | } | 473 | } |
465 | 474 | ||
466 | local_irq_disable(); | 475 | local_irq_disable(); |
467 | t->next = NULL; | 476 | t->next = NULL; |
468 | *__this_cpu_read(tasklet_vec.tail) = t; | 477 | *__this_cpu_read(tasklet_vec.tail) = t; |
469 | __this_cpu_write(tasklet_vec.tail, &(t->next)); | 478 | __this_cpu_write(tasklet_vec.tail, &(t->next)); |
470 | __raise_softirq_irqoff(TASKLET_SOFTIRQ); | 479 | __raise_softirq_irqoff(TASKLET_SOFTIRQ); |
471 | local_irq_enable(); | 480 | local_irq_enable(); |
472 | } | 481 | } |
473 | } | 482 | } |
474 | 483 | ||
475 | static void tasklet_hi_action(struct softirq_action *a) | 484 | static void tasklet_hi_action(struct softirq_action *a) |
476 | { | 485 | { |
477 | struct tasklet_struct *list; | 486 | struct tasklet_struct *list; |
478 | 487 | ||
479 | local_irq_disable(); | 488 | local_irq_disable(); |
480 | list = __this_cpu_read(tasklet_hi_vec.head); | 489 | list = __this_cpu_read(tasklet_hi_vec.head); |
481 | __this_cpu_write(tasklet_hi_vec.head, NULL); | 490 | __this_cpu_write(tasklet_hi_vec.head, NULL); |
482 | __this_cpu_write(tasklet_hi_vec.tail, &__get_cpu_var(tasklet_hi_vec).head); | 491 | __this_cpu_write(tasklet_hi_vec.tail, &__get_cpu_var(tasklet_hi_vec).head); |
483 | local_irq_enable(); | 492 | local_irq_enable(); |
484 | 493 | ||
485 | while (list) { | 494 | while (list) { |
486 | struct tasklet_struct *t = list; | 495 | struct tasklet_struct *t = list; |
487 | 496 | ||
488 | list = list->next; | 497 | list = list->next; |
489 | 498 | ||
490 | if (tasklet_trylock(t)) { | 499 | if (tasklet_trylock(t)) { |
491 | if (!atomic_read(&t->count)) { | 500 | if (!atomic_read(&t->count)) { |
492 | if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) | 501 | if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) |
493 | BUG(); | 502 | BUG(); |
494 | t->func(t->data); | 503 | t->func(t->data); |
495 | tasklet_unlock(t); | 504 | tasklet_unlock(t); |
496 | continue; | 505 | continue; |
497 | } | 506 | } |
498 | tasklet_unlock(t); | 507 | tasklet_unlock(t); |
499 | } | 508 | } |
500 | 509 | ||
501 | local_irq_disable(); | 510 | local_irq_disable(); |
502 | t->next = NULL; | 511 | t->next = NULL; |
503 | *__this_cpu_read(tasklet_hi_vec.tail) = t; | 512 | *__this_cpu_read(tasklet_hi_vec.tail) = t; |
504 | __this_cpu_write(tasklet_hi_vec.tail, &(t->next)); | 513 | __this_cpu_write(tasklet_hi_vec.tail, &(t->next)); |
505 | __raise_softirq_irqoff(HI_SOFTIRQ); | 514 | __raise_softirq_irqoff(HI_SOFTIRQ); |
506 | local_irq_enable(); | 515 | local_irq_enable(); |
507 | } | 516 | } |
508 | } | 517 | } |
509 | 518 | ||
510 | 519 | ||
511 | void tasklet_init(struct tasklet_struct *t, | 520 | void tasklet_init(struct tasklet_struct *t, |
512 | void (*func)(unsigned long), unsigned long data) | 521 | void (*func)(unsigned long), unsigned long data) |
513 | { | 522 | { |
514 | t->next = NULL; | 523 | t->next = NULL; |
515 | t->state = 0; | 524 | t->state = 0; |
516 | atomic_set(&t->count, 0); | 525 | atomic_set(&t->count, 0); |
517 | t->func = func; | 526 | t->func = func; |
518 | t->data = data; | 527 | t->data = data; |
519 | } | 528 | } |
520 | 529 | ||
521 | EXPORT_SYMBOL(tasklet_init); | 530 | EXPORT_SYMBOL(tasklet_init); |
522 | 531 | ||
523 | void tasklet_kill(struct tasklet_struct *t) | 532 | void tasklet_kill(struct tasklet_struct *t) |
524 | { | 533 | { |
525 | if (in_interrupt()) | 534 | if (in_interrupt()) |
526 | printk("Attempt to kill tasklet from interrupt\n"); | 535 | printk("Attempt to kill tasklet from interrupt\n"); |
527 | 536 | ||
528 | while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) { | 537 | while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) { |
529 | do { | 538 | do { |
530 | yield(); | 539 | yield(); |
531 | } while (test_bit(TASKLET_STATE_SCHED, &t->state)); | 540 | } while (test_bit(TASKLET_STATE_SCHED, &t->state)); |
532 | } | 541 | } |
533 | tasklet_unlock_wait(t); | 542 | tasklet_unlock_wait(t); |
534 | clear_bit(TASKLET_STATE_SCHED, &t->state); | 543 | clear_bit(TASKLET_STATE_SCHED, &t->state); |
535 | } | 544 | } |
536 | 545 | ||
537 | EXPORT_SYMBOL(tasklet_kill); | 546 | EXPORT_SYMBOL(tasklet_kill); |
538 | 547 | ||
539 | /* | 548 | /* |
540 | * tasklet_hrtimer | 549 | * tasklet_hrtimer |
541 | */ | 550 | */ |
542 | 551 | ||
543 | /* | 552 | /* |
544 | * The trampoline is called when the hrtimer expires. It schedules a tasklet | 553 | * The trampoline is called when the hrtimer expires. It schedules a tasklet |
545 | * to run __tasklet_hrtimer_trampoline() which in turn will call the intended | 554 | * to run __tasklet_hrtimer_trampoline() which in turn will call the intended |
546 | * hrtimer callback, but from softirq context. | 555 | * hrtimer callback, but from softirq context. |
547 | */ | 556 | */ |
548 | static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer) | 557 | static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer) |
549 | { | 558 | { |
550 | struct tasklet_hrtimer *ttimer = | 559 | struct tasklet_hrtimer *ttimer = |
551 | container_of(timer, struct tasklet_hrtimer, timer); | 560 | container_of(timer, struct tasklet_hrtimer, timer); |
552 | 561 | ||
553 | tasklet_hi_schedule(&ttimer->tasklet); | 562 | tasklet_hi_schedule(&ttimer->tasklet); |
554 | return HRTIMER_NORESTART; | 563 | return HRTIMER_NORESTART; |
555 | } | 564 | } |
556 | 565 | ||
557 | /* | 566 | /* |
558 | * Helper function which calls the hrtimer callback from | 567 | * Helper function which calls the hrtimer callback from |
559 | * tasklet/softirq context | 568 | * tasklet/softirq context |
560 | */ | 569 | */ |
561 | static void __tasklet_hrtimer_trampoline(unsigned long data) | 570 | static void __tasklet_hrtimer_trampoline(unsigned long data) |
562 | { | 571 | { |
563 | struct tasklet_hrtimer *ttimer = (void *)data; | 572 | struct tasklet_hrtimer *ttimer = (void *)data; |
564 | enum hrtimer_restart restart; | 573 | enum hrtimer_restart restart; |
565 | 574 | ||
566 | restart = ttimer->function(&ttimer->timer); | 575 | restart = ttimer->function(&ttimer->timer); |
567 | if (restart != HRTIMER_NORESTART) | 576 | if (restart != HRTIMER_NORESTART) |
568 | hrtimer_restart(&ttimer->timer); | 577 | hrtimer_restart(&ttimer->timer); |
569 | } | 578 | } |
570 | 579 | ||
571 | /** | 580 | /** |
572 | * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks | 581 | * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks |
573 | * @ttimer: tasklet_hrtimer which is initialized | 582 | * @ttimer: tasklet_hrtimer which is initialized |
574 | * @function: hrtimer callback function which gets called from softirq context | 583 | * @function: hrtimer callback function which gets called from softirq context |
575 | * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME) | 584 | * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME) |
576 | * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL) | 585 | * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL) |
577 | */ | 586 | */ |
578 | void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer, | 587 | void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer, |
579 | enum hrtimer_restart (*function)(struct hrtimer *), | 588 | enum hrtimer_restart (*function)(struct hrtimer *), |
580 | clockid_t which_clock, enum hrtimer_mode mode) | 589 | clockid_t which_clock, enum hrtimer_mode mode) |
581 | { | 590 | { |
582 | hrtimer_init(&ttimer->timer, which_clock, mode); | 591 | hrtimer_init(&ttimer->timer, which_clock, mode); |
583 | ttimer->timer.function = __hrtimer_tasklet_trampoline; | 592 | ttimer->timer.function = __hrtimer_tasklet_trampoline; |
584 | tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline, | 593 | tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline, |
585 | (unsigned long)ttimer); | 594 | (unsigned long)ttimer); |
586 | ttimer->function = function; | 595 | ttimer->function = function; |
587 | } | 596 | } |
588 | EXPORT_SYMBOL_GPL(tasklet_hrtimer_init); | 597 | EXPORT_SYMBOL_GPL(tasklet_hrtimer_init); |
589 | 598 | ||
590 | /* | 599 | /* |
591 | * Remote softirq bits | 600 | * Remote softirq bits |
592 | */ | 601 | */ |
593 | 602 | ||
594 | DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list); | 603 | DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list); |
595 | EXPORT_PER_CPU_SYMBOL(softirq_work_list); | 604 | EXPORT_PER_CPU_SYMBOL(softirq_work_list); |
596 | 605 | ||
597 | static void __local_trigger(struct call_single_data *cp, int softirq) | 606 | static void __local_trigger(struct call_single_data *cp, int softirq) |
598 | { | 607 | { |
599 | struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]); | 608 | struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]); |
600 | 609 | ||
601 | list_add_tail(&cp->list, head); | 610 | list_add_tail(&cp->list, head); |
602 | 611 | ||
603 | /* Trigger the softirq only if the list was previously empty. */ | 612 | /* Trigger the softirq only if the list was previously empty. */ |
604 | if (head->next == &cp->list) | 613 | if (head->next == &cp->list) |
605 | raise_softirq_irqoff(softirq); | 614 | raise_softirq_irqoff(softirq); |
606 | } | 615 | } |
607 | 616 | ||
608 | #ifdef CONFIG_USE_GENERIC_SMP_HELPERS | 617 | #ifdef CONFIG_USE_GENERIC_SMP_HELPERS |
609 | static void remote_softirq_receive(void *data) | 618 | static void remote_softirq_receive(void *data) |
610 | { | 619 | { |
611 | struct call_single_data *cp = data; | 620 | struct call_single_data *cp = data; |
612 | unsigned long flags; | 621 | unsigned long flags; |
613 | int softirq; | 622 | int softirq; |
614 | 623 | ||
615 | softirq = cp->priv; | 624 | softirq = cp->priv; |
616 | 625 | ||
617 | local_irq_save(flags); | 626 | local_irq_save(flags); |
618 | __local_trigger(cp, softirq); | 627 | __local_trigger(cp, softirq); |
619 | local_irq_restore(flags); | 628 | local_irq_restore(flags); |
620 | } | 629 | } |
621 | 630 | ||
622 | static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) | 631 | static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) |
623 | { | 632 | { |
624 | if (cpu_online(cpu)) { | 633 | if (cpu_online(cpu)) { |
625 | cp->func = remote_softirq_receive; | 634 | cp->func = remote_softirq_receive; |
626 | cp->info = cp; | 635 | cp->info = cp; |
627 | cp->flags = 0; | 636 | cp->flags = 0; |
628 | cp->priv = softirq; | 637 | cp->priv = softirq; |
629 | 638 | ||
630 | __smp_call_function_single(cpu, cp, 0); | 639 | __smp_call_function_single(cpu, cp, 0); |
631 | return 0; | 640 | return 0; |
632 | } | 641 | } |
633 | return 1; | 642 | return 1; |
634 | } | 643 | } |
635 | #else /* CONFIG_USE_GENERIC_SMP_HELPERS */ | 644 | #else /* CONFIG_USE_GENERIC_SMP_HELPERS */ |
636 | static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) | 645 | static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) |
637 | { | 646 | { |
638 | return 1; | 647 | return 1; |
639 | } | 648 | } |
640 | #endif | 649 | #endif |
641 | 650 | ||
642 | /** | 651 | /** |
643 | * __send_remote_softirq - try to schedule softirq work on a remote cpu | 652 | * __send_remote_softirq - try to schedule softirq work on a remote cpu |
644 | * @cp: private SMP call function data area | 653 | * @cp: private SMP call function data area |
645 | * @cpu: the remote cpu | 654 | * @cpu: the remote cpu |
646 | * @this_cpu: the currently executing cpu | 655 | * @this_cpu: the currently executing cpu |
647 | * @softirq: the softirq for the work | 656 | * @softirq: the softirq for the work |
648 | * | 657 | * |
649 | * Attempt to schedule softirq work on a remote cpu. If this cannot be | 658 | * Attempt to schedule softirq work on a remote cpu. If this cannot be |
650 | * done, the work is instead queued up on the local cpu. | 659 | * done, the work is instead queued up on the local cpu. |
651 | * | 660 | * |
652 | * Interrupts must be disabled. | 661 | * Interrupts must be disabled. |
653 | */ | 662 | */ |
654 | void __send_remote_softirq(struct call_single_data *cp, int cpu, int this_cpu, int softirq) | 663 | void __send_remote_softirq(struct call_single_data *cp, int cpu, int this_cpu, int softirq) |
655 | { | 664 | { |
656 | if (cpu == this_cpu || __try_remote_softirq(cp, cpu, softirq)) | 665 | if (cpu == this_cpu || __try_remote_softirq(cp, cpu, softirq)) |
657 | __local_trigger(cp, softirq); | 666 | __local_trigger(cp, softirq); |
658 | } | 667 | } |
659 | EXPORT_SYMBOL(__send_remote_softirq); | 668 | EXPORT_SYMBOL(__send_remote_softirq); |
660 | 669 | ||
661 | /** | 670 | /** |
662 | * send_remote_softirq - try to schedule softirq work on a remote cpu | 671 | * send_remote_softirq - try to schedule softirq work on a remote cpu |
663 | * @cp: private SMP call function data area | 672 | * @cp: private SMP call function data area |
664 | * @cpu: the remote cpu | 673 | * @cpu: the remote cpu |
665 | * @softirq: the softirq for the work | 674 | * @softirq: the softirq for the work |
666 | * | 675 | * |
667 | * Like __send_remote_softirq except that disabling interrupts and | 676 | * Like __send_remote_softirq except that disabling interrupts and |
668 | * computing the current cpu is done for the caller. | 677 | * computing the current cpu is done for the caller. |
669 | */ | 678 | */ |
670 | void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq) | 679 | void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq) |
671 | { | 680 | { |
672 | unsigned long flags; | 681 | unsigned long flags; |
673 | int this_cpu; | 682 | int this_cpu; |
674 | 683 | ||
675 | local_irq_save(flags); | 684 | local_irq_save(flags); |
676 | this_cpu = smp_processor_id(); | 685 | this_cpu = smp_processor_id(); |
677 | __send_remote_softirq(cp, cpu, this_cpu, softirq); | 686 | __send_remote_softirq(cp, cpu, this_cpu, softirq); |
678 | local_irq_restore(flags); | 687 | local_irq_restore(flags); |
679 | } | 688 | } |
680 | EXPORT_SYMBOL(send_remote_softirq); | 689 | EXPORT_SYMBOL(send_remote_softirq); |
681 | 690 | ||
682 | static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self, | 691 | static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self, |
683 | unsigned long action, void *hcpu) | 692 | unsigned long action, void *hcpu) |
684 | { | 693 | { |
685 | /* | 694 | /* |
686 | * If a CPU goes away, splice its entries to the current CPU | 695 | * If a CPU goes away, splice its entries to the current CPU |
687 | * and trigger a run of the softirq | 696 | * and trigger a run of the softirq |
688 | */ | 697 | */ |
689 | if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { | 698 | if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { |
690 | int cpu = (unsigned long) hcpu; | 699 | int cpu = (unsigned long) hcpu; |
691 | int i; | 700 | int i; |
692 | 701 | ||
693 | local_irq_disable(); | 702 | local_irq_disable(); |
694 | for (i = 0; i < NR_SOFTIRQS; i++) { | 703 | for (i = 0; i < NR_SOFTIRQS; i++) { |
695 | struct list_head *head = &per_cpu(softirq_work_list[i], cpu); | 704 | struct list_head *head = &per_cpu(softirq_work_list[i], cpu); |
696 | struct list_head *local_head; | 705 | struct list_head *local_head; |
697 | 706 | ||
698 | if (list_empty(head)) | 707 | if (list_empty(head)) |
699 | continue; | 708 | continue; |
700 | 709 | ||
701 | local_head = &__get_cpu_var(softirq_work_list[i]); | 710 | local_head = &__get_cpu_var(softirq_work_list[i]); |
702 | list_splice_init(head, local_head); | 711 | list_splice_init(head, local_head); |
703 | raise_softirq_irqoff(i); | 712 | raise_softirq_irqoff(i); |
704 | } | 713 | } |
705 | local_irq_enable(); | 714 | local_irq_enable(); |
706 | } | 715 | } |
707 | 716 | ||
708 | return NOTIFY_OK; | 717 | return NOTIFY_OK; |
709 | } | 718 | } |
710 | 719 | ||
711 | static struct notifier_block __cpuinitdata remote_softirq_cpu_notifier = { | 720 | static struct notifier_block __cpuinitdata remote_softirq_cpu_notifier = { |
712 | .notifier_call = remote_softirq_cpu_notify, | 721 | .notifier_call = remote_softirq_cpu_notify, |
713 | }; | 722 | }; |
714 | 723 | ||
715 | void __init softirq_init(void) | 724 | void __init softirq_init(void) |
716 | { | 725 | { |
717 | int cpu; | 726 | int cpu; |
718 | 727 | ||
719 | for_each_possible_cpu(cpu) { | 728 | for_each_possible_cpu(cpu) { |
720 | int i; | 729 | int i; |
721 | 730 | ||
722 | per_cpu(tasklet_vec, cpu).tail = | 731 | per_cpu(tasklet_vec, cpu).tail = |
723 | &per_cpu(tasklet_vec, cpu).head; | 732 | &per_cpu(tasklet_vec, cpu).head; |
724 | per_cpu(tasklet_hi_vec, cpu).tail = | 733 | per_cpu(tasklet_hi_vec, cpu).tail = |
725 | &per_cpu(tasklet_hi_vec, cpu).head; | 734 | &per_cpu(tasklet_hi_vec, cpu).head; |
726 | for (i = 0; i < NR_SOFTIRQS; i++) | 735 | for (i = 0; i < NR_SOFTIRQS; i++) |
727 | INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu)); | 736 | INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu)); |
728 | } | 737 | } |
729 | 738 | ||
730 | register_hotcpu_notifier(&remote_softirq_cpu_notifier); | 739 | register_hotcpu_notifier(&remote_softirq_cpu_notifier); |
731 | 740 | ||
732 | open_softirq(TASKLET_SOFTIRQ, tasklet_action); | 741 | open_softirq(TASKLET_SOFTIRQ, tasklet_action); |
733 | open_softirq(HI_SOFTIRQ, tasklet_hi_action); | 742 | open_softirq(HI_SOFTIRQ, tasklet_hi_action); |
734 | } | 743 | } |
735 | 744 | ||
736 | static int run_ksoftirqd(void * __bind_cpu) | 745 | static int run_ksoftirqd(void * __bind_cpu) |
737 | { | 746 | { |
738 | set_current_state(TASK_INTERRUPTIBLE); | 747 | set_current_state(TASK_INTERRUPTIBLE); |
739 | 748 | ||
740 | while (!kthread_should_stop()) { | 749 | while (!kthread_should_stop()) { |
741 | preempt_disable(); | 750 | preempt_disable(); |
742 | if (!local_softirq_pending()) { | 751 | if (!local_softirq_pending()) { |
743 | schedule_preempt_disabled(); | 752 | schedule_preempt_disabled(); |
744 | } | 753 | } |
745 | 754 | ||
746 | __set_current_state(TASK_RUNNING); | 755 | __set_current_state(TASK_RUNNING); |
747 | 756 | ||
748 | while (local_softirq_pending()) { | 757 | while (local_softirq_pending()) { |
749 | /* Preempt disable stops cpu going offline. | 758 | /* Preempt disable stops cpu going offline. |
750 | If already offline, we'll be on wrong CPU: | 759 | If already offline, we'll be on wrong CPU: |
751 | don't process */ | 760 | don't process */ |
752 | if (cpu_is_offline((long)__bind_cpu)) | 761 | if (cpu_is_offline((long)__bind_cpu)) |
753 | goto wait_to_die; | 762 | goto wait_to_die; |
754 | local_irq_disable(); | 763 | local_irq_disable(); |
755 | if (local_softirq_pending()) | 764 | if (local_softirq_pending()) |
756 | __do_softirq(); | 765 | __do_softirq(); |
757 | local_irq_enable(); | 766 | local_irq_enable(); |
758 | sched_preempt_enable_no_resched(); | 767 | sched_preempt_enable_no_resched(); |
759 | cond_resched(); | 768 | cond_resched(); |
760 | preempt_disable(); | 769 | preempt_disable(); |
761 | rcu_note_context_switch((long)__bind_cpu); | 770 | rcu_note_context_switch((long)__bind_cpu); |
762 | } | 771 | } |
763 | preempt_enable(); | 772 | preempt_enable(); |
764 | set_current_state(TASK_INTERRUPTIBLE); | 773 | set_current_state(TASK_INTERRUPTIBLE); |
765 | } | 774 | } |
766 | __set_current_state(TASK_RUNNING); | 775 | __set_current_state(TASK_RUNNING); |
767 | return 0; | 776 | return 0; |
768 | 777 | ||
769 | wait_to_die: | 778 | wait_to_die: |
770 | preempt_enable(); | 779 | preempt_enable(); |
771 | /* Wait for kthread_stop */ | 780 | /* Wait for kthread_stop */ |
772 | set_current_state(TASK_INTERRUPTIBLE); | 781 | set_current_state(TASK_INTERRUPTIBLE); |
773 | while (!kthread_should_stop()) { | 782 | while (!kthread_should_stop()) { |
774 | schedule(); | 783 | schedule(); |
775 | set_current_state(TASK_INTERRUPTIBLE); | 784 | set_current_state(TASK_INTERRUPTIBLE); |
776 | } | 785 | } |
777 | __set_current_state(TASK_RUNNING); | 786 | __set_current_state(TASK_RUNNING); |
778 | return 0; | 787 | return 0; |
779 | } | 788 | } |
780 | 789 | ||
781 | #ifdef CONFIG_HOTPLUG_CPU | 790 | #ifdef CONFIG_HOTPLUG_CPU |
782 | /* | 791 | /* |
783 | * tasklet_kill_immediate is called to remove a tasklet which can already be | 792 | * tasklet_kill_immediate is called to remove a tasklet which can already be |
784 | * scheduled for execution on @cpu. | 793 | * scheduled for execution on @cpu. |
785 | * | 794 | * |
786 | * Unlike tasklet_kill, this function removes the tasklet | 795 | * Unlike tasklet_kill, this function removes the tasklet |
787 | * _immediately_, even if the tasklet is in TASKLET_STATE_SCHED state. | 796 | * _immediately_, even if the tasklet is in TASKLET_STATE_SCHED state. |
788 | * | 797 | * |
789 | * When this function is called, @cpu must be in the CPU_DEAD state. | 798 | * When this function is called, @cpu must be in the CPU_DEAD state. |
790 | */ | 799 | */ |
791 | void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu) | 800 | void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu) |
792 | { | 801 | { |
793 | struct tasklet_struct **i; | 802 | struct tasklet_struct **i; |
794 | 803 | ||
795 | BUG_ON(cpu_online(cpu)); | 804 | BUG_ON(cpu_online(cpu)); |
796 | BUG_ON(test_bit(TASKLET_STATE_RUN, &t->state)); | 805 | BUG_ON(test_bit(TASKLET_STATE_RUN, &t->state)); |
797 | 806 | ||
798 | if (!test_bit(TASKLET_STATE_SCHED, &t->state)) | 807 | if (!test_bit(TASKLET_STATE_SCHED, &t->state)) |
799 | return; | 808 | return; |
800 | 809 | ||
801 | /* CPU is dead, so no lock needed. */ | 810 | /* CPU is dead, so no lock needed. */ |
802 | for (i = &per_cpu(tasklet_vec, cpu).head; *i; i = &(*i)->next) { | 811 | for (i = &per_cpu(tasklet_vec, cpu).head; *i; i = &(*i)->next) { |
803 | if (*i == t) { | 812 | if (*i == t) { |
804 | *i = t->next; | 813 | *i = t->next; |
805 | /* If this was the tail element, move the tail ptr */ | 814 | /* If this was the tail element, move the tail ptr */ |
806 | if (*i == NULL) | 815 | if (*i == NULL) |
807 | per_cpu(tasklet_vec, cpu).tail = i; | 816 | per_cpu(tasklet_vec, cpu).tail = i; |
808 | return; | 817 | return; |
809 | } | 818 | } |
810 | } | 819 | } |
811 | BUG(); | 820 | BUG(); |
812 | } | 821 | } |
813 | 822 | ||
814 | static void takeover_tasklets(unsigned int cpu) | 823 | static void takeover_tasklets(unsigned int cpu) |
815 | { | 824 | { |
816 | /* CPU is dead, so no lock needed. */ | 825 | /* CPU is dead, so no lock needed. */ |
817 | local_irq_disable(); | 826 | local_irq_disable(); |
818 | 827 | ||
819 | /* Find end, append list for that CPU. */ | 828 | /* Find end, append list for that CPU. */ |
820 | if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) { | 829 | if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) { |
821 | *__this_cpu_read(tasklet_vec.tail) = per_cpu(tasklet_vec, cpu).head; | 830 | *__this_cpu_read(tasklet_vec.tail) = per_cpu(tasklet_vec, cpu).head; |
822 | this_cpu_write(tasklet_vec.tail, per_cpu(tasklet_vec, cpu).tail); | 831 | this_cpu_write(tasklet_vec.tail, per_cpu(tasklet_vec, cpu).tail); |
823 | per_cpu(tasklet_vec, cpu).head = NULL; | 832 | per_cpu(tasklet_vec, cpu).head = NULL; |
824 | per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; | 833 | per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; |
825 | } | 834 | } |
826 | raise_softirq_irqoff(TASKLET_SOFTIRQ); | 835 | raise_softirq_irqoff(TASKLET_SOFTIRQ); |
827 | 836 | ||
828 | if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) { | 837 | if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) { |
829 | *__this_cpu_read(tasklet_hi_vec.tail) = per_cpu(tasklet_hi_vec, cpu).head; | 838 | *__this_cpu_read(tasklet_hi_vec.tail) = per_cpu(tasklet_hi_vec, cpu).head; |
830 | __this_cpu_write(tasklet_hi_vec.tail, per_cpu(tasklet_hi_vec, cpu).tail); | 839 | __this_cpu_write(tasklet_hi_vec.tail, per_cpu(tasklet_hi_vec, cpu).tail); |
831 | per_cpu(tasklet_hi_vec, cpu).head = NULL; | 840 | per_cpu(tasklet_hi_vec, cpu).head = NULL; |
832 | per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head; | 841 | per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head; |
833 | } | 842 | } |
834 | raise_softirq_irqoff(HI_SOFTIRQ); | 843 | raise_softirq_irqoff(HI_SOFTIRQ); |
835 | 844 | ||
836 | local_irq_enable(); | 845 | local_irq_enable(); |
837 | } | 846 | } |
838 | #endif /* CONFIG_HOTPLUG_CPU */ | 847 | #endif /* CONFIG_HOTPLUG_CPU */ |
839 | 848 | ||
840 | static int __cpuinit cpu_callback(struct notifier_block *nfb, | 849 | static int __cpuinit cpu_callback(struct notifier_block *nfb, |
841 | unsigned long action, | 850 | unsigned long action, |
842 | void *hcpu) | 851 | void *hcpu) |
843 | { | 852 | { |
844 | int hotcpu = (unsigned long)hcpu; | 853 | int hotcpu = (unsigned long)hcpu; |
845 | struct task_struct *p; | 854 | struct task_struct *p; |
846 | 855 | ||
847 | switch (action) { | 856 | switch (action) { |
848 | case CPU_UP_PREPARE: | 857 | case CPU_UP_PREPARE: |
849 | case CPU_UP_PREPARE_FROZEN: | 858 | case CPU_UP_PREPARE_FROZEN: |
850 | p = kthread_create_on_node(run_ksoftirqd, | 859 | p = kthread_create_on_node(run_ksoftirqd, |
851 | hcpu, | 860 | hcpu, |
852 | cpu_to_node(hotcpu), | 861 | cpu_to_node(hotcpu), |
853 | "ksoftirqd/%d", hotcpu); | 862 | "ksoftirqd/%d", hotcpu); |
854 | if (IS_ERR(p)) { | 863 | if (IS_ERR(p)) { |
855 | printk("ksoftirqd for %i failed\n", hotcpu); | 864 | printk("ksoftirqd for %i failed\n", hotcpu); |
856 | return notifier_from_errno(PTR_ERR(p)); | 865 | return notifier_from_errno(PTR_ERR(p)); |
857 | } | 866 | } |
858 | kthread_bind(p, hotcpu); | 867 | kthread_bind(p, hotcpu); |
859 | per_cpu(ksoftirqd, hotcpu) = p; | 868 | per_cpu(ksoftirqd, hotcpu) = p; |
860 | break; | 869 | break; |
861 | case CPU_ONLINE: | 870 | case CPU_ONLINE: |
862 | case CPU_ONLINE_FROZEN: | 871 | case CPU_ONLINE_FROZEN: |
863 | wake_up_process(per_cpu(ksoftirqd, hotcpu)); | 872 | wake_up_process(per_cpu(ksoftirqd, hotcpu)); |
864 | break; | 873 | break; |
865 | #ifdef CONFIG_HOTPLUG_CPU | 874 | #ifdef CONFIG_HOTPLUG_CPU |
866 | case CPU_UP_CANCELED: | 875 | case CPU_UP_CANCELED: |
867 | case CPU_UP_CANCELED_FROZEN: | 876 | case CPU_UP_CANCELED_FROZEN: |
868 | if (!per_cpu(ksoftirqd, hotcpu)) | 877 | if (!per_cpu(ksoftirqd, hotcpu)) |
869 | break; | 878 | break; |
870 | /* Unbind so it can run. Fall thru. */ | 879 | /* Unbind so it can run. Fall thru. */ |
871 | kthread_bind(per_cpu(ksoftirqd, hotcpu), | 880 | kthread_bind(per_cpu(ksoftirqd, hotcpu), |
872 | cpumask_any(cpu_online_mask)); | 881 | cpumask_any(cpu_online_mask)); |
873 | case CPU_DEAD: | 882 | case CPU_DEAD: |
874 | case CPU_DEAD_FROZEN: { | 883 | case CPU_DEAD_FROZEN: { |
875 | static const struct sched_param param = { | 884 | static const struct sched_param param = { |
876 | .sched_priority = MAX_RT_PRIO-1 | 885 | .sched_priority = MAX_RT_PRIO-1 |
877 | }; | 886 | }; |
878 | 887 | ||
879 | p = per_cpu(ksoftirqd, hotcpu); | 888 | p = per_cpu(ksoftirqd, hotcpu); |
880 | per_cpu(ksoftirqd, hotcpu) = NULL; | 889 | per_cpu(ksoftirqd, hotcpu) = NULL; |
881 | sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); | 890 | sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); |
882 | kthread_stop(p); | 891 | kthread_stop(p); |
883 | takeover_tasklets(hotcpu); | 892 | takeover_tasklets(hotcpu); |
884 | break; | 893 | break; |
885 | } | 894 | } |
886 | #endif /* CONFIG_HOTPLUG_CPU */ | 895 | #endif /* CONFIG_HOTPLUG_CPU */ |
887 | } | 896 | } |
888 | return NOTIFY_OK; | 897 | return NOTIFY_OK; |
889 | } | 898 | } |
890 | 899 | ||
891 | static struct notifier_block __cpuinitdata cpu_nfb = { | 900 | static struct notifier_block __cpuinitdata cpu_nfb = { |
892 | .notifier_call = cpu_callback | 901 | .notifier_call = cpu_callback |
893 | }; | 902 | }; |
894 | 903 | ||
895 | static __init int spawn_ksoftirqd(void) | 904 | static __init int spawn_ksoftirqd(void) |
896 | { | 905 | { |
897 | void *cpu = (void *)(long)smp_processor_id(); | 906 | void *cpu = (void *)(long)smp_processor_id(); |
898 | int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); | 907 | int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); |
899 | 908 | ||
900 | BUG_ON(err != NOTIFY_OK); | 909 | BUG_ON(err != NOTIFY_OK); |
901 | cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); | 910 | cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); |
902 | register_cpu_notifier(&cpu_nfb); | 911 | register_cpu_notifier(&cpu_nfb); |
903 | return 0; | 912 | return 0; |
904 | } | 913 | } |
905 | early_initcall(spawn_ksoftirqd); | 914 | early_initcall(spawn_ksoftirqd); |
906 | 915 | ||
907 | /* | 916 | /* |
908 | * [ These __weak aliases are kept in a separate compilation unit, so that | 917 | * [ These __weak aliases are kept in a separate compilation unit, so that |
909 | * GCC does not inline them incorrectly. ] | 918 | * GCC does not inline them incorrectly. ] |
910 | */ | 919 | */ |
911 | 920 | ||
912 | int __init __weak early_irq_init(void) | 921 | int __init __weak early_irq_init(void) |
913 | { | 922 | { |
914 | return 0; | 923 | return 0; |
915 | } | 924 | } |
916 | 925 | ||
917 | #ifdef CONFIG_GENERIC_HARDIRQS | 926 | #ifdef CONFIG_GENERIC_HARDIRQS |
918 | int __init __weak arch_probe_nr_irqs(void) | 927 | int __init __weak arch_probe_nr_irqs(void) |
919 | { | 928 | { |
920 | return NR_IRQS_LEGACY; | 929 | return NR_IRQS_LEGACY; |
921 | } | 930 | } |
922 | 931 | ||
923 | int __init __weak arch_early_irq_init(void) | 932 | int __init __weak arch_early_irq_init(void) |
924 | { | 933 | { |
925 | return 0; | 934 | return 0; |
926 | } | 935 | } |
927 | #endif | 936 | #endif |
928 | 937 |
mm/page_alloc.c
1 | /* | 1 | /* |
2 | * linux/mm/page_alloc.c | 2 | * linux/mm/page_alloc.c |
3 | * | 3 | * |
4 | * Manages the free list, the system allocates free pages here. | 4 | * Manages the free list, the system allocates free pages here. |
5 | * Note that kmalloc() lives in slab.c | 5 | * Note that kmalloc() lives in slab.c |
6 | * | 6 | * |
7 | * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds | 7 | * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds |
8 | * Swap reorganised 29.12.95, Stephen Tweedie | 8 | * Swap reorganised 29.12.95, Stephen Tweedie |
9 | * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 | 9 | * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 |
10 | * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 | 10 | * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 |
11 | * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 | 11 | * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 |
12 | * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 | 12 | * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 |
13 | * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 | 13 | * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 |
14 | * (lots of bits borrowed from Ingo Molnar & Andrew Morton) | 14 | * (lots of bits borrowed from Ingo Molnar & Andrew Morton) |
15 | */ | 15 | */ |
16 | 16 | ||
17 | #include <linux/stddef.h> | 17 | #include <linux/stddef.h> |
18 | #include <linux/mm.h> | 18 | #include <linux/mm.h> |
19 | #include <linux/swap.h> | 19 | #include <linux/swap.h> |
20 | #include <linux/interrupt.h> | 20 | #include <linux/interrupt.h> |
21 | #include <linux/pagemap.h> | 21 | #include <linux/pagemap.h> |
22 | #include <linux/jiffies.h> | 22 | #include <linux/jiffies.h> |
23 | #include <linux/bootmem.h> | 23 | #include <linux/bootmem.h> |
24 | #include <linux/memblock.h> | 24 | #include <linux/memblock.h> |
25 | #include <linux/compiler.h> | 25 | #include <linux/compiler.h> |
26 | #include <linux/kernel.h> | 26 | #include <linux/kernel.h> |
27 | #include <linux/kmemcheck.h> | 27 | #include <linux/kmemcheck.h> |
28 | #include <linux/module.h> | 28 | #include <linux/module.h> |
29 | #include <linux/suspend.h> | 29 | #include <linux/suspend.h> |
30 | #include <linux/pagevec.h> | 30 | #include <linux/pagevec.h> |
31 | #include <linux/blkdev.h> | 31 | #include <linux/blkdev.h> |
32 | #include <linux/slab.h> | 32 | #include <linux/slab.h> |
33 | #include <linux/ratelimit.h> | 33 | #include <linux/ratelimit.h> |
34 | #include <linux/oom.h> | 34 | #include <linux/oom.h> |
35 | #include <linux/notifier.h> | 35 | #include <linux/notifier.h> |
36 | #include <linux/topology.h> | 36 | #include <linux/topology.h> |
37 | #include <linux/sysctl.h> | 37 | #include <linux/sysctl.h> |
38 | #include <linux/cpu.h> | 38 | #include <linux/cpu.h> |
39 | #include <linux/cpuset.h> | 39 | #include <linux/cpuset.h> |
40 | #include <linux/memory_hotplug.h> | 40 | #include <linux/memory_hotplug.h> |
41 | #include <linux/nodemask.h> | 41 | #include <linux/nodemask.h> |
42 | #include <linux/vmalloc.h> | 42 | #include <linux/vmalloc.h> |
43 | #include <linux/vmstat.h> | 43 | #include <linux/vmstat.h> |
44 | #include <linux/mempolicy.h> | 44 | #include <linux/mempolicy.h> |
45 | #include <linux/stop_machine.h> | 45 | #include <linux/stop_machine.h> |
46 | #include <linux/sort.h> | 46 | #include <linux/sort.h> |
47 | #include <linux/pfn.h> | 47 | #include <linux/pfn.h> |
48 | #include <linux/backing-dev.h> | 48 | #include <linux/backing-dev.h> |
49 | #include <linux/fault-inject.h> | 49 | #include <linux/fault-inject.h> |
50 | #include <linux/page-isolation.h> | 50 | #include <linux/page-isolation.h> |
51 | #include <linux/page_cgroup.h> | 51 | #include <linux/page_cgroup.h> |
52 | #include <linux/debugobjects.h> | 52 | #include <linux/debugobjects.h> |
53 | #include <linux/kmemleak.h> | 53 | #include <linux/kmemleak.h> |
54 | #include <linux/compaction.h> | 54 | #include <linux/compaction.h> |
55 | #include <trace/events/kmem.h> | 55 | #include <trace/events/kmem.h> |
56 | #include <linux/ftrace_event.h> | 56 | #include <linux/ftrace_event.h> |
57 | #include <linux/memcontrol.h> | 57 | #include <linux/memcontrol.h> |
58 | #include <linux/prefetch.h> | 58 | #include <linux/prefetch.h> |
59 | #include <linux/migrate.h> | 59 | #include <linux/migrate.h> |
60 | #include <linux/page-debug-flags.h> | 60 | #include <linux/page-debug-flags.h> |
61 | 61 | ||
62 | #include <asm/tlbflush.h> | 62 | #include <asm/tlbflush.h> |
63 | #include <asm/div64.h> | 63 | #include <asm/div64.h> |
64 | #include "internal.h" | 64 | #include "internal.h" |
65 | 65 | ||
66 | #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID | 66 | #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID |
67 | DEFINE_PER_CPU(int, numa_node); | 67 | DEFINE_PER_CPU(int, numa_node); |
68 | EXPORT_PER_CPU_SYMBOL(numa_node); | 68 | EXPORT_PER_CPU_SYMBOL(numa_node); |
69 | #endif | 69 | #endif |
70 | 70 | ||
71 | #ifdef CONFIG_HAVE_MEMORYLESS_NODES | 71 | #ifdef CONFIG_HAVE_MEMORYLESS_NODES |
72 | /* | 72 | /* |
73 | * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. | 73 | * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. |
74 | * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. | 74 | * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. |
75 | * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem() | 75 | * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem() |
76 | * defined in <linux/topology.h>. | 76 | * defined in <linux/topology.h>. |
77 | */ | 77 | */ |
78 | DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ | 78 | DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ |
79 | EXPORT_PER_CPU_SYMBOL(_numa_mem_); | 79 | EXPORT_PER_CPU_SYMBOL(_numa_mem_); |
80 | #endif | 80 | #endif |
81 | 81 | ||
82 | /* | 82 | /* |
83 | * Array of node states. | 83 | * Array of node states. |
84 | */ | 84 | */ |
85 | nodemask_t node_states[NR_NODE_STATES] __read_mostly = { | 85 | nodemask_t node_states[NR_NODE_STATES] __read_mostly = { |
86 | [N_POSSIBLE] = NODE_MASK_ALL, | 86 | [N_POSSIBLE] = NODE_MASK_ALL, |
87 | [N_ONLINE] = { { [0] = 1UL } }, | 87 | [N_ONLINE] = { { [0] = 1UL } }, |
88 | #ifndef CONFIG_NUMA | 88 | #ifndef CONFIG_NUMA |
89 | [N_NORMAL_MEMORY] = { { [0] = 1UL } }, | 89 | [N_NORMAL_MEMORY] = { { [0] = 1UL } }, |
90 | #ifdef CONFIG_HIGHMEM | 90 | #ifdef CONFIG_HIGHMEM |
91 | [N_HIGH_MEMORY] = { { [0] = 1UL } }, | 91 | [N_HIGH_MEMORY] = { { [0] = 1UL } }, |
92 | #endif | 92 | #endif |
93 | [N_CPU] = { { [0] = 1UL } }, | 93 | [N_CPU] = { { [0] = 1UL } }, |
94 | #endif /* NUMA */ | 94 | #endif /* NUMA */ |
95 | }; | 95 | }; |
96 | EXPORT_SYMBOL(node_states); | 96 | EXPORT_SYMBOL(node_states); |
97 | 97 | ||
98 | unsigned long totalram_pages __read_mostly; | 98 | unsigned long totalram_pages __read_mostly; |
99 | unsigned long totalreserve_pages __read_mostly; | 99 | unsigned long totalreserve_pages __read_mostly; |
100 | /* | 100 | /* |
101 | * When calculating the number of globally allowed dirty pages, there | 101 | * When calculating the number of globally allowed dirty pages, there |
102 | * is a certain number of per-zone reserves that should not be | 102 | * is a certain number of per-zone reserves that should not be |
103 | * considered dirtyable memory. This is the sum of those reserves | 103 | * considered dirtyable memory. This is the sum of those reserves |
104 | * over all existing zones that contribute dirtyable memory. | 104 | * over all existing zones that contribute dirtyable memory. |
105 | */ | 105 | */ |
106 | unsigned long dirty_balance_reserve __read_mostly; | 106 | unsigned long dirty_balance_reserve __read_mostly; |
107 | 107 | ||
108 | int percpu_pagelist_fraction; | 108 | int percpu_pagelist_fraction; |
109 | gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; | 109 | gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; |
110 | 110 | ||
111 | #ifdef CONFIG_PM_SLEEP | 111 | #ifdef CONFIG_PM_SLEEP |
112 | /* | 112 | /* |
113 | * The following functions are used by the suspend/hibernate code to temporarily | 113 | * The following functions are used by the suspend/hibernate code to temporarily |
114 | * change gfp_allowed_mask in order to avoid using I/O during memory allocations | 114 | * change gfp_allowed_mask in order to avoid using I/O during memory allocations |
115 | * while devices are suspended. To avoid races with the suspend/hibernate code, | 115 | * while devices are suspended. To avoid races with the suspend/hibernate code, |
116 | * they should always be called with pm_mutex held (gfp_allowed_mask also should | 116 | * they should always be called with pm_mutex held (gfp_allowed_mask also should |
117 | * only be modified with pm_mutex held, unless the suspend/hibernate code is | 117 | * only be modified with pm_mutex held, unless the suspend/hibernate code is |
118 | * guaranteed not to run in parallel with that modification). | 118 | * guaranteed not to run in parallel with that modification). |
119 | */ | 119 | */ |
120 | 120 | ||
121 | static gfp_t saved_gfp_mask; | 121 | static gfp_t saved_gfp_mask; |
122 | 122 | ||
123 | void pm_restore_gfp_mask(void) | 123 | void pm_restore_gfp_mask(void) |
124 | { | 124 | { |
125 | WARN_ON(!mutex_is_locked(&pm_mutex)); | 125 | WARN_ON(!mutex_is_locked(&pm_mutex)); |
126 | if (saved_gfp_mask) { | 126 | if (saved_gfp_mask) { |
127 | gfp_allowed_mask = saved_gfp_mask; | 127 | gfp_allowed_mask = saved_gfp_mask; |
128 | saved_gfp_mask = 0; | 128 | saved_gfp_mask = 0; |
129 | } | 129 | } |
130 | } | 130 | } |
131 | 131 | ||
132 | void pm_restrict_gfp_mask(void) | 132 | void pm_restrict_gfp_mask(void) |
133 | { | 133 | { |
134 | WARN_ON(!mutex_is_locked(&pm_mutex)); | 134 | WARN_ON(!mutex_is_locked(&pm_mutex)); |
135 | WARN_ON(saved_gfp_mask); | 135 | WARN_ON(saved_gfp_mask); |
136 | saved_gfp_mask = gfp_allowed_mask; | 136 | saved_gfp_mask = gfp_allowed_mask; |
137 | gfp_allowed_mask &= ~GFP_IOFS; | 137 | gfp_allowed_mask &= ~GFP_IOFS; |
138 | } | 138 | } |
139 | 139 | ||
140 | bool pm_suspended_storage(void) | 140 | bool pm_suspended_storage(void) |
141 | { | 141 | { |
142 | if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS) | 142 | if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS) |
143 | return false; | 143 | return false; |
144 | return true; | 144 | return true; |
145 | } | 145 | } |
146 | #endif /* CONFIG_PM_SLEEP */ | 146 | #endif /* CONFIG_PM_SLEEP */ |
147 | 147 | ||
148 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE | 148 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE |
149 | int pageblock_order __read_mostly; | 149 | int pageblock_order __read_mostly; |
150 | #endif | 150 | #endif |
151 | 151 | ||
152 | static void __free_pages_ok(struct page *page, unsigned int order); | 152 | static void __free_pages_ok(struct page *page, unsigned int order); |
153 | 153 | ||
154 | /* | 154 | /* |
155 | * results with 256, 32 in the lowmem_reserve sysctl: | 155 | * results with 256, 32 in the lowmem_reserve sysctl: |
156 | * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) | 156 | * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) |
157 | * 1G machine -> (16M dma, 784M normal, 224M high) | 157 | * 1G machine -> (16M dma, 784M normal, 224M high) |
158 | * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA | 158 | * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA |
159 | * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL | 159 | * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL |
160 | * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA | 160 | * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA |
161 | * | 161 | * |
162 | * TBD: should special case ZONE_DMA32 machines here - in those we normally | 162 | * TBD: should special case ZONE_DMA32 machines here - in those we normally |
163 | * don't need any ZONE_NORMAL reservation | 163 | * don't need any ZONE_NORMAL reservation |
164 | */ | 164 | */ |
165 | int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { | 165 | int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { |
166 | #ifdef CONFIG_ZONE_DMA | 166 | #ifdef CONFIG_ZONE_DMA |
167 | 256, | 167 | 256, |
168 | #endif | 168 | #endif |
169 | #ifdef CONFIG_ZONE_DMA32 | 169 | #ifdef CONFIG_ZONE_DMA32 |
170 | 256, | 170 | 256, |
171 | #endif | 171 | #endif |
172 | #ifdef CONFIG_HIGHMEM | 172 | #ifdef CONFIG_HIGHMEM |
173 | 32, | 173 | 32, |
174 | #endif | 174 | #endif |
175 | 32, | 175 | 32, |
176 | }; | 176 | }; |
177 | 177 | ||
178 | EXPORT_SYMBOL(totalram_pages); | 178 | EXPORT_SYMBOL(totalram_pages); |
179 | 179 | ||
180 | static char * const zone_names[MAX_NR_ZONES] = { | 180 | static char * const zone_names[MAX_NR_ZONES] = { |
181 | #ifdef CONFIG_ZONE_DMA | 181 | #ifdef CONFIG_ZONE_DMA |
182 | "DMA", | 182 | "DMA", |
183 | #endif | 183 | #endif |
184 | #ifdef CONFIG_ZONE_DMA32 | 184 | #ifdef CONFIG_ZONE_DMA32 |
185 | "DMA32", | 185 | "DMA32", |
186 | #endif | 186 | #endif |
187 | "Normal", | 187 | "Normal", |
188 | #ifdef CONFIG_HIGHMEM | 188 | #ifdef CONFIG_HIGHMEM |
189 | "HighMem", | 189 | "HighMem", |
190 | #endif | 190 | #endif |
191 | "Movable", | 191 | "Movable", |
192 | }; | 192 | }; |
193 | 193 | ||
194 | int min_free_kbytes = 1024; | 194 | int min_free_kbytes = 1024; |
195 | 195 | ||
196 | static unsigned long __meminitdata nr_kernel_pages; | 196 | static unsigned long __meminitdata nr_kernel_pages; |
197 | static unsigned long __meminitdata nr_all_pages; | 197 | static unsigned long __meminitdata nr_all_pages; |
198 | static unsigned long __meminitdata dma_reserve; | 198 | static unsigned long __meminitdata dma_reserve; |
199 | 199 | ||
200 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | 200 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
201 | static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; | 201 | static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; |
202 | static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; | 202 | static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; |
203 | static unsigned long __initdata required_kernelcore; | 203 | static unsigned long __initdata required_kernelcore; |
204 | static unsigned long __initdata required_movablecore; | 204 | static unsigned long __initdata required_movablecore; |
205 | static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; | 205 | static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; |
206 | 206 | ||
207 | /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ | 207 | /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ |
208 | int movable_zone; | 208 | int movable_zone; |
209 | EXPORT_SYMBOL(movable_zone); | 209 | EXPORT_SYMBOL(movable_zone); |
210 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | 210 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
211 | 211 | ||
212 | #if MAX_NUMNODES > 1 | 212 | #if MAX_NUMNODES > 1 |
213 | int nr_node_ids __read_mostly = MAX_NUMNODES; | 213 | int nr_node_ids __read_mostly = MAX_NUMNODES; |
214 | int nr_online_nodes __read_mostly = 1; | 214 | int nr_online_nodes __read_mostly = 1; |
215 | EXPORT_SYMBOL(nr_node_ids); | 215 | EXPORT_SYMBOL(nr_node_ids); |
216 | EXPORT_SYMBOL(nr_online_nodes); | 216 | EXPORT_SYMBOL(nr_online_nodes); |
217 | #endif | 217 | #endif |
218 | 218 | ||
219 | int page_group_by_mobility_disabled __read_mostly; | 219 | int page_group_by_mobility_disabled __read_mostly; |
220 | 220 | ||
221 | /* | 221 | /* |
222 | * NOTE: | 222 | * NOTE: |
223 | * Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly. | 223 | * Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly. |
224 | * Instead, use {un}set_pageblock_isolate. | 224 | * Instead, use {un}set_pageblock_isolate. |
225 | */ | 225 | */ |
226 | void set_pageblock_migratetype(struct page *page, int migratetype) | 226 | void set_pageblock_migratetype(struct page *page, int migratetype) |
227 | { | 227 | { |
228 | 228 | ||
229 | if (unlikely(page_group_by_mobility_disabled)) | 229 | if (unlikely(page_group_by_mobility_disabled)) |
230 | migratetype = MIGRATE_UNMOVABLE; | 230 | migratetype = MIGRATE_UNMOVABLE; |
231 | 231 | ||
232 | set_pageblock_flags_group(page, (unsigned long)migratetype, | 232 | set_pageblock_flags_group(page, (unsigned long)migratetype, |
233 | PB_migrate, PB_migrate_end); | 233 | PB_migrate, PB_migrate_end); |
234 | } | 234 | } |
235 | 235 | ||
236 | bool oom_killer_disabled __read_mostly; | 236 | bool oom_killer_disabled __read_mostly; |
237 | 237 | ||
238 | #ifdef CONFIG_DEBUG_VM | 238 | #ifdef CONFIG_DEBUG_VM |
239 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) | 239 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) |
240 | { | 240 | { |
241 | int ret = 0; | 241 | int ret = 0; |
242 | unsigned seq; | 242 | unsigned seq; |
243 | unsigned long pfn = page_to_pfn(page); | 243 | unsigned long pfn = page_to_pfn(page); |
244 | 244 | ||
245 | do { | 245 | do { |
246 | seq = zone_span_seqbegin(zone); | 246 | seq = zone_span_seqbegin(zone); |
247 | if (pfn >= zone->zone_start_pfn + zone->spanned_pages) | 247 | if (pfn >= zone->zone_start_pfn + zone->spanned_pages) |
248 | ret = 1; | 248 | ret = 1; |
249 | else if (pfn < zone->zone_start_pfn) | 249 | else if (pfn < zone->zone_start_pfn) |
250 | ret = 1; | 250 | ret = 1; |
251 | } while (zone_span_seqretry(zone, seq)); | 251 | } while (zone_span_seqretry(zone, seq)); |
252 | 252 | ||
253 | return ret; | 253 | return ret; |
254 | } | 254 | } |
255 | 255 | ||
256 | static int page_is_consistent(struct zone *zone, struct page *page) | 256 | static int page_is_consistent(struct zone *zone, struct page *page) |
257 | { | 257 | { |
258 | if (!pfn_valid_within(page_to_pfn(page))) | 258 | if (!pfn_valid_within(page_to_pfn(page))) |
259 | return 0; | 259 | return 0; |
260 | if (zone != page_zone(page)) | 260 | if (zone != page_zone(page)) |
261 | return 0; | 261 | return 0; |
262 | 262 | ||
263 | return 1; | 263 | return 1; |
264 | } | 264 | } |
265 | /* | 265 | /* |
266 | * Temporary debugging check for pages not lying within a given zone. | 266 | * Temporary debugging check for pages not lying within a given zone. |
267 | */ | 267 | */ |
268 | static int bad_range(struct zone *zone, struct page *page) | 268 | static int bad_range(struct zone *zone, struct page *page) |
269 | { | 269 | { |
270 | if (page_outside_zone_boundaries(zone, page)) | 270 | if (page_outside_zone_boundaries(zone, page)) |
271 | return 1; | 271 | return 1; |
272 | if (!page_is_consistent(zone, page)) | 272 | if (!page_is_consistent(zone, page)) |
273 | return 1; | 273 | return 1; |
274 | 274 | ||
275 | return 0; | 275 | return 0; |
276 | } | 276 | } |
277 | #else | 277 | #else |
278 | static inline int bad_range(struct zone *zone, struct page *page) | 278 | static inline int bad_range(struct zone *zone, struct page *page) |
279 | { | 279 | { |
280 | return 0; | 280 | return 0; |
281 | } | 281 | } |
282 | #endif | 282 | #endif |
283 | 283 | ||
284 | static void bad_page(struct page *page) | 284 | static void bad_page(struct page *page) |
285 | { | 285 | { |
286 | static unsigned long resume; | 286 | static unsigned long resume; |
287 | static unsigned long nr_shown; | 287 | static unsigned long nr_shown; |
288 | static unsigned long nr_unshown; | 288 | static unsigned long nr_unshown; |
289 | 289 | ||
290 | /* Don't complain about poisoned pages */ | 290 | /* Don't complain about poisoned pages */ |
291 | if (PageHWPoison(page)) { | 291 | if (PageHWPoison(page)) { |
292 | reset_page_mapcount(page); /* remove PageBuddy */ | 292 | reset_page_mapcount(page); /* remove PageBuddy */ |
293 | return; | 293 | return; |
294 | } | 294 | } |
295 | 295 | ||
296 | /* | 296 | /* |
297 | * Allow a burst of 60 reports, then keep quiet for that minute; | 297 | * Allow a burst of 60 reports, then keep quiet for that minute; |
298 | * or allow a steady drip of one report per second. | 298 | * or allow a steady drip of one report per second. |
299 | */ | 299 | */ |
300 | if (nr_shown == 60) { | 300 | if (nr_shown == 60) { |
301 | if (time_before(jiffies, resume)) { | 301 | if (time_before(jiffies, resume)) { |
302 | nr_unshown++; | 302 | nr_unshown++; |
303 | goto out; | 303 | goto out; |
304 | } | 304 | } |
305 | if (nr_unshown) { | 305 | if (nr_unshown) { |
306 | printk(KERN_ALERT | 306 | printk(KERN_ALERT |
307 | "BUG: Bad page state: %lu messages suppressed\n", | 307 | "BUG: Bad page state: %lu messages suppressed\n", |
308 | nr_unshown); | 308 | nr_unshown); |
309 | nr_unshown = 0; | 309 | nr_unshown = 0; |
310 | } | 310 | } |
311 | nr_shown = 0; | 311 | nr_shown = 0; |
312 | } | 312 | } |
313 | if (nr_shown++ == 0) | 313 | if (nr_shown++ == 0) |
314 | resume = jiffies + 60 * HZ; | 314 | resume = jiffies + 60 * HZ; |
315 | 315 | ||
316 | printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", | 316 | printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", |
317 | current->comm, page_to_pfn(page)); | 317 | current->comm, page_to_pfn(page)); |
318 | dump_page(page); | 318 | dump_page(page); |
319 | 319 | ||
320 | print_modules(); | 320 | print_modules(); |
321 | dump_stack(); | 321 | dump_stack(); |
322 | out: | 322 | out: |
323 | /* Leave bad fields for debug, except PageBuddy could make trouble */ | 323 | /* Leave bad fields for debug, except PageBuddy could make trouble */ |
324 | reset_page_mapcount(page); /* remove PageBuddy */ | 324 | reset_page_mapcount(page); /* remove PageBuddy */ |
325 | add_taint(TAINT_BAD_PAGE); | 325 | add_taint(TAINT_BAD_PAGE); |
326 | } | 326 | } |
327 | 327 | ||
328 | /* | 328 | /* |
329 | * Higher-order pages are called "compound pages". They are structured thusly: | 329 | * Higher-order pages are called "compound pages". They are structured thusly: |
330 | * | 330 | * |
331 | * The first PAGE_SIZE page is called the "head page". | 331 | * The first PAGE_SIZE page is called the "head page". |
332 | * | 332 | * |
333 | * The remaining PAGE_SIZE pages are called "tail pages". | 333 | * The remaining PAGE_SIZE pages are called "tail pages". |
334 | * | 334 | * |
335 | * All pages have PG_compound set. All tail pages have their ->first_page | 335 | * All pages have PG_compound set. All tail pages have their ->first_page |
336 | * pointing at the head page. | 336 | * pointing at the head page. |
337 | * | 337 | * |
338 | * The first tail page's ->lru.next holds the address of the compound page's | 338 | * The first tail page's ->lru.next holds the address of the compound page's |
339 | * put_page() function. Its ->lru.prev holds the order of allocation. | 339 | * put_page() function. Its ->lru.prev holds the order of allocation. |
340 | * This usage means that zero-order pages may not be compound. | 340 | * This usage means that zero-order pages may not be compound. |
341 | */ | 341 | */ |
342 | 342 | ||
343 | static void free_compound_page(struct page *page) | 343 | static void free_compound_page(struct page *page) |
344 | { | 344 | { |
345 | __free_pages_ok(page, compound_order(page)); | 345 | __free_pages_ok(page, compound_order(page)); |
346 | } | 346 | } |
347 | 347 | ||
348 | void prep_compound_page(struct page *page, unsigned long order) | 348 | void prep_compound_page(struct page *page, unsigned long order) |
349 | { | 349 | { |
350 | int i; | 350 | int i; |
351 | int nr_pages = 1 << order; | 351 | int nr_pages = 1 << order; |
352 | 352 | ||
353 | set_compound_page_dtor(page, free_compound_page); | 353 | set_compound_page_dtor(page, free_compound_page); |
354 | set_compound_order(page, order); | 354 | set_compound_order(page, order); |
355 | __SetPageHead(page); | 355 | __SetPageHead(page); |
356 | for (i = 1; i < nr_pages; i++) { | 356 | for (i = 1; i < nr_pages; i++) { |
357 | struct page *p = page + i; | 357 | struct page *p = page + i; |
358 | __SetPageTail(p); | 358 | __SetPageTail(p); |
359 | set_page_count(p, 0); | 359 | set_page_count(p, 0); |
360 | p->first_page = page; | 360 | p->first_page = page; |
361 | } | 361 | } |
362 | } | 362 | } |
363 | 363 | ||
364 | /* update __split_huge_page_refcount if you change this function */ | 364 | /* update __split_huge_page_refcount if you change this function */ |
365 | static int destroy_compound_page(struct page *page, unsigned long order) | 365 | static int destroy_compound_page(struct page *page, unsigned long order) |
366 | { | 366 | { |
367 | int i; | 367 | int i; |
368 | int nr_pages = 1 << order; | 368 | int nr_pages = 1 << order; |
369 | int bad = 0; | 369 | int bad = 0; |
370 | 370 | ||
371 | if (unlikely(compound_order(page) != order) || | 371 | if (unlikely(compound_order(page) != order) || |
372 | unlikely(!PageHead(page))) { | 372 | unlikely(!PageHead(page))) { |
373 | bad_page(page); | 373 | bad_page(page); |
374 | bad++; | 374 | bad++; |
375 | } | 375 | } |
376 | 376 | ||
377 | __ClearPageHead(page); | 377 | __ClearPageHead(page); |
378 | 378 | ||
379 | for (i = 1; i < nr_pages; i++) { | 379 | for (i = 1; i < nr_pages; i++) { |
380 | struct page *p = page + i; | 380 | struct page *p = page + i; |
381 | 381 | ||
382 | if (unlikely(!PageTail(p) || (p->first_page != page))) { | 382 | if (unlikely(!PageTail(p) || (p->first_page != page))) { |
383 | bad_page(page); | 383 | bad_page(page); |
384 | bad++; | 384 | bad++; |
385 | } | 385 | } |
386 | __ClearPageTail(p); | 386 | __ClearPageTail(p); |
387 | } | 387 | } |
388 | 388 | ||
389 | return bad; | 389 | return bad; |
390 | } | 390 | } |
391 | 391 | ||
392 | static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) | 392 | static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) |
393 | { | 393 | { |
394 | int i; | 394 | int i; |
395 | 395 | ||
396 | /* | 396 | /* |
397 | * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO | 397 | * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO |
398 | * and __GFP_HIGHMEM from hard or soft interrupt context. | 398 | * and __GFP_HIGHMEM from hard or soft interrupt context. |
399 | */ | 399 | */ |
400 | VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); | 400 | VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); |
401 | for (i = 0; i < (1 << order); i++) | 401 | for (i = 0; i < (1 << order); i++) |
402 | clear_highpage(page + i); | 402 | clear_highpage(page + i); |
403 | } | 403 | } |
404 | 404 | ||
405 | #ifdef CONFIG_DEBUG_PAGEALLOC | 405 | #ifdef CONFIG_DEBUG_PAGEALLOC |
406 | unsigned int _debug_guardpage_minorder; | 406 | unsigned int _debug_guardpage_minorder; |
407 | 407 | ||
408 | static int __init debug_guardpage_minorder_setup(char *buf) | 408 | static int __init debug_guardpage_minorder_setup(char *buf) |
409 | { | 409 | { |
410 | unsigned long res; | 410 | unsigned long res; |
411 | 411 | ||
412 | if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { | 412 | if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { |
413 | printk(KERN_ERR "Bad debug_guardpage_minorder value\n"); | 413 | printk(KERN_ERR "Bad debug_guardpage_minorder value\n"); |
414 | return 0; | 414 | return 0; |
415 | } | 415 | } |
416 | _debug_guardpage_minorder = res; | 416 | _debug_guardpage_minorder = res; |
417 | printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res); | 417 | printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res); |
418 | return 0; | 418 | return 0; |
419 | } | 419 | } |
420 | __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); | 420 | __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); |
421 | 421 | ||
422 | static inline void set_page_guard_flag(struct page *page) | 422 | static inline void set_page_guard_flag(struct page *page) |
423 | { | 423 | { |
424 | __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); | 424 | __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); |
425 | } | 425 | } |
426 | 426 | ||
427 | static inline void clear_page_guard_flag(struct page *page) | 427 | static inline void clear_page_guard_flag(struct page *page) |
428 | { | 428 | { |
429 | __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); | 429 | __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); |
430 | } | 430 | } |
431 | #else | 431 | #else |
432 | static inline void set_page_guard_flag(struct page *page) { } | 432 | static inline void set_page_guard_flag(struct page *page) { } |
433 | static inline void clear_page_guard_flag(struct page *page) { } | 433 | static inline void clear_page_guard_flag(struct page *page) { } |
434 | #endif | 434 | #endif |
435 | 435 | ||
436 | static inline void set_page_order(struct page *page, int order) | 436 | static inline void set_page_order(struct page *page, int order) |
437 | { | 437 | { |
438 | set_page_private(page, order); | 438 | set_page_private(page, order); |
439 | __SetPageBuddy(page); | 439 | __SetPageBuddy(page); |
440 | } | 440 | } |
441 | 441 | ||
442 | static inline void rmv_page_order(struct page *page) | 442 | static inline void rmv_page_order(struct page *page) |
443 | { | 443 | { |
444 | __ClearPageBuddy(page); | 444 | __ClearPageBuddy(page); |
445 | set_page_private(page, 0); | 445 | set_page_private(page, 0); |
446 | } | 446 | } |
447 | 447 | ||
448 | /* | 448 | /* |
449 | * Locate the struct page for both the matching buddy in our | 449 | * Locate the struct page for both the matching buddy in our |
450 | * pair (buddy1) and the combined O(n+1) page they form (page). | 450 | * pair (buddy1) and the combined O(n+1) page they form (page). |
451 | * | 451 | * |
452 | * 1) Any buddy B1 will have an order O twin B2 which satisfies | 452 | * 1) Any buddy B1 will have an order O twin B2 which satisfies |
453 | * the following equation: | 453 | * the following equation: |
454 | * B2 = B1 ^ (1 << O) | 454 | * B2 = B1 ^ (1 << O) |
455 | * For example, if the starting buddy (buddy2) is #8 its order | 455 | * For example, if the starting buddy (buddy2) is #8 its order |
456 | * 1 buddy is #10: | 456 | * 1 buddy is #10: |
457 | * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 | 457 | * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 |
458 | * | 458 | * |
459 | * 2) Any buddy B will have an order O+1 parent P which | 459 | * 2) Any buddy B will have an order O+1 parent P which |
460 | * satisfies the following equation: | 460 | * satisfies the following equation: |
461 | * P = B & ~(1 << O) | 461 | * P = B & ~(1 << O) |
462 | * | 462 | * |
463 | * Assumption: *_mem_map is contiguous at least up to MAX_ORDER | 463 | * Assumption: *_mem_map is contiguous at least up to MAX_ORDER |
464 | */ | 464 | */ |
465 | static inline unsigned long | 465 | static inline unsigned long |
466 | __find_buddy_index(unsigned long page_idx, unsigned int order) | 466 | __find_buddy_index(unsigned long page_idx, unsigned int order) |
467 | { | 467 | { |
468 | return page_idx ^ (1 << order); | 468 | return page_idx ^ (1 << order); |
469 | } | 469 | } |
470 | 470 | ||
471 | /* | 471 | /* |
472 | * This function checks whether a page is free && is the buddy | 472 | * This function checks whether a page is free && is the buddy |
473 | * we can do coalesce a page and its buddy if | 473 | * we can do coalesce a page and its buddy if |
474 | * (a) the buddy is not in a hole && | 474 | * (a) the buddy is not in a hole && |
475 | * (b) the buddy is in the buddy system && | 475 | * (b) the buddy is in the buddy system && |
476 | * (c) a page and its buddy have the same order && | 476 | * (c) a page and its buddy have the same order && |
477 | * (d) a page and its buddy are in the same zone. | 477 | * (d) a page and its buddy are in the same zone. |
478 | * | 478 | * |
479 | * For recording whether a page is in the buddy system, we set ->_mapcount -2. | 479 | * For recording whether a page is in the buddy system, we set ->_mapcount -2. |
480 | * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock. | 480 | * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock. |
481 | * | 481 | * |
482 | * For recording page's order, we use page_private(page). | 482 | * For recording page's order, we use page_private(page). |
483 | */ | 483 | */ |
484 | static inline int page_is_buddy(struct page *page, struct page *buddy, | 484 | static inline int page_is_buddy(struct page *page, struct page *buddy, |
485 | int order) | 485 | int order) |
486 | { | 486 | { |
487 | if (!pfn_valid_within(page_to_pfn(buddy))) | 487 | if (!pfn_valid_within(page_to_pfn(buddy))) |
488 | return 0; | 488 | return 0; |
489 | 489 | ||
490 | if (page_zone_id(page) != page_zone_id(buddy)) | 490 | if (page_zone_id(page) != page_zone_id(buddy)) |
491 | return 0; | 491 | return 0; |
492 | 492 | ||
493 | if (page_is_guard(buddy) && page_order(buddy) == order) { | 493 | if (page_is_guard(buddy) && page_order(buddy) == order) { |
494 | VM_BUG_ON(page_count(buddy) != 0); | 494 | VM_BUG_ON(page_count(buddy) != 0); |
495 | return 1; | 495 | return 1; |
496 | } | 496 | } |
497 | 497 | ||
498 | if (PageBuddy(buddy) && page_order(buddy) == order) { | 498 | if (PageBuddy(buddy) && page_order(buddy) == order) { |
499 | VM_BUG_ON(page_count(buddy) != 0); | 499 | VM_BUG_ON(page_count(buddy) != 0); |
500 | return 1; | 500 | return 1; |
501 | } | 501 | } |
502 | return 0; | 502 | return 0; |
503 | } | 503 | } |
504 | 504 | ||
505 | /* | 505 | /* |
506 | * Freeing function for a buddy system allocator. | 506 | * Freeing function for a buddy system allocator. |
507 | * | 507 | * |
508 | * The concept of a buddy system is to maintain direct-mapped table | 508 | * The concept of a buddy system is to maintain direct-mapped table |
509 | * (containing bit values) for memory blocks of various "orders". | 509 | * (containing bit values) for memory blocks of various "orders". |
510 | * The bottom level table contains the map for the smallest allocatable | 510 | * The bottom level table contains the map for the smallest allocatable |
511 | * units of memory (here, pages), and each level above it describes | 511 | * units of memory (here, pages), and each level above it describes |
512 | * pairs of units from the levels below, hence, "buddies". | 512 | * pairs of units from the levels below, hence, "buddies". |
513 | * At a high level, all that happens here is marking the table entry | 513 | * At a high level, all that happens here is marking the table entry |
514 | * at the bottom level available, and propagating the changes upward | 514 | * at the bottom level available, and propagating the changes upward |
515 | * as necessary, plus some accounting needed to play nicely with other | 515 | * as necessary, plus some accounting needed to play nicely with other |
516 | * parts of the VM system. | 516 | * parts of the VM system. |
517 | * At each level, we keep a list of pages, which are heads of continuous | 517 | * At each level, we keep a list of pages, which are heads of continuous |
518 | * free pages of length of (1 << order) and marked with _mapcount -2. Page's | 518 | * free pages of length of (1 << order) and marked with _mapcount -2. Page's |
519 | * order is recorded in page_private(page) field. | 519 | * order is recorded in page_private(page) field. |
520 | * So when we are allocating or freeing one, we can derive the state of the | 520 | * So when we are allocating or freeing one, we can derive the state of the |
521 | * other. That is, if we allocate a small block, and both were | 521 | * other. That is, if we allocate a small block, and both were |
522 | * free, the remainder of the region must be split into blocks. | 522 | * free, the remainder of the region must be split into blocks. |
523 | * If a block is freed, and its buddy is also free, then this | 523 | * If a block is freed, and its buddy is also free, then this |
524 | * triggers coalescing into a block of larger size. | 524 | * triggers coalescing into a block of larger size. |
525 | * | 525 | * |
526 | * -- wli | 526 | * -- wli |
527 | */ | 527 | */ |
528 | 528 | ||
529 | static inline void __free_one_page(struct page *page, | 529 | static inline void __free_one_page(struct page *page, |
530 | struct zone *zone, unsigned int order, | 530 | struct zone *zone, unsigned int order, |
531 | int migratetype) | 531 | int migratetype) |
532 | { | 532 | { |
533 | unsigned long page_idx; | 533 | unsigned long page_idx; |
534 | unsigned long combined_idx; | 534 | unsigned long combined_idx; |
535 | unsigned long uninitialized_var(buddy_idx); | 535 | unsigned long uninitialized_var(buddy_idx); |
536 | struct page *buddy; | 536 | struct page *buddy; |
537 | 537 | ||
538 | if (unlikely(PageCompound(page))) | 538 | if (unlikely(PageCompound(page))) |
539 | if (unlikely(destroy_compound_page(page, order))) | 539 | if (unlikely(destroy_compound_page(page, order))) |
540 | return; | 540 | return; |
541 | 541 | ||
542 | VM_BUG_ON(migratetype == -1); | 542 | VM_BUG_ON(migratetype == -1); |
543 | 543 | ||
544 | page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); | 544 | page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); |
545 | 545 | ||
546 | VM_BUG_ON(page_idx & ((1 << order) - 1)); | 546 | VM_BUG_ON(page_idx & ((1 << order) - 1)); |
547 | VM_BUG_ON(bad_range(zone, page)); | 547 | VM_BUG_ON(bad_range(zone, page)); |
548 | 548 | ||
549 | while (order < MAX_ORDER-1) { | 549 | while (order < MAX_ORDER-1) { |
550 | buddy_idx = __find_buddy_index(page_idx, order); | 550 | buddy_idx = __find_buddy_index(page_idx, order); |
551 | buddy = page + (buddy_idx - page_idx); | 551 | buddy = page + (buddy_idx - page_idx); |
552 | if (!page_is_buddy(page, buddy, order)) | 552 | if (!page_is_buddy(page, buddy, order)) |
553 | break; | 553 | break; |
554 | /* | 554 | /* |
555 | * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, | 555 | * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, |
556 | * merge with it and move up one order. | 556 | * merge with it and move up one order. |
557 | */ | 557 | */ |
558 | if (page_is_guard(buddy)) { | 558 | if (page_is_guard(buddy)) { |
559 | clear_page_guard_flag(buddy); | 559 | clear_page_guard_flag(buddy); |
560 | set_page_private(page, 0); | 560 | set_page_private(page, 0); |
561 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); | 561 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); |
562 | } else { | 562 | } else { |
563 | list_del(&buddy->lru); | 563 | list_del(&buddy->lru); |
564 | zone->free_area[order].nr_free--; | 564 | zone->free_area[order].nr_free--; |
565 | rmv_page_order(buddy); | 565 | rmv_page_order(buddy); |
566 | } | 566 | } |
567 | combined_idx = buddy_idx & page_idx; | 567 | combined_idx = buddy_idx & page_idx; |
568 | page = page + (combined_idx - page_idx); | 568 | page = page + (combined_idx - page_idx); |
569 | page_idx = combined_idx; | 569 | page_idx = combined_idx; |
570 | order++; | 570 | order++; |
571 | } | 571 | } |
572 | set_page_order(page, order); | 572 | set_page_order(page, order); |
573 | 573 | ||
574 | /* | 574 | /* |
575 | * If this is not the largest possible page, check if the buddy | 575 | * If this is not the largest possible page, check if the buddy |
576 | * of the next-highest order is free. If it is, it's possible | 576 | * of the next-highest order is free. If it is, it's possible |
577 | * that pages are being freed that will coalesce soon. In case, | 577 | * that pages are being freed that will coalesce soon. In case, |
578 | * that is happening, add the free page to the tail of the list | 578 | * that is happening, add the free page to the tail of the list |
579 | * so it's less likely to be used soon and more likely to be merged | 579 | * so it's less likely to be used soon and more likely to be merged |
580 | * as a higher order page | 580 | * as a higher order page |
581 | */ | 581 | */ |
582 | if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { | 582 | if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { |
583 | struct page *higher_page, *higher_buddy; | 583 | struct page *higher_page, *higher_buddy; |
584 | combined_idx = buddy_idx & page_idx; | 584 | combined_idx = buddy_idx & page_idx; |
585 | higher_page = page + (combined_idx - page_idx); | 585 | higher_page = page + (combined_idx - page_idx); |
586 | buddy_idx = __find_buddy_index(combined_idx, order + 1); | 586 | buddy_idx = __find_buddy_index(combined_idx, order + 1); |
587 | higher_buddy = page + (buddy_idx - combined_idx); | 587 | higher_buddy = page + (buddy_idx - combined_idx); |
588 | if (page_is_buddy(higher_page, higher_buddy, order + 1)) { | 588 | if (page_is_buddy(higher_page, higher_buddy, order + 1)) { |
589 | list_add_tail(&page->lru, | 589 | list_add_tail(&page->lru, |
590 | &zone->free_area[order].free_list[migratetype]); | 590 | &zone->free_area[order].free_list[migratetype]); |
591 | goto out; | 591 | goto out; |
592 | } | 592 | } |
593 | } | 593 | } |
594 | 594 | ||
595 | list_add(&page->lru, &zone->free_area[order].free_list[migratetype]); | 595 | list_add(&page->lru, &zone->free_area[order].free_list[migratetype]); |
596 | out: | 596 | out: |
597 | zone->free_area[order].nr_free++; | 597 | zone->free_area[order].nr_free++; |
598 | } | 598 | } |
599 | 599 | ||
600 | /* | 600 | /* |
601 | * free_page_mlock() -- clean up attempts to free and mlocked() page. | 601 | * free_page_mlock() -- clean up attempts to free and mlocked() page. |
602 | * Page should not be on lru, so no need to fix that up. | 602 | * Page should not be on lru, so no need to fix that up. |
603 | * free_pages_check() will verify... | 603 | * free_pages_check() will verify... |
604 | */ | 604 | */ |
605 | static inline void free_page_mlock(struct page *page) | 605 | static inline void free_page_mlock(struct page *page) |
606 | { | 606 | { |
607 | __dec_zone_page_state(page, NR_MLOCK); | 607 | __dec_zone_page_state(page, NR_MLOCK); |
608 | __count_vm_event(UNEVICTABLE_MLOCKFREED); | 608 | __count_vm_event(UNEVICTABLE_MLOCKFREED); |
609 | } | 609 | } |
610 | 610 | ||
611 | static inline int free_pages_check(struct page *page) | 611 | static inline int free_pages_check(struct page *page) |
612 | { | 612 | { |
613 | if (unlikely(page_mapcount(page) | | 613 | if (unlikely(page_mapcount(page) | |
614 | (page->mapping != NULL) | | 614 | (page->mapping != NULL) | |
615 | (atomic_read(&page->_count) != 0) | | 615 | (atomic_read(&page->_count) != 0) | |
616 | (page->flags & PAGE_FLAGS_CHECK_AT_FREE) | | 616 | (page->flags & PAGE_FLAGS_CHECK_AT_FREE) | |
617 | (mem_cgroup_bad_page_check(page)))) { | 617 | (mem_cgroup_bad_page_check(page)))) { |
618 | bad_page(page); | 618 | bad_page(page); |
619 | return 1; | 619 | return 1; |
620 | } | 620 | } |
621 | if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) | 621 | if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
622 | page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; | 622 | page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; |
623 | return 0; | 623 | return 0; |
624 | } | 624 | } |
625 | 625 | ||
626 | /* | 626 | /* |
627 | * Frees a number of pages from the PCP lists | 627 | * Frees a number of pages from the PCP lists |
628 | * Assumes all pages on list are in same zone, and of same order. | 628 | * Assumes all pages on list are in same zone, and of same order. |
629 | * count is the number of pages to free. | 629 | * count is the number of pages to free. |
630 | * | 630 | * |
631 | * If the zone was previously in an "all pages pinned" state then look to | 631 | * If the zone was previously in an "all pages pinned" state then look to |
632 | * see if this freeing clears that state. | 632 | * see if this freeing clears that state. |
633 | * | 633 | * |
634 | * And clear the zone's pages_scanned counter, to hold off the "all pages are | 634 | * And clear the zone's pages_scanned counter, to hold off the "all pages are |
635 | * pinned" detection logic. | 635 | * pinned" detection logic. |
636 | */ | 636 | */ |
637 | static void free_pcppages_bulk(struct zone *zone, int count, | 637 | static void free_pcppages_bulk(struct zone *zone, int count, |
638 | struct per_cpu_pages *pcp) | 638 | struct per_cpu_pages *pcp) |
639 | { | 639 | { |
640 | int migratetype = 0; | 640 | int migratetype = 0; |
641 | int batch_free = 0; | 641 | int batch_free = 0; |
642 | int to_free = count; | 642 | int to_free = count; |
643 | 643 | ||
644 | spin_lock(&zone->lock); | 644 | spin_lock(&zone->lock); |
645 | zone->all_unreclaimable = 0; | 645 | zone->all_unreclaimable = 0; |
646 | zone->pages_scanned = 0; | 646 | zone->pages_scanned = 0; |
647 | 647 | ||
648 | while (to_free) { | 648 | while (to_free) { |
649 | struct page *page; | 649 | struct page *page; |
650 | struct list_head *list; | 650 | struct list_head *list; |
651 | 651 | ||
652 | /* | 652 | /* |
653 | * Remove pages from lists in a round-robin fashion. A | 653 | * Remove pages from lists in a round-robin fashion. A |
654 | * batch_free count is maintained that is incremented when an | 654 | * batch_free count is maintained that is incremented when an |
655 | * empty list is encountered. This is so more pages are freed | 655 | * empty list is encountered. This is so more pages are freed |
656 | * off fuller lists instead of spinning excessively around empty | 656 | * off fuller lists instead of spinning excessively around empty |
657 | * lists | 657 | * lists |
658 | */ | 658 | */ |
659 | do { | 659 | do { |
660 | batch_free++; | 660 | batch_free++; |
661 | if (++migratetype == MIGRATE_PCPTYPES) | 661 | if (++migratetype == MIGRATE_PCPTYPES) |
662 | migratetype = 0; | 662 | migratetype = 0; |
663 | list = &pcp->lists[migratetype]; | 663 | list = &pcp->lists[migratetype]; |
664 | } while (list_empty(list)); | 664 | } while (list_empty(list)); |
665 | 665 | ||
666 | /* This is the only non-empty list. Free them all. */ | 666 | /* This is the only non-empty list. Free them all. */ |
667 | if (batch_free == MIGRATE_PCPTYPES) | 667 | if (batch_free == MIGRATE_PCPTYPES) |
668 | batch_free = to_free; | 668 | batch_free = to_free; |
669 | 669 | ||
670 | do { | 670 | do { |
671 | page = list_entry(list->prev, struct page, lru); | 671 | page = list_entry(list->prev, struct page, lru); |
672 | /* must delete as __free_one_page list manipulates */ | 672 | /* must delete as __free_one_page list manipulates */ |
673 | list_del(&page->lru); | 673 | list_del(&page->lru); |
674 | /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ | 674 | /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ |
675 | __free_one_page(page, zone, 0, page_private(page)); | 675 | __free_one_page(page, zone, 0, page_private(page)); |
676 | trace_mm_page_pcpu_drain(page, 0, page_private(page)); | 676 | trace_mm_page_pcpu_drain(page, 0, page_private(page)); |
677 | } while (--to_free && --batch_free && !list_empty(list)); | 677 | } while (--to_free && --batch_free && !list_empty(list)); |
678 | } | 678 | } |
679 | __mod_zone_page_state(zone, NR_FREE_PAGES, count); | 679 | __mod_zone_page_state(zone, NR_FREE_PAGES, count); |
680 | spin_unlock(&zone->lock); | 680 | spin_unlock(&zone->lock); |
681 | } | 681 | } |
682 | 682 | ||
683 | static void free_one_page(struct zone *zone, struct page *page, int order, | 683 | static void free_one_page(struct zone *zone, struct page *page, int order, |
684 | int migratetype) | 684 | int migratetype) |
685 | { | 685 | { |
686 | spin_lock(&zone->lock); | 686 | spin_lock(&zone->lock); |
687 | zone->all_unreclaimable = 0; | 687 | zone->all_unreclaimable = 0; |
688 | zone->pages_scanned = 0; | 688 | zone->pages_scanned = 0; |
689 | 689 | ||
690 | __free_one_page(page, zone, order, migratetype); | 690 | __free_one_page(page, zone, order, migratetype); |
691 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); | 691 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); |
692 | spin_unlock(&zone->lock); | 692 | spin_unlock(&zone->lock); |
693 | } | 693 | } |
694 | 694 | ||
695 | static bool free_pages_prepare(struct page *page, unsigned int order) | 695 | static bool free_pages_prepare(struct page *page, unsigned int order) |
696 | { | 696 | { |
697 | int i; | 697 | int i; |
698 | int bad = 0; | 698 | int bad = 0; |
699 | 699 | ||
700 | trace_mm_page_free(page, order); | 700 | trace_mm_page_free(page, order); |
701 | kmemcheck_free_shadow(page, order); | 701 | kmemcheck_free_shadow(page, order); |
702 | 702 | ||
703 | if (PageAnon(page)) | 703 | if (PageAnon(page)) |
704 | page->mapping = NULL; | 704 | page->mapping = NULL; |
705 | for (i = 0; i < (1 << order); i++) | 705 | for (i = 0; i < (1 << order); i++) |
706 | bad += free_pages_check(page + i); | 706 | bad += free_pages_check(page + i); |
707 | if (bad) | 707 | if (bad) |
708 | return false; | 708 | return false; |
709 | 709 | ||
710 | if (!PageHighMem(page)) { | 710 | if (!PageHighMem(page)) { |
711 | debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order); | 711 | debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order); |
712 | debug_check_no_obj_freed(page_address(page), | 712 | debug_check_no_obj_freed(page_address(page), |
713 | PAGE_SIZE << order); | 713 | PAGE_SIZE << order); |
714 | } | 714 | } |
715 | arch_free_page(page, order); | 715 | arch_free_page(page, order); |
716 | kernel_map_pages(page, 1 << order, 0); | 716 | kernel_map_pages(page, 1 << order, 0); |
717 | 717 | ||
718 | return true; | 718 | return true; |
719 | } | 719 | } |
720 | 720 | ||
721 | static void __free_pages_ok(struct page *page, unsigned int order) | 721 | static void __free_pages_ok(struct page *page, unsigned int order) |
722 | { | 722 | { |
723 | unsigned long flags; | 723 | unsigned long flags; |
724 | int wasMlocked = __TestClearPageMlocked(page); | 724 | int wasMlocked = __TestClearPageMlocked(page); |
725 | 725 | ||
726 | if (!free_pages_prepare(page, order)) | 726 | if (!free_pages_prepare(page, order)) |
727 | return; | 727 | return; |
728 | 728 | ||
729 | local_irq_save(flags); | 729 | local_irq_save(flags); |
730 | if (unlikely(wasMlocked)) | 730 | if (unlikely(wasMlocked)) |
731 | free_page_mlock(page); | 731 | free_page_mlock(page); |
732 | __count_vm_events(PGFREE, 1 << order); | 732 | __count_vm_events(PGFREE, 1 << order); |
733 | free_one_page(page_zone(page), page, order, | 733 | free_one_page(page_zone(page), page, order, |
734 | get_pageblock_migratetype(page)); | 734 | get_pageblock_migratetype(page)); |
735 | local_irq_restore(flags); | 735 | local_irq_restore(flags); |
736 | } | 736 | } |
737 | 737 | ||
738 | void __meminit __free_pages_bootmem(struct page *page, unsigned int order) | 738 | void __meminit __free_pages_bootmem(struct page *page, unsigned int order) |
739 | { | 739 | { |
740 | unsigned int nr_pages = 1 << order; | 740 | unsigned int nr_pages = 1 << order; |
741 | unsigned int loop; | 741 | unsigned int loop; |
742 | 742 | ||
743 | prefetchw(page); | 743 | prefetchw(page); |
744 | for (loop = 0; loop < nr_pages; loop++) { | 744 | for (loop = 0; loop < nr_pages; loop++) { |
745 | struct page *p = &page[loop]; | 745 | struct page *p = &page[loop]; |
746 | 746 | ||
747 | if (loop + 1 < nr_pages) | 747 | if (loop + 1 < nr_pages) |
748 | prefetchw(p + 1); | 748 | prefetchw(p + 1); |
749 | __ClearPageReserved(p); | 749 | __ClearPageReserved(p); |
750 | set_page_count(p, 0); | 750 | set_page_count(p, 0); |
751 | } | 751 | } |
752 | 752 | ||
753 | set_page_refcounted(page); | 753 | set_page_refcounted(page); |
754 | __free_pages(page, order); | 754 | __free_pages(page, order); |
755 | } | 755 | } |
756 | 756 | ||
757 | #ifdef CONFIG_CMA | 757 | #ifdef CONFIG_CMA |
758 | /* Free whole pageblock and set it's migration type to MIGRATE_CMA. */ | 758 | /* Free whole pageblock and set it's migration type to MIGRATE_CMA. */ |
759 | void __init init_cma_reserved_pageblock(struct page *page) | 759 | void __init init_cma_reserved_pageblock(struct page *page) |
760 | { | 760 | { |
761 | unsigned i = pageblock_nr_pages; | 761 | unsigned i = pageblock_nr_pages; |
762 | struct page *p = page; | 762 | struct page *p = page; |
763 | 763 | ||
764 | do { | 764 | do { |
765 | __ClearPageReserved(p); | 765 | __ClearPageReserved(p); |
766 | set_page_count(p, 0); | 766 | set_page_count(p, 0); |
767 | } while (++p, --i); | 767 | } while (++p, --i); |
768 | 768 | ||
769 | set_page_refcounted(page); | 769 | set_page_refcounted(page); |
770 | set_pageblock_migratetype(page, MIGRATE_CMA); | 770 | set_pageblock_migratetype(page, MIGRATE_CMA); |
771 | __free_pages(page, pageblock_order); | 771 | __free_pages(page, pageblock_order); |
772 | totalram_pages += pageblock_nr_pages; | 772 | totalram_pages += pageblock_nr_pages; |
773 | } | 773 | } |
774 | #endif | 774 | #endif |
775 | 775 | ||
776 | /* | 776 | /* |
777 | * The order of subdivision here is critical for the IO subsystem. | 777 | * The order of subdivision here is critical for the IO subsystem. |
778 | * Please do not alter this order without good reasons and regression | 778 | * Please do not alter this order without good reasons and regression |
779 | * testing. Specifically, as large blocks of memory are subdivided, | 779 | * testing. Specifically, as large blocks of memory are subdivided, |
780 | * the order in which smaller blocks are delivered depends on the order | 780 | * the order in which smaller blocks are delivered depends on the order |
781 | * they're subdivided in this function. This is the primary factor | 781 | * they're subdivided in this function. This is the primary factor |
782 | * influencing the order in which pages are delivered to the IO | 782 | * influencing the order in which pages are delivered to the IO |
783 | * subsystem according to empirical testing, and this is also justified | 783 | * subsystem according to empirical testing, and this is also justified |
784 | * by considering the behavior of a buddy system containing a single | 784 | * by considering the behavior of a buddy system containing a single |
785 | * large block of memory acted on by a series of small allocations. | 785 | * large block of memory acted on by a series of small allocations. |
786 | * This behavior is a critical factor in sglist merging's success. | 786 | * This behavior is a critical factor in sglist merging's success. |
787 | * | 787 | * |
788 | * -- wli | 788 | * -- wli |
789 | */ | 789 | */ |
790 | static inline void expand(struct zone *zone, struct page *page, | 790 | static inline void expand(struct zone *zone, struct page *page, |
791 | int low, int high, struct free_area *area, | 791 | int low, int high, struct free_area *area, |
792 | int migratetype) | 792 | int migratetype) |
793 | { | 793 | { |
794 | unsigned long size = 1 << high; | 794 | unsigned long size = 1 << high; |
795 | 795 | ||
796 | while (high > low) { | 796 | while (high > low) { |
797 | area--; | 797 | area--; |
798 | high--; | 798 | high--; |
799 | size >>= 1; | 799 | size >>= 1; |
800 | VM_BUG_ON(bad_range(zone, &page[size])); | 800 | VM_BUG_ON(bad_range(zone, &page[size])); |
801 | 801 | ||
802 | #ifdef CONFIG_DEBUG_PAGEALLOC | 802 | #ifdef CONFIG_DEBUG_PAGEALLOC |
803 | if (high < debug_guardpage_minorder()) { | 803 | if (high < debug_guardpage_minorder()) { |
804 | /* | 804 | /* |
805 | * Mark as guard pages (or page), that will allow to | 805 | * Mark as guard pages (or page), that will allow to |
806 | * merge back to allocator when buddy will be freed. | 806 | * merge back to allocator when buddy will be freed. |
807 | * Corresponding page table entries will not be touched, | 807 | * Corresponding page table entries will not be touched, |
808 | * pages will stay not present in virtual address space | 808 | * pages will stay not present in virtual address space |
809 | */ | 809 | */ |
810 | INIT_LIST_HEAD(&page[size].lru); | 810 | INIT_LIST_HEAD(&page[size].lru); |
811 | set_page_guard_flag(&page[size]); | 811 | set_page_guard_flag(&page[size]); |
812 | set_page_private(&page[size], high); | 812 | set_page_private(&page[size], high); |
813 | /* Guard pages are not available for any usage */ | 813 | /* Guard pages are not available for any usage */ |
814 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high)); | 814 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high)); |
815 | continue; | 815 | continue; |
816 | } | 816 | } |
817 | #endif | 817 | #endif |
818 | list_add(&page[size].lru, &area->free_list[migratetype]); | 818 | list_add(&page[size].lru, &area->free_list[migratetype]); |
819 | area->nr_free++; | 819 | area->nr_free++; |
820 | set_page_order(&page[size], high); | 820 | set_page_order(&page[size], high); |
821 | } | 821 | } |
822 | } | 822 | } |
823 | 823 | ||
824 | /* | 824 | /* |
825 | * This page is about to be returned from the page allocator | 825 | * This page is about to be returned from the page allocator |
826 | */ | 826 | */ |
827 | static inline int check_new_page(struct page *page) | 827 | static inline int check_new_page(struct page *page) |
828 | { | 828 | { |
829 | if (unlikely(page_mapcount(page) | | 829 | if (unlikely(page_mapcount(page) | |
830 | (page->mapping != NULL) | | 830 | (page->mapping != NULL) | |
831 | (atomic_read(&page->_count) != 0) | | 831 | (atomic_read(&page->_count) != 0) | |
832 | (page->flags & PAGE_FLAGS_CHECK_AT_PREP) | | 832 | (page->flags & PAGE_FLAGS_CHECK_AT_PREP) | |
833 | (mem_cgroup_bad_page_check(page)))) { | 833 | (mem_cgroup_bad_page_check(page)))) { |
834 | bad_page(page); | 834 | bad_page(page); |
835 | return 1; | 835 | return 1; |
836 | } | 836 | } |
837 | return 0; | 837 | return 0; |
838 | } | 838 | } |
839 | 839 | ||
840 | static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) | 840 | static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) |
841 | { | 841 | { |
842 | int i; | 842 | int i; |
843 | 843 | ||
844 | for (i = 0; i < (1 << order); i++) { | 844 | for (i = 0; i < (1 << order); i++) { |
845 | struct page *p = page + i; | 845 | struct page *p = page + i; |
846 | if (unlikely(check_new_page(p))) | 846 | if (unlikely(check_new_page(p))) |
847 | return 1; | 847 | return 1; |
848 | } | 848 | } |
849 | 849 | ||
850 | set_page_private(page, 0); | 850 | set_page_private(page, 0); |
851 | set_page_refcounted(page); | 851 | set_page_refcounted(page); |
852 | 852 | ||
853 | arch_alloc_page(page, order); | 853 | arch_alloc_page(page, order); |
854 | kernel_map_pages(page, 1 << order, 1); | 854 | kernel_map_pages(page, 1 << order, 1); |
855 | 855 | ||
856 | if (gfp_flags & __GFP_ZERO) | 856 | if (gfp_flags & __GFP_ZERO) |
857 | prep_zero_page(page, order, gfp_flags); | 857 | prep_zero_page(page, order, gfp_flags); |
858 | 858 | ||
859 | if (order && (gfp_flags & __GFP_COMP)) | 859 | if (order && (gfp_flags & __GFP_COMP)) |
860 | prep_compound_page(page, order); | 860 | prep_compound_page(page, order); |
861 | 861 | ||
862 | return 0; | 862 | return 0; |
863 | } | 863 | } |
864 | 864 | ||
865 | /* | 865 | /* |
866 | * Go through the free lists for the given migratetype and remove | 866 | * Go through the free lists for the given migratetype and remove |
867 | * the smallest available page from the freelists | 867 | * the smallest available page from the freelists |
868 | */ | 868 | */ |
869 | static inline | 869 | static inline |
870 | struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, | 870 | struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, |
871 | int migratetype) | 871 | int migratetype) |
872 | { | 872 | { |
873 | unsigned int current_order; | 873 | unsigned int current_order; |
874 | struct free_area * area; | 874 | struct free_area * area; |
875 | struct page *page; | 875 | struct page *page; |
876 | 876 | ||
877 | /* Find a page of the appropriate size in the preferred list */ | 877 | /* Find a page of the appropriate size in the preferred list */ |
878 | for (current_order = order; current_order < MAX_ORDER; ++current_order) { | 878 | for (current_order = order; current_order < MAX_ORDER; ++current_order) { |
879 | area = &(zone->free_area[current_order]); | 879 | area = &(zone->free_area[current_order]); |
880 | if (list_empty(&area->free_list[migratetype])) | 880 | if (list_empty(&area->free_list[migratetype])) |
881 | continue; | 881 | continue; |
882 | 882 | ||
883 | page = list_entry(area->free_list[migratetype].next, | 883 | page = list_entry(area->free_list[migratetype].next, |
884 | struct page, lru); | 884 | struct page, lru); |
885 | list_del(&page->lru); | 885 | list_del(&page->lru); |
886 | rmv_page_order(page); | 886 | rmv_page_order(page); |
887 | area->nr_free--; | 887 | area->nr_free--; |
888 | expand(zone, page, order, current_order, area, migratetype); | 888 | expand(zone, page, order, current_order, area, migratetype); |
889 | return page; | 889 | return page; |
890 | } | 890 | } |
891 | 891 | ||
892 | return NULL; | 892 | return NULL; |
893 | } | 893 | } |
894 | 894 | ||
895 | 895 | ||
896 | /* | 896 | /* |
897 | * This array describes the order lists are fallen back to when | 897 | * This array describes the order lists are fallen back to when |
898 | * the free lists for the desirable migrate type are depleted | 898 | * the free lists for the desirable migrate type are depleted |
899 | */ | 899 | */ |
900 | static int fallbacks[MIGRATE_TYPES][4] = { | 900 | static int fallbacks[MIGRATE_TYPES][4] = { |
901 | [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, | 901 | [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, |
902 | [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, | 902 | [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, |
903 | #ifdef CONFIG_CMA | 903 | #ifdef CONFIG_CMA |
904 | [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, | 904 | [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, |
905 | [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */ | 905 | [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */ |
906 | #else | 906 | #else |
907 | [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, | 907 | [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, |
908 | #endif | 908 | #endif |
909 | [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ | 909 | [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ |
910 | [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */ | 910 | [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */ |
911 | }; | 911 | }; |
912 | 912 | ||
913 | /* | 913 | /* |
914 | * Move the free pages in a range to the free lists of the requested type. | 914 | * Move the free pages in a range to the free lists of the requested type. |
915 | * Note that start_page and end_pages are not aligned on a pageblock | 915 | * Note that start_page and end_pages are not aligned on a pageblock |
916 | * boundary. If alignment is required, use move_freepages_block() | 916 | * boundary. If alignment is required, use move_freepages_block() |
917 | */ | 917 | */ |
918 | static int move_freepages(struct zone *zone, | 918 | static int move_freepages(struct zone *zone, |
919 | struct page *start_page, struct page *end_page, | 919 | struct page *start_page, struct page *end_page, |
920 | int migratetype) | 920 | int migratetype) |
921 | { | 921 | { |
922 | struct page *page; | 922 | struct page *page; |
923 | unsigned long order; | 923 | unsigned long order; |
924 | int pages_moved = 0; | 924 | int pages_moved = 0; |
925 | 925 | ||
926 | #ifndef CONFIG_HOLES_IN_ZONE | 926 | #ifndef CONFIG_HOLES_IN_ZONE |
927 | /* | 927 | /* |
928 | * page_zone is not safe to call in this context when | 928 | * page_zone is not safe to call in this context when |
929 | * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant | 929 | * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant |
930 | * anyway as we check zone boundaries in move_freepages_block(). | 930 | * anyway as we check zone boundaries in move_freepages_block(). |
931 | * Remove at a later date when no bug reports exist related to | 931 | * Remove at a later date when no bug reports exist related to |
932 | * grouping pages by mobility | 932 | * grouping pages by mobility |
933 | */ | 933 | */ |
934 | BUG_ON(page_zone(start_page) != page_zone(end_page)); | 934 | BUG_ON(page_zone(start_page) != page_zone(end_page)); |
935 | #endif | 935 | #endif |
936 | 936 | ||
937 | for (page = start_page; page <= end_page;) { | 937 | for (page = start_page; page <= end_page;) { |
938 | /* Make sure we are not inadvertently changing nodes */ | 938 | /* Make sure we are not inadvertently changing nodes */ |
939 | VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone)); | 939 | VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone)); |
940 | 940 | ||
941 | if (!pfn_valid_within(page_to_pfn(page))) { | 941 | if (!pfn_valid_within(page_to_pfn(page))) { |
942 | page++; | 942 | page++; |
943 | continue; | 943 | continue; |
944 | } | 944 | } |
945 | 945 | ||
946 | if (!PageBuddy(page)) { | 946 | if (!PageBuddy(page)) { |
947 | page++; | 947 | page++; |
948 | continue; | 948 | continue; |
949 | } | 949 | } |
950 | 950 | ||
951 | order = page_order(page); | 951 | order = page_order(page); |
952 | list_move(&page->lru, | 952 | list_move(&page->lru, |
953 | &zone->free_area[order].free_list[migratetype]); | 953 | &zone->free_area[order].free_list[migratetype]); |
954 | page += 1 << order; | 954 | page += 1 << order; |
955 | pages_moved += 1 << order; | 955 | pages_moved += 1 << order; |
956 | } | 956 | } |
957 | 957 | ||
958 | return pages_moved; | 958 | return pages_moved; |
959 | } | 959 | } |
960 | 960 | ||
961 | int move_freepages_block(struct zone *zone, struct page *page, | 961 | int move_freepages_block(struct zone *zone, struct page *page, |
962 | int migratetype) | 962 | int migratetype) |
963 | { | 963 | { |
964 | unsigned long start_pfn, end_pfn; | 964 | unsigned long start_pfn, end_pfn; |
965 | struct page *start_page, *end_page; | 965 | struct page *start_page, *end_page; |
966 | 966 | ||
967 | start_pfn = page_to_pfn(page); | 967 | start_pfn = page_to_pfn(page); |
968 | start_pfn = start_pfn & ~(pageblock_nr_pages-1); | 968 | start_pfn = start_pfn & ~(pageblock_nr_pages-1); |
969 | start_page = pfn_to_page(start_pfn); | 969 | start_page = pfn_to_page(start_pfn); |
970 | end_page = start_page + pageblock_nr_pages - 1; | 970 | end_page = start_page + pageblock_nr_pages - 1; |
971 | end_pfn = start_pfn + pageblock_nr_pages - 1; | 971 | end_pfn = start_pfn + pageblock_nr_pages - 1; |
972 | 972 | ||
973 | /* Do not cross zone boundaries */ | 973 | /* Do not cross zone boundaries */ |
974 | if (start_pfn < zone->zone_start_pfn) | 974 | if (start_pfn < zone->zone_start_pfn) |
975 | start_page = page; | 975 | start_page = page; |
976 | if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages) | 976 | if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages) |
977 | return 0; | 977 | return 0; |
978 | 978 | ||
979 | return move_freepages(zone, start_page, end_page, migratetype); | 979 | return move_freepages(zone, start_page, end_page, migratetype); |
980 | } | 980 | } |
981 | 981 | ||
982 | static void change_pageblock_range(struct page *pageblock_page, | 982 | static void change_pageblock_range(struct page *pageblock_page, |
983 | int start_order, int migratetype) | 983 | int start_order, int migratetype) |
984 | { | 984 | { |
985 | int nr_pageblocks = 1 << (start_order - pageblock_order); | 985 | int nr_pageblocks = 1 << (start_order - pageblock_order); |
986 | 986 | ||
987 | while (nr_pageblocks--) { | 987 | while (nr_pageblocks--) { |
988 | set_pageblock_migratetype(pageblock_page, migratetype); | 988 | set_pageblock_migratetype(pageblock_page, migratetype); |
989 | pageblock_page += pageblock_nr_pages; | 989 | pageblock_page += pageblock_nr_pages; |
990 | } | 990 | } |
991 | } | 991 | } |
992 | 992 | ||
993 | /* Remove an element from the buddy allocator from the fallback list */ | 993 | /* Remove an element from the buddy allocator from the fallback list */ |
994 | static inline struct page * | 994 | static inline struct page * |
995 | __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) | 995 | __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) |
996 | { | 996 | { |
997 | struct free_area * area; | 997 | struct free_area * area; |
998 | int current_order; | 998 | int current_order; |
999 | struct page *page; | 999 | struct page *page; |
1000 | int migratetype, i; | 1000 | int migratetype, i; |
1001 | 1001 | ||
1002 | /* Find the largest possible block of pages in the other list */ | 1002 | /* Find the largest possible block of pages in the other list */ |
1003 | for (current_order = MAX_ORDER-1; current_order >= order; | 1003 | for (current_order = MAX_ORDER-1; current_order >= order; |
1004 | --current_order) { | 1004 | --current_order) { |
1005 | for (i = 0;; i++) { | 1005 | for (i = 0;; i++) { |
1006 | migratetype = fallbacks[start_migratetype][i]; | 1006 | migratetype = fallbacks[start_migratetype][i]; |
1007 | 1007 | ||
1008 | /* MIGRATE_RESERVE handled later if necessary */ | 1008 | /* MIGRATE_RESERVE handled later if necessary */ |
1009 | if (migratetype == MIGRATE_RESERVE) | 1009 | if (migratetype == MIGRATE_RESERVE) |
1010 | break; | 1010 | break; |
1011 | 1011 | ||
1012 | area = &(zone->free_area[current_order]); | 1012 | area = &(zone->free_area[current_order]); |
1013 | if (list_empty(&area->free_list[migratetype])) | 1013 | if (list_empty(&area->free_list[migratetype])) |
1014 | continue; | 1014 | continue; |
1015 | 1015 | ||
1016 | page = list_entry(area->free_list[migratetype].next, | 1016 | page = list_entry(area->free_list[migratetype].next, |
1017 | struct page, lru); | 1017 | struct page, lru); |
1018 | area->nr_free--; | 1018 | area->nr_free--; |
1019 | 1019 | ||
1020 | /* | 1020 | /* |
1021 | * If breaking a large block of pages, move all free | 1021 | * If breaking a large block of pages, move all free |
1022 | * pages to the preferred allocation list. If falling | 1022 | * pages to the preferred allocation list. If falling |
1023 | * back for a reclaimable kernel allocation, be more | 1023 | * back for a reclaimable kernel allocation, be more |
1024 | * aggressive about taking ownership of free pages | 1024 | * aggressive about taking ownership of free pages |
1025 | * | 1025 | * |
1026 | * On the other hand, never change migration | 1026 | * On the other hand, never change migration |
1027 | * type of MIGRATE_CMA pageblocks nor move CMA | 1027 | * type of MIGRATE_CMA pageblocks nor move CMA |
1028 | * pages on different free lists. We don't | 1028 | * pages on different free lists. We don't |
1029 | * want unmovable pages to be allocated from | 1029 | * want unmovable pages to be allocated from |
1030 | * MIGRATE_CMA areas. | 1030 | * MIGRATE_CMA areas. |
1031 | */ | 1031 | */ |
1032 | if (!is_migrate_cma(migratetype) && | 1032 | if (!is_migrate_cma(migratetype) && |
1033 | (unlikely(current_order >= pageblock_order / 2) || | 1033 | (unlikely(current_order >= pageblock_order / 2) || |
1034 | start_migratetype == MIGRATE_RECLAIMABLE || | 1034 | start_migratetype == MIGRATE_RECLAIMABLE || |
1035 | page_group_by_mobility_disabled)) { | 1035 | page_group_by_mobility_disabled)) { |
1036 | int pages; | 1036 | int pages; |
1037 | pages = move_freepages_block(zone, page, | 1037 | pages = move_freepages_block(zone, page, |
1038 | start_migratetype); | 1038 | start_migratetype); |
1039 | 1039 | ||
1040 | /* Claim the whole block if over half of it is free */ | 1040 | /* Claim the whole block if over half of it is free */ |
1041 | if (pages >= (1 << (pageblock_order-1)) || | 1041 | if (pages >= (1 << (pageblock_order-1)) || |
1042 | page_group_by_mobility_disabled) | 1042 | page_group_by_mobility_disabled) |
1043 | set_pageblock_migratetype(page, | 1043 | set_pageblock_migratetype(page, |
1044 | start_migratetype); | 1044 | start_migratetype); |
1045 | 1045 | ||
1046 | migratetype = start_migratetype; | 1046 | migratetype = start_migratetype; |
1047 | } | 1047 | } |
1048 | 1048 | ||
1049 | /* Remove the page from the freelists */ | 1049 | /* Remove the page from the freelists */ |
1050 | list_del(&page->lru); | 1050 | list_del(&page->lru); |
1051 | rmv_page_order(page); | 1051 | rmv_page_order(page); |
1052 | 1052 | ||
1053 | /* Take ownership for orders >= pageblock_order */ | 1053 | /* Take ownership for orders >= pageblock_order */ |
1054 | if (current_order >= pageblock_order && | 1054 | if (current_order >= pageblock_order && |
1055 | !is_migrate_cma(migratetype)) | 1055 | !is_migrate_cma(migratetype)) |
1056 | change_pageblock_range(page, current_order, | 1056 | change_pageblock_range(page, current_order, |
1057 | start_migratetype); | 1057 | start_migratetype); |
1058 | 1058 | ||
1059 | expand(zone, page, order, current_order, area, | 1059 | expand(zone, page, order, current_order, area, |
1060 | is_migrate_cma(migratetype) | 1060 | is_migrate_cma(migratetype) |
1061 | ? migratetype : start_migratetype); | 1061 | ? migratetype : start_migratetype); |
1062 | 1062 | ||
1063 | trace_mm_page_alloc_extfrag(page, order, current_order, | 1063 | trace_mm_page_alloc_extfrag(page, order, current_order, |
1064 | start_migratetype, migratetype); | 1064 | start_migratetype, migratetype); |
1065 | 1065 | ||
1066 | return page; | 1066 | return page; |
1067 | } | 1067 | } |
1068 | } | 1068 | } |
1069 | 1069 | ||
1070 | return NULL; | 1070 | return NULL; |
1071 | } | 1071 | } |
1072 | 1072 | ||
1073 | /* | 1073 | /* |
1074 | * Do the hard work of removing an element from the buddy allocator. | 1074 | * Do the hard work of removing an element from the buddy allocator. |
1075 | * Call me with the zone->lock already held. | 1075 | * Call me with the zone->lock already held. |
1076 | */ | 1076 | */ |
1077 | static struct page *__rmqueue(struct zone *zone, unsigned int order, | 1077 | static struct page *__rmqueue(struct zone *zone, unsigned int order, |
1078 | int migratetype) | 1078 | int migratetype) |
1079 | { | 1079 | { |
1080 | struct page *page; | 1080 | struct page *page; |
1081 | 1081 | ||
1082 | retry_reserve: | 1082 | retry_reserve: |
1083 | page = __rmqueue_smallest(zone, order, migratetype); | 1083 | page = __rmqueue_smallest(zone, order, migratetype); |
1084 | 1084 | ||
1085 | if (unlikely(!page) && migratetype != MIGRATE_RESERVE) { | 1085 | if (unlikely(!page) && migratetype != MIGRATE_RESERVE) { |
1086 | page = __rmqueue_fallback(zone, order, migratetype); | 1086 | page = __rmqueue_fallback(zone, order, migratetype); |
1087 | 1087 | ||
1088 | /* | 1088 | /* |
1089 | * Use MIGRATE_RESERVE rather than fail an allocation. goto | 1089 | * Use MIGRATE_RESERVE rather than fail an allocation. goto |
1090 | * is used because __rmqueue_smallest is an inline function | 1090 | * is used because __rmqueue_smallest is an inline function |
1091 | * and we want just one call site | 1091 | * and we want just one call site |
1092 | */ | 1092 | */ |
1093 | if (!page) { | 1093 | if (!page) { |
1094 | migratetype = MIGRATE_RESERVE; | 1094 | migratetype = MIGRATE_RESERVE; |
1095 | goto retry_reserve; | 1095 | goto retry_reserve; |
1096 | } | 1096 | } |
1097 | } | 1097 | } |
1098 | 1098 | ||
1099 | trace_mm_page_alloc_zone_locked(page, order, migratetype); | 1099 | trace_mm_page_alloc_zone_locked(page, order, migratetype); |
1100 | return page; | 1100 | return page; |
1101 | } | 1101 | } |
1102 | 1102 | ||
1103 | /* | 1103 | /* |
1104 | * Obtain a specified number of elements from the buddy allocator, all under | 1104 | * Obtain a specified number of elements from the buddy allocator, all under |
1105 | * a single hold of the lock, for efficiency. Add them to the supplied list. | 1105 | * a single hold of the lock, for efficiency. Add them to the supplied list. |
1106 | * Returns the number of new pages which were placed at *list. | 1106 | * Returns the number of new pages which were placed at *list. |
1107 | */ | 1107 | */ |
1108 | static int rmqueue_bulk(struct zone *zone, unsigned int order, | 1108 | static int rmqueue_bulk(struct zone *zone, unsigned int order, |
1109 | unsigned long count, struct list_head *list, | 1109 | unsigned long count, struct list_head *list, |
1110 | int migratetype, int cold) | 1110 | int migratetype, int cold) |
1111 | { | 1111 | { |
1112 | int mt = migratetype, i; | 1112 | int mt = migratetype, i; |
1113 | 1113 | ||
1114 | spin_lock(&zone->lock); | 1114 | spin_lock(&zone->lock); |
1115 | for (i = 0; i < count; ++i) { | 1115 | for (i = 0; i < count; ++i) { |
1116 | struct page *page = __rmqueue(zone, order, migratetype); | 1116 | struct page *page = __rmqueue(zone, order, migratetype); |
1117 | if (unlikely(page == NULL)) | 1117 | if (unlikely(page == NULL)) |
1118 | break; | 1118 | break; |
1119 | 1119 | ||
1120 | /* | 1120 | /* |
1121 | * Split buddy pages returned by expand() are received here | 1121 | * Split buddy pages returned by expand() are received here |
1122 | * in physical page order. The page is added to the callers and | 1122 | * in physical page order. The page is added to the callers and |
1123 | * list and the list head then moves forward. From the callers | 1123 | * list and the list head then moves forward. From the callers |
1124 | * perspective, the linked list is ordered by page number in | 1124 | * perspective, the linked list is ordered by page number in |
1125 | * some conditions. This is useful for IO devices that can | 1125 | * some conditions. This is useful for IO devices that can |
1126 | * merge IO requests if the physical pages are ordered | 1126 | * merge IO requests if the physical pages are ordered |
1127 | * properly. | 1127 | * properly. |
1128 | */ | 1128 | */ |
1129 | if (likely(cold == 0)) | 1129 | if (likely(cold == 0)) |
1130 | list_add(&page->lru, list); | 1130 | list_add(&page->lru, list); |
1131 | else | 1131 | else |
1132 | list_add_tail(&page->lru, list); | 1132 | list_add_tail(&page->lru, list); |
1133 | if (IS_ENABLED(CONFIG_CMA)) { | 1133 | if (IS_ENABLED(CONFIG_CMA)) { |
1134 | mt = get_pageblock_migratetype(page); | 1134 | mt = get_pageblock_migratetype(page); |
1135 | if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE) | 1135 | if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE) |
1136 | mt = migratetype; | 1136 | mt = migratetype; |
1137 | } | 1137 | } |
1138 | set_page_private(page, mt); | 1138 | set_page_private(page, mt); |
1139 | list = &page->lru; | 1139 | list = &page->lru; |
1140 | } | 1140 | } |
1141 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); | 1141 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); |
1142 | spin_unlock(&zone->lock); | 1142 | spin_unlock(&zone->lock); |
1143 | return i; | 1143 | return i; |
1144 | } | 1144 | } |
1145 | 1145 | ||
1146 | #ifdef CONFIG_NUMA | 1146 | #ifdef CONFIG_NUMA |
1147 | /* | 1147 | /* |
1148 | * Called from the vmstat counter updater to drain pagesets of this | 1148 | * Called from the vmstat counter updater to drain pagesets of this |
1149 | * currently executing processor on remote nodes after they have | 1149 | * currently executing processor on remote nodes after they have |
1150 | * expired. | 1150 | * expired. |
1151 | * | 1151 | * |
1152 | * Note that this function must be called with the thread pinned to | 1152 | * Note that this function must be called with the thread pinned to |
1153 | * a single processor. | 1153 | * a single processor. |
1154 | */ | 1154 | */ |
1155 | void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) | 1155 | void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) |
1156 | { | 1156 | { |
1157 | unsigned long flags; | 1157 | unsigned long flags; |
1158 | int to_drain; | 1158 | int to_drain; |
1159 | 1159 | ||
1160 | local_irq_save(flags); | 1160 | local_irq_save(flags); |
1161 | if (pcp->count >= pcp->batch) | 1161 | if (pcp->count >= pcp->batch) |
1162 | to_drain = pcp->batch; | 1162 | to_drain = pcp->batch; |
1163 | else | 1163 | else |
1164 | to_drain = pcp->count; | 1164 | to_drain = pcp->count; |
1165 | if (to_drain > 0) { | 1165 | if (to_drain > 0) { |
1166 | free_pcppages_bulk(zone, to_drain, pcp); | 1166 | free_pcppages_bulk(zone, to_drain, pcp); |
1167 | pcp->count -= to_drain; | 1167 | pcp->count -= to_drain; |
1168 | } | 1168 | } |
1169 | local_irq_restore(flags); | 1169 | local_irq_restore(flags); |
1170 | } | 1170 | } |
1171 | #endif | 1171 | #endif |
1172 | 1172 | ||
1173 | /* | 1173 | /* |
1174 | * Drain pages of the indicated processor. | 1174 | * Drain pages of the indicated processor. |
1175 | * | 1175 | * |
1176 | * The processor must either be the current processor and the | 1176 | * The processor must either be the current processor and the |
1177 | * thread pinned to the current processor or a processor that | 1177 | * thread pinned to the current processor or a processor that |
1178 | * is not online. | 1178 | * is not online. |
1179 | */ | 1179 | */ |
1180 | static void drain_pages(unsigned int cpu) | 1180 | static void drain_pages(unsigned int cpu) |
1181 | { | 1181 | { |
1182 | unsigned long flags; | 1182 | unsigned long flags; |
1183 | struct zone *zone; | 1183 | struct zone *zone; |
1184 | 1184 | ||
1185 | for_each_populated_zone(zone) { | 1185 | for_each_populated_zone(zone) { |
1186 | struct per_cpu_pageset *pset; | 1186 | struct per_cpu_pageset *pset; |
1187 | struct per_cpu_pages *pcp; | 1187 | struct per_cpu_pages *pcp; |
1188 | 1188 | ||
1189 | local_irq_save(flags); | 1189 | local_irq_save(flags); |
1190 | pset = per_cpu_ptr(zone->pageset, cpu); | 1190 | pset = per_cpu_ptr(zone->pageset, cpu); |
1191 | 1191 | ||
1192 | pcp = &pset->pcp; | 1192 | pcp = &pset->pcp; |
1193 | if (pcp->count) { | 1193 | if (pcp->count) { |
1194 | free_pcppages_bulk(zone, pcp->count, pcp); | 1194 | free_pcppages_bulk(zone, pcp->count, pcp); |
1195 | pcp->count = 0; | 1195 | pcp->count = 0; |
1196 | } | 1196 | } |
1197 | local_irq_restore(flags); | 1197 | local_irq_restore(flags); |
1198 | } | 1198 | } |
1199 | } | 1199 | } |
1200 | 1200 | ||
1201 | /* | 1201 | /* |
1202 | * Spill all of this CPU's per-cpu pages back into the buddy allocator. | 1202 | * Spill all of this CPU's per-cpu pages back into the buddy allocator. |
1203 | */ | 1203 | */ |
1204 | void drain_local_pages(void *arg) | 1204 | void drain_local_pages(void *arg) |
1205 | { | 1205 | { |
1206 | drain_pages(smp_processor_id()); | 1206 | drain_pages(smp_processor_id()); |
1207 | } | 1207 | } |
1208 | 1208 | ||
1209 | /* | 1209 | /* |
1210 | * Spill all the per-cpu pages from all CPUs back into the buddy allocator. | 1210 | * Spill all the per-cpu pages from all CPUs back into the buddy allocator. |
1211 | * | 1211 | * |
1212 | * Note that this code is protected against sending an IPI to an offline | 1212 | * Note that this code is protected against sending an IPI to an offline |
1213 | * CPU but does not guarantee sending an IPI to newly hotplugged CPUs: | 1213 | * CPU but does not guarantee sending an IPI to newly hotplugged CPUs: |
1214 | * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but | 1214 | * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but |
1215 | * nothing keeps CPUs from showing up after we populated the cpumask and | 1215 | * nothing keeps CPUs from showing up after we populated the cpumask and |
1216 | * before the call to on_each_cpu_mask(). | 1216 | * before the call to on_each_cpu_mask(). |
1217 | */ | 1217 | */ |
1218 | void drain_all_pages(void) | 1218 | void drain_all_pages(void) |
1219 | { | 1219 | { |
1220 | int cpu; | 1220 | int cpu; |
1221 | struct per_cpu_pageset *pcp; | 1221 | struct per_cpu_pageset *pcp; |
1222 | struct zone *zone; | 1222 | struct zone *zone; |
1223 | 1223 | ||
1224 | /* | 1224 | /* |
1225 | * Allocate in the BSS so we wont require allocation in | 1225 | * Allocate in the BSS so we wont require allocation in |
1226 | * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y | 1226 | * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y |
1227 | */ | 1227 | */ |
1228 | static cpumask_t cpus_with_pcps; | 1228 | static cpumask_t cpus_with_pcps; |
1229 | 1229 | ||
1230 | /* | 1230 | /* |
1231 | * We don't care about racing with CPU hotplug event | 1231 | * We don't care about racing with CPU hotplug event |
1232 | * as offline notification will cause the notified | 1232 | * as offline notification will cause the notified |
1233 | * cpu to drain that CPU pcps and on_each_cpu_mask | 1233 | * cpu to drain that CPU pcps and on_each_cpu_mask |
1234 | * disables preemption as part of its processing | 1234 | * disables preemption as part of its processing |
1235 | */ | 1235 | */ |
1236 | for_each_online_cpu(cpu) { | 1236 | for_each_online_cpu(cpu) { |
1237 | bool has_pcps = false; | 1237 | bool has_pcps = false; |
1238 | for_each_populated_zone(zone) { | 1238 | for_each_populated_zone(zone) { |
1239 | pcp = per_cpu_ptr(zone->pageset, cpu); | 1239 | pcp = per_cpu_ptr(zone->pageset, cpu); |
1240 | if (pcp->pcp.count) { | 1240 | if (pcp->pcp.count) { |
1241 | has_pcps = true; | 1241 | has_pcps = true; |
1242 | break; | 1242 | break; |
1243 | } | 1243 | } |
1244 | } | 1244 | } |
1245 | if (has_pcps) | 1245 | if (has_pcps) |
1246 | cpumask_set_cpu(cpu, &cpus_with_pcps); | 1246 | cpumask_set_cpu(cpu, &cpus_with_pcps); |
1247 | else | 1247 | else |
1248 | cpumask_clear_cpu(cpu, &cpus_with_pcps); | 1248 | cpumask_clear_cpu(cpu, &cpus_with_pcps); |
1249 | } | 1249 | } |
1250 | on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1); | 1250 | on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1); |
1251 | } | 1251 | } |
1252 | 1252 | ||
1253 | #ifdef CONFIG_HIBERNATION | 1253 | #ifdef CONFIG_HIBERNATION |
1254 | 1254 | ||
1255 | void mark_free_pages(struct zone *zone) | 1255 | void mark_free_pages(struct zone *zone) |
1256 | { | 1256 | { |
1257 | unsigned long pfn, max_zone_pfn; | 1257 | unsigned long pfn, max_zone_pfn; |
1258 | unsigned long flags; | 1258 | unsigned long flags; |
1259 | int order, t; | 1259 | int order, t; |
1260 | struct list_head *curr; | 1260 | struct list_head *curr; |
1261 | 1261 | ||
1262 | if (!zone->spanned_pages) | 1262 | if (!zone->spanned_pages) |
1263 | return; | 1263 | return; |
1264 | 1264 | ||
1265 | spin_lock_irqsave(&zone->lock, flags); | 1265 | spin_lock_irqsave(&zone->lock, flags); |
1266 | 1266 | ||
1267 | max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; | 1267 | max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; |
1268 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) | 1268 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) |
1269 | if (pfn_valid(pfn)) { | 1269 | if (pfn_valid(pfn)) { |
1270 | struct page *page = pfn_to_page(pfn); | 1270 | struct page *page = pfn_to_page(pfn); |
1271 | 1271 | ||
1272 | if (!swsusp_page_is_forbidden(page)) | 1272 | if (!swsusp_page_is_forbidden(page)) |
1273 | swsusp_unset_page_free(page); | 1273 | swsusp_unset_page_free(page); |
1274 | } | 1274 | } |
1275 | 1275 | ||
1276 | for_each_migratetype_order(order, t) { | 1276 | for_each_migratetype_order(order, t) { |
1277 | list_for_each(curr, &zone->free_area[order].free_list[t]) { | 1277 | list_for_each(curr, &zone->free_area[order].free_list[t]) { |
1278 | unsigned long i; | 1278 | unsigned long i; |
1279 | 1279 | ||
1280 | pfn = page_to_pfn(list_entry(curr, struct page, lru)); | 1280 | pfn = page_to_pfn(list_entry(curr, struct page, lru)); |
1281 | for (i = 0; i < (1UL << order); i++) | 1281 | for (i = 0; i < (1UL << order); i++) |
1282 | swsusp_set_page_free(pfn_to_page(pfn + i)); | 1282 | swsusp_set_page_free(pfn_to_page(pfn + i)); |
1283 | } | 1283 | } |
1284 | } | 1284 | } |
1285 | spin_unlock_irqrestore(&zone->lock, flags); | 1285 | spin_unlock_irqrestore(&zone->lock, flags); |
1286 | } | 1286 | } |
1287 | #endif /* CONFIG_PM */ | 1287 | #endif /* CONFIG_PM */ |
1288 | 1288 | ||
1289 | /* | 1289 | /* |
1290 | * Free a 0-order page | 1290 | * Free a 0-order page |
1291 | * cold == 1 ? free a cold page : free a hot page | 1291 | * cold == 1 ? free a cold page : free a hot page |
1292 | */ | 1292 | */ |
1293 | void free_hot_cold_page(struct page *page, int cold) | 1293 | void free_hot_cold_page(struct page *page, int cold) |
1294 | { | 1294 | { |
1295 | struct zone *zone = page_zone(page); | 1295 | struct zone *zone = page_zone(page); |
1296 | struct per_cpu_pages *pcp; | 1296 | struct per_cpu_pages *pcp; |
1297 | unsigned long flags; | 1297 | unsigned long flags; |
1298 | int migratetype; | 1298 | int migratetype; |
1299 | int wasMlocked = __TestClearPageMlocked(page); | 1299 | int wasMlocked = __TestClearPageMlocked(page); |
1300 | 1300 | ||
1301 | if (!free_pages_prepare(page, 0)) | 1301 | if (!free_pages_prepare(page, 0)) |
1302 | return; | 1302 | return; |
1303 | 1303 | ||
1304 | migratetype = get_pageblock_migratetype(page); | 1304 | migratetype = get_pageblock_migratetype(page); |
1305 | set_page_private(page, migratetype); | 1305 | set_page_private(page, migratetype); |
1306 | local_irq_save(flags); | 1306 | local_irq_save(flags); |
1307 | if (unlikely(wasMlocked)) | 1307 | if (unlikely(wasMlocked)) |
1308 | free_page_mlock(page); | 1308 | free_page_mlock(page); |
1309 | __count_vm_event(PGFREE); | 1309 | __count_vm_event(PGFREE); |
1310 | 1310 | ||
1311 | /* | 1311 | /* |
1312 | * We only track unmovable, reclaimable and movable on pcp lists. | 1312 | * We only track unmovable, reclaimable and movable on pcp lists. |
1313 | * Free ISOLATE pages back to the allocator because they are being | 1313 | * Free ISOLATE pages back to the allocator because they are being |
1314 | * offlined but treat RESERVE as movable pages so we can get those | 1314 | * offlined but treat RESERVE as movable pages so we can get those |
1315 | * areas back if necessary. Otherwise, we may have to free | 1315 | * areas back if necessary. Otherwise, we may have to free |
1316 | * excessively into the page allocator | 1316 | * excessively into the page allocator |
1317 | */ | 1317 | */ |
1318 | if (migratetype >= MIGRATE_PCPTYPES) { | 1318 | if (migratetype >= MIGRATE_PCPTYPES) { |
1319 | if (unlikely(migratetype == MIGRATE_ISOLATE)) { | 1319 | if (unlikely(migratetype == MIGRATE_ISOLATE)) { |
1320 | free_one_page(zone, page, 0, migratetype); | 1320 | free_one_page(zone, page, 0, migratetype); |
1321 | goto out; | 1321 | goto out; |
1322 | } | 1322 | } |
1323 | migratetype = MIGRATE_MOVABLE; | 1323 | migratetype = MIGRATE_MOVABLE; |
1324 | } | 1324 | } |
1325 | 1325 | ||
1326 | pcp = &this_cpu_ptr(zone->pageset)->pcp; | 1326 | pcp = &this_cpu_ptr(zone->pageset)->pcp; |
1327 | if (cold) | 1327 | if (cold) |
1328 | list_add_tail(&page->lru, &pcp->lists[migratetype]); | 1328 | list_add_tail(&page->lru, &pcp->lists[migratetype]); |
1329 | else | 1329 | else |
1330 | list_add(&page->lru, &pcp->lists[migratetype]); | 1330 | list_add(&page->lru, &pcp->lists[migratetype]); |
1331 | pcp->count++; | 1331 | pcp->count++; |
1332 | if (pcp->count >= pcp->high) { | 1332 | if (pcp->count >= pcp->high) { |
1333 | free_pcppages_bulk(zone, pcp->batch, pcp); | 1333 | free_pcppages_bulk(zone, pcp->batch, pcp); |
1334 | pcp->count -= pcp->batch; | 1334 | pcp->count -= pcp->batch; |
1335 | } | 1335 | } |
1336 | 1336 | ||
1337 | out: | 1337 | out: |
1338 | local_irq_restore(flags); | 1338 | local_irq_restore(flags); |
1339 | } | 1339 | } |
1340 | 1340 | ||
1341 | /* | 1341 | /* |
1342 | * Free a list of 0-order pages | 1342 | * Free a list of 0-order pages |
1343 | */ | 1343 | */ |
1344 | void free_hot_cold_page_list(struct list_head *list, int cold) | 1344 | void free_hot_cold_page_list(struct list_head *list, int cold) |
1345 | { | 1345 | { |
1346 | struct page *page, *next; | 1346 | struct page *page, *next; |
1347 | 1347 | ||
1348 | list_for_each_entry_safe(page, next, list, lru) { | 1348 | list_for_each_entry_safe(page, next, list, lru) { |
1349 | trace_mm_page_free_batched(page, cold); | 1349 | trace_mm_page_free_batched(page, cold); |
1350 | free_hot_cold_page(page, cold); | 1350 | free_hot_cold_page(page, cold); |
1351 | } | 1351 | } |
1352 | } | 1352 | } |
1353 | 1353 | ||
1354 | /* | 1354 | /* |
1355 | * split_page takes a non-compound higher-order page, and splits it into | 1355 | * split_page takes a non-compound higher-order page, and splits it into |
1356 | * n (1<<order) sub-pages: page[0..n] | 1356 | * n (1<<order) sub-pages: page[0..n] |
1357 | * Each sub-page must be freed individually. | 1357 | * Each sub-page must be freed individually. |
1358 | * | 1358 | * |
1359 | * Note: this is probably too low level an operation for use in drivers. | 1359 | * Note: this is probably too low level an operation for use in drivers. |
1360 | * Please consult with lkml before using this in your driver. | 1360 | * Please consult with lkml before using this in your driver. |
1361 | */ | 1361 | */ |
1362 | void split_page(struct page *page, unsigned int order) | 1362 | void split_page(struct page *page, unsigned int order) |
1363 | { | 1363 | { |
1364 | int i; | 1364 | int i; |
1365 | 1365 | ||
1366 | VM_BUG_ON(PageCompound(page)); | 1366 | VM_BUG_ON(PageCompound(page)); |
1367 | VM_BUG_ON(!page_count(page)); | 1367 | VM_BUG_ON(!page_count(page)); |
1368 | 1368 | ||
1369 | #ifdef CONFIG_KMEMCHECK | 1369 | #ifdef CONFIG_KMEMCHECK |
1370 | /* | 1370 | /* |
1371 | * Split shadow pages too, because free(page[0]) would | 1371 | * Split shadow pages too, because free(page[0]) would |
1372 | * otherwise free the whole shadow. | 1372 | * otherwise free the whole shadow. |
1373 | */ | 1373 | */ |
1374 | if (kmemcheck_page_is_tracked(page)) | 1374 | if (kmemcheck_page_is_tracked(page)) |
1375 | split_page(virt_to_page(page[0].shadow), order); | 1375 | split_page(virt_to_page(page[0].shadow), order); |
1376 | #endif | 1376 | #endif |
1377 | 1377 | ||
1378 | for (i = 1; i < (1 << order); i++) | 1378 | for (i = 1; i < (1 << order); i++) |
1379 | set_page_refcounted(page + i); | 1379 | set_page_refcounted(page + i); |
1380 | } | 1380 | } |
1381 | 1381 | ||
1382 | /* | 1382 | /* |
1383 | * Similar to split_page except the page is already free. As this is only | 1383 | * Similar to split_page except the page is already free. As this is only |
1384 | * being used for migration, the migratetype of the block also changes. | 1384 | * being used for migration, the migratetype of the block also changes. |
1385 | * As this is called with interrupts disabled, the caller is responsible | 1385 | * As this is called with interrupts disabled, the caller is responsible |
1386 | * for calling arch_alloc_page() and kernel_map_page() after interrupts | 1386 | * for calling arch_alloc_page() and kernel_map_page() after interrupts |
1387 | * are enabled. | 1387 | * are enabled. |
1388 | * | 1388 | * |
1389 | * Note: this is probably too low level an operation for use in drivers. | 1389 | * Note: this is probably too low level an operation for use in drivers. |
1390 | * Please consult with lkml before using this in your driver. | 1390 | * Please consult with lkml before using this in your driver. |
1391 | */ | 1391 | */ |
1392 | int split_free_page(struct page *page) | 1392 | int split_free_page(struct page *page) |
1393 | { | 1393 | { |
1394 | unsigned int order; | 1394 | unsigned int order; |
1395 | unsigned long watermark; | 1395 | unsigned long watermark; |
1396 | struct zone *zone; | 1396 | struct zone *zone; |
1397 | 1397 | ||
1398 | BUG_ON(!PageBuddy(page)); | 1398 | BUG_ON(!PageBuddy(page)); |
1399 | 1399 | ||
1400 | zone = page_zone(page); | 1400 | zone = page_zone(page); |
1401 | order = page_order(page); | 1401 | order = page_order(page); |
1402 | 1402 | ||
1403 | /* Obey watermarks as if the page was being allocated */ | 1403 | /* Obey watermarks as if the page was being allocated */ |
1404 | watermark = low_wmark_pages(zone) + (1 << order); | 1404 | watermark = low_wmark_pages(zone) + (1 << order); |
1405 | if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) | 1405 | if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) |
1406 | return 0; | 1406 | return 0; |
1407 | 1407 | ||
1408 | /* Remove page from free list */ | 1408 | /* Remove page from free list */ |
1409 | list_del(&page->lru); | 1409 | list_del(&page->lru); |
1410 | zone->free_area[order].nr_free--; | 1410 | zone->free_area[order].nr_free--; |
1411 | rmv_page_order(page); | 1411 | rmv_page_order(page); |
1412 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order)); | 1412 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order)); |
1413 | 1413 | ||
1414 | /* Split into individual pages */ | 1414 | /* Split into individual pages */ |
1415 | set_page_refcounted(page); | 1415 | set_page_refcounted(page); |
1416 | split_page(page, order); | 1416 | split_page(page, order); |
1417 | 1417 | ||
1418 | if (order >= pageblock_order - 1) { | 1418 | if (order >= pageblock_order - 1) { |
1419 | struct page *endpage = page + (1 << order) - 1; | 1419 | struct page *endpage = page + (1 << order) - 1; |
1420 | for (; page < endpage; page += pageblock_nr_pages) { | 1420 | for (; page < endpage; page += pageblock_nr_pages) { |
1421 | int mt = get_pageblock_migratetype(page); | 1421 | int mt = get_pageblock_migratetype(page); |
1422 | if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt)) | 1422 | if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt)) |
1423 | set_pageblock_migratetype(page, | 1423 | set_pageblock_migratetype(page, |
1424 | MIGRATE_MOVABLE); | 1424 | MIGRATE_MOVABLE); |
1425 | } | 1425 | } |
1426 | } | 1426 | } |
1427 | 1427 | ||
1428 | return 1 << order; | 1428 | return 1 << order; |
1429 | } | 1429 | } |
1430 | 1430 | ||
1431 | /* | 1431 | /* |
1432 | * Really, prep_compound_page() should be called from __rmqueue_bulk(). But | 1432 | * Really, prep_compound_page() should be called from __rmqueue_bulk(). But |
1433 | * we cheat by calling it from here, in the order > 0 path. Saves a branch | 1433 | * we cheat by calling it from here, in the order > 0 path. Saves a branch |
1434 | * or two. | 1434 | * or two. |
1435 | */ | 1435 | */ |
1436 | static inline | 1436 | static inline |
1437 | struct page *buffered_rmqueue(struct zone *preferred_zone, | 1437 | struct page *buffered_rmqueue(struct zone *preferred_zone, |
1438 | struct zone *zone, int order, gfp_t gfp_flags, | 1438 | struct zone *zone, int order, gfp_t gfp_flags, |
1439 | int migratetype) | 1439 | int migratetype) |
1440 | { | 1440 | { |
1441 | unsigned long flags; | 1441 | unsigned long flags; |
1442 | struct page *page; | 1442 | struct page *page; |
1443 | int cold = !!(gfp_flags & __GFP_COLD); | 1443 | int cold = !!(gfp_flags & __GFP_COLD); |
1444 | 1444 | ||
1445 | again: | 1445 | again: |
1446 | if (likely(order == 0)) { | 1446 | if (likely(order == 0)) { |
1447 | struct per_cpu_pages *pcp; | 1447 | struct per_cpu_pages *pcp; |
1448 | struct list_head *list; | 1448 | struct list_head *list; |
1449 | 1449 | ||
1450 | local_irq_save(flags); | 1450 | local_irq_save(flags); |
1451 | pcp = &this_cpu_ptr(zone->pageset)->pcp; | 1451 | pcp = &this_cpu_ptr(zone->pageset)->pcp; |
1452 | list = &pcp->lists[migratetype]; | 1452 | list = &pcp->lists[migratetype]; |
1453 | if (list_empty(list)) { | 1453 | if (list_empty(list)) { |
1454 | pcp->count += rmqueue_bulk(zone, 0, | 1454 | pcp->count += rmqueue_bulk(zone, 0, |
1455 | pcp->batch, list, | 1455 | pcp->batch, list, |
1456 | migratetype, cold); | 1456 | migratetype, cold); |
1457 | if (unlikely(list_empty(list))) | 1457 | if (unlikely(list_empty(list))) |
1458 | goto failed; | 1458 | goto failed; |
1459 | } | 1459 | } |
1460 | 1460 | ||
1461 | if (cold) | 1461 | if (cold) |
1462 | page = list_entry(list->prev, struct page, lru); | 1462 | page = list_entry(list->prev, struct page, lru); |
1463 | else | 1463 | else |
1464 | page = list_entry(list->next, struct page, lru); | 1464 | page = list_entry(list->next, struct page, lru); |
1465 | 1465 | ||
1466 | list_del(&page->lru); | 1466 | list_del(&page->lru); |
1467 | pcp->count--; | 1467 | pcp->count--; |
1468 | } else { | 1468 | } else { |
1469 | if (unlikely(gfp_flags & __GFP_NOFAIL)) { | 1469 | if (unlikely(gfp_flags & __GFP_NOFAIL)) { |
1470 | /* | 1470 | /* |
1471 | * __GFP_NOFAIL is not to be used in new code. | 1471 | * __GFP_NOFAIL is not to be used in new code. |
1472 | * | 1472 | * |
1473 | * All __GFP_NOFAIL callers should be fixed so that they | 1473 | * All __GFP_NOFAIL callers should be fixed so that they |
1474 | * properly detect and handle allocation failures. | 1474 | * properly detect and handle allocation failures. |
1475 | * | 1475 | * |
1476 | * We most definitely don't want callers attempting to | 1476 | * We most definitely don't want callers attempting to |
1477 | * allocate greater than order-1 page units with | 1477 | * allocate greater than order-1 page units with |
1478 | * __GFP_NOFAIL. | 1478 | * __GFP_NOFAIL. |
1479 | */ | 1479 | */ |
1480 | WARN_ON_ONCE(order > 1); | 1480 | WARN_ON_ONCE(order > 1); |
1481 | } | 1481 | } |
1482 | spin_lock_irqsave(&zone->lock, flags); | 1482 | spin_lock_irqsave(&zone->lock, flags); |
1483 | page = __rmqueue(zone, order, migratetype); | 1483 | page = __rmqueue(zone, order, migratetype); |
1484 | spin_unlock(&zone->lock); | 1484 | spin_unlock(&zone->lock); |
1485 | if (!page) | 1485 | if (!page) |
1486 | goto failed; | 1486 | goto failed; |
1487 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); | 1487 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); |
1488 | } | 1488 | } |
1489 | 1489 | ||
1490 | __count_zone_vm_events(PGALLOC, zone, 1 << order); | 1490 | __count_zone_vm_events(PGALLOC, zone, 1 << order); |
1491 | zone_statistics(preferred_zone, zone, gfp_flags); | 1491 | zone_statistics(preferred_zone, zone, gfp_flags); |
1492 | local_irq_restore(flags); | 1492 | local_irq_restore(flags); |
1493 | 1493 | ||
1494 | VM_BUG_ON(bad_range(zone, page)); | 1494 | VM_BUG_ON(bad_range(zone, page)); |
1495 | if (prep_new_page(page, order, gfp_flags)) | 1495 | if (prep_new_page(page, order, gfp_flags)) |
1496 | goto again; | 1496 | goto again; |
1497 | return page; | 1497 | return page; |
1498 | 1498 | ||
1499 | failed: | 1499 | failed: |
1500 | local_irq_restore(flags); | 1500 | local_irq_restore(flags); |
1501 | return NULL; | 1501 | return NULL; |
1502 | } | 1502 | } |
1503 | 1503 | ||
1504 | /* The ALLOC_WMARK bits are used as an index to zone->watermark */ | 1504 | /* The ALLOC_WMARK bits are used as an index to zone->watermark */ |
1505 | #define ALLOC_WMARK_MIN WMARK_MIN | 1505 | #define ALLOC_WMARK_MIN WMARK_MIN |
1506 | #define ALLOC_WMARK_LOW WMARK_LOW | 1506 | #define ALLOC_WMARK_LOW WMARK_LOW |
1507 | #define ALLOC_WMARK_HIGH WMARK_HIGH | 1507 | #define ALLOC_WMARK_HIGH WMARK_HIGH |
1508 | #define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */ | 1508 | #define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */ |
1509 | 1509 | ||
1510 | /* Mask to get the watermark bits */ | 1510 | /* Mask to get the watermark bits */ |
1511 | #define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) | 1511 | #define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) |
1512 | 1512 | ||
1513 | #define ALLOC_HARDER 0x10 /* try to alloc harder */ | 1513 | #define ALLOC_HARDER 0x10 /* try to alloc harder */ |
1514 | #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ | 1514 | #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ |
1515 | #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ | 1515 | #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ |
1516 | 1516 | ||
1517 | #ifdef CONFIG_FAIL_PAGE_ALLOC | 1517 | #ifdef CONFIG_FAIL_PAGE_ALLOC |
1518 | 1518 | ||
1519 | static struct { | 1519 | static struct { |
1520 | struct fault_attr attr; | 1520 | struct fault_attr attr; |
1521 | 1521 | ||
1522 | u32 ignore_gfp_highmem; | 1522 | u32 ignore_gfp_highmem; |
1523 | u32 ignore_gfp_wait; | 1523 | u32 ignore_gfp_wait; |
1524 | u32 min_order; | 1524 | u32 min_order; |
1525 | } fail_page_alloc = { | 1525 | } fail_page_alloc = { |
1526 | .attr = FAULT_ATTR_INITIALIZER, | 1526 | .attr = FAULT_ATTR_INITIALIZER, |
1527 | .ignore_gfp_wait = 1, | 1527 | .ignore_gfp_wait = 1, |
1528 | .ignore_gfp_highmem = 1, | 1528 | .ignore_gfp_highmem = 1, |
1529 | .min_order = 1, | 1529 | .min_order = 1, |
1530 | }; | 1530 | }; |
1531 | 1531 | ||
1532 | static int __init setup_fail_page_alloc(char *str) | 1532 | static int __init setup_fail_page_alloc(char *str) |
1533 | { | 1533 | { |
1534 | return setup_fault_attr(&fail_page_alloc.attr, str); | 1534 | return setup_fault_attr(&fail_page_alloc.attr, str); |
1535 | } | 1535 | } |
1536 | __setup("fail_page_alloc=", setup_fail_page_alloc); | 1536 | __setup("fail_page_alloc=", setup_fail_page_alloc); |
1537 | 1537 | ||
1538 | static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) | 1538 | static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) |
1539 | { | 1539 | { |
1540 | if (order < fail_page_alloc.min_order) | 1540 | if (order < fail_page_alloc.min_order) |
1541 | return false; | 1541 | return false; |
1542 | if (gfp_mask & __GFP_NOFAIL) | 1542 | if (gfp_mask & __GFP_NOFAIL) |
1543 | return false; | 1543 | return false; |
1544 | if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) | 1544 | if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) |
1545 | return false; | 1545 | return false; |
1546 | if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) | 1546 | if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) |
1547 | return false; | 1547 | return false; |
1548 | 1548 | ||
1549 | return should_fail(&fail_page_alloc.attr, 1 << order); | 1549 | return should_fail(&fail_page_alloc.attr, 1 << order); |
1550 | } | 1550 | } |
1551 | 1551 | ||
1552 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS | 1552 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS |
1553 | 1553 | ||
1554 | static int __init fail_page_alloc_debugfs(void) | 1554 | static int __init fail_page_alloc_debugfs(void) |
1555 | { | 1555 | { |
1556 | umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; | 1556 | umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; |
1557 | struct dentry *dir; | 1557 | struct dentry *dir; |
1558 | 1558 | ||
1559 | dir = fault_create_debugfs_attr("fail_page_alloc", NULL, | 1559 | dir = fault_create_debugfs_attr("fail_page_alloc", NULL, |
1560 | &fail_page_alloc.attr); | 1560 | &fail_page_alloc.attr); |
1561 | if (IS_ERR(dir)) | 1561 | if (IS_ERR(dir)) |
1562 | return PTR_ERR(dir); | 1562 | return PTR_ERR(dir); |
1563 | 1563 | ||
1564 | if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, | 1564 | if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, |
1565 | &fail_page_alloc.ignore_gfp_wait)) | 1565 | &fail_page_alloc.ignore_gfp_wait)) |
1566 | goto fail; | 1566 | goto fail; |
1567 | if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir, | 1567 | if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir, |
1568 | &fail_page_alloc.ignore_gfp_highmem)) | 1568 | &fail_page_alloc.ignore_gfp_highmem)) |
1569 | goto fail; | 1569 | goto fail; |
1570 | if (!debugfs_create_u32("min-order", mode, dir, | 1570 | if (!debugfs_create_u32("min-order", mode, dir, |
1571 | &fail_page_alloc.min_order)) | 1571 | &fail_page_alloc.min_order)) |
1572 | goto fail; | 1572 | goto fail; |
1573 | 1573 | ||
1574 | return 0; | 1574 | return 0; |
1575 | fail: | 1575 | fail: |
1576 | debugfs_remove_recursive(dir); | 1576 | debugfs_remove_recursive(dir); |
1577 | 1577 | ||
1578 | return -ENOMEM; | 1578 | return -ENOMEM; |
1579 | } | 1579 | } |
1580 | 1580 | ||
1581 | late_initcall(fail_page_alloc_debugfs); | 1581 | late_initcall(fail_page_alloc_debugfs); |
1582 | 1582 | ||
1583 | #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ | 1583 | #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ |
1584 | 1584 | ||
1585 | #else /* CONFIG_FAIL_PAGE_ALLOC */ | 1585 | #else /* CONFIG_FAIL_PAGE_ALLOC */ |
1586 | 1586 | ||
1587 | static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) | 1587 | static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) |
1588 | { | 1588 | { |
1589 | return false; | 1589 | return false; |
1590 | } | 1590 | } |
1591 | 1591 | ||
1592 | #endif /* CONFIG_FAIL_PAGE_ALLOC */ | 1592 | #endif /* CONFIG_FAIL_PAGE_ALLOC */ |
1593 | 1593 | ||
1594 | /* | 1594 | /* |
1595 | * Return true if free pages are above 'mark'. This takes into account the order | 1595 | * Return true if free pages are above 'mark'. This takes into account the order |
1596 | * of the allocation. | 1596 | * of the allocation. |
1597 | */ | 1597 | */ |
1598 | static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, | 1598 | static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, |
1599 | int classzone_idx, int alloc_flags, long free_pages) | 1599 | int classzone_idx, int alloc_flags, long free_pages) |
1600 | { | 1600 | { |
1601 | /* free_pages my go negative - that's OK */ | 1601 | /* free_pages my go negative - that's OK */ |
1602 | long min = mark; | 1602 | long min = mark; |
1603 | long lowmem_reserve = z->lowmem_reserve[classzone_idx]; | 1603 | long lowmem_reserve = z->lowmem_reserve[classzone_idx]; |
1604 | int o; | 1604 | int o; |
1605 | 1605 | ||
1606 | free_pages -= (1 << order) - 1; | 1606 | free_pages -= (1 << order) - 1; |
1607 | if (alloc_flags & ALLOC_HIGH) | 1607 | if (alloc_flags & ALLOC_HIGH) |
1608 | min -= min / 2; | 1608 | min -= min / 2; |
1609 | if (alloc_flags & ALLOC_HARDER) | 1609 | if (alloc_flags & ALLOC_HARDER) |
1610 | min -= min / 4; | 1610 | min -= min / 4; |
1611 | 1611 | ||
1612 | if (free_pages <= min + lowmem_reserve) | 1612 | if (free_pages <= min + lowmem_reserve) |
1613 | return false; | 1613 | return false; |
1614 | for (o = 0; o < order; o++) { | 1614 | for (o = 0; o < order; o++) { |
1615 | /* At the next order, this order's pages become unavailable */ | 1615 | /* At the next order, this order's pages become unavailable */ |
1616 | free_pages -= z->free_area[o].nr_free << o; | 1616 | free_pages -= z->free_area[o].nr_free << o; |
1617 | 1617 | ||
1618 | /* Require fewer higher order pages to be free */ | 1618 | /* Require fewer higher order pages to be free */ |
1619 | min >>= 1; | 1619 | min >>= 1; |
1620 | 1620 | ||
1621 | if (free_pages <= min) | 1621 | if (free_pages <= min) |
1622 | return false; | 1622 | return false; |
1623 | } | 1623 | } |
1624 | return true; | 1624 | return true; |
1625 | } | 1625 | } |
1626 | 1626 | ||
1627 | #ifdef CONFIG_MEMORY_ISOLATION | 1627 | #ifdef CONFIG_MEMORY_ISOLATION |
1628 | static inline unsigned long nr_zone_isolate_freepages(struct zone *zone) | 1628 | static inline unsigned long nr_zone_isolate_freepages(struct zone *zone) |
1629 | { | 1629 | { |
1630 | if (unlikely(zone->nr_pageblock_isolate)) | 1630 | if (unlikely(zone->nr_pageblock_isolate)) |
1631 | return zone->nr_pageblock_isolate * pageblock_nr_pages; | 1631 | return zone->nr_pageblock_isolate * pageblock_nr_pages; |
1632 | return 0; | 1632 | return 0; |
1633 | } | 1633 | } |
1634 | #else | 1634 | #else |
1635 | static inline unsigned long nr_zone_isolate_freepages(struct zone *zone) | 1635 | static inline unsigned long nr_zone_isolate_freepages(struct zone *zone) |
1636 | { | 1636 | { |
1637 | return 0; | 1637 | return 0; |
1638 | } | 1638 | } |
1639 | #endif | 1639 | #endif |
1640 | 1640 | ||
1641 | bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, | 1641 | bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, |
1642 | int classzone_idx, int alloc_flags) | 1642 | int classzone_idx, int alloc_flags) |
1643 | { | 1643 | { |
1644 | return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, | 1644 | return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, |
1645 | zone_page_state(z, NR_FREE_PAGES)); | 1645 | zone_page_state(z, NR_FREE_PAGES)); |
1646 | } | 1646 | } |
1647 | 1647 | ||
1648 | bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, | 1648 | bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, |
1649 | int classzone_idx, int alloc_flags) | 1649 | int classzone_idx, int alloc_flags) |
1650 | { | 1650 | { |
1651 | long free_pages = zone_page_state(z, NR_FREE_PAGES); | 1651 | long free_pages = zone_page_state(z, NR_FREE_PAGES); |
1652 | 1652 | ||
1653 | if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) | 1653 | if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) |
1654 | free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); | 1654 | free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); |
1655 | 1655 | ||
1656 | /* | 1656 | /* |
1657 | * If the zone has MIGRATE_ISOLATE type free pages, we should consider | 1657 | * If the zone has MIGRATE_ISOLATE type free pages, we should consider |
1658 | * it. nr_zone_isolate_freepages is never accurate so kswapd might not | 1658 | * it. nr_zone_isolate_freepages is never accurate so kswapd might not |
1659 | * sleep although it could do so. But this is more desirable for memory | 1659 | * sleep although it could do so. But this is more desirable for memory |
1660 | * hotplug than sleeping which can cause a livelock in the direct | 1660 | * hotplug than sleeping which can cause a livelock in the direct |
1661 | * reclaim path. | 1661 | * reclaim path. |
1662 | */ | 1662 | */ |
1663 | free_pages -= nr_zone_isolate_freepages(z); | 1663 | free_pages -= nr_zone_isolate_freepages(z); |
1664 | return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, | 1664 | return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, |
1665 | free_pages); | 1665 | free_pages); |
1666 | } | 1666 | } |
1667 | 1667 | ||
1668 | #ifdef CONFIG_NUMA | 1668 | #ifdef CONFIG_NUMA |
1669 | /* | 1669 | /* |
1670 | * zlc_setup - Setup for "zonelist cache". Uses cached zone data to | 1670 | * zlc_setup - Setup for "zonelist cache". Uses cached zone data to |
1671 | * skip over zones that are not allowed by the cpuset, or that have | 1671 | * skip over zones that are not allowed by the cpuset, or that have |
1672 | * been recently (in last second) found to be nearly full. See further | 1672 | * been recently (in last second) found to be nearly full. See further |
1673 | * comments in mmzone.h. Reduces cache footprint of zonelist scans | 1673 | * comments in mmzone.h. Reduces cache footprint of zonelist scans |
1674 | * that have to skip over a lot of full or unallowed zones. | 1674 | * that have to skip over a lot of full or unallowed zones. |
1675 | * | 1675 | * |
1676 | * If the zonelist cache is present in the passed in zonelist, then | 1676 | * If the zonelist cache is present in the passed in zonelist, then |
1677 | * returns a pointer to the allowed node mask (either the current | 1677 | * returns a pointer to the allowed node mask (either the current |
1678 | * tasks mems_allowed, or node_states[N_HIGH_MEMORY].) | 1678 | * tasks mems_allowed, or node_states[N_HIGH_MEMORY].) |
1679 | * | 1679 | * |
1680 | * If the zonelist cache is not available for this zonelist, does | 1680 | * If the zonelist cache is not available for this zonelist, does |
1681 | * nothing and returns NULL. | 1681 | * nothing and returns NULL. |
1682 | * | 1682 | * |
1683 | * If the fullzones BITMAP in the zonelist cache is stale (more than | 1683 | * If the fullzones BITMAP in the zonelist cache is stale (more than |
1684 | * a second since last zap'd) then we zap it out (clear its bits.) | 1684 | * a second since last zap'd) then we zap it out (clear its bits.) |
1685 | * | 1685 | * |
1686 | * We hold off even calling zlc_setup, until after we've checked the | 1686 | * We hold off even calling zlc_setup, until after we've checked the |
1687 | * first zone in the zonelist, on the theory that most allocations will | 1687 | * first zone in the zonelist, on the theory that most allocations will |
1688 | * be satisfied from that first zone, so best to examine that zone as | 1688 | * be satisfied from that first zone, so best to examine that zone as |
1689 | * quickly as we can. | 1689 | * quickly as we can. |
1690 | */ | 1690 | */ |
1691 | static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) | 1691 | static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) |
1692 | { | 1692 | { |
1693 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ | 1693 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ |
1694 | nodemask_t *allowednodes; /* zonelist_cache approximation */ | 1694 | nodemask_t *allowednodes; /* zonelist_cache approximation */ |
1695 | 1695 | ||
1696 | zlc = zonelist->zlcache_ptr; | 1696 | zlc = zonelist->zlcache_ptr; |
1697 | if (!zlc) | 1697 | if (!zlc) |
1698 | return NULL; | 1698 | return NULL; |
1699 | 1699 | ||
1700 | if (time_after(jiffies, zlc->last_full_zap + HZ)) { | 1700 | if (time_after(jiffies, zlc->last_full_zap + HZ)) { |
1701 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); | 1701 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); |
1702 | zlc->last_full_zap = jiffies; | 1702 | zlc->last_full_zap = jiffies; |
1703 | } | 1703 | } |
1704 | 1704 | ||
1705 | allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? | 1705 | allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? |
1706 | &cpuset_current_mems_allowed : | 1706 | &cpuset_current_mems_allowed : |
1707 | &node_states[N_HIGH_MEMORY]; | 1707 | &node_states[N_HIGH_MEMORY]; |
1708 | return allowednodes; | 1708 | return allowednodes; |
1709 | } | 1709 | } |
1710 | 1710 | ||
1711 | /* | 1711 | /* |
1712 | * Given 'z' scanning a zonelist, run a couple of quick checks to see | 1712 | * Given 'z' scanning a zonelist, run a couple of quick checks to see |
1713 | * if it is worth looking at further for free memory: | 1713 | * if it is worth looking at further for free memory: |
1714 | * 1) Check that the zone isn't thought to be full (doesn't have its | 1714 | * 1) Check that the zone isn't thought to be full (doesn't have its |
1715 | * bit set in the zonelist_cache fullzones BITMAP). | 1715 | * bit set in the zonelist_cache fullzones BITMAP). |
1716 | * 2) Check that the zones node (obtained from the zonelist_cache | 1716 | * 2) Check that the zones node (obtained from the zonelist_cache |
1717 | * z_to_n[] mapping) is allowed in the passed in allowednodes mask. | 1717 | * z_to_n[] mapping) is allowed in the passed in allowednodes mask. |
1718 | * Return true (non-zero) if zone is worth looking at further, or | 1718 | * Return true (non-zero) if zone is worth looking at further, or |
1719 | * else return false (zero) if it is not. | 1719 | * else return false (zero) if it is not. |
1720 | * | 1720 | * |
1721 | * This check -ignores- the distinction between various watermarks, | 1721 | * This check -ignores- the distinction between various watermarks, |
1722 | * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is | 1722 | * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is |
1723 | * found to be full for any variation of these watermarks, it will | 1723 | * found to be full for any variation of these watermarks, it will |
1724 | * be considered full for up to one second by all requests, unless | 1724 | * be considered full for up to one second by all requests, unless |
1725 | * we are so low on memory on all allowed nodes that we are forced | 1725 | * we are so low on memory on all allowed nodes that we are forced |
1726 | * into the second scan of the zonelist. | 1726 | * into the second scan of the zonelist. |
1727 | * | 1727 | * |
1728 | * In the second scan we ignore this zonelist cache and exactly | 1728 | * In the second scan we ignore this zonelist cache and exactly |
1729 | * apply the watermarks to all zones, even it is slower to do so. | 1729 | * apply the watermarks to all zones, even it is slower to do so. |
1730 | * We are low on memory in the second scan, and should leave no stone | 1730 | * We are low on memory in the second scan, and should leave no stone |
1731 | * unturned looking for a free page. | 1731 | * unturned looking for a free page. |
1732 | */ | 1732 | */ |
1733 | static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, | 1733 | static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, |
1734 | nodemask_t *allowednodes) | 1734 | nodemask_t *allowednodes) |
1735 | { | 1735 | { |
1736 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ | 1736 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ |
1737 | int i; /* index of *z in zonelist zones */ | 1737 | int i; /* index of *z in zonelist zones */ |
1738 | int n; /* node that zone *z is on */ | 1738 | int n; /* node that zone *z is on */ |
1739 | 1739 | ||
1740 | zlc = zonelist->zlcache_ptr; | 1740 | zlc = zonelist->zlcache_ptr; |
1741 | if (!zlc) | 1741 | if (!zlc) |
1742 | return 1; | 1742 | return 1; |
1743 | 1743 | ||
1744 | i = z - zonelist->_zonerefs; | 1744 | i = z - zonelist->_zonerefs; |
1745 | n = zlc->z_to_n[i]; | 1745 | n = zlc->z_to_n[i]; |
1746 | 1746 | ||
1747 | /* This zone is worth trying if it is allowed but not full */ | 1747 | /* This zone is worth trying if it is allowed but not full */ |
1748 | return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones); | 1748 | return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones); |
1749 | } | 1749 | } |
1750 | 1750 | ||
1751 | /* | 1751 | /* |
1752 | * Given 'z' scanning a zonelist, set the corresponding bit in | 1752 | * Given 'z' scanning a zonelist, set the corresponding bit in |
1753 | * zlc->fullzones, so that subsequent attempts to allocate a page | 1753 | * zlc->fullzones, so that subsequent attempts to allocate a page |
1754 | * from that zone don't waste time re-examining it. | 1754 | * from that zone don't waste time re-examining it. |
1755 | */ | 1755 | */ |
1756 | static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) | 1756 | static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) |
1757 | { | 1757 | { |
1758 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ | 1758 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ |
1759 | int i; /* index of *z in zonelist zones */ | 1759 | int i; /* index of *z in zonelist zones */ |
1760 | 1760 | ||
1761 | zlc = zonelist->zlcache_ptr; | 1761 | zlc = zonelist->zlcache_ptr; |
1762 | if (!zlc) | 1762 | if (!zlc) |
1763 | return; | 1763 | return; |
1764 | 1764 | ||
1765 | i = z - zonelist->_zonerefs; | 1765 | i = z - zonelist->_zonerefs; |
1766 | 1766 | ||
1767 | set_bit(i, zlc->fullzones); | 1767 | set_bit(i, zlc->fullzones); |
1768 | } | 1768 | } |
1769 | 1769 | ||
1770 | /* | 1770 | /* |
1771 | * clear all zones full, called after direct reclaim makes progress so that | 1771 | * clear all zones full, called after direct reclaim makes progress so that |
1772 | * a zone that was recently full is not skipped over for up to a second | 1772 | * a zone that was recently full is not skipped over for up to a second |
1773 | */ | 1773 | */ |
1774 | static void zlc_clear_zones_full(struct zonelist *zonelist) | 1774 | static void zlc_clear_zones_full(struct zonelist *zonelist) |
1775 | { | 1775 | { |
1776 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ | 1776 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ |
1777 | 1777 | ||
1778 | zlc = zonelist->zlcache_ptr; | 1778 | zlc = zonelist->zlcache_ptr; |
1779 | if (!zlc) | 1779 | if (!zlc) |
1780 | return; | 1780 | return; |
1781 | 1781 | ||
1782 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); | 1782 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); |
1783 | } | 1783 | } |
1784 | 1784 | ||
1785 | #else /* CONFIG_NUMA */ | 1785 | #else /* CONFIG_NUMA */ |
1786 | 1786 | ||
1787 | static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) | 1787 | static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) |
1788 | { | 1788 | { |
1789 | return NULL; | 1789 | return NULL; |
1790 | } | 1790 | } |
1791 | 1791 | ||
1792 | static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, | 1792 | static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, |
1793 | nodemask_t *allowednodes) | 1793 | nodemask_t *allowednodes) |
1794 | { | 1794 | { |
1795 | return 1; | 1795 | return 1; |
1796 | } | 1796 | } |
1797 | 1797 | ||
1798 | static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) | 1798 | static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) |
1799 | { | 1799 | { |
1800 | } | 1800 | } |
1801 | 1801 | ||
1802 | static void zlc_clear_zones_full(struct zonelist *zonelist) | 1802 | static void zlc_clear_zones_full(struct zonelist *zonelist) |
1803 | { | 1803 | { |
1804 | } | 1804 | } |
1805 | #endif /* CONFIG_NUMA */ | 1805 | #endif /* CONFIG_NUMA */ |
1806 | 1806 | ||
1807 | /* | 1807 | /* |
1808 | * get_page_from_freelist goes through the zonelist trying to allocate | 1808 | * get_page_from_freelist goes through the zonelist trying to allocate |
1809 | * a page. | 1809 | * a page. |
1810 | */ | 1810 | */ |
1811 | static struct page * | 1811 | static struct page * |
1812 | get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, | 1812 | get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, |
1813 | struct zonelist *zonelist, int high_zoneidx, int alloc_flags, | 1813 | struct zonelist *zonelist, int high_zoneidx, int alloc_flags, |
1814 | struct zone *preferred_zone, int migratetype) | 1814 | struct zone *preferred_zone, int migratetype) |
1815 | { | 1815 | { |
1816 | struct zoneref *z; | 1816 | struct zoneref *z; |
1817 | struct page *page = NULL; | 1817 | struct page *page = NULL; |
1818 | int classzone_idx; | 1818 | int classzone_idx; |
1819 | struct zone *zone; | 1819 | struct zone *zone; |
1820 | nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ | 1820 | nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ |
1821 | int zlc_active = 0; /* set if using zonelist_cache */ | 1821 | int zlc_active = 0; /* set if using zonelist_cache */ |
1822 | int did_zlc_setup = 0; /* just call zlc_setup() one time */ | 1822 | int did_zlc_setup = 0; /* just call zlc_setup() one time */ |
1823 | 1823 | ||
1824 | classzone_idx = zone_idx(preferred_zone); | 1824 | classzone_idx = zone_idx(preferred_zone); |
1825 | zonelist_scan: | 1825 | zonelist_scan: |
1826 | /* | 1826 | /* |
1827 | * Scan zonelist, looking for a zone with enough free. | 1827 | * Scan zonelist, looking for a zone with enough free. |
1828 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | 1828 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. |
1829 | */ | 1829 | */ |
1830 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 1830 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
1831 | high_zoneidx, nodemask) { | 1831 | high_zoneidx, nodemask) { |
1832 | if (NUMA_BUILD && zlc_active && | 1832 | if (NUMA_BUILD && zlc_active && |
1833 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) | 1833 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) |
1834 | continue; | 1834 | continue; |
1835 | if ((alloc_flags & ALLOC_CPUSET) && | 1835 | if ((alloc_flags & ALLOC_CPUSET) && |
1836 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) | 1836 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) |
1837 | continue; | 1837 | continue; |
1838 | /* | 1838 | /* |
1839 | * When allocating a page cache page for writing, we | 1839 | * When allocating a page cache page for writing, we |
1840 | * want to get it from a zone that is within its dirty | 1840 | * want to get it from a zone that is within its dirty |
1841 | * limit, such that no single zone holds more than its | 1841 | * limit, such that no single zone holds more than its |
1842 | * proportional share of globally allowed dirty pages. | 1842 | * proportional share of globally allowed dirty pages. |
1843 | * The dirty limits take into account the zone's | 1843 | * The dirty limits take into account the zone's |
1844 | * lowmem reserves and high watermark so that kswapd | 1844 | * lowmem reserves and high watermark so that kswapd |
1845 | * should be able to balance it without having to | 1845 | * should be able to balance it without having to |
1846 | * write pages from its LRU list. | 1846 | * write pages from its LRU list. |
1847 | * | 1847 | * |
1848 | * This may look like it could increase pressure on | 1848 | * This may look like it could increase pressure on |
1849 | * lower zones by failing allocations in higher zones | 1849 | * lower zones by failing allocations in higher zones |
1850 | * before they are full. But the pages that do spill | 1850 | * before they are full. But the pages that do spill |
1851 | * over are limited as the lower zones are protected | 1851 | * over are limited as the lower zones are protected |
1852 | * by this very same mechanism. It should not become | 1852 | * by this very same mechanism. It should not become |
1853 | * a practical burden to them. | 1853 | * a practical burden to them. |
1854 | * | 1854 | * |
1855 | * XXX: For now, allow allocations to potentially | 1855 | * XXX: For now, allow allocations to potentially |
1856 | * exceed the per-zone dirty limit in the slowpath | 1856 | * exceed the per-zone dirty limit in the slowpath |
1857 | * (ALLOC_WMARK_LOW unset) before going into reclaim, | 1857 | * (ALLOC_WMARK_LOW unset) before going into reclaim, |
1858 | * which is important when on a NUMA setup the allowed | 1858 | * which is important when on a NUMA setup the allowed |
1859 | * zones are together not big enough to reach the | 1859 | * zones are together not big enough to reach the |
1860 | * global limit. The proper fix for these situations | 1860 | * global limit. The proper fix for these situations |
1861 | * will require awareness of zones in the | 1861 | * will require awareness of zones in the |
1862 | * dirty-throttling and the flusher threads. | 1862 | * dirty-throttling and the flusher threads. |
1863 | */ | 1863 | */ |
1864 | if ((alloc_flags & ALLOC_WMARK_LOW) && | 1864 | if ((alloc_flags & ALLOC_WMARK_LOW) && |
1865 | (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone)) | 1865 | (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone)) |
1866 | goto this_zone_full; | 1866 | goto this_zone_full; |
1867 | 1867 | ||
1868 | BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); | 1868 | BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); |
1869 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { | 1869 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { |
1870 | unsigned long mark; | 1870 | unsigned long mark; |
1871 | int ret; | 1871 | int ret; |
1872 | 1872 | ||
1873 | mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; | 1873 | mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; |
1874 | if (zone_watermark_ok(zone, order, mark, | 1874 | if (zone_watermark_ok(zone, order, mark, |
1875 | classzone_idx, alloc_flags)) | 1875 | classzone_idx, alloc_flags)) |
1876 | goto try_this_zone; | 1876 | goto try_this_zone; |
1877 | 1877 | ||
1878 | if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { | 1878 | if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { |
1879 | /* | 1879 | /* |
1880 | * we do zlc_setup if there are multiple nodes | 1880 | * we do zlc_setup if there are multiple nodes |
1881 | * and before considering the first zone allowed | 1881 | * and before considering the first zone allowed |
1882 | * by the cpuset. | 1882 | * by the cpuset. |
1883 | */ | 1883 | */ |
1884 | allowednodes = zlc_setup(zonelist, alloc_flags); | 1884 | allowednodes = zlc_setup(zonelist, alloc_flags); |
1885 | zlc_active = 1; | 1885 | zlc_active = 1; |
1886 | did_zlc_setup = 1; | 1886 | did_zlc_setup = 1; |
1887 | } | 1887 | } |
1888 | 1888 | ||
1889 | if (zone_reclaim_mode == 0) | 1889 | if (zone_reclaim_mode == 0) |
1890 | goto this_zone_full; | 1890 | goto this_zone_full; |
1891 | 1891 | ||
1892 | /* | 1892 | /* |
1893 | * As we may have just activated ZLC, check if the first | 1893 | * As we may have just activated ZLC, check if the first |
1894 | * eligible zone has failed zone_reclaim recently. | 1894 | * eligible zone has failed zone_reclaim recently. |
1895 | */ | 1895 | */ |
1896 | if (NUMA_BUILD && zlc_active && | 1896 | if (NUMA_BUILD && zlc_active && |
1897 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) | 1897 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) |
1898 | continue; | 1898 | continue; |
1899 | 1899 | ||
1900 | ret = zone_reclaim(zone, gfp_mask, order); | 1900 | ret = zone_reclaim(zone, gfp_mask, order); |
1901 | switch (ret) { | 1901 | switch (ret) { |
1902 | case ZONE_RECLAIM_NOSCAN: | 1902 | case ZONE_RECLAIM_NOSCAN: |
1903 | /* did not scan */ | 1903 | /* did not scan */ |
1904 | continue; | 1904 | continue; |
1905 | case ZONE_RECLAIM_FULL: | 1905 | case ZONE_RECLAIM_FULL: |
1906 | /* scanned but unreclaimable */ | 1906 | /* scanned but unreclaimable */ |
1907 | continue; | 1907 | continue; |
1908 | default: | 1908 | default: |
1909 | /* did we reclaim enough */ | 1909 | /* did we reclaim enough */ |
1910 | if (!zone_watermark_ok(zone, order, mark, | 1910 | if (!zone_watermark_ok(zone, order, mark, |
1911 | classzone_idx, alloc_flags)) | 1911 | classzone_idx, alloc_flags)) |
1912 | goto this_zone_full; | 1912 | goto this_zone_full; |
1913 | } | 1913 | } |
1914 | } | 1914 | } |
1915 | 1915 | ||
1916 | try_this_zone: | 1916 | try_this_zone: |
1917 | page = buffered_rmqueue(preferred_zone, zone, order, | 1917 | page = buffered_rmqueue(preferred_zone, zone, order, |
1918 | gfp_mask, migratetype); | 1918 | gfp_mask, migratetype); |
1919 | if (page) | 1919 | if (page) |
1920 | break; | 1920 | break; |
1921 | this_zone_full: | 1921 | this_zone_full: |
1922 | if (NUMA_BUILD) | 1922 | if (NUMA_BUILD) |
1923 | zlc_mark_zone_full(zonelist, z); | 1923 | zlc_mark_zone_full(zonelist, z); |
1924 | } | 1924 | } |
1925 | 1925 | ||
1926 | if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { | 1926 | if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { |
1927 | /* Disable zlc cache for second zonelist scan */ | 1927 | /* Disable zlc cache for second zonelist scan */ |
1928 | zlc_active = 0; | 1928 | zlc_active = 0; |
1929 | goto zonelist_scan; | 1929 | goto zonelist_scan; |
1930 | } | 1930 | } |
1931 | return page; | 1931 | return page; |
1932 | } | 1932 | } |
1933 | 1933 | ||
1934 | /* | 1934 | /* |
1935 | * Large machines with many possible nodes should not always dump per-node | 1935 | * Large machines with many possible nodes should not always dump per-node |
1936 | * meminfo in irq context. | 1936 | * meminfo in irq context. |
1937 | */ | 1937 | */ |
1938 | static inline bool should_suppress_show_mem(void) | 1938 | static inline bool should_suppress_show_mem(void) |
1939 | { | 1939 | { |
1940 | bool ret = false; | 1940 | bool ret = false; |
1941 | 1941 | ||
1942 | #if NODES_SHIFT > 8 | 1942 | #if NODES_SHIFT > 8 |
1943 | ret = in_interrupt(); | 1943 | ret = in_interrupt(); |
1944 | #endif | 1944 | #endif |
1945 | return ret; | 1945 | return ret; |
1946 | } | 1946 | } |
1947 | 1947 | ||
1948 | static DEFINE_RATELIMIT_STATE(nopage_rs, | 1948 | static DEFINE_RATELIMIT_STATE(nopage_rs, |
1949 | DEFAULT_RATELIMIT_INTERVAL, | 1949 | DEFAULT_RATELIMIT_INTERVAL, |
1950 | DEFAULT_RATELIMIT_BURST); | 1950 | DEFAULT_RATELIMIT_BURST); |
1951 | 1951 | ||
1952 | void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) | 1952 | void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) |
1953 | { | 1953 | { |
1954 | unsigned int filter = SHOW_MEM_FILTER_NODES; | 1954 | unsigned int filter = SHOW_MEM_FILTER_NODES; |
1955 | 1955 | ||
1956 | if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || | 1956 | if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || |
1957 | debug_guardpage_minorder() > 0) | 1957 | debug_guardpage_minorder() > 0) |
1958 | return; | 1958 | return; |
1959 | 1959 | ||
1960 | /* | 1960 | /* |
1961 | * This documents exceptions given to allocations in certain | 1961 | * This documents exceptions given to allocations in certain |
1962 | * contexts that are allowed to allocate outside current's set | 1962 | * contexts that are allowed to allocate outside current's set |
1963 | * of allowed nodes. | 1963 | * of allowed nodes. |
1964 | */ | 1964 | */ |
1965 | if (!(gfp_mask & __GFP_NOMEMALLOC)) | 1965 | if (!(gfp_mask & __GFP_NOMEMALLOC)) |
1966 | if (test_thread_flag(TIF_MEMDIE) || | 1966 | if (test_thread_flag(TIF_MEMDIE) || |
1967 | (current->flags & (PF_MEMALLOC | PF_EXITING))) | 1967 | (current->flags & (PF_MEMALLOC | PF_EXITING))) |
1968 | filter &= ~SHOW_MEM_FILTER_NODES; | 1968 | filter &= ~SHOW_MEM_FILTER_NODES; |
1969 | if (in_interrupt() || !(gfp_mask & __GFP_WAIT)) | 1969 | if (in_interrupt() || !(gfp_mask & __GFP_WAIT)) |
1970 | filter &= ~SHOW_MEM_FILTER_NODES; | 1970 | filter &= ~SHOW_MEM_FILTER_NODES; |
1971 | 1971 | ||
1972 | if (fmt) { | 1972 | if (fmt) { |
1973 | struct va_format vaf; | 1973 | struct va_format vaf; |
1974 | va_list args; | 1974 | va_list args; |
1975 | 1975 | ||
1976 | va_start(args, fmt); | 1976 | va_start(args, fmt); |
1977 | 1977 | ||
1978 | vaf.fmt = fmt; | 1978 | vaf.fmt = fmt; |
1979 | vaf.va = &args; | 1979 | vaf.va = &args; |
1980 | 1980 | ||
1981 | pr_warn("%pV", &vaf); | 1981 | pr_warn("%pV", &vaf); |
1982 | 1982 | ||
1983 | va_end(args); | 1983 | va_end(args); |
1984 | } | 1984 | } |
1985 | 1985 | ||
1986 | pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n", | 1986 | pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n", |
1987 | current->comm, order, gfp_mask); | 1987 | current->comm, order, gfp_mask); |
1988 | 1988 | ||
1989 | dump_stack(); | 1989 | dump_stack(); |
1990 | if (!should_suppress_show_mem()) | 1990 | if (!should_suppress_show_mem()) |
1991 | show_mem(filter); | 1991 | show_mem(filter); |
1992 | } | 1992 | } |
1993 | 1993 | ||
1994 | static inline int | 1994 | static inline int |
1995 | should_alloc_retry(gfp_t gfp_mask, unsigned int order, | 1995 | should_alloc_retry(gfp_t gfp_mask, unsigned int order, |
1996 | unsigned long did_some_progress, | 1996 | unsigned long did_some_progress, |
1997 | unsigned long pages_reclaimed) | 1997 | unsigned long pages_reclaimed) |
1998 | { | 1998 | { |
1999 | /* Do not loop if specifically requested */ | 1999 | /* Do not loop if specifically requested */ |
2000 | if (gfp_mask & __GFP_NORETRY) | 2000 | if (gfp_mask & __GFP_NORETRY) |
2001 | return 0; | 2001 | return 0; |
2002 | 2002 | ||
2003 | /* Always retry if specifically requested */ | 2003 | /* Always retry if specifically requested */ |
2004 | if (gfp_mask & __GFP_NOFAIL) | 2004 | if (gfp_mask & __GFP_NOFAIL) |
2005 | return 1; | 2005 | return 1; |
2006 | 2006 | ||
2007 | /* | 2007 | /* |
2008 | * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim | 2008 | * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim |
2009 | * making forward progress without invoking OOM. Suspend also disables | 2009 | * making forward progress without invoking OOM. Suspend also disables |
2010 | * storage devices so kswapd will not help. Bail if we are suspending. | 2010 | * storage devices so kswapd will not help. Bail if we are suspending. |
2011 | */ | 2011 | */ |
2012 | if (!did_some_progress && pm_suspended_storage()) | 2012 | if (!did_some_progress && pm_suspended_storage()) |
2013 | return 0; | 2013 | return 0; |
2014 | 2014 | ||
2015 | /* | 2015 | /* |
2016 | * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER | 2016 | * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER |
2017 | * means __GFP_NOFAIL, but that may not be true in other | 2017 | * means __GFP_NOFAIL, but that may not be true in other |
2018 | * implementations. | 2018 | * implementations. |
2019 | */ | 2019 | */ |
2020 | if (order <= PAGE_ALLOC_COSTLY_ORDER) | 2020 | if (order <= PAGE_ALLOC_COSTLY_ORDER) |
2021 | return 1; | 2021 | return 1; |
2022 | 2022 | ||
2023 | /* | 2023 | /* |
2024 | * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is | 2024 | * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is |
2025 | * specified, then we retry until we no longer reclaim any pages | 2025 | * specified, then we retry until we no longer reclaim any pages |
2026 | * (above), or we've reclaimed an order of pages at least as | 2026 | * (above), or we've reclaimed an order of pages at least as |
2027 | * large as the allocation's order. In both cases, if the | 2027 | * large as the allocation's order. In both cases, if the |
2028 | * allocation still fails, we stop retrying. | 2028 | * allocation still fails, we stop retrying. |
2029 | */ | 2029 | */ |
2030 | if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order)) | 2030 | if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order)) |
2031 | return 1; | 2031 | return 1; |
2032 | 2032 | ||
2033 | return 0; | 2033 | return 0; |
2034 | } | 2034 | } |
2035 | 2035 | ||
2036 | static inline struct page * | 2036 | static inline struct page * |
2037 | __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | 2037 | __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, |
2038 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2038 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
2039 | nodemask_t *nodemask, struct zone *preferred_zone, | 2039 | nodemask_t *nodemask, struct zone *preferred_zone, |
2040 | int migratetype) | 2040 | int migratetype) |
2041 | { | 2041 | { |
2042 | struct page *page; | 2042 | struct page *page; |
2043 | 2043 | ||
2044 | /* Acquire the OOM killer lock for the zones in zonelist */ | 2044 | /* Acquire the OOM killer lock for the zones in zonelist */ |
2045 | if (!try_set_zonelist_oom(zonelist, gfp_mask)) { | 2045 | if (!try_set_zonelist_oom(zonelist, gfp_mask)) { |
2046 | schedule_timeout_uninterruptible(1); | 2046 | schedule_timeout_uninterruptible(1); |
2047 | return NULL; | 2047 | return NULL; |
2048 | } | 2048 | } |
2049 | 2049 | ||
2050 | /* | 2050 | /* |
2051 | * Go through the zonelist yet one more time, keep very high watermark | 2051 | * Go through the zonelist yet one more time, keep very high watermark |
2052 | * here, this is only to catch a parallel oom killing, we must fail if | 2052 | * here, this is only to catch a parallel oom killing, we must fail if |
2053 | * we're still under heavy pressure. | 2053 | * we're still under heavy pressure. |
2054 | */ | 2054 | */ |
2055 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, | 2055 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, |
2056 | order, zonelist, high_zoneidx, | 2056 | order, zonelist, high_zoneidx, |
2057 | ALLOC_WMARK_HIGH|ALLOC_CPUSET, | 2057 | ALLOC_WMARK_HIGH|ALLOC_CPUSET, |
2058 | preferred_zone, migratetype); | 2058 | preferred_zone, migratetype); |
2059 | if (page) | 2059 | if (page) |
2060 | goto out; | 2060 | goto out; |
2061 | 2061 | ||
2062 | if (!(gfp_mask & __GFP_NOFAIL)) { | 2062 | if (!(gfp_mask & __GFP_NOFAIL)) { |
2063 | /* The OOM killer will not help higher order allocs */ | 2063 | /* The OOM killer will not help higher order allocs */ |
2064 | if (order > PAGE_ALLOC_COSTLY_ORDER) | 2064 | if (order > PAGE_ALLOC_COSTLY_ORDER) |
2065 | goto out; | 2065 | goto out; |
2066 | /* The OOM killer does not needlessly kill tasks for lowmem */ | 2066 | /* The OOM killer does not needlessly kill tasks for lowmem */ |
2067 | if (high_zoneidx < ZONE_NORMAL) | 2067 | if (high_zoneidx < ZONE_NORMAL) |
2068 | goto out; | 2068 | goto out; |
2069 | /* | 2069 | /* |
2070 | * GFP_THISNODE contains __GFP_NORETRY and we never hit this. | 2070 | * GFP_THISNODE contains __GFP_NORETRY and we never hit this. |
2071 | * Sanity check for bare calls of __GFP_THISNODE, not real OOM. | 2071 | * Sanity check for bare calls of __GFP_THISNODE, not real OOM. |
2072 | * The caller should handle page allocation failure by itself if | 2072 | * The caller should handle page allocation failure by itself if |
2073 | * it specifies __GFP_THISNODE. | 2073 | * it specifies __GFP_THISNODE. |
2074 | * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER. | 2074 | * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER. |
2075 | */ | 2075 | */ |
2076 | if (gfp_mask & __GFP_THISNODE) | 2076 | if (gfp_mask & __GFP_THISNODE) |
2077 | goto out; | 2077 | goto out; |
2078 | } | 2078 | } |
2079 | /* Exhausted what can be done so it's blamo time */ | 2079 | /* Exhausted what can be done so it's blamo time */ |
2080 | out_of_memory(zonelist, gfp_mask, order, nodemask, false); | 2080 | out_of_memory(zonelist, gfp_mask, order, nodemask, false); |
2081 | 2081 | ||
2082 | out: | 2082 | out: |
2083 | clear_zonelist_oom(zonelist, gfp_mask); | 2083 | clear_zonelist_oom(zonelist, gfp_mask); |
2084 | return page; | 2084 | return page; |
2085 | } | 2085 | } |
2086 | 2086 | ||
2087 | #ifdef CONFIG_COMPACTION | 2087 | #ifdef CONFIG_COMPACTION |
2088 | /* Try memory compaction for high-order allocations before reclaim */ | 2088 | /* Try memory compaction for high-order allocations before reclaim */ |
2089 | static struct page * | 2089 | static struct page * |
2090 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | 2090 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, |
2091 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2091 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
2092 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 2092 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, |
2093 | int migratetype, bool sync_migration, | 2093 | int migratetype, bool sync_migration, |
2094 | bool *deferred_compaction, | 2094 | bool *deferred_compaction, |
2095 | unsigned long *did_some_progress) | 2095 | unsigned long *did_some_progress) |
2096 | { | 2096 | { |
2097 | struct page *page; | 2097 | struct page *page; |
2098 | 2098 | ||
2099 | if (!order) | 2099 | if (!order) |
2100 | return NULL; | 2100 | return NULL; |
2101 | 2101 | ||
2102 | if (compaction_deferred(preferred_zone, order)) { | 2102 | if (compaction_deferred(preferred_zone, order)) { |
2103 | *deferred_compaction = true; | 2103 | *deferred_compaction = true; |
2104 | return NULL; | 2104 | return NULL; |
2105 | } | 2105 | } |
2106 | 2106 | ||
2107 | current->flags |= PF_MEMALLOC; | 2107 | current->flags |= PF_MEMALLOC; |
2108 | *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, | 2108 | *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, |
2109 | nodemask, sync_migration); | 2109 | nodemask, sync_migration); |
2110 | current->flags &= ~PF_MEMALLOC; | 2110 | current->flags &= ~PF_MEMALLOC; |
2111 | if (*did_some_progress != COMPACT_SKIPPED) { | 2111 | if (*did_some_progress != COMPACT_SKIPPED) { |
2112 | 2112 | ||
2113 | /* Page migration frees to the PCP lists but we want merging */ | 2113 | /* Page migration frees to the PCP lists but we want merging */ |
2114 | drain_pages(get_cpu()); | 2114 | drain_pages(get_cpu()); |
2115 | put_cpu(); | 2115 | put_cpu(); |
2116 | 2116 | ||
2117 | page = get_page_from_freelist(gfp_mask, nodemask, | 2117 | page = get_page_from_freelist(gfp_mask, nodemask, |
2118 | order, zonelist, high_zoneidx, | 2118 | order, zonelist, high_zoneidx, |
2119 | alloc_flags, preferred_zone, | 2119 | alloc_flags, preferred_zone, |
2120 | migratetype); | 2120 | migratetype); |
2121 | if (page) { | 2121 | if (page) { |
2122 | preferred_zone->compact_considered = 0; | 2122 | preferred_zone->compact_considered = 0; |
2123 | preferred_zone->compact_defer_shift = 0; | 2123 | preferred_zone->compact_defer_shift = 0; |
2124 | if (order >= preferred_zone->compact_order_failed) | 2124 | if (order >= preferred_zone->compact_order_failed) |
2125 | preferred_zone->compact_order_failed = order + 1; | 2125 | preferred_zone->compact_order_failed = order + 1; |
2126 | count_vm_event(COMPACTSUCCESS); | 2126 | count_vm_event(COMPACTSUCCESS); |
2127 | return page; | 2127 | return page; |
2128 | } | 2128 | } |
2129 | 2129 | ||
2130 | /* | 2130 | /* |
2131 | * It's bad if compaction run occurs and fails. | 2131 | * It's bad if compaction run occurs and fails. |
2132 | * The most likely reason is that pages exist, | 2132 | * The most likely reason is that pages exist, |
2133 | * but not enough to satisfy watermarks. | 2133 | * but not enough to satisfy watermarks. |
2134 | */ | 2134 | */ |
2135 | count_vm_event(COMPACTFAIL); | 2135 | count_vm_event(COMPACTFAIL); |
2136 | 2136 | ||
2137 | /* | 2137 | /* |
2138 | * As async compaction considers a subset of pageblocks, only | 2138 | * As async compaction considers a subset of pageblocks, only |
2139 | * defer if the failure was a sync compaction failure. | 2139 | * defer if the failure was a sync compaction failure. |
2140 | */ | 2140 | */ |
2141 | if (sync_migration) | 2141 | if (sync_migration) |
2142 | defer_compaction(preferred_zone, order); | 2142 | defer_compaction(preferred_zone, order); |
2143 | 2143 | ||
2144 | cond_resched(); | 2144 | cond_resched(); |
2145 | } | 2145 | } |
2146 | 2146 | ||
2147 | return NULL; | 2147 | return NULL; |
2148 | } | 2148 | } |
2149 | #else | 2149 | #else |
2150 | static inline struct page * | 2150 | static inline struct page * |
2151 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | 2151 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, |
2152 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2152 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
2153 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 2153 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, |
2154 | int migratetype, bool sync_migration, | 2154 | int migratetype, bool sync_migration, |
2155 | bool *deferred_compaction, | 2155 | bool *deferred_compaction, |
2156 | unsigned long *did_some_progress) | 2156 | unsigned long *did_some_progress) |
2157 | { | 2157 | { |
2158 | return NULL; | 2158 | return NULL; |
2159 | } | 2159 | } |
2160 | #endif /* CONFIG_COMPACTION */ | 2160 | #endif /* CONFIG_COMPACTION */ |
2161 | 2161 | ||
2162 | /* Perform direct synchronous page reclaim */ | 2162 | /* Perform direct synchronous page reclaim */ |
2163 | static int | 2163 | static int |
2164 | __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, | 2164 | __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, |
2165 | nodemask_t *nodemask) | 2165 | nodemask_t *nodemask) |
2166 | { | 2166 | { |
2167 | struct reclaim_state reclaim_state; | 2167 | struct reclaim_state reclaim_state; |
2168 | int progress; | 2168 | int progress; |
2169 | 2169 | ||
2170 | cond_resched(); | 2170 | cond_resched(); |
2171 | 2171 | ||
2172 | /* We now go into synchronous reclaim */ | 2172 | /* We now go into synchronous reclaim */ |
2173 | cpuset_memory_pressure_bump(); | 2173 | cpuset_memory_pressure_bump(); |
2174 | current->flags |= PF_MEMALLOC; | 2174 | current->flags |= PF_MEMALLOC; |
2175 | lockdep_set_current_reclaim_state(gfp_mask); | 2175 | lockdep_set_current_reclaim_state(gfp_mask); |
2176 | reclaim_state.reclaimed_slab = 0; | 2176 | reclaim_state.reclaimed_slab = 0; |
2177 | current->reclaim_state = &reclaim_state; | 2177 | current->reclaim_state = &reclaim_state; |
2178 | 2178 | ||
2179 | progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); | 2179 | progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); |
2180 | 2180 | ||
2181 | current->reclaim_state = NULL; | 2181 | current->reclaim_state = NULL; |
2182 | lockdep_clear_current_reclaim_state(); | 2182 | lockdep_clear_current_reclaim_state(); |
2183 | current->flags &= ~PF_MEMALLOC; | 2183 | current->flags &= ~PF_MEMALLOC; |
2184 | 2184 | ||
2185 | cond_resched(); | 2185 | cond_resched(); |
2186 | 2186 | ||
2187 | return progress; | 2187 | return progress; |
2188 | } | 2188 | } |
2189 | 2189 | ||
2190 | /* The really slow allocator path where we enter direct reclaim */ | 2190 | /* The really slow allocator path where we enter direct reclaim */ |
2191 | static inline struct page * | 2191 | static inline struct page * |
2192 | __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | 2192 | __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, |
2193 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2193 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
2194 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 2194 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, |
2195 | int migratetype, unsigned long *did_some_progress) | 2195 | int migratetype, unsigned long *did_some_progress) |
2196 | { | 2196 | { |
2197 | struct page *page = NULL; | 2197 | struct page *page = NULL; |
2198 | bool drained = false; | 2198 | bool drained = false; |
2199 | 2199 | ||
2200 | *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, | 2200 | *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, |
2201 | nodemask); | 2201 | nodemask); |
2202 | if (unlikely(!(*did_some_progress))) | 2202 | if (unlikely(!(*did_some_progress))) |
2203 | return NULL; | 2203 | return NULL; |
2204 | 2204 | ||
2205 | /* After successful reclaim, reconsider all zones for allocation */ | 2205 | /* After successful reclaim, reconsider all zones for allocation */ |
2206 | if (NUMA_BUILD) | 2206 | if (NUMA_BUILD) |
2207 | zlc_clear_zones_full(zonelist); | 2207 | zlc_clear_zones_full(zonelist); |
2208 | 2208 | ||
2209 | retry: | 2209 | retry: |
2210 | page = get_page_from_freelist(gfp_mask, nodemask, order, | 2210 | page = get_page_from_freelist(gfp_mask, nodemask, order, |
2211 | zonelist, high_zoneidx, | 2211 | zonelist, high_zoneidx, |
2212 | alloc_flags, preferred_zone, | 2212 | alloc_flags, preferred_zone, |
2213 | migratetype); | 2213 | migratetype); |
2214 | 2214 | ||
2215 | /* | 2215 | /* |
2216 | * If an allocation failed after direct reclaim, it could be because | 2216 | * If an allocation failed after direct reclaim, it could be because |
2217 | * pages are pinned on the per-cpu lists. Drain them and try again | 2217 | * pages are pinned on the per-cpu lists. Drain them and try again |
2218 | */ | 2218 | */ |
2219 | if (!page && !drained) { | 2219 | if (!page && !drained) { |
2220 | drain_all_pages(); | 2220 | drain_all_pages(); |
2221 | drained = true; | 2221 | drained = true; |
2222 | goto retry; | 2222 | goto retry; |
2223 | } | 2223 | } |
2224 | 2224 | ||
2225 | return page; | 2225 | return page; |
2226 | } | 2226 | } |
2227 | 2227 | ||
2228 | /* | 2228 | /* |
2229 | * This is called in the allocator slow-path if the allocation request is of | 2229 | * This is called in the allocator slow-path if the allocation request is of |
2230 | * sufficient urgency to ignore watermarks and take other desperate measures | 2230 | * sufficient urgency to ignore watermarks and take other desperate measures |
2231 | */ | 2231 | */ |
2232 | static inline struct page * | 2232 | static inline struct page * |
2233 | __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, | 2233 | __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, |
2234 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2234 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
2235 | nodemask_t *nodemask, struct zone *preferred_zone, | 2235 | nodemask_t *nodemask, struct zone *preferred_zone, |
2236 | int migratetype) | 2236 | int migratetype) |
2237 | { | 2237 | { |
2238 | struct page *page; | 2238 | struct page *page; |
2239 | 2239 | ||
2240 | do { | 2240 | do { |
2241 | page = get_page_from_freelist(gfp_mask, nodemask, order, | 2241 | page = get_page_from_freelist(gfp_mask, nodemask, order, |
2242 | zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, | 2242 | zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, |
2243 | preferred_zone, migratetype); | 2243 | preferred_zone, migratetype); |
2244 | 2244 | ||
2245 | if (!page && gfp_mask & __GFP_NOFAIL) | 2245 | if (!page && gfp_mask & __GFP_NOFAIL) |
2246 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); | 2246 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); |
2247 | } while (!page && (gfp_mask & __GFP_NOFAIL)); | 2247 | } while (!page && (gfp_mask & __GFP_NOFAIL)); |
2248 | 2248 | ||
2249 | return page; | 2249 | return page; |
2250 | } | 2250 | } |
2251 | 2251 | ||
2252 | static inline | 2252 | static inline |
2253 | void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, | 2253 | void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, |
2254 | enum zone_type high_zoneidx, | 2254 | enum zone_type high_zoneidx, |
2255 | enum zone_type classzone_idx) | 2255 | enum zone_type classzone_idx) |
2256 | { | 2256 | { |
2257 | struct zoneref *z; | 2257 | struct zoneref *z; |
2258 | struct zone *zone; | 2258 | struct zone *zone; |
2259 | 2259 | ||
2260 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) | 2260 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) |
2261 | wakeup_kswapd(zone, order, classzone_idx); | 2261 | wakeup_kswapd(zone, order, classzone_idx); |
2262 | } | 2262 | } |
2263 | 2263 | ||
2264 | static inline int | 2264 | static inline int |
2265 | gfp_to_alloc_flags(gfp_t gfp_mask) | 2265 | gfp_to_alloc_flags(gfp_t gfp_mask) |
2266 | { | 2266 | { |
2267 | int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; | 2267 | int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; |
2268 | const gfp_t wait = gfp_mask & __GFP_WAIT; | 2268 | const gfp_t wait = gfp_mask & __GFP_WAIT; |
2269 | 2269 | ||
2270 | /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ | 2270 | /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ |
2271 | BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); | 2271 | BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); |
2272 | 2272 | ||
2273 | /* | 2273 | /* |
2274 | * The caller may dip into page reserves a bit more if the caller | 2274 | * The caller may dip into page reserves a bit more if the caller |
2275 | * cannot run direct reclaim, or if the caller has realtime scheduling | 2275 | * cannot run direct reclaim, or if the caller has realtime scheduling |
2276 | * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will | 2276 | * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will |
2277 | * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). | 2277 | * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). |
2278 | */ | 2278 | */ |
2279 | alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); | 2279 | alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); |
2280 | 2280 | ||
2281 | if (!wait) { | 2281 | if (!wait) { |
2282 | /* | 2282 | /* |
2283 | * Not worth trying to allocate harder for | 2283 | * Not worth trying to allocate harder for |
2284 | * __GFP_NOMEMALLOC even if it can't schedule. | 2284 | * __GFP_NOMEMALLOC even if it can't schedule. |
2285 | */ | 2285 | */ |
2286 | if (!(gfp_mask & __GFP_NOMEMALLOC)) | 2286 | if (!(gfp_mask & __GFP_NOMEMALLOC)) |
2287 | alloc_flags |= ALLOC_HARDER; | 2287 | alloc_flags |= ALLOC_HARDER; |
2288 | /* | 2288 | /* |
2289 | * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. | 2289 | * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. |
2290 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | 2290 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. |
2291 | */ | 2291 | */ |
2292 | alloc_flags &= ~ALLOC_CPUSET; | 2292 | alloc_flags &= ~ALLOC_CPUSET; |
2293 | } else if (unlikely(rt_task(current)) && !in_interrupt()) | 2293 | } else if (unlikely(rt_task(current)) && !in_interrupt()) |
2294 | alloc_flags |= ALLOC_HARDER; | 2294 | alloc_flags |= ALLOC_HARDER; |
2295 | 2295 | ||
2296 | if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { | 2296 | if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { |
2297 | if (gfp_mask & __GFP_MEMALLOC) | 2297 | if (gfp_mask & __GFP_MEMALLOC) |
2298 | alloc_flags |= ALLOC_NO_WATERMARKS; | 2298 | alloc_flags |= ALLOC_NO_WATERMARKS; |
2299 | else if (likely(!(gfp_mask & __GFP_NOMEMALLOC)) && !in_interrupt()) | 2299 | else if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) |
2300 | alloc_flags |= ALLOC_NO_WATERMARKS; | ||
2301 | else if (!in_interrupt() && | ||
2302 | ((current->flags & PF_MEMALLOC) || | ||
2303 | unlikely(test_thread_flag(TIF_MEMDIE)))) | ||
2300 | alloc_flags |= ALLOC_NO_WATERMARKS; | 2304 | alloc_flags |= ALLOC_NO_WATERMARKS; |
2301 | } | 2305 | } |
2302 | 2306 | ||
2303 | return alloc_flags; | 2307 | return alloc_flags; |
2304 | } | 2308 | } |
2305 | 2309 | ||
2306 | bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) | 2310 | bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) |
2307 | { | 2311 | { |
2308 | return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS); | 2312 | return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS); |
2309 | } | 2313 | } |
2310 | 2314 | ||
2311 | static inline struct page * | 2315 | static inline struct page * |
2312 | __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | 2316 | __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, |
2313 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2317 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
2314 | nodemask_t *nodemask, struct zone *preferred_zone, | 2318 | nodemask_t *nodemask, struct zone *preferred_zone, |
2315 | int migratetype) | 2319 | int migratetype) |
2316 | { | 2320 | { |
2317 | const gfp_t wait = gfp_mask & __GFP_WAIT; | 2321 | const gfp_t wait = gfp_mask & __GFP_WAIT; |
2318 | struct page *page = NULL; | 2322 | struct page *page = NULL; |
2319 | int alloc_flags; | 2323 | int alloc_flags; |
2320 | unsigned long pages_reclaimed = 0; | 2324 | unsigned long pages_reclaimed = 0; |
2321 | unsigned long did_some_progress; | 2325 | unsigned long did_some_progress; |
2322 | bool sync_migration = false; | 2326 | bool sync_migration = false; |
2323 | bool deferred_compaction = false; | 2327 | bool deferred_compaction = false; |
2324 | 2328 | ||
2325 | /* | 2329 | /* |
2326 | * In the slowpath, we sanity check order to avoid ever trying to | 2330 | * In the slowpath, we sanity check order to avoid ever trying to |
2327 | * reclaim >= MAX_ORDER areas which will never succeed. Callers may | 2331 | * reclaim >= MAX_ORDER areas which will never succeed. Callers may |
2328 | * be using allocators in order of preference for an area that is | 2332 | * be using allocators in order of preference for an area that is |
2329 | * too large. | 2333 | * too large. |
2330 | */ | 2334 | */ |
2331 | if (order >= MAX_ORDER) { | 2335 | if (order >= MAX_ORDER) { |
2332 | WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN)); | 2336 | WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN)); |
2333 | return NULL; | 2337 | return NULL; |
2334 | } | 2338 | } |
2335 | 2339 | ||
2336 | /* | 2340 | /* |
2337 | * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and | 2341 | * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and |
2338 | * __GFP_NOWARN set) should not cause reclaim since the subsystem | 2342 | * __GFP_NOWARN set) should not cause reclaim since the subsystem |
2339 | * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim | 2343 | * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim |
2340 | * using a larger set of nodes after it has established that the | 2344 | * using a larger set of nodes after it has established that the |
2341 | * allowed per node queues are empty and that nodes are | 2345 | * allowed per node queues are empty and that nodes are |
2342 | * over allocated. | 2346 | * over allocated. |
2343 | */ | 2347 | */ |
2344 | if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) | 2348 | if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) |
2345 | goto nopage; | 2349 | goto nopage; |
2346 | 2350 | ||
2347 | restart: | 2351 | restart: |
2348 | if (!(gfp_mask & __GFP_NO_KSWAPD)) | 2352 | if (!(gfp_mask & __GFP_NO_KSWAPD)) |
2349 | wake_all_kswapd(order, zonelist, high_zoneidx, | 2353 | wake_all_kswapd(order, zonelist, high_zoneidx, |
2350 | zone_idx(preferred_zone)); | 2354 | zone_idx(preferred_zone)); |
2351 | 2355 | ||
2352 | /* | 2356 | /* |
2353 | * OK, we're below the kswapd watermark and have kicked background | 2357 | * OK, we're below the kswapd watermark and have kicked background |
2354 | * reclaim. Now things get more complex, so set up alloc_flags according | 2358 | * reclaim. Now things get more complex, so set up alloc_flags according |
2355 | * to how we want to proceed. | 2359 | * to how we want to proceed. |
2356 | */ | 2360 | */ |
2357 | alloc_flags = gfp_to_alloc_flags(gfp_mask); | 2361 | alloc_flags = gfp_to_alloc_flags(gfp_mask); |
2358 | 2362 | ||
2359 | /* | 2363 | /* |
2360 | * Find the true preferred zone if the allocation is unconstrained by | 2364 | * Find the true preferred zone if the allocation is unconstrained by |
2361 | * cpusets. | 2365 | * cpusets. |
2362 | */ | 2366 | */ |
2363 | if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) | 2367 | if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) |
2364 | first_zones_zonelist(zonelist, high_zoneidx, NULL, | 2368 | first_zones_zonelist(zonelist, high_zoneidx, NULL, |
2365 | &preferred_zone); | 2369 | &preferred_zone); |
2366 | 2370 | ||
2367 | rebalance: | 2371 | rebalance: |
2368 | /* This is the last chance, in general, before the goto nopage. */ | 2372 | /* This is the last chance, in general, before the goto nopage. */ |
2369 | page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, | 2373 | page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, |
2370 | high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, | 2374 | high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, |
2371 | preferred_zone, migratetype); | 2375 | preferred_zone, migratetype); |
2372 | if (page) | 2376 | if (page) |
2373 | goto got_pg; | 2377 | goto got_pg; |
2374 | 2378 | ||
2375 | /* Allocate without watermarks if the context allows */ | 2379 | /* Allocate without watermarks if the context allows */ |
2376 | if (alloc_flags & ALLOC_NO_WATERMARKS) { | 2380 | if (alloc_flags & ALLOC_NO_WATERMARKS) { |
2377 | page = __alloc_pages_high_priority(gfp_mask, order, | 2381 | page = __alloc_pages_high_priority(gfp_mask, order, |
2378 | zonelist, high_zoneidx, nodemask, | 2382 | zonelist, high_zoneidx, nodemask, |
2379 | preferred_zone, migratetype); | 2383 | preferred_zone, migratetype); |
2380 | if (page) | 2384 | if (page) |
2381 | goto got_pg; | 2385 | goto got_pg; |
2382 | } | 2386 | } |
2383 | 2387 | ||
2384 | /* Atomic allocations - we can't balance anything */ | 2388 | /* Atomic allocations - we can't balance anything */ |
2385 | if (!wait) | 2389 | if (!wait) |
2386 | goto nopage; | 2390 | goto nopage; |
2387 | 2391 | ||
2388 | /* Avoid recursion of direct reclaim */ | 2392 | /* Avoid recursion of direct reclaim */ |
2389 | if (current->flags & PF_MEMALLOC) | 2393 | if (current->flags & PF_MEMALLOC) |
2390 | goto nopage; | 2394 | goto nopage; |
2391 | 2395 | ||
2392 | /* Avoid allocations with no watermarks from looping endlessly */ | 2396 | /* Avoid allocations with no watermarks from looping endlessly */ |
2393 | if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) | 2397 | if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) |
2394 | goto nopage; | 2398 | goto nopage; |
2395 | 2399 | ||
2396 | /* | 2400 | /* |
2397 | * Try direct compaction. The first pass is asynchronous. Subsequent | 2401 | * Try direct compaction. The first pass is asynchronous. Subsequent |
2398 | * attempts after direct reclaim are synchronous | 2402 | * attempts after direct reclaim are synchronous |
2399 | */ | 2403 | */ |
2400 | page = __alloc_pages_direct_compact(gfp_mask, order, | 2404 | page = __alloc_pages_direct_compact(gfp_mask, order, |
2401 | zonelist, high_zoneidx, | 2405 | zonelist, high_zoneidx, |
2402 | nodemask, | 2406 | nodemask, |
2403 | alloc_flags, preferred_zone, | 2407 | alloc_flags, preferred_zone, |
2404 | migratetype, sync_migration, | 2408 | migratetype, sync_migration, |
2405 | &deferred_compaction, | 2409 | &deferred_compaction, |
2406 | &did_some_progress); | 2410 | &did_some_progress); |
2407 | if (page) | 2411 | if (page) |
2408 | goto got_pg; | 2412 | goto got_pg; |
2409 | sync_migration = true; | 2413 | sync_migration = true; |
2410 | 2414 | ||
2411 | /* | 2415 | /* |
2412 | * If compaction is deferred for high-order allocations, it is because | 2416 | * If compaction is deferred for high-order allocations, it is because |
2413 | * sync compaction recently failed. In this is the case and the caller | 2417 | * sync compaction recently failed. In this is the case and the caller |
2414 | * has requested the system not be heavily disrupted, fail the | 2418 | * has requested the system not be heavily disrupted, fail the |
2415 | * allocation now instead of entering direct reclaim | 2419 | * allocation now instead of entering direct reclaim |
2416 | */ | 2420 | */ |
2417 | if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD)) | 2421 | if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD)) |
2418 | goto nopage; | 2422 | goto nopage; |
2419 | 2423 | ||
2420 | /* Try direct reclaim and then allocating */ | 2424 | /* Try direct reclaim and then allocating */ |
2421 | page = __alloc_pages_direct_reclaim(gfp_mask, order, | 2425 | page = __alloc_pages_direct_reclaim(gfp_mask, order, |
2422 | zonelist, high_zoneidx, | 2426 | zonelist, high_zoneidx, |
2423 | nodemask, | 2427 | nodemask, |
2424 | alloc_flags, preferred_zone, | 2428 | alloc_flags, preferred_zone, |
2425 | migratetype, &did_some_progress); | 2429 | migratetype, &did_some_progress); |
2426 | if (page) | 2430 | if (page) |
2427 | goto got_pg; | 2431 | goto got_pg; |
2428 | 2432 | ||
2429 | /* | 2433 | /* |
2430 | * If we failed to make any progress reclaiming, then we are | 2434 | * If we failed to make any progress reclaiming, then we are |
2431 | * running out of options and have to consider going OOM | 2435 | * running out of options and have to consider going OOM |
2432 | */ | 2436 | */ |
2433 | if (!did_some_progress) { | 2437 | if (!did_some_progress) { |
2434 | if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { | 2438 | if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { |
2435 | if (oom_killer_disabled) | 2439 | if (oom_killer_disabled) |
2436 | goto nopage; | 2440 | goto nopage; |
2437 | /* Coredumps can quickly deplete all memory reserves */ | 2441 | /* Coredumps can quickly deplete all memory reserves */ |
2438 | if ((current->flags & PF_DUMPCORE) && | 2442 | if ((current->flags & PF_DUMPCORE) && |
2439 | !(gfp_mask & __GFP_NOFAIL)) | 2443 | !(gfp_mask & __GFP_NOFAIL)) |
2440 | goto nopage; | 2444 | goto nopage; |
2441 | page = __alloc_pages_may_oom(gfp_mask, order, | 2445 | page = __alloc_pages_may_oom(gfp_mask, order, |
2442 | zonelist, high_zoneidx, | 2446 | zonelist, high_zoneidx, |
2443 | nodemask, preferred_zone, | 2447 | nodemask, preferred_zone, |
2444 | migratetype); | 2448 | migratetype); |
2445 | if (page) | 2449 | if (page) |
2446 | goto got_pg; | 2450 | goto got_pg; |
2447 | 2451 | ||
2448 | if (!(gfp_mask & __GFP_NOFAIL)) { | 2452 | if (!(gfp_mask & __GFP_NOFAIL)) { |
2449 | /* | 2453 | /* |
2450 | * The oom killer is not called for high-order | 2454 | * The oom killer is not called for high-order |
2451 | * allocations that may fail, so if no progress | 2455 | * allocations that may fail, so if no progress |
2452 | * is being made, there are no other options and | 2456 | * is being made, there are no other options and |
2453 | * retrying is unlikely to help. | 2457 | * retrying is unlikely to help. |
2454 | */ | 2458 | */ |
2455 | if (order > PAGE_ALLOC_COSTLY_ORDER) | 2459 | if (order > PAGE_ALLOC_COSTLY_ORDER) |
2456 | goto nopage; | 2460 | goto nopage; |
2457 | /* | 2461 | /* |
2458 | * The oom killer is not called for lowmem | 2462 | * The oom killer is not called for lowmem |
2459 | * allocations to prevent needlessly killing | 2463 | * allocations to prevent needlessly killing |
2460 | * innocent tasks. | 2464 | * innocent tasks. |
2461 | */ | 2465 | */ |
2462 | if (high_zoneidx < ZONE_NORMAL) | 2466 | if (high_zoneidx < ZONE_NORMAL) |
2463 | goto nopage; | 2467 | goto nopage; |
2464 | } | 2468 | } |
2465 | 2469 | ||
2466 | goto restart; | 2470 | goto restart; |
2467 | } | 2471 | } |
2468 | } | 2472 | } |
2469 | 2473 | ||
2470 | /* Check if we should retry the allocation */ | 2474 | /* Check if we should retry the allocation */ |
2471 | pages_reclaimed += did_some_progress; | 2475 | pages_reclaimed += did_some_progress; |
2472 | if (should_alloc_retry(gfp_mask, order, did_some_progress, | 2476 | if (should_alloc_retry(gfp_mask, order, did_some_progress, |
2473 | pages_reclaimed)) { | 2477 | pages_reclaimed)) { |
2474 | /* Wait for some write requests to complete then retry */ | 2478 | /* Wait for some write requests to complete then retry */ |
2475 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); | 2479 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); |
2476 | goto rebalance; | 2480 | goto rebalance; |
2477 | } else { | 2481 | } else { |
2478 | /* | 2482 | /* |
2479 | * High-order allocations do not necessarily loop after | 2483 | * High-order allocations do not necessarily loop after |
2480 | * direct reclaim and reclaim/compaction depends on compaction | 2484 | * direct reclaim and reclaim/compaction depends on compaction |
2481 | * being called after reclaim so call directly if necessary | 2485 | * being called after reclaim so call directly if necessary |
2482 | */ | 2486 | */ |
2483 | page = __alloc_pages_direct_compact(gfp_mask, order, | 2487 | page = __alloc_pages_direct_compact(gfp_mask, order, |
2484 | zonelist, high_zoneidx, | 2488 | zonelist, high_zoneidx, |
2485 | nodemask, | 2489 | nodemask, |
2486 | alloc_flags, preferred_zone, | 2490 | alloc_flags, preferred_zone, |
2487 | migratetype, sync_migration, | 2491 | migratetype, sync_migration, |
2488 | &deferred_compaction, | 2492 | &deferred_compaction, |
2489 | &did_some_progress); | 2493 | &did_some_progress); |
2490 | if (page) | 2494 | if (page) |
2491 | goto got_pg; | 2495 | goto got_pg; |
2492 | } | 2496 | } |
2493 | 2497 | ||
2494 | nopage: | 2498 | nopage: |
2495 | warn_alloc_failed(gfp_mask, order, NULL); | 2499 | warn_alloc_failed(gfp_mask, order, NULL); |
2496 | return page; | 2500 | return page; |
2497 | got_pg: | 2501 | got_pg: |
2498 | /* | 2502 | /* |
2499 | * page->pfmemalloc is set when the caller had PFMEMALLOC set, is | 2503 | * page->pfmemalloc is set when the caller had PFMEMALLOC set, is |
2500 | * been OOM killed or specified __GFP_MEMALLOC. The expectation is | 2504 | * been OOM killed or specified __GFP_MEMALLOC. The expectation is |
2501 | * that the caller is taking steps that will free more memory. The | 2505 | * that the caller is taking steps that will free more memory. The |
2502 | * caller should avoid the page being used for !PFMEMALLOC purposes. | 2506 | * caller should avoid the page being used for !PFMEMALLOC purposes. |
2503 | */ | 2507 | */ |
2504 | page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); | 2508 | page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); |
2505 | 2509 | ||
2506 | if (kmemcheck_enabled) | 2510 | if (kmemcheck_enabled) |
2507 | kmemcheck_pagealloc_alloc(page, order, gfp_mask); | 2511 | kmemcheck_pagealloc_alloc(page, order, gfp_mask); |
2508 | 2512 | ||
2509 | return page; | 2513 | return page; |
2510 | } | 2514 | } |
2511 | 2515 | ||
2512 | /* | 2516 | /* |
2513 | * This is the 'heart' of the zoned buddy allocator. | 2517 | * This is the 'heart' of the zoned buddy allocator. |
2514 | */ | 2518 | */ |
2515 | struct page * | 2519 | struct page * |
2516 | __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | 2520 | __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, |
2517 | struct zonelist *zonelist, nodemask_t *nodemask) | 2521 | struct zonelist *zonelist, nodemask_t *nodemask) |
2518 | { | 2522 | { |
2519 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | 2523 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); |
2520 | struct zone *preferred_zone; | 2524 | struct zone *preferred_zone; |
2521 | struct page *page = NULL; | 2525 | struct page *page = NULL; |
2522 | int migratetype = allocflags_to_migratetype(gfp_mask); | 2526 | int migratetype = allocflags_to_migratetype(gfp_mask); |
2523 | unsigned int cpuset_mems_cookie; | 2527 | unsigned int cpuset_mems_cookie; |
2524 | 2528 | ||
2525 | gfp_mask &= gfp_allowed_mask; | 2529 | gfp_mask &= gfp_allowed_mask; |
2526 | 2530 | ||
2527 | lockdep_trace_alloc(gfp_mask); | 2531 | lockdep_trace_alloc(gfp_mask); |
2528 | 2532 | ||
2529 | might_sleep_if(gfp_mask & __GFP_WAIT); | 2533 | might_sleep_if(gfp_mask & __GFP_WAIT); |
2530 | 2534 | ||
2531 | if (should_fail_alloc_page(gfp_mask, order)) | 2535 | if (should_fail_alloc_page(gfp_mask, order)) |
2532 | return NULL; | 2536 | return NULL; |
2533 | 2537 | ||
2534 | /* | 2538 | /* |
2535 | * Check the zones suitable for the gfp_mask contain at least one | 2539 | * Check the zones suitable for the gfp_mask contain at least one |
2536 | * valid zone. It's possible to have an empty zonelist as a result | 2540 | * valid zone. It's possible to have an empty zonelist as a result |
2537 | * of GFP_THISNODE and a memoryless node | 2541 | * of GFP_THISNODE and a memoryless node |
2538 | */ | 2542 | */ |
2539 | if (unlikely(!zonelist->_zonerefs->zone)) | 2543 | if (unlikely(!zonelist->_zonerefs->zone)) |
2540 | return NULL; | 2544 | return NULL; |
2541 | 2545 | ||
2542 | retry_cpuset: | 2546 | retry_cpuset: |
2543 | cpuset_mems_cookie = get_mems_allowed(); | 2547 | cpuset_mems_cookie = get_mems_allowed(); |
2544 | 2548 | ||
2545 | /* The preferred zone is used for statistics later */ | 2549 | /* The preferred zone is used for statistics later */ |
2546 | first_zones_zonelist(zonelist, high_zoneidx, | 2550 | first_zones_zonelist(zonelist, high_zoneidx, |
2547 | nodemask ? : &cpuset_current_mems_allowed, | 2551 | nodemask ? : &cpuset_current_mems_allowed, |
2548 | &preferred_zone); | 2552 | &preferred_zone); |
2549 | if (!preferred_zone) | 2553 | if (!preferred_zone) |
2550 | goto out; | 2554 | goto out; |
2551 | 2555 | ||
2552 | /* First allocation attempt */ | 2556 | /* First allocation attempt */ |
2553 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, | 2557 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, |
2554 | zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET, | 2558 | zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET, |
2555 | preferred_zone, migratetype); | 2559 | preferred_zone, migratetype); |
2556 | if (unlikely(!page)) | 2560 | if (unlikely(!page)) |
2557 | page = __alloc_pages_slowpath(gfp_mask, order, | 2561 | page = __alloc_pages_slowpath(gfp_mask, order, |
2558 | zonelist, high_zoneidx, nodemask, | 2562 | zonelist, high_zoneidx, nodemask, |
2559 | preferred_zone, migratetype); | 2563 | preferred_zone, migratetype); |
2560 | else | 2564 | else |
2561 | page->pfmemalloc = false; | 2565 | page->pfmemalloc = false; |
2562 | 2566 | ||
2563 | trace_mm_page_alloc(page, order, gfp_mask, migratetype); | 2567 | trace_mm_page_alloc(page, order, gfp_mask, migratetype); |
2564 | 2568 | ||
2565 | out: | 2569 | out: |
2566 | /* | 2570 | /* |
2567 | * When updating a task's mems_allowed, it is possible to race with | 2571 | * When updating a task's mems_allowed, it is possible to race with |
2568 | * parallel threads in such a way that an allocation can fail while | 2572 | * parallel threads in such a way that an allocation can fail while |
2569 | * the mask is being updated. If a page allocation is about to fail, | 2573 | * the mask is being updated. If a page allocation is about to fail, |
2570 | * check if the cpuset changed during allocation and if so, retry. | 2574 | * check if the cpuset changed during allocation and if so, retry. |
2571 | */ | 2575 | */ |
2572 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) | 2576 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) |
2573 | goto retry_cpuset; | 2577 | goto retry_cpuset; |
2574 | 2578 | ||
2575 | return page; | 2579 | return page; |
2576 | } | 2580 | } |
2577 | EXPORT_SYMBOL(__alloc_pages_nodemask); | 2581 | EXPORT_SYMBOL(__alloc_pages_nodemask); |
2578 | 2582 | ||
2579 | /* | 2583 | /* |
2580 | * Common helper functions. | 2584 | * Common helper functions. |
2581 | */ | 2585 | */ |
2582 | unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) | 2586 | unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) |
2583 | { | 2587 | { |
2584 | struct page *page; | 2588 | struct page *page; |
2585 | 2589 | ||
2586 | /* | 2590 | /* |
2587 | * __get_free_pages() returns a 32-bit address, which cannot represent | 2591 | * __get_free_pages() returns a 32-bit address, which cannot represent |
2588 | * a highmem page | 2592 | * a highmem page |
2589 | */ | 2593 | */ |
2590 | VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); | 2594 | VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); |
2591 | 2595 | ||
2592 | page = alloc_pages(gfp_mask, order); | 2596 | page = alloc_pages(gfp_mask, order); |
2593 | if (!page) | 2597 | if (!page) |
2594 | return 0; | 2598 | return 0; |
2595 | return (unsigned long) page_address(page); | 2599 | return (unsigned long) page_address(page); |
2596 | } | 2600 | } |
2597 | EXPORT_SYMBOL(__get_free_pages); | 2601 | EXPORT_SYMBOL(__get_free_pages); |
2598 | 2602 | ||
2599 | unsigned long get_zeroed_page(gfp_t gfp_mask) | 2603 | unsigned long get_zeroed_page(gfp_t gfp_mask) |
2600 | { | 2604 | { |
2601 | return __get_free_pages(gfp_mask | __GFP_ZERO, 0); | 2605 | return __get_free_pages(gfp_mask | __GFP_ZERO, 0); |
2602 | } | 2606 | } |
2603 | EXPORT_SYMBOL(get_zeroed_page); | 2607 | EXPORT_SYMBOL(get_zeroed_page); |
2604 | 2608 | ||
2605 | void __free_pages(struct page *page, unsigned int order) | 2609 | void __free_pages(struct page *page, unsigned int order) |
2606 | { | 2610 | { |
2607 | if (put_page_testzero(page)) { | 2611 | if (put_page_testzero(page)) { |
2608 | if (order == 0) | 2612 | if (order == 0) |
2609 | free_hot_cold_page(page, 0); | 2613 | free_hot_cold_page(page, 0); |
2610 | else | 2614 | else |
2611 | __free_pages_ok(page, order); | 2615 | __free_pages_ok(page, order); |
2612 | } | 2616 | } |
2613 | } | 2617 | } |
2614 | 2618 | ||
2615 | EXPORT_SYMBOL(__free_pages); | 2619 | EXPORT_SYMBOL(__free_pages); |
2616 | 2620 | ||
2617 | void free_pages(unsigned long addr, unsigned int order) | 2621 | void free_pages(unsigned long addr, unsigned int order) |
2618 | { | 2622 | { |
2619 | if (addr != 0) { | 2623 | if (addr != 0) { |
2620 | VM_BUG_ON(!virt_addr_valid((void *)addr)); | 2624 | VM_BUG_ON(!virt_addr_valid((void *)addr)); |
2621 | __free_pages(virt_to_page((void *)addr), order); | 2625 | __free_pages(virt_to_page((void *)addr), order); |
2622 | } | 2626 | } |
2623 | } | 2627 | } |
2624 | 2628 | ||
2625 | EXPORT_SYMBOL(free_pages); | 2629 | EXPORT_SYMBOL(free_pages); |
2626 | 2630 | ||
2627 | static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size) | 2631 | static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size) |
2628 | { | 2632 | { |
2629 | if (addr) { | 2633 | if (addr) { |
2630 | unsigned long alloc_end = addr + (PAGE_SIZE << order); | 2634 | unsigned long alloc_end = addr + (PAGE_SIZE << order); |
2631 | unsigned long used = addr + PAGE_ALIGN(size); | 2635 | unsigned long used = addr + PAGE_ALIGN(size); |
2632 | 2636 | ||
2633 | split_page(virt_to_page((void *)addr), order); | 2637 | split_page(virt_to_page((void *)addr), order); |
2634 | while (used < alloc_end) { | 2638 | while (used < alloc_end) { |
2635 | free_page(used); | 2639 | free_page(used); |
2636 | used += PAGE_SIZE; | 2640 | used += PAGE_SIZE; |
2637 | } | 2641 | } |
2638 | } | 2642 | } |
2639 | return (void *)addr; | 2643 | return (void *)addr; |
2640 | } | 2644 | } |
2641 | 2645 | ||
2642 | /** | 2646 | /** |
2643 | * alloc_pages_exact - allocate an exact number physically-contiguous pages. | 2647 | * alloc_pages_exact - allocate an exact number physically-contiguous pages. |
2644 | * @size: the number of bytes to allocate | 2648 | * @size: the number of bytes to allocate |
2645 | * @gfp_mask: GFP flags for the allocation | 2649 | * @gfp_mask: GFP flags for the allocation |
2646 | * | 2650 | * |
2647 | * This function is similar to alloc_pages(), except that it allocates the | 2651 | * This function is similar to alloc_pages(), except that it allocates the |
2648 | * minimum number of pages to satisfy the request. alloc_pages() can only | 2652 | * minimum number of pages to satisfy the request. alloc_pages() can only |
2649 | * allocate memory in power-of-two pages. | 2653 | * allocate memory in power-of-two pages. |
2650 | * | 2654 | * |
2651 | * This function is also limited by MAX_ORDER. | 2655 | * This function is also limited by MAX_ORDER. |
2652 | * | 2656 | * |
2653 | * Memory allocated by this function must be released by free_pages_exact(). | 2657 | * Memory allocated by this function must be released by free_pages_exact(). |
2654 | */ | 2658 | */ |
2655 | void *alloc_pages_exact(size_t size, gfp_t gfp_mask) | 2659 | void *alloc_pages_exact(size_t size, gfp_t gfp_mask) |
2656 | { | 2660 | { |
2657 | unsigned int order = get_order(size); | 2661 | unsigned int order = get_order(size); |
2658 | unsigned long addr; | 2662 | unsigned long addr; |
2659 | 2663 | ||
2660 | addr = __get_free_pages(gfp_mask, order); | 2664 | addr = __get_free_pages(gfp_mask, order); |
2661 | return make_alloc_exact(addr, order, size); | 2665 | return make_alloc_exact(addr, order, size); |
2662 | } | 2666 | } |
2663 | EXPORT_SYMBOL(alloc_pages_exact); | 2667 | EXPORT_SYMBOL(alloc_pages_exact); |
2664 | 2668 | ||
2665 | /** | 2669 | /** |
2666 | * alloc_pages_exact_nid - allocate an exact number of physically-contiguous | 2670 | * alloc_pages_exact_nid - allocate an exact number of physically-contiguous |
2667 | * pages on a node. | 2671 | * pages on a node. |
2668 | * @nid: the preferred node ID where memory should be allocated | 2672 | * @nid: the preferred node ID where memory should be allocated |
2669 | * @size: the number of bytes to allocate | 2673 | * @size: the number of bytes to allocate |
2670 | * @gfp_mask: GFP flags for the allocation | 2674 | * @gfp_mask: GFP flags for the allocation |
2671 | * | 2675 | * |
2672 | * Like alloc_pages_exact(), but try to allocate on node nid first before falling | 2676 | * Like alloc_pages_exact(), but try to allocate on node nid first before falling |
2673 | * back. | 2677 | * back. |
2674 | * Note this is not alloc_pages_exact_node() which allocates on a specific node, | 2678 | * Note this is not alloc_pages_exact_node() which allocates on a specific node, |
2675 | * but is not exact. | 2679 | * but is not exact. |
2676 | */ | 2680 | */ |
2677 | void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) | 2681 | void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) |
2678 | { | 2682 | { |
2679 | unsigned order = get_order(size); | 2683 | unsigned order = get_order(size); |
2680 | struct page *p = alloc_pages_node(nid, gfp_mask, order); | 2684 | struct page *p = alloc_pages_node(nid, gfp_mask, order); |
2681 | if (!p) | 2685 | if (!p) |
2682 | return NULL; | 2686 | return NULL; |
2683 | return make_alloc_exact((unsigned long)page_address(p), order, size); | 2687 | return make_alloc_exact((unsigned long)page_address(p), order, size); |
2684 | } | 2688 | } |
2685 | EXPORT_SYMBOL(alloc_pages_exact_nid); | 2689 | EXPORT_SYMBOL(alloc_pages_exact_nid); |
2686 | 2690 | ||
2687 | /** | 2691 | /** |
2688 | * free_pages_exact - release memory allocated via alloc_pages_exact() | 2692 | * free_pages_exact - release memory allocated via alloc_pages_exact() |
2689 | * @virt: the value returned by alloc_pages_exact. | 2693 | * @virt: the value returned by alloc_pages_exact. |
2690 | * @size: size of allocation, same value as passed to alloc_pages_exact(). | 2694 | * @size: size of allocation, same value as passed to alloc_pages_exact(). |
2691 | * | 2695 | * |
2692 | * Release the memory allocated by a previous call to alloc_pages_exact. | 2696 | * Release the memory allocated by a previous call to alloc_pages_exact. |
2693 | */ | 2697 | */ |
2694 | void free_pages_exact(void *virt, size_t size) | 2698 | void free_pages_exact(void *virt, size_t size) |
2695 | { | 2699 | { |
2696 | unsigned long addr = (unsigned long)virt; | 2700 | unsigned long addr = (unsigned long)virt; |
2697 | unsigned long end = addr + PAGE_ALIGN(size); | 2701 | unsigned long end = addr + PAGE_ALIGN(size); |
2698 | 2702 | ||
2699 | while (addr < end) { | 2703 | while (addr < end) { |
2700 | free_page(addr); | 2704 | free_page(addr); |
2701 | addr += PAGE_SIZE; | 2705 | addr += PAGE_SIZE; |
2702 | } | 2706 | } |
2703 | } | 2707 | } |
2704 | EXPORT_SYMBOL(free_pages_exact); | 2708 | EXPORT_SYMBOL(free_pages_exact); |
2705 | 2709 | ||
2706 | static unsigned int nr_free_zone_pages(int offset) | 2710 | static unsigned int nr_free_zone_pages(int offset) |
2707 | { | 2711 | { |
2708 | struct zoneref *z; | 2712 | struct zoneref *z; |
2709 | struct zone *zone; | 2713 | struct zone *zone; |
2710 | 2714 | ||
2711 | /* Just pick one node, since fallback list is circular */ | 2715 | /* Just pick one node, since fallback list is circular */ |
2712 | unsigned int sum = 0; | 2716 | unsigned int sum = 0; |
2713 | 2717 | ||
2714 | struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); | 2718 | struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); |
2715 | 2719 | ||
2716 | for_each_zone_zonelist(zone, z, zonelist, offset) { | 2720 | for_each_zone_zonelist(zone, z, zonelist, offset) { |
2717 | unsigned long size = zone->present_pages; | 2721 | unsigned long size = zone->present_pages; |
2718 | unsigned long high = high_wmark_pages(zone); | 2722 | unsigned long high = high_wmark_pages(zone); |
2719 | if (size > high) | 2723 | if (size > high) |
2720 | sum += size - high; | 2724 | sum += size - high; |
2721 | } | 2725 | } |
2722 | 2726 | ||
2723 | return sum; | 2727 | return sum; |
2724 | } | 2728 | } |
2725 | 2729 | ||
2726 | /* | 2730 | /* |
2727 | * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL | 2731 | * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL |
2728 | */ | 2732 | */ |
2729 | unsigned int nr_free_buffer_pages(void) | 2733 | unsigned int nr_free_buffer_pages(void) |
2730 | { | 2734 | { |
2731 | return nr_free_zone_pages(gfp_zone(GFP_USER)); | 2735 | return nr_free_zone_pages(gfp_zone(GFP_USER)); |
2732 | } | 2736 | } |
2733 | EXPORT_SYMBOL_GPL(nr_free_buffer_pages); | 2737 | EXPORT_SYMBOL_GPL(nr_free_buffer_pages); |
2734 | 2738 | ||
2735 | /* | 2739 | /* |
2736 | * Amount of free RAM allocatable within all zones | 2740 | * Amount of free RAM allocatable within all zones |
2737 | */ | 2741 | */ |
2738 | unsigned int nr_free_pagecache_pages(void) | 2742 | unsigned int nr_free_pagecache_pages(void) |
2739 | { | 2743 | { |
2740 | return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); | 2744 | return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); |
2741 | } | 2745 | } |
2742 | 2746 | ||
2743 | static inline void show_node(struct zone *zone) | 2747 | static inline void show_node(struct zone *zone) |
2744 | { | 2748 | { |
2745 | if (NUMA_BUILD) | 2749 | if (NUMA_BUILD) |
2746 | printk("Node %d ", zone_to_nid(zone)); | 2750 | printk("Node %d ", zone_to_nid(zone)); |
2747 | } | 2751 | } |
2748 | 2752 | ||
2749 | void si_meminfo(struct sysinfo *val) | 2753 | void si_meminfo(struct sysinfo *val) |
2750 | { | 2754 | { |
2751 | val->totalram = totalram_pages; | 2755 | val->totalram = totalram_pages; |
2752 | val->sharedram = 0; | 2756 | val->sharedram = 0; |
2753 | val->freeram = global_page_state(NR_FREE_PAGES); | 2757 | val->freeram = global_page_state(NR_FREE_PAGES); |
2754 | val->bufferram = nr_blockdev_pages(); | 2758 | val->bufferram = nr_blockdev_pages(); |
2755 | val->totalhigh = totalhigh_pages; | 2759 | val->totalhigh = totalhigh_pages; |
2756 | val->freehigh = nr_free_highpages(); | 2760 | val->freehigh = nr_free_highpages(); |
2757 | val->mem_unit = PAGE_SIZE; | 2761 | val->mem_unit = PAGE_SIZE; |
2758 | } | 2762 | } |
2759 | 2763 | ||
2760 | EXPORT_SYMBOL(si_meminfo); | 2764 | EXPORT_SYMBOL(si_meminfo); |
2761 | 2765 | ||
2762 | #ifdef CONFIG_NUMA | 2766 | #ifdef CONFIG_NUMA |
2763 | void si_meminfo_node(struct sysinfo *val, int nid) | 2767 | void si_meminfo_node(struct sysinfo *val, int nid) |
2764 | { | 2768 | { |
2765 | pg_data_t *pgdat = NODE_DATA(nid); | 2769 | pg_data_t *pgdat = NODE_DATA(nid); |
2766 | 2770 | ||
2767 | val->totalram = pgdat->node_present_pages; | 2771 | val->totalram = pgdat->node_present_pages; |
2768 | val->freeram = node_page_state(nid, NR_FREE_PAGES); | 2772 | val->freeram = node_page_state(nid, NR_FREE_PAGES); |
2769 | #ifdef CONFIG_HIGHMEM | 2773 | #ifdef CONFIG_HIGHMEM |
2770 | val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; | 2774 | val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; |
2771 | val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], | 2775 | val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], |
2772 | NR_FREE_PAGES); | 2776 | NR_FREE_PAGES); |
2773 | #else | 2777 | #else |
2774 | val->totalhigh = 0; | 2778 | val->totalhigh = 0; |
2775 | val->freehigh = 0; | 2779 | val->freehigh = 0; |
2776 | #endif | 2780 | #endif |
2777 | val->mem_unit = PAGE_SIZE; | 2781 | val->mem_unit = PAGE_SIZE; |
2778 | } | 2782 | } |
2779 | #endif | 2783 | #endif |
2780 | 2784 | ||
2781 | /* | 2785 | /* |
2782 | * Determine whether the node should be displayed or not, depending on whether | 2786 | * Determine whether the node should be displayed or not, depending on whether |
2783 | * SHOW_MEM_FILTER_NODES was passed to show_free_areas(). | 2787 | * SHOW_MEM_FILTER_NODES was passed to show_free_areas(). |
2784 | */ | 2788 | */ |
2785 | bool skip_free_areas_node(unsigned int flags, int nid) | 2789 | bool skip_free_areas_node(unsigned int flags, int nid) |
2786 | { | 2790 | { |
2787 | bool ret = false; | 2791 | bool ret = false; |
2788 | unsigned int cpuset_mems_cookie; | 2792 | unsigned int cpuset_mems_cookie; |
2789 | 2793 | ||
2790 | if (!(flags & SHOW_MEM_FILTER_NODES)) | 2794 | if (!(flags & SHOW_MEM_FILTER_NODES)) |
2791 | goto out; | 2795 | goto out; |
2792 | 2796 | ||
2793 | do { | 2797 | do { |
2794 | cpuset_mems_cookie = get_mems_allowed(); | 2798 | cpuset_mems_cookie = get_mems_allowed(); |
2795 | ret = !node_isset(nid, cpuset_current_mems_allowed); | 2799 | ret = !node_isset(nid, cpuset_current_mems_allowed); |
2796 | } while (!put_mems_allowed(cpuset_mems_cookie)); | 2800 | } while (!put_mems_allowed(cpuset_mems_cookie)); |
2797 | out: | 2801 | out: |
2798 | return ret; | 2802 | return ret; |
2799 | } | 2803 | } |
2800 | 2804 | ||
2801 | #define K(x) ((x) << (PAGE_SHIFT-10)) | 2805 | #define K(x) ((x) << (PAGE_SHIFT-10)) |
2802 | 2806 | ||
2803 | /* | 2807 | /* |
2804 | * Show free area list (used inside shift_scroll-lock stuff) | 2808 | * Show free area list (used inside shift_scroll-lock stuff) |
2805 | * We also calculate the percentage fragmentation. We do this by counting the | 2809 | * We also calculate the percentage fragmentation. We do this by counting the |
2806 | * memory on each free list with the exception of the first item on the list. | 2810 | * memory on each free list with the exception of the first item on the list. |
2807 | * Suppresses nodes that are not allowed by current's cpuset if | 2811 | * Suppresses nodes that are not allowed by current's cpuset if |
2808 | * SHOW_MEM_FILTER_NODES is passed. | 2812 | * SHOW_MEM_FILTER_NODES is passed. |
2809 | */ | 2813 | */ |
2810 | void show_free_areas(unsigned int filter) | 2814 | void show_free_areas(unsigned int filter) |
2811 | { | 2815 | { |
2812 | int cpu; | 2816 | int cpu; |
2813 | struct zone *zone; | 2817 | struct zone *zone; |
2814 | 2818 | ||
2815 | for_each_populated_zone(zone) { | 2819 | for_each_populated_zone(zone) { |
2816 | if (skip_free_areas_node(filter, zone_to_nid(zone))) | 2820 | if (skip_free_areas_node(filter, zone_to_nid(zone))) |
2817 | continue; | 2821 | continue; |
2818 | show_node(zone); | 2822 | show_node(zone); |
2819 | printk("%s per-cpu:\n", zone->name); | 2823 | printk("%s per-cpu:\n", zone->name); |
2820 | 2824 | ||
2821 | for_each_online_cpu(cpu) { | 2825 | for_each_online_cpu(cpu) { |
2822 | struct per_cpu_pageset *pageset; | 2826 | struct per_cpu_pageset *pageset; |
2823 | 2827 | ||
2824 | pageset = per_cpu_ptr(zone->pageset, cpu); | 2828 | pageset = per_cpu_ptr(zone->pageset, cpu); |
2825 | 2829 | ||
2826 | printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", | 2830 | printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", |
2827 | cpu, pageset->pcp.high, | 2831 | cpu, pageset->pcp.high, |
2828 | pageset->pcp.batch, pageset->pcp.count); | 2832 | pageset->pcp.batch, pageset->pcp.count); |
2829 | } | 2833 | } |
2830 | } | 2834 | } |
2831 | 2835 | ||
2832 | printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" | 2836 | printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" |
2833 | " active_file:%lu inactive_file:%lu isolated_file:%lu\n" | 2837 | " active_file:%lu inactive_file:%lu isolated_file:%lu\n" |
2834 | " unevictable:%lu" | 2838 | " unevictable:%lu" |
2835 | " dirty:%lu writeback:%lu unstable:%lu\n" | 2839 | " dirty:%lu writeback:%lu unstable:%lu\n" |
2836 | " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" | 2840 | " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" |
2837 | " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n", | 2841 | " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n", |
2838 | global_page_state(NR_ACTIVE_ANON), | 2842 | global_page_state(NR_ACTIVE_ANON), |
2839 | global_page_state(NR_INACTIVE_ANON), | 2843 | global_page_state(NR_INACTIVE_ANON), |
2840 | global_page_state(NR_ISOLATED_ANON), | 2844 | global_page_state(NR_ISOLATED_ANON), |
2841 | global_page_state(NR_ACTIVE_FILE), | 2845 | global_page_state(NR_ACTIVE_FILE), |
2842 | global_page_state(NR_INACTIVE_FILE), | 2846 | global_page_state(NR_INACTIVE_FILE), |
2843 | global_page_state(NR_ISOLATED_FILE), | 2847 | global_page_state(NR_ISOLATED_FILE), |
2844 | global_page_state(NR_UNEVICTABLE), | 2848 | global_page_state(NR_UNEVICTABLE), |
2845 | global_page_state(NR_FILE_DIRTY), | 2849 | global_page_state(NR_FILE_DIRTY), |
2846 | global_page_state(NR_WRITEBACK), | 2850 | global_page_state(NR_WRITEBACK), |
2847 | global_page_state(NR_UNSTABLE_NFS), | 2851 | global_page_state(NR_UNSTABLE_NFS), |
2848 | global_page_state(NR_FREE_PAGES), | 2852 | global_page_state(NR_FREE_PAGES), |
2849 | global_page_state(NR_SLAB_RECLAIMABLE), | 2853 | global_page_state(NR_SLAB_RECLAIMABLE), |
2850 | global_page_state(NR_SLAB_UNRECLAIMABLE), | 2854 | global_page_state(NR_SLAB_UNRECLAIMABLE), |
2851 | global_page_state(NR_FILE_MAPPED), | 2855 | global_page_state(NR_FILE_MAPPED), |
2852 | global_page_state(NR_SHMEM), | 2856 | global_page_state(NR_SHMEM), |
2853 | global_page_state(NR_PAGETABLE), | 2857 | global_page_state(NR_PAGETABLE), |
2854 | global_page_state(NR_BOUNCE)); | 2858 | global_page_state(NR_BOUNCE)); |
2855 | 2859 | ||
2856 | for_each_populated_zone(zone) { | 2860 | for_each_populated_zone(zone) { |
2857 | int i; | 2861 | int i; |
2858 | 2862 | ||
2859 | if (skip_free_areas_node(filter, zone_to_nid(zone))) | 2863 | if (skip_free_areas_node(filter, zone_to_nid(zone))) |
2860 | continue; | 2864 | continue; |
2861 | show_node(zone); | 2865 | show_node(zone); |
2862 | printk("%s" | 2866 | printk("%s" |
2863 | " free:%lukB" | 2867 | " free:%lukB" |
2864 | " min:%lukB" | 2868 | " min:%lukB" |
2865 | " low:%lukB" | 2869 | " low:%lukB" |
2866 | " high:%lukB" | 2870 | " high:%lukB" |
2867 | " active_anon:%lukB" | 2871 | " active_anon:%lukB" |
2868 | " inactive_anon:%lukB" | 2872 | " inactive_anon:%lukB" |
2869 | " active_file:%lukB" | 2873 | " active_file:%lukB" |
2870 | " inactive_file:%lukB" | 2874 | " inactive_file:%lukB" |
2871 | " unevictable:%lukB" | 2875 | " unevictable:%lukB" |
2872 | " isolated(anon):%lukB" | 2876 | " isolated(anon):%lukB" |
2873 | " isolated(file):%lukB" | 2877 | " isolated(file):%lukB" |
2874 | " present:%lukB" | 2878 | " present:%lukB" |
2875 | " mlocked:%lukB" | 2879 | " mlocked:%lukB" |
2876 | " dirty:%lukB" | 2880 | " dirty:%lukB" |
2877 | " writeback:%lukB" | 2881 | " writeback:%lukB" |
2878 | " mapped:%lukB" | 2882 | " mapped:%lukB" |
2879 | " shmem:%lukB" | 2883 | " shmem:%lukB" |
2880 | " slab_reclaimable:%lukB" | 2884 | " slab_reclaimable:%lukB" |
2881 | " slab_unreclaimable:%lukB" | 2885 | " slab_unreclaimable:%lukB" |
2882 | " kernel_stack:%lukB" | 2886 | " kernel_stack:%lukB" |
2883 | " pagetables:%lukB" | 2887 | " pagetables:%lukB" |
2884 | " unstable:%lukB" | 2888 | " unstable:%lukB" |
2885 | " bounce:%lukB" | 2889 | " bounce:%lukB" |
2886 | " writeback_tmp:%lukB" | 2890 | " writeback_tmp:%lukB" |
2887 | " pages_scanned:%lu" | 2891 | " pages_scanned:%lu" |
2888 | " all_unreclaimable? %s" | 2892 | " all_unreclaimable? %s" |
2889 | "\n", | 2893 | "\n", |
2890 | zone->name, | 2894 | zone->name, |
2891 | K(zone_page_state(zone, NR_FREE_PAGES)), | 2895 | K(zone_page_state(zone, NR_FREE_PAGES)), |
2892 | K(min_wmark_pages(zone)), | 2896 | K(min_wmark_pages(zone)), |
2893 | K(low_wmark_pages(zone)), | 2897 | K(low_wmark_pages(zone)), |
2894 | K(high_wmark_pages(zone)), | 2898 | K(high_wmark_pages(zone)), |
2895 | K(zone_page_state(zone, NR_ACTIVE_ANON)), | 2899 | K(zone_page_state(zone, NR_ACTIVE_ANON)), |
2896 | K(zone_page_state(zone, NR_INACTIVE_ANON)), | 2900 | K(zone_page_state(zone, NR_INACTIVE_ANON)), |
2897 | K(zone_page_state(zone, NR_ACTIVE_FILE)), | 2901 | K(zone_page_state(zone, NR_ACTIVE_FILE)), |
2898 | K(zone_page_state(zone, NR_INACTIVE_FILE)), | 2902 | K(zone_page_state(zone, NR_INACTIVE_FILE)), |
2899 | K(zone_page_state(zone, NR_UNEVICTABLE)), | 2903 | K(zone_page_state(zone, NR_UNEVICTABLE)), |
2900 | K(zone_page_state(zone, NR_ISOLATED_ANON)), | 2904 | K(zone_page_state(zone, NR_ISOLATED_ANON)), |
2901 | K(zone_page_state(zone, NR_ISOLATED_FILE)), | 2905 | K(zone_page_state(zone, NR_ISOLATED_FILE)), |
2902 | K(zone->present_pages), | 2906 | K(zone->present_pages), |
2903 | K(zone_page_state(zone, NR_MLOCK)), | 2907 | K(zone_page_state(zone, NR_MLOCK)), |
2904 | K(zone_page_state(zone, NR_FILE_DIRTY)), | 2908 | K(zone_page_state(zone, NR_FILE_DIRTY)), |
2905 | K(zone_page_state(zone, NR_WRITEBACK)), | 2909 | K(zone_page_state(zone, NR_WRITEBACK)), |
2906 | K(zone_page_state(zone, NR_FILE_MAPPED)), | 2910 | K(zone_page_state(zone, NR_FILE_MAPPED)), |
2907 | K(zone_page_state(zone, NR_SHMEM)), | 2911 | K(zone_page_state(zone, NR_SHMEM)), |
2908 | K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), | 2912 | K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), |
2909 | K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), | 2913 | K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), |
2910 | zone_page_state(zone, NR_KERNEL_STACK) * | 2914 | zone_page_state(zone, NR_KERNEL_STACK) * |
2911 | THREAD_SIZE / 1024, | 2915 | THREAD_SIZE / 1024, |
2912 | K(zone_page_state(zone, NR_PAGETABLE)), | 2916 | K(zone_page_state(zone, NR_PAGETABLE)), |
2913 | K(zone_page_state(zone, NR_UNSTABLE_NFS)), | 2917 | K(zone_page_state(zone, NR_UNSTABLE_NFS)), |
2914 | K(zone_page_state(zone, NR_BOUNCE)), | 2918 | K(zone_page_state(zone, NR_BOUNCE)), |
2915 | K(zone_page_state(zone, NR_WRITEBACK_TEMP)), | 2919 | K(zone_page_state(zone, NR_WRITEBACK_TEMP)), |
2916 | zone->pages_scanned, | 2920 | zone->pages_scanned, |
2917 | (zone->all_unreclaimable ? "yes" : "no") | 2921 | (zone->all_unreclaimable ? "yes" : "no") |
2918 | ); | 2922 | ); |
2919 | printk("lowmem_reserve[]:"); | 2923 | printk("lowmem_reserve[]:"); |
2920 | for (i = 0; i < MAX_NR_ZONES; i++) | 2924 | for (i = 0; i < MAX_NR_ZONES; i++) |
2921 | printk(" %lu", zone->lowmem_reserve[i]); | 2925 | printk(" %lu", zone->lowmem_reserve[i]); |
2922 | printk("\n"); | 2926 | printk("\n"); |
2923 | } | 2927 | } |
2924 | 2928 | ||
2925 | for_each_populated_zone(zone) { | 2929 | for_each_populated_zone(zone) { |
2926 | unsigned long nr[MAX_ORDER], flags, order, total = 0; | 2930 | unsigned long nr[MAX_ORDER], flags, order, total = 0; |
2927 | 2931 | ||
2928 | if (skip_free_areas_node(filter, zone_to_nid(zone))) | 2932 | if (skip_free_areas_node(filter, zone_to_nid(zone))) |
2929 | continue; | 2933 | continue; |
2930 | show_node(zone); | 2934 | show_node(zone); |
2931 | printk("%s: ", zone->name); | 2935 | printk("%s: ", zone->name); |
2932 | 2936 | ||
2933 | spin_lock_irqsave(&zone->lock, flags); | 2937 | spin_lock_irqsave(&zone->lock, flags); |
2934 | for (order = 0; order < MAX_ORDER; order++) { | 2938 | for (order = 0; order < MAX_ORDER; order++) { |
2935 | nr[order] = zone->free_area[order].nr_free; | 2939 | nr[order] = zone->free_area[order].nr_free; |
2936 | total += nr[order] << order; | 2940 | total += nr[order] << order; |
2937 | } | 2941 | } |
2938 | spin_unlock_irqrestore(&zone->lock, flags); | 2942 | spin_unlock_irqrestore(&zone->lock, flags); |
2939 | for (order = 0; order < MAX_ORDER; order++) | 2943 | for (order = 0; order < MAX_ORDER; order++) |
2940 | printk("%lu*%lukB ", nr[order], K(1UL) << order); | 2944 | printk("%lu*%lukB ", nr[order], K(1UL) << order); |
2941 | printk("= %lukB\n", K(total)); | 2945 | printk("= %lukB\n", K(total)); |
2942 | } | 2946 | } |
2943 | 2947 | ||
2944 | printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES)); | 2948 | printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES)); |
2945 | 2949 | ||
2946 | show_swap_cache_info(); | 2950 | show_swap_cache_info(); |
2947 | } | 2951 | } |
2948 | 2952 | ||
2949 | static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) | 2953 | static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) |
2950 | { | 2954 | { |
2951 | zoneref->zone = zone; | 2955 | zoneref->zone = zone; |
2952 | zoneref->zone_idx = zone_idx(zone); | 2956 | zoneref->zone_idx = zone_idx(zone); |
2953 | } | 2957 | } |
2954 | 2958 | ||
2955 | /* | 2959 | /* |
2956 | * Builds allocation fallback zone lists. | 2960 | * Builds allocation fallback zone lists. |
2957 | * | 2961 | * |
2958 | * Add all populated zones of a node to the zonelist. | 2962 | * Add all populated zones of a node to the zonelist. |
2959 | */ | 2963 | */ |
2960 | static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, | 2964 | static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, |
2961 | int nr_zones, enum zone_type zone_type) | 2965 | int nr_zones, enum zone_type zone_type) |
2962 | { | 2966 | { |
2963 | struct zone *zone; | 2967 | struct zone *zone; |
2964 | 2968 | ||
2965 | BUG_ON(zone_type >= MAX_NR_ZONES); | 2969 | BUG_ON(zone_type >= MAX_NR_ZONES); |
2966 | zone_type++; | 2970 | zone_type++; |
2967 | 2971 | ||
2968 | do { | 2972 | do { |
2969 | zone_type--; | 2973 | zone_type--; |
2970 | zone = pgdat->node_zones + zone_type; | 2974 | zone = pgdat->node_zones + zone_type; |
2971 | if (populated_zone(zone)) { | 2975 | if (populated_zone(zone)) { |
2972 | zoneref_set_zone(zone, | 2976 | zoneref_set_zone(zone, |
2973 | &zonelist->_zonerefs[nr_zones++]); | 2977 | &zonelist->_zonerefs[nr_zones++]); |
2974 | check_highest_zone(zone_type); | 2978 | check_highest_zone(zone_type); |
2975 | } | 2979 | } |
2976 | 2980 | ||
2977 | } while (zone_type); | 2981 | } while (zone_type); |
2978 | return nr_zones; | 2982 | return nr_zones; |
2979 | } | 2983 | } |
2980 | 2984 | ||
2981 | 2985 | ||
2982 | /* | 2986 | /* |
2983 | * zonelist_order: | 2987 | * zonelist_order: |
2984 | * 0 = automatic detection of better ordering. | 2988 | * 0 = automatic detection of better ordering. |
2985 | * 1 = order by ([node] distance, -zonetype) | 2989 | * 1 = order by ([node] distance, -zonetype) |
2986 | * 2 = order by (-zonetype, [node] distance) | 2990 | * 2 = order by (-zonetype, [node] distance) |
2987 | * | 2991 | * |
2988 | * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create | 2992 | * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create |
2989 | * the same zonelist. So only NUMA can configure this param. | 2993 | * the same zonelist. So only NUMA can configure this param. |
2990 | */ | 2994 | */ |
2991 | #define ZONELIST_ORDER_DEFAULT 0 | 2995 | #define ZONELIST_ORDER_DEFAULT 0 |
2992 | #define ZONELIST_ORDER_NODE 1 | 2996 | #define ZONELIST_ORDER_NODE 1 |
2993 | #define ZONELIST_ORDER_ZONE 2 | 2997 | #define ZONELIST_ORDER_ZONE 2 |
2994 | 2998 | ||
2995 | /* zonelist order in the kernel. | 2999 | /* zonelist order in the kernel. |
2996 | * set_zonelist_order() will set this to NODE or ZONE. | 3000 | * set_zonelist_order() will set this to NODE or ZONE. |
2997 | */ | 3001 | */ |
2998 | static int current_zonelist_order = ZONELIST_ORDER_DEFAULT; | 3002 | static int current_zonelist_order = ZONELIST_ORDER_DEFAULT; |
2999 | static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"}; | 3003 | static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"}; |
3000 | 3004 | ||
3001 | 3005 | ||
3002 | #ifdef CONFIG_NUMA | 3006 | #ifdef CONFIG_NUMA |
3003 | /* The value user specified ....changed by config */ | 3007 | /* The value user specified ....changed by config */ |
3004 | static int user_zonelist_order = ZONELIST_ORDER_DEFAULT; | 3008 | static int user_zonelist_order = ZONELIST_ORDER_DEFAULT; |
3005 | /* string for sysctl */ | 3009 | /* string for sysctl */ |
3006 | #define NUMA_ZONELIST_ORDER_LEN 16 | 3010 | #define NUMA_ZONELIST_ORDER_LEN 16 |
3007 | char numa_zonelist_order[16] = "default"; | 3011 | char numa_zonelist_order[16] = "default"; |
3008 | 3012 | ||
3009 | /* | 3013 | /* |
3010 | * interface for configure zonelist ordering. | 3014 | * interface for configure zonelist ordering. |
3011 | * command line option "numa_zonelist_order" | 3015 | * command line option "numa_zonelist_order" |
3012 | * = "[dD]efault - default, automatic configuration. | 3016 | * = "[dD]efault - default, automatic configuration. |
3013 | * = "[nN]ode - order by node locality, then by zone within node | 3017 | * = "[nN]ode - order by node locality, then by zone within node |
3014 | * = "[zZ]one - order by zone, then by locality within zone | 3018 | * = "[zZ]one - order by zone, then by locality within zone |
3015 | */ | 3019 | */ |
3016 | 3020 | ||
3017 | static int __parse_numa_zonelist_order(char *s) | 3021 | static int __parse_numa_zonelist_order(char *s) |
3018 | { | 3022 | { |
3019 | if (*s == 'd' || *s == 'D') { | 3023 | if (*s == 'd' || *s == 'D') { |
3020 | user_zonelist_order = ZONELIST_ORDER_DEFAULT; | 3024 | user_zonelist_order = ZONELIST_ORDER_DEFAULT; |
3021 | } else if (*s == 'n' || *s == 'N') { | 3025 | } else if (*s == 'n' || *s == 'N') { |
3022 | user_zonelist_order = ZONELIST_ORDER_NODE; | 3026 | user_zonelist_order = ZONELIST_ORDER_NODE; |
3023 | } else if (*s == 'z' || *s == 'Z') { | 3027 | } else if (*s == 'z' || *s == 'Z') { |
3024 | user_zonelist_order = ZONELIST_ORDER_ZONE; | 3028 | user_zonelist_order = ZONELIST_ORDER_ZONE; |
3025 | } else { | 3029 | } else { |
3026 | printk(KERN_WARNING | 3030 | printk(KERN_WARNING |
3027 | "Ignoring invalid numa_zonelist_order value: " | 3031 | "Ignoring invalid numa_zonelist_order value: " |
3028 | "%s\n", s); | 3032 | "%s\n", s); |
3029 | return -EINVAL; | 3033 | return -EINVAL; |
3030 | } | 3034 | } |
3031 | return 0; | 3035 | return 0; |
3032 | } | 3036 | } |
3033 | 3037 | ||
3034 | static __init int setup_numa_zonelist_order(char *s) | 3038 | static __init int setup_numa_zonelist_order(char *s) |
3035 | { | 3039 | { |
3036 | int ret; | 3040 | int ret; |
3037 | 3041 | ||
3038 | if (!s) | 3042 | if (!s) |
3039 | return 0; | 3043 | return 0; |
3040 | 3044 | ||
3041 | ret = __parse_numa_zonelist_order(s); | 3045 | ret = __parse_numa_zonelist_order(s); |
3042 | if (ret == 0) | 3046 | if (ret == 0) |
3043 | strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN); | 3047 | strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN); |
3044 | 3048 | ||
3045 | return ret; | 3049 | return ret; |
3046 | } | 3050 | } |
3047 | early_param("numa_zonelist_order", setup_numa_zonelist_order); | 3051 | early_param("numa_zonelist_order", setup_numa_zonelist_order); |
3048 | 3052 | ||
3049 | /* | 3053 | /* |
3050 | * sysctl handler for numa_zonelist_order | 3054 | * sysctl handler for numa_zonelist_order |
3051 | */ | 3055 | */ |
3052 | int numa_zonelist_order_handler(ctl_table *table, int write, | 3056 | int numa_zonelist_order_handler(ctl_table *table, int write, |
3053 | void __user *buffer, size_t *length, | 3057 | void __user *buffer, size_t *length, |
3054 | loff_t *ppos) | 3058 | loff_t *ppos) |
3055 | { | 3059 | { |
3056 | char saved_string[NUMA_ZONELIST_ORDER_LEN]; | 3060 | char saved_string[NUMA_ZONELIST_ORDER_LEN]; |
3057 | int ret; | 3061 | int ret; |
3058 | static DEFINE_MUTEX(zl_order_mutex); | 3062 | static DEFINE_MUTEX(zl_order_mutex); |
3059 | 3063 | ||
3060 | mutex_lock(&zl_order_mutex); | 3064 | mutex_lock(&zl_order_mutex); |
3061 | if (write) | 3065 | if (write) |
3062 | strcpy(saved_string, (char*)table->data); | 3066 | strcpy(saved_string, (char*)table->data); |
3063 | ret = proc_dostring(table, write, buffer, length, ppos); | 3067 | ret = proc_dostring(table, write, buffer, length, ppos); |
3064 | if (ret) | 3068 | if (ret) |
3065 | goto out; | 3069 | goto out; |
3066 | if (write) { | 3070 | if (write) { |
3067 | int oldval = user_zonelist_order; | 3071 | int oldval = user_zonelist_order; |
3068 | if (__parse_numa_zonelist_order((char*)table->data)) { | 3072 | if (__parse_numa_zonelist_order((char*)table->data)) { |
3069 | /* | 3073 | /* |
3070 | * bogus value. restore saved string | 3074 | * bogus value. restore saved string |
3071 | */ | 3075 | */ |
3072 | strncpy((char*)table->data, saved_string, | 3076 | strncpy((char*)table->data, saved_string, |
3073 | NUMA_ZONELIST_ORDER_LEN); | 3077 | NUMA_ZONELIST_ORDER_LEN); |
3074 | user_zonelist_order = oldval; | 3078 | user_zonelist_order = oldval; |
3075 | } else if (oldval != user_zonelist_order) { | 3079 | } else if (oldval != user_zonelist_order) { |
3076 | mutex_lock(&zonelists_mutex); | 3080 | mutex_lock(&zonelists_mutex); |
3077 | build_all_zonelists(NULL, NULL); | 3081 | build_all_zonelists(NULL, NULL); |
3078 | mutex_unlock(&zonelists_mutex); | 3082 | mutex_unlock(&zonelists_mutex); |
3079 | } | 3083 | } |
3080 | } | 3084 | } |
3081 | out: | 3085 | out: |
3082 | mutex_unlock(&zl_order_mutex); | 3086 | mutex_unlock(&zl_order_mutex); |
3083 | return ret; | 3087 | return ret; |
3084 | } | 3088 | } |
3085 | 3089 | ||
3086 | 3090 | ||
3087 | #define MAX_NODE_LOAD (nr_online_nodes) | 3091 | #define MAX_NODE_LOAD (nr_online_nodes) |
3088 | static int node_load[MAX_NUMNODES]; | 3092 | static int node_load[MAX_NUMNODES]; |
3089 | 3093 | ||
3090 | /** | 3094 | /** |
3091 | * find_next_best_node - find the next node that should appear in a given node's fallback list | 3095 | * find_next_best_node - find the next node that should appear in a given node's fallback list |
3092 | * @node: node whose fallback list we're appending | 3096 | * @node: node whose fallback list we're appending |
3093 | * @used_node_mask: nodemask_t of already used nodes | 3097 | * @used_node_mask: nodemask_t of already used nodes |
3094 | * | 3098 | * |
3095 | * We use a number of factors to determine which is the next node that should | 3099 | * We use a number of factors to determine which is the next node that should |
3096 | * appear on a given node's fallback list. The node should not have appeared | 3100 | * appear on a given node's fallback list. The node should not have appeared |
3097 | * already in @node's fallback list, and it should be the next closest node | 3101 | * already in @node's fallback list, and it should be the next closest node |
3098 | * according to the distance array (which contains arbitrary distance values | 3102 | * according to the distance array (which contains arbitrary distance values |
3099 | * from each node to each node in the system), and should also prefer nodes | 3103 | * from each node to each node in the system), and should also prefer nodes |
3100 | * with no CPUs, since presumably they'll have very little allocation pressure | 3104 | * with no CPUs, since presumably they'll have very little allocation pressure |
3101 | * on them otherwise. | 3105 | * on them otherwise. |
3102 | * It returns -1 if no node is found. | 3106 | * It returns -1 if no node is found. |
3103 | */ | 3107 | */ |
3104 | static int find_next_best_node(int node, nodemask_t *used_node_mask) | 3108 | static int find_next_best_node(int node, nodemask_t *used_node_mask) |
3105 | { | 3109 | { |
3106 | int n, val; | 3110 | int n, val; |
3107 | int min_val = INT_MAX; | 3111 | int min_val = INT_MAX; |
3108 | int best_node = -1; | 3112 | int best_node = -1; |
3109 | const struct cpumask *tmp = cpumask_of_node(0); | 3113 | const struct cpumask *tmp = cpumask_of_node(0); |
3110 | 3114 | ||
3111 | /* Use the local node if we haven't already */ | 3115 | /* Use the local node if we haven't already */ |
3112 | if (!node_isset(node, *used_node_mask)) { | 3116 | if (!node_isset(node, *used_node_mask)) { |
3113 | node_set(node, *used_node_mask); | 3117 | node_set(node, *used_node_mask); |
3114 | return node; | 3118 | return node; |
3115 | } | 3119 | } |
3116 | 3120 | ||
3117 | for_each_node_state(n, N_HIGH_MEMORY) { | 3121 | for_each_node_state(n, N_HIGH_MEMORY) { |
3118 | 3122 | ||
3119 | /* Don't want a node to appear more than once */ | 3123 | /* Don't want a node to appear more than once */ |
3120 | if (node_isset(n, *used_node_mask)) | 3124 | if (node_isset(n, *used_node_mask)) |
3121 | continue; | 3125 | continue; |
3122 | 3126 | ||
3123 | /* Use the distance array to find the distance */ | 3127 | /* Use the distance array to find the distance */ |
3124 | val = node_distance(node, n); | 3128 | val = node_distance(node, n); |
3125 | 3129 | ||
3126 | /* Penalize nodes under us ("prefer the next node") */ | 3130 | /* Penalize nodes under us ("prefer the next node") */ |
3127 | val += (n < node); | 3131 | val += (n < node); |
3128 | 3132 | ||
3129 | /* Give preference to headless and unused nodes */ | 3133 | /* Give preference to headless and unused nodes */ |
3130 | tmp = cpumask_of_node(n); | 3134 | tmp = cpumask_of_node(n); |
3131 | if (!cpumask_empty(tmp)) | 3135 | if (!cpumask_empty(tmp)) |
3132 | val += PENALTY_FOR_NODE_WITH_CPUS; | 3136 | val += PENALTY_FOR_NODE_WITH_CPUS; |
3133 | 3137 | ||
3134 | /* Slight preference for less loaded node */ | 3138 | /* Slight preference for less loaded node */ |
3135 | val *= (MAX_NODE_LOAD*MAX_NUMNODES); | 3139 | val *= (MAX_NODE_LOAD*MAX_NUMNODES); |
3136 | val += node_load[n]; | 3140 | val += node_load[n]; |
3137 | 3141 | ||
3138 | if (val < min_val) { | 3142 | if (val < min_val) { |
3139 | min_val = val; | 3143 | min_val = val; |
3140 | best_node = n; | 3144 | best_node = n; |
3141 | } | 3145 | } |
3142 | } | 3146 | } |
3143 | 3147 | ||
3144 | if (best_node >= 0) | 3148 | if (best_node >= 0) |
3145 | node_set(best_node, *used_node_mask); | 3149 | node_set(best_node, *used_node_mask); |
3146 | 3150 | ||
3147 | return best_node; | 3151 | return best_node; |
3148 | } | 3152 | } |
3149 | 3153 | ||
3150 | 3154 | ||
3151 | /* | 3155 | /* |
3152 | * Build zonelists ordered by node and zones within node. | 3156 | * Build zonelists ordered by node and zones within node. |
3153 | * This results in maximum locality--normal zone overflows into local | 3157 | * This results in maximum locality--normal zone overflows into local |
3154 | * DMA zone, if any--but risks exhausting DMA zone. | 3158 | * DMA zone, if any--but risks exhausting DMA zone. |
3155 | */ | 3159 | */ |
3156 | static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) | 3160 | static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) |
3157 | { | 3161 | { |
3158 | int j; | 3162 | int j; |
3159 | struct zonelist *zonelist; | 3163 | struct zonelist *zonelist; |
3160 | 3164 | ||
3161 | zonelist = &pgdat->node_zonelists[0]; | 3165 | zonelist = &pgdat->node_zonelists[0]; |
3162 | for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) | 3166 | for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) |
3163 | ; | 3167 | ; |
3164 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, | 3168 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, |
3165 | MAX_NR_ZONES - 1); | 3169 | MAX_NR_ZONES - 1); |
3166 | zonelist->_zonerefs[j].zone = NULL; | 3170 | zonelist->_zonerefs[j].zone = NULL; |
3167 | zonelist->_zonerefs[j].zone_idx = 0; | 3171 | zonelist->_zonerefs[j].zone_idx = 0; |
3168 | } | 3172 | } |
3169 | 3173 | ||
3170 | /* | 3174 | /* |
3171 | * Build gfp_thisnode zonelists | 3175 | * Build gfp_thisnode zonelists |
3172 | */ | 3176 | */ |
3173 | static void build_thisnode_zonelists(pg_data_t *pgdat) | 3177 | static void build_thisnode_zonelists(pg_data_t *pgdat) |
3174 | { | 3178 | { |
3175 | int j; | 3179 | int j; |
3176 | struct zonelist *zonelist; | 3180 | struct zonelist *zonelist; |
3177 | 3181 | ||
3178 | zonelist = &pgdat->node_zonelists[1]; | 3182 | zonelist = &pgdat->node_zonelists[1]; |
3179 | j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); | 3183 | j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); |
3180 | zonelist->_zonerefs[j].zone = NULL; | 3184 | zonelist->_zonerefs[j].zone = NULL; |
3181 | zonelist->_zonerefs[j].zone_idx = 0; | 3185 | zonelist->_zonerefs[j].zone_idx = 0; |
3182 | } | 3186 | } |
3183 | 3187 | ||
3184 | /* | 3188 | /* |
3185 | * Build zonelists ordered by zone and nodes within zones. | 3189 | * Build zonelists ordered by zone and nodes within zones. |
3186 | * This results in conserving DMA zone[s] until all Normal memory is | 3190 | * This results in conserving DMA zone[s] until all Normal memory is |
3187 | * exhausted, but results in overflowing to remote node while memory | 3191 | * exhausted, but results in overflowing to remote node while memory |
3188 | * may still exist in local DMA zone. | 3192 | * may still exist in local DMA zone. |
3189 | */ | 3193 | */ |
3190 | static int node_order[MAX_NUMNODES]; | 3194 | static int node_order[MAX_NUMNODES]; |
3191 | 3195 | ||
3192 | static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) | 3196 | static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) |
3193 | { | 3197 | { |
3194 | int pos, j, node; | 3198 | int pos, j, node; |
3195 | int zone_type; /* needs to be signed */ | 3199 | int zone_type; /* needs to be signed */ |
3196 | struct zone *z; | 3200 | struct zone *z; |
3197 | struct zonelist *zonelist; | 3201 | struct zonelist *zonelist; |
3198 | 3202 | ||
3199 | zonelist = &pgdat->node_zonelists[0]; | 3203 | zonelist = &pgdat->node_zonelists[0]; |
3200 | pos = 0; | 3204 | pos = 0; |
3201 | for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) { | 3205 | for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) { |
3202 | for (j = 0; j < nr_nodes; j++) { | 3206 | for (j = 0; j < nr_nodes; j++) { |
3203 | node = node_order[j]; | 3207 | node = node_order[j]; |
3204 | z = &NODE_DATA(node)->node_zones[zone_type]; | 3208 | z = &NODE_DATA(node)->node_zones[zone_type]; |
3205 | if (populated_zone(z)) { | 3209 | if (populated_zone(z)) { |
3206 | zoneref_set_zone(z, | 3210 | zoneref_set_zone(z, |
3207 | &zonelist->_zonerefs[pos++]); | 3211 | &zonelist->_zonerefs[pos++]); |
3208 | check_highest_zone(zone_type); | 3212 | check_highest_zone(zone_type); |
3209 | } | 3213 | } |
3210 | } | 3214 | } |
3211 | } | 3215 | } |
3212 | zonelist->_zonerefs[pos].zone = NULL; | 3216 | zonelist->_zonerefs[pos].zone = NULL; |
3213 | zonelist->_zonerefs[pos].zone_idx = 0; | 3217 | zonelist->_zonerefs[pos].zone_idx = 0; |
3214 | } | 3218 | } |
3215 | 3219 | ||
3216 | static int default_zonelist_order(void) | 3220 | static int default_zonelist_order(void) |
3217 | { | 3221 | { |
3218 | int nid, zone_type; | 3222 | int nid, zone_type; |
3219 | unsigned long low_kmem_size,total_size; | 3223 | unsigned long low_kmem_size,total_size; |
3220 | struct zone *z; | 3224 | struct zone *z; |
3221 | int average_size; | 3225 | int average_size; |
3222 | /* | 3226 | /* |
3223 | * ZONE_DMA and ZONE_DMA32 can be very small area in the system. | 3227 | * ZONE_DMA and ZONE_DMA32 can be very small area in the system. |
3224 | * If they are really small and used heavily, the system can fall | 3228 | * If they are really small and used heavily, the system can fall |
3225 | * into OOM very easily. | 3229 | * into OOM very easily. |
3226 | * This function detect ZONE_DMA/DMA32 size and configures zone order. | 3230 | * This function detect ZONE_DMA/DMA32 size and configures zone order. |
3227 | */ | 3231 | */ |
3228 | /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */ | 3232 | /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */ |
3229 | low_kmem_size = 0; | 3233 | low_kmem_size = 0; |
3230 | total_size = 0; | 3234 | total_size = 0; |
3231 | for_each_online_node(nid) { | 3235 | for_each_online_node(nid) { |
3232 | for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { | 3236 | for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { |
3233 | z = &NODE_DATA(nid)->node_zones[zone_type]; | 3237 | z = &NODE_DATA(nid)->node_zones[zone_type]; |
3234 | if (populated_zone(z)) { | 3238 | if (populated_zone(z)) { |
3235 | if (zone_type < ZONE_NORMAL) | 3239 | if (zone_type < ZONE_NORMAL) |
3236 | low_kmem_size += z->present_pages; | 3240 | low_kmem_size += z->present_pages; |
3237 | total_size += z->present_pages; | 3241 | total_size += z->present_pages; |
3238 | } else if (zone_type == ZONE_NORMAL) { | 3242 | } else if (zone_type == ZONE_NORMAL) { |
3239 | /* | 3243 | /* |
3240 | * If any node has only lowmem, then node order | 3244 | * If any node has only lowmem, then node order |
3241 | * is preferred to allow kernel allocations | 3245 | * is preferred to allow kernel allocations |
3242 | * locally; otherwise, they can easily infringe | 3246 | * locally; otherwise, they can easily infringe |
3243 | * on other nodes when there is an abundance of | 3247 | * on other nodes when there is an abundance of |
3244 | * lowmem available to allocate from. | 3248 | * lowmem available to allocate from. |
3245 | */ | 3249 | */ |
3246 | return ZONELIST_ORDER_NODE; | 3250 | return ZONELIST_ORDER_NODE; |
3247 | } | 3251 | } |
3248 | } | 3252 | } |
3249 | } | 3253 | } |
3250 | if (!low_kmem_size || /* there are no DMA area. */ | 3254 | if (!low_kmem_size || /* there are no DMA area. */ |
3251 | low_kmem_size > total_size/2) /* DMA/DMA32 is big. */ | 3255 | low_kmem_size > total_size/2) /* DMA/DMA32 is big. */ |
3252 | return ZONELIST_ORDER_NODE; | 3256 | return ZONELIST_ORDER_NODE; |
3253 | /* | 3257 | /* |
3254 | * look into each node's config. | 3258 | * look into each node's config. |
3255 | * If there is a node whose DMA/DMA32 memory is very big area on | 3259 | * If there is a node whose DMA/DMA32 memory is very big area on |
3256 | * local memory, NODE_ORDER may be suitable. | 3260 | * local memory, NODE_ORDER may be suitable. |
3257 | */ | 3261 | */ |
3258 | average_size = total_size / | 3262 | average_size = total_size / |
3259 | (nodes_weight(node_states[N_HIGH_MEMORY]) + 1); | 3263 | (nodes_weight(node_states[N_HIGH_MEMORY]) + 1); |
3260 | for_each_online_node(nid) { | 3264 | for_each_online_node(nid) { |
3261 | low_kmem_size = 0; | 3265 | low_kmem_size = 0; |
3262 | total_size = 0; | 3266 | total_size = 0; |
3263 | for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { | 3267 | for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { |
3264 | z = &NODE_DATA(nid)->node_zones[zone_type]; | 3268 | z = &NODE_DATA(nid)->node_zones[zone_type]; |
3265 | if (populated_zone(z)) { | 3269 | if (populated_zone(z)) { |
3266 | if (zone_type < ZONE_NORMAL) | 3270 | if (zone_type < ZONE_NORMAL) |
3267 | low_kmem_size += z->present_pages; | 3271 | low_kmem_size += z->present_pages; |
3268 | total_size += z->present_pages; | 3272 | total_size += z->present_pages; |
3269 | } | 3273 | } |
3270 | } | 3274 | } |
3271 | if (low_kmem_size && | 3275 | if (low_kmem_size && |
3272 | total_size > average_size && /* ignore small node */ | 3276 | total_size > average_size && /* ignore small node */ |
3273 | low_kmem_size > total_size * 70/100) | 3277 | low_kmem_size > total_size * 70/100) |
3274 | return ZONELIST_ORDER_NODE; | 3278 | return ZONELIST_ORDER_NODE; |
3275 | } | 3279 | } |
3276 | return ZONELIST_ORDER_ZONE; | 3280 | return ZONELIST_ORDER_ZONE; |
3277 | } | 3281 | } |
3278 | 3282 | ||
3279 | static void set_zonelist_order(void) | 3283 | static void set_zonelist_order(void) |
3280 | { | 3284 | { |
3281 | if (user_zonelist_order == ZONELIST_ORDER_DEFAULT) | 3285 | if (user_zonelist_order == ZONELIST_ORDER_DEFAULT) |
3282 | current_zonelist_order = default_zonelist_order(); | 3286 | current_zonelist_order = default_zonelist_order(); |
3283 | else | 3287 | else |
3284 | current_zonelist_order = user_zonelist_order; | 3288 | current_zonelist_order = user_zonelist_order; |
3285 | } | 3289 | } |
3286 | 3290 | ||
3287 | static void build_zonelists(pg_data_t *pgdat) | 3291 | static void build_zonelists(pg_data_t *pgdat) |
3288 | { | 3292 | { |
3289 | int j, node, load; | 3293 | int j, node, load; |
3290 | enum zone_type i; | 3294 | enum zone_type i; |
3291 | nodemask_t used_mask; | 3295 | nodemask_t used_mask; |
3292 | int local_node, prev_node; | 3296 | int local_node, prev_node; |
3293 | struct zonelist *zonelist; | 3297 | struct zonelist *zonelist; |
3294 | int order = current_zonelist_order; | 3298 | int order = current_zonelist_order; |
3295 | 3299 | ||
3296 | /* initialize zonelists */ | 3300 | /* initialize zonelists */ |
3297 | for (i = 0; i < MAX_ZONELISTS; i++) { | 3301 | for (i = 0; i < MAX_ZONELISTS; i++) { |
3298 | zonelist = pgdat->node_zonelists + i; | 3302 | zonelist = pgdat->node_zonelists + i; |
3299 | zonelist->_zonerefs[0].zone = NULL; | 3303 | zonelist->_zonerefs[0].zone = NULL; |
3300 | zonelist->_zonerefs[0].zone_idx = 0; | 3304 | zonelist->_zonerefs[0].zone_idx = 0; |
3301 | } | 3305 | } |
3302 | 3306 | ||
3303 | /* NUMA-aware ordering of nodes */ | 3307 | /* NUMA-aware ordering of nodes */ |
3304 | local_node = pgdat->node_id; | 3308 | local_node = pgdat->node_id; |
3305 | load = nr_online_nodes; | 3309 | load = nr_online_nodes; |
3306 | prev_node = local_node; | 3310 | prev_node = local_node; |
3307 | nodes_clear(used_mask); | 3311 | nodes_clear(used_mask); |
3308 | 3312 | ||
3309 | memset(node_order, 0, sizeof(node_order)); | 3313 | memset(node_order, 0, sizeof(node_order)); |
3310 | j = 0; | 3314 | j = 0; |
3311 | 3315 | ||
3312 | while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { | 3316 | while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { |
3313 | int distance = node_distance(local_node, node); | 3317 | int distance = node_distance(local_node, node); |
3314 | 3318 | ||
3315 | /* | 3319 | /* |
3316 | * If another node is sufficiently far away then it is better | 3320 | * If another node is sufficiently far away then it is better |
3317 | * to reclaim pages in a zone before going off node. | 3321 | * to reclaim pages in a zone before going off node. |
3318 | */ | 3322 | */ |
3319 | if (distance > RECLAIM_DISTANCE) | 3323 | if (distance > RECLAIM_DISTANCE) |
3320 | zone_reclaim_mode = 1; | 3324 | zone_reclaim_mode = 1; |
3321 | 3325 | ||
3322 | /* | 3326 | /* |
3323 | * We don't want to pressure a particular node. | 3327 | * We don't want to pressure a particular node. |
3324 | * So adding penalty to the first node in same | 3328 | * So adding penalty to the first node in same |
3325 | * distance group to make it round-robin. | 3329 | * distance group to make it round-robin. |
3326 | */ | 3330 | */ |
3327 | if (distance != node_distance(local_node, prev_node)) | 3331 | if (distance != node_distance(local_node, prev_node)) |
3328 | node_load[node] = load; | 3332 | node_load[node] = load; |
3329 | 3333 | ||
3330 | prev_node = node; | 3334 | prev_node = node; |
3331 | load--; | 3335 | load--; |
3332 | if (order == ZONELIST_ORDER_NODE) | 3336 | if (order == ZONELIST_ORDER_NODE) |
3333 | build_zonelists_in_node_order(pgdat, node); | 3337 | build_zonelists_in_node_order(pgdat, node); |
3334 | else | 3338 | else |
3335 | node_order[j++] = node; /* remember order */ | 3339 | node_order[j++] = node; /* remember order */ |
3336 | } | 3340 | } |
3337 | 3341 | ||
3338 | if (order == ZONELIST_ORDER_ZONE) { | 3342 | if (order == ZONELIST_ORDER_ZONE) { |
3339 | /* calculate node order -- i.e., DMA last! */ | 3343 | /* calculate node order -- i.e., DMA last! */ |
3340 | build_zonelists_in_zone_order(pgdat, j); | 3344 | build_zonelists_in_zone_order(pgdat, j); |
3341 | } | 3345 | } |
3342 | 3346 | ||
3343 | build_thisnode_zonelists(pgdat); | 3347 | build_thisnode_zonelists(pgdat); |
3344 | } | 3348 | } |
3345 | 3349 | ||
3346 | /* Construct the zonelist performance cache - see further mmzone.h */ | 3350 | /* Construct the zonelist performance cache - see further mmzone.h */ |
3347 | static void build_zonelist_cache(pg_data_t *pgdat) | 3351 | static void build_zonelist_cache(pg_data_t *pgdat) |
3348 | { | 3352 | { |
3349 | struct zonelist *zonelist; | 3353 | struct zonelist *zonelist; |
3350 | struct zonelist_cache *zlc; | 3354 | struct zonelist_cache *zlc; |
3351 | struct zoneref *z; | 3355 | struct zoneref *z; |
3352 | 3356 | ||
3353 | zonelist = &pgdat->node_zonelists[0]; | 3357 | zonelist = &pgdat->node_zonelists[0]; |
3354 | zonelist->zlcache_ptr = zlc = &zonelist->zlcache; | 3358 | zonelist->zlcache_ptr = zlc = &zonelist->zlcache; |
3355 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); | 3359 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); |
3356 | for (z = zonelist->_zonerefs; z->zone; z++) | 3360 | for (z = zonelist->_zonerefs; z->zone; z++) |
3357 | zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z); | 3361 | zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z); |
3358 | } | 3362 | } |
3359 | 3363 | ||
3360 | #ifdef CONFIG_HAVE_MEMORYLESS_NODES | 3364 | #ifdef CONFIG_HAVE_MEMORYLESS_NODES |
3361 | /* | 3365 | /* |
3362 | * Return node id of node used for "local" allocations. | 3366 | * Return node id of node used for "local" allocations. |
3363 | * I.e., first node id of first zone in arg node's generic zonelist. | 3367 | * I.e., first node id of first zone in arg node's generic zonelist. |
3364 | * Used for initializing percpu 'numa_mem', which is used primarily | 3368 | * Used for initializing percpu 'numa_mem', which is used primarily |
3365 | * for kernel allocations, so use GFP_KERNEL flags to locate zonelist. | 3369 | * for kernel allocations, so use GFP_KERNEL flags to locate zonelist. |
3366 | */ | 3370 | */ |
3367 | int local_memory_node(int node) | 3371 | int local_memory_node(int node) |
3368 | { | 3372 | { |
3369 | struct zone *zone; | 3373 | struct zone *zone; |
3370 | 3374 | ||
3371 | (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL), | 3375 | (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL), |
3372 | gfp_zone(GFP_KERNEL), | 3376 | gfp_zone(GFP_KERNEL), |
3373 | NULL, | 3377 | NULL, |
3374 | &zone); | 3378 | &zone); |
3375 | return zone->node; | 3379 | return zone->node; |
3376 | } | 3380 | } |
3377 | #endif | 3381 | #endif |
3378 | 3382 | ||
3379 | #else /* CONFIG_NUMA */ | 3383 | #else /* CONFIG_NUMA */ |
3380 | 3384 | ||
3381 | static void set_zonelist_order(void) | 3385 | static void set_zonelist_order(void) |
3382 | { | 3386 | { |
3383 | current_zonelist_order = ZONELIST_ORDER_ZONE; | 3387 | current_zonelist_order = ZONELIST_ORDER_ZONE; |
3384 | } | 3388 | } |
3385 | 3389 | ||
3386 | static void build_zonelists(pg_data_t *pgdat) | 3390 | static void build_zonelists(pg_data_t *pgdat) |
3387 | { | 3391 | { |
3388 | int node, local_node; | 3392 | int node, local_node; |
3389 | enum zone_type j; | 3393 | enum zone_type j; |
3390 | struct zonelist *zonelist; | 3394 | struct zonelist *zonelist; |
3391 | 3395 | ||
3392 | local_node = pgdat->node_id; | 3396 | local_node = pgdat->node_id; |
3393 | 3397 | ||
3394 | zonelist = &pgdat->node_zonelists[0]; | 3398 | zonelist = &pgdat->node_zonelists[0]; |
3395 | j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); | 3399 | j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); |
3396 | 3400 | ||
3397 | /* | 3401 | /* |
3398 | * Now we build the zonelist so that it contains the zones | 3402 | * Now we build the zonelist so that it contains the zones |
3399 | * of all the other nodes. | 3403 | * of all the other nodes. |
3400 | * We don't want to pressure a particular node, so when | 3404 | * We don't want to pressure a particular node, so when |
3401 | * building the zones for node N, we make sure that the | 3405 | * building the zones for node N, we make sure that the |
3402 | * zones coming right after the local ones are those from | 3406 | * zones coming right after the local ones are those from |
3403 | * node N+1 (modulo N) | 3407 | * node N+1 (modulo N) |
3404 | */ | 3408 | */ |
3405 | for (node = local_node + 1; node < MAX_NUMNODES; node++) { | 3409 | for (node = local_node + 1; node < MAX_NUMNODES; node++) { |
3406 | if (!node_online(node)) | 3410 | if (!node_online(node)) |
3407 | continue; | 3411 | continue; |
3408 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, | 3412 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, |
3409 | MAX_NR_ZONES - 1); | 3413 | MAX_NR_ZONES - 1); |
3410 | } | 3414 | } |
3411 | for (node = 0; node < local_node; node++) { | 3415 | for (node = 0; node < local_node; node++) { |
3412 | if (!node_online(node)) | 3416 | if (!node_online(node)) |
3413 | continue; | 3417 | continue; |
3414 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, | 3418 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, |
3415 | MAX_NR_ZONES - 1); | 3419 | MAX_NR_ZONES - 1); |
3416 | } | 3420 | } |
3417 | 3421 | ||
3418 | zonelist->_zonerefs[j].zone = NULL; | 3422 | zonelist->_zonerefs[j].zone = NULL; |
3419 | zonelist->_zonerefs[j].zone_idx = 0; | 3423 | zonelist->_zonerefs[j].zone_idx = 0; |
3420 | } | 3424 | } |
3421 | 3425 | ||
3422 | /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ | 3426 | /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ |
3423 | static void build_zonelist_cache(pg_data_t *pgdat) | 3427 | static void build_zonelist_cache(pg_data_t *pgdat) |
3424 | { | 3428 | { |
3425 | pgdat->node_zonelists[0].zlcache_ptr = NULL; | 3429 | pgdat->node_zonelists[0].zlcache_ptr = NULL; |
3426 | } | 3430 | } |
3427 | 3431 | ||
3428 | #endif /* CONFIG_NUMA */ | 3432 | #endif /* CONFIG_NUMA */ |
3429 | 3433 | ||
3430 | /* | 3434 | /* |
3431 | * Boot pageset table. One per cpu which is going to be used for all | 3435 | * Boot pageset table. One per cpu which is going to be used for all |
3432 | * zones and all nodes. The parameters will be set in such a way | 3436 | * zones and all nodes. The parameters will be set in such a way |
3433 | * that an item put on a list will immediately be handed over to | 3437 | * that an item put on a list will immediately be handed over to |
3434 | * the buddy list. This is safe since pageset manipulation is done | 3438 | * the buddy list. This is safe since pageset manipulation is done |
3435 | * with interrupts disabled. | 3439 | * with interrupts disabled. |
3436 | * | 3440 | * |
3437 | * The boot_pagesets must be kept even after bootup is complete for | 3441 | * The boot_pagesets must be kept even after bootup is complete for |
3438 | * unused processors and/or zones. They do play a role for bootstrapping | 3442 | * unused processors and/or zones. They do play a role for bootstrapping |
3439 | * hotplugged processors. | 3443 | * hotplugged processors. |
3440 | * | 3444 | * |
3441 | * zoneinfo_show() and maybe other functions do | 3445 | * zoneinfo_show() and maybe other functions do |
3442 | * not check if the processor is online before following the pageset pointer. | 3446 | * not check if the processor is online before following the pageset pointer. |
3443 | * Other parts of the kernel may not check if the zone is available. | 3447 | * Other parts of the kernel may not check if the zone is available. |
3444 | */ | 3448 | */ |
3445 | static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); | 3449 | static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); |
3446 | static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); | 3450 | static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); |
3447 | static void setup_zone_pageset(struct zone *zone); | 3451 | static void setup_zone_pageset(struct zone *zone); |
3448 | 3452 | ||
3449 | /* | 3453 | /* |
3450 | * Global mutex to protect against size modification of zonelists | 3454 | * Global mutex to protect against size modification of zonelists |
3451 | * as well as to serialize pageset setup for the new populated zone. | 3455 | * as well as to serialize pageset setup for the new populated zone. |
3452 | */ | 3456 | */ |
3453 | DEFINE_MUTEX(zonelists_mutex); | 3457 | DEFINE_MUTEX(zonelists_mutex); |
3454 | 3458 | ||
3455 | /* return values int ....just for stop_machine() */ | 3459 | /* return values int ....just for stop_machine() */ |
3456 | static int __build_all_zonelists(void *data) | 3460 | static int __build_all_zonelists(void *data) |
3457 | { | 3461 | { |
3458 | int nid; | 3462 | int nid; |
3459 | int cpu; | 3463 | int cpu; |
3460 | pg_data_t *self = data; | 3464 | pg_data_t *self = data; |
3461 | 3465 | ||
3462 | #ifdef CONFIG_NUMA | 3466 | #ifdef CONFIG_NUMA |
3463 | memset(node_load, 0, sizeof(node_load)); | 3467 | memset(node_load, 0, sizeof(node_load)); |
3464 | #endif | 3468 | #endif |
3465 | 3469 | ||
3466 | if (self && !node_online(self->node_id)) { | 3470 | if (self && !node_online(self->node_id)) { |
3467 | build_zonelists(self); | 3471 | build_zonelists(self); |
3468 | build_zonelist_cache(self); | 3472 | build_zonelist_cache(self); |
3469 | } | 3473 | } |
3470 | 3474 | ||
3471 | for_each_online_node(nid) { | 3475 | for_each_online_node(nid) { |
3472 | pg_data_t *pgdat = NODE_DATA(nid); | 3476 | pg_data_t *pgdat = NODE_DATA(nid); |
3473 | 3477 | ||
3474 | build_zonelists(pgdat); | 3478 | build_zonelists(pgdat); |
3475 | build_zonelist_cache(pgdat); | 3479 | build_zonelist_cache(pgdat); |
3476 | } | 3480 | } |
3477 | 3481 | ||
3478 | /* | 3482 | /* |
3479 | * Initialize the boot_pagesets that are going to be used | 3483 | * Initialize the boot_pagesets that are going to be used |
3480 | * for bootstrapping processors. The real pagesets for | 3484 | * for bootstrapping processors. The real pagesets for |
3481 | * each zone will be allocated later when the per cpu | 3485 | * each zone will be allocated later when the per cpu |
3482 | * allocator is available. | 3486 | * allocator is available. |
3483 | * | 3487 | * |
3484 | * boot_pagesets are used also for bootstrapping offline | 3488 | * boot_pagesets are used also for bootstrapping offline |
3485 | * cpus if the system is already booted because the pagesets | 3489 | * cpus if the system is already booted because the pagesets |
3486 | * are needed to initialize allocators on a specific cpu too. | 3490 | * are needed to initialize allocators on a specific cpu too. |
3487 | * F.e. the percpu allocator needs the page allocator which | 3491 | * F.e. the percpu allocator needs the page allocator which |
3488 | * needs the percpu allocator in order to allocate its pagesets | 3492 | * needs the percpu allocator in order to allocate its pagesets |
3489 | * (a chicken-egg dilemma). | 3493 | * (a chicken-egg dilemma). |
3490 | */ | 3494 | */ |
3491 | for_each_possible_cpu(cpu) { | 3495 | for_each_possible_cpu(cpu) { |
3492 | setup_pageset(&per_cpu(boot_pageset, cpu), 0); | 3496 | setup_pageset(&per_cpu(boot_pageset, cpu), 0); |
3493 | 3497 | ||
3494 | #ifdef CONFIG_HAVE_MEMORYLESS_NODES | 3498 | #ifdef CONFIG_HAVE_MEMORYLESS_NODES |
3495 | /* | 3499 | /* |
3496 | * We now know the "local memory node" for each node-- | 3500 | * We now know the "local memory node" for each node-- |
3497 | * i.e., the node of the first zone in the generic zonelist. | 3501 | * i.e., the node of the first zone in the generic zonelist. |
3498 | * Set up numa_mem percpu variable for on-line cpus. During | 3502 | * Set up numa_mem percpu variable for on-line cpus. During |
3499 | * boot, only the boot cpu should be on-line; we'll init the | 3503 | * boot, only the boot cpu should be on-line; we'll init the |
3500 | * secondary cpus' numa_mem as they come on-line. During | 3504 | * secondary cpus' numa_mem as they come on-line. During |
3501 | * node/memory hotplug, we'll fixup all on-line cpus. | 3505 | * node/memory hotplug, we'll fixup all on-line cpus. |
3502 | */ | 3506 | */ |
3503 | if (cpu_online(cpu)) | 3507 | if (cpu_online(cpu)) |
3504 | set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu))); | 3508 | set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu))); |
3505 | #endif | 3509 | #endif |
3506 | } | 3510 | } |
3507 | 3511 | ||
3508 | return 0; | 3512 | return 0; |
3509 | } | 3513 | } |
3510 | 3514 | ||
3511 | /* | 3515 | /* |
3512 | * Called with zonelists_mutex held always | 3516 | * Called with zonelists_mutex held always |
3513 | * unless system_state == SYSTEM_BOOTING. | 3517 | * unless system_state == SYSTEM_BOOTING. |
3514 | */ | 3518 | */ |
3515 | void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) | 3519 | void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) |
3516 | { | 3520 | { |
3517 | set_zonelist_order(); | 3521 | set_zonelist_order(); |
3518 | 3522 | ||
3519 | if (system_state == SYSTEM_BOOTING) { | 3523 | if (system_state == SYSTEM_BOOTING) { |
3520 | __build_all_zonelists(NULL); | 3524 | __build_all_zonelists(NULL); |
3521 | mminit_verify_zonelist(); | 3525 | mminit_verify_zonelist(); |
3522 | cpuset_init_current_mems_allowed(); | 3526 | cpuset_init_current_mems_allowed(); |
3523 | } else { | 3527 | } else { |
3524 | /* we have to stop all cpus to guarantee there is no user | 3528 | /* we have to stop all cpus to guarantee there is no user |
3525 | of zonelist */ | 3529 | of zonelist */ |
3526 | #ifdef CONFIG_MEMORY_HOTPLUG | 3530 | #ifdef CONFIG_MEMORY_HOTPLUG |
3527 | if (zone) | 3531 | if (zone) |
3528 | setup_zone_pageset(zone); | 3532 | setup_zone_pageset(zone); |
3529 | #endif | 3533 | #endif |
3530 | stop_machine(__build_all_zonelists, pgdat, NULL); | 3534 | stop_machine(__build_all_zonelists, pgdat, NULL); |
3531 | /* cpuset refresh routine should be here */ | 3535 | /* cpuset refresh routine should be here */ |
3532 | } | 3536 | } |
3533 | vm_total_pages = nr_free_pagecache_pages(); | 3537 | vm_total_pages = nr_free_pagecache_pages(); |
3534 | /* | 3538 | /* |
3535 | * Disable grouping by mobility if the number of pages in the | 3539 | * Disable grouping by mobility if the number of pages in the |
3536 | * system is too low to allow the mechanism to work. It would be | 3540 | * system is too low to allow the mechanism to work. It would be |
3537 | * more accurate, but expensive to check per-zone. This check is | 3541 | * more accurate, but expensive to check per-zone. This check is |
3538 | * made on memory-hotadd so a system can start with mobility | 3542 | * made on memory-hotadd so a system can start with mobility |
3539 | * disabled and enable it later | 3543 | * disabled and enable it later |
3540 | */ | 3544 | */ |
3541 | if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES)) | 3545 | if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES)) |
3542 | page_group_by_mobility_disabled = 1; | 3546 | page_group_by_mobility_disabled = 1; |
3543 | else | 3547 | else |
3544 | page_group_by_mobility_disabled = 0; | 3548 | page_group_by_mobility_disabled = 0; |
3545 | 3549 | ||
3546 | printk("Built %i zonelists in %s order, mobility grouping %s. " | 3550 | printk("Built %i zonelists in %s order, mobility grouping %s. " |
3547 | "Total pages: %ld\n", | 3551 | "Total pages: %ld\n", |
3548 | nr_online_nodes, | 3552 | nr_online_nodes, |
3549 | zonelist_order_name[current_zonelist_order], | 3553 | zonelist_order_name[current_zonelist_order], |
3550 | page_group_by_mobility_disabled ? "off" : "on", | 3554 | page_group_by_mobility_disabled ? "off" : "on", |
3551 | vm_total_pages); | 3555 | vm_total_pages); |
3552 | #ifdef CONFIG_NUMA | 3556 | #ifdef CONFIG_NUMA |
3553 | printk("Policy zone: %s\n", zone_names[policy_zone]); | 3557 | printk("Policy zone: %s\n", zone_names[policy_zone]); |
3554 | #endif | 3558 | #endif |
3555 | } | 3559 | } |
3556 | 3560 | ||
3557 | /* | 3561 | /* |
3558 | * Helper functions to size the waitqueue hash table. | 3562 | * Helper functions to size the waitqueue hash table. |
3559 | * Essentially these want to choose hash table sizes sufficiently | 3563 | * Essentially these want to choose hash table sizes sufficiently |
3560 | * large so that collisions trying to wait on pages are rare. | 3564 | * large so that collisions trying to wait on pages are rare. |
3561 | * But in fact, the number of active page waitqueues on typical | 3565 | * But in fact, the number of active page waitqueues on typical |
3562 | * systems is ridiculously low, less than 200. So this is even | 3566 | * systems is ridiculously low, less than 200. So this is even |
3563 | * conservative, even though it seems large. | 3567 | * conservative, even though it seems large. |
3564 | * | 3568 | * |
3565 | * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to | 3569 | * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to |
3566 | * waitqueues, i.e. the size of the waitq table given the number of pages. | 3570 | * waitqueues, i.e. the size of the waitq table given the number of pages. |
3567 | */ | 3571 | */ |
3568 | #define PAGES_PER_WAITQUEUE 256 | 3572 | #define PAGES_PER_WAITQUEUE 256 |
3569 | 3573 | ||
3570 | #ifndef CONFIG_MEMORY_HOTPLUG | 3574 | #ifndef CONFIG_MEMORY_HOTPLUG |
3571 | static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) | 3575 | static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) |
3572 | { | 3576 | { |
3573 | unsigned long size = 1; | 3577 | unsigned long size = 1; |
3574 | 3578 | ||
3575 | pages /= PAGES_PER_WAITQUEUE; | 3579 | pages /= PAGES_PER_WAITQUEUE; |
3576 | 3580 | ||
3577 | while (size < pages) | 3581 | while (size < pages) |
3578 | size <<= 1; | 3582 | size <<= 1; |
3579 | 3583 | ||
3580 | /* | 3584 | /* |
3581 | * Once we have dozens or even hundreds of threads sleeping | 3585 | * Once we have dozens or even hundreds of threads sleeping |
3582 | * on IO we've got bigger problems than wait queue collision. | 3586 | * on IO we've got bigger problems than wait queue collision. |
3583 | * Limit the size of the wait table to a reasonable size. | 3587 | * Limit the size of the wait table to a reasonable size. |
3584 | */ | 3588 | */ |
3585 | size = min(size, 4096UL); | 3589 | size = min(size, 4096UL); |
3586 | 3590 | ||
3587 | return max(size, 4UL); | 3591 | return max(size, 4UL); |
3588 | } | 3592 | } |
3589 | #else | 3593 | #else |
3590 | /* | 3594 | /* |
3591 | * A zone's size might be changed by hot-add, so it is not possible to determine | 3595 | * A zone's size might be changed by hot-add, so it is not possible to determine |
3592 | * a suitable size for its wait_table. So we use the maximum size now. | 3596 | * a suitable size for its wait_table. So we use the maximum size now. |
3593 | * | 3597 | * |
3594 | * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie: | 3598 | * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie: |
3595 | * | 3599 | * |
3596 | * i386 (preemption config) : 4096 x 16 = 64Kbyte. | 3600 | * i386 (preemption config) : 4096 x 16 = 64Kbyte. |
3597 | * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte. | 3601 | * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte. |
3598 | * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte. | 3602 | * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte. |
3599 | * | 3603 | * |
3600 | * The maximum entries are prepared when a zone's memory is (512K + 256) pages | 3604 | * The maximum entries are prepared when a zone's memory is (512K + 256) pages |
3601 | * or more by the traditional way. (See above). It equals: | 3605 | * or more by the traditional way. (See above). It equals: |
3602 | * | 3606 | * |
3603 | * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte. | 3607 | * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte. |
3604 | * ia64(16K page size) : = ( 8G + 4M)byte. | 3608 | * ia64(16K page size) : = ( 8G + 4M)byte. |
3605 | * powerpc (64K page size) : = (32G +16M)byte. | 3609 | * powerpc (64K page size) : = (32G +16M)byte. |
3606 | */ | 3610 | */ |
3607 | static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) | 3611 | static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) |
3608 | { | 3612 | { |
3609 | return 4096UL; | 3613 | return 4096UL; |
3610 | } | 3614 | } |
3611 | #endif | 3615 | #endif |
3612 | 3616 | ||
3613 | /* | 3617 | /* |
3614 | * This is an integer logarithm so that shifts can be used later | 3618 | * This is an integer logarithm so that shifts can be used later |
3615 | * to extract the more random high bits from the multiplicative | 3619 | * to extract the more random high bits from the multiplicative |
3616 | * hash function before the remainder is taken. | 3620 | * hash function before the remainder is taken. |
3617 | */ | 3621 | */ |
3618 | static inline unsigned long wait_table_bits(unsigned long size) | 3622 | static inline unsigned long wait_table_bits(unsigned long size) |
3619 | { | 3623 | { |
3620 | return ffz(~size); | 3624 | return ffz(~size); |
3621 | } | 3625 | } |
3622 | 3626 | ||
3623 | #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) | 3627 | #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) |
3624 | 3628 | ||
3625 | /* | 3629 | /* |
3626 | * Check if a pageblock contains reserved pages | 3630 | * Check if a pageblock contains reserved pages |
3627 | */ | 3631 | */ |
3628 | static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn) | 3632 | static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn) |
3629 | { | 3633 | { |
3630 | unsigned long pfn; | 3634 | unsigned long pfn; |
3631 | 3635 | ||
3632 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { | 3636 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { |
3633 | if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn))) | 3637 | if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn))) |
3634 | return 1; | 3638 | return 1; |
3635 | } | 3639 | } |
3636 | return 0; | 3640 | return 0; |
3637 | } | 3641 | } |
3638 | 3642 | ||
3639 | /* | 3643 | /* |
3640 | * Mark a number of pageblocks as MIGRATE_RESERVE. The number | 3644 | * Mark a number of pageblocks as MIGRATE_RESERVE. The number |
3641 | * of blocks reserved is based on min_wmark_pages(zone). The memory within | 3645 | * of blocks reserved is based on min_wmark_pages(zone). The memory within |
3642 | * the reserve will tend to store contiguous free pages. Setting min_free_kbytes | 3646 | * the reserve will tend to store contiguous free pages. Setting min_free_kbytes |
3643 | * higher will lead to a bigger reserve which will get freed as contiguous | 3647 | * higher will lead to a bigger reserve which will get freed as contiguous |
3644 | * blocks as reclaim kicks in | 3648 | * blocks as reclaim kicks in |
3645 | */ | 3649 | */ |
3646 | static void setup_zone_migrate_reserve(struct zone *zone) | 3650 | static void setup_zone_migrate_reserve(struct zone *zone) |
3647 | { | 3651 | { |
3648 | unsigned long start_pfn, pfn, end_pfn, block_end_pfn; | 3652 | unsigned long start_pfn, pfn, end_pfn, block_end_pfn; |
3649 | struct page *page; | 3653 | struct page *page; |
3650 | unsigned long block_migratetype; | 3654 | unsigned long block_migratetype; |
3651 | int reserve; | 3655 | int reserve; |
3652 | 3656 | ||
3653 | /* | 3657 | /* |
3654 | * Get the start pfn, end pfn and the number of blocks to reserve | 3658 | * Get the start pfn, end pfn and the number of blocks to reserve |
3655 | * We have to be careful to be aligned to pageblock_nr_pages to | 3659 | * We have to be careful to be aligned to pageblock_nr_pages to |
3656 | * make sure that we always check pfn_valid for the first page in | 3660 | * make sure that we always check pfn_valid for the first page in |
3657 | * the block. | 3661 | * the block. |
3658 | */ | 3662 | */ |
3659 | start_pfn = zone->zone_start_pfn; | 3663 | start_pfn = zone->zone_start_pfn; |
3660 | end_pfn = start_pfn + zone->spanned_pages; | 3664 | end_pfn = start_pfn + zone->spanned_pages; |
3661 | start_pfn = roundup(start_pfn, pageblock_nr_pages); | 3665 | start_pfn = roundup(start_pfn, pageblock_nr_pages); |
3662 | reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> | 3666 | reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> |
3663 | pageblock_order; | 3667 | pageblock_order; |
3664 | 3668 | ||
3665 | /* | 3669 | /* |
3666 | * Reserve blocks are generally in place to help high-order atomic | 3670 | * Reserve blocks are generally in place to help high-order atomic |
3667 | * allocations that are short-lived. A min_free_kbytes value that | 3671 | * allocations that are short-lived. A min_free_kbytes value that |
3668 | * would result in more than 2 reserve blocks for atomic allocations | 3672 | * would result in more than 2 reserve blocks for atomic allocations |
3669 | * is assumed to be in place to help anti-fragmentation for the | 3673 | * is assumed to be in place to help anti-fragmentation for the |
3670 | * future allocation of hugepages at runtime. | 3674 | * future allocation of hugepages at runtime. |
3671 | */ | 3675 | */ |
3672 | reserve = min(2, reserve); | 3676 | reserve = min(2, reserve); |
3673 | 3677 | ||
3674 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { | 3678 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { |
3675 | if (!pfn_valid(pfn)) | 3679 | if (!pfn_valid(pfn)) |
3676 | continue; | 3680 | continue; |
3677 | page = pfn_to_page(pfn); | 3681 | page = pfn_to_page(pfn); |
3678 | 3682 | ||
3679 | /* Watch out for overlapping nodes */ | 3683 | /* Watch out for overlapping nodes */ |
3680 | if (page_to_nid(page) != zone_to_nid(zone)) | 3684 | if (page_to_nid(page) != zone_to_nid(zone)) |
3681 | continue; | 3685 | continue; |
3682 | 3686 | ||
3683 | block_migratetype = get_pageblock_migratetype(page); | 3687 | block_migratetype = get_pageblock_migratetype(page); |
3684 | 3688 | ||
3685 | /* Only test what is necessary when the reserves are not met */ | 3689 | /* Only test what is necessary when the reserves are not met */ |
3686 | if (reserve > 0) { | 3690 | if (reserve > 0) { |
3687 | /* | 3691 | /* |
3688 | * Blocks with reserved pages will never free, skip | 3692 | * Blocks with reserved pages will never free, skip |
3689 | * them. | 3693 | * them. |
3690 | */ | 3694 | */ |
3691 | block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); | 3695 | block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); |
3692 | if (pageblock_is_reserved(pfn, block_end_pfn)) | 3696 | if (pageblock_is_reserved(pfn, block_end_pfn)) |
3693 | continue; | 3697 | continue; |
3694 | 3698 | ||
3695 | /* If this block is reserved, account for it */ | 3699 | /* If this block is reserved, account for it */ |
3696 | if (block_migratetype == MIGRATE_RESERVE) { | 3700 | if (block_migratetype == MIGRATE_RESERVE) { |
3697 | reserve--; | 3701 | reserve--; |
3698 | continue; | 3702 | continue; |
3699 | } | 3703 | } |
3700 | 3704 | ||
3701 | /* Suitable for reserving if this block is movable */ | 3705 | /* Suitable for reserving if this block is movable */ |
3702 | if (block_migratetype == MIGRATE_MOVABLE) { | 3706 | if (block_migratetype == MIGRATE_MOVABLE) { |
3703 | set_pageblock_migratetype(page, | 3707 | set_pageblock_migratetype(page, |
3704 | MIGRATE_RESERVE); | 3708 | MIGRATE_RESERVE); |
3705 | move_freepages_block(zone, page, | 3709 | move_freepages_block(zone, page, |
3706 | MIGRATE_RESERVE); | 3710 | MIGRATE_RESERVE); |
3707 | reserve--; | 3711 | reserve--; |
3708 | continue; | 3712 | continue; |
3709 | } | 3713 | } |
3710 | } | 3714 | } |
3711 | 3715 | ||
3712 | /* | 3716 | /* |
3713 | * If the reserve is met and this is a previous reserved block, | 3717 | * If the reserve is met and this is a previous reserved block, |
3714 | * take it back | 3718 | * take it back |
3715 | */ | 3719 | */ |
3716 | if (block_migratetype == MIGRATE_RESERVE) { | 3720 | if (block_migratetype == MIGRATE_RESERVE) { |
3717 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); | 3721 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); |
3718 | move_freepages_block(zone, page, MIGRATE_MOVABLE); | 3722 | move_freepages_block(zone, page, MIGRATE_MOVABLE); |
3719 | } | 3723 | } |
3720 | } | 3724 | } |
3721 | } | 3725 | } |
3722 | 3726 | ||
3723 | /* | 3727 | /* |
3724 | * Initially all pages are reserved - free ones are freed | 3728 | * Initially all pages are reserved - free ones are freed |
3725 | * up by free_all_bootmem() once the early boot process is | 3729 | * up by free_all_bootmem() once the early boot process is |
3726 | * done. Non-atomic initialization, single-pass. | 3730 | * done. Non-atomic initialization, single-pass. |
3727 | */ | 3731 | */ |
3728 | void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | 3732 | void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, |
3729 | unsigned long start_pfn, enum memmap_context context) | 3733 | unsigned long start_pfn, enum memmap_context context) |
3730 | { | 3734 | { |
3731 | struct page *page; | 3735 | struct page *page; |
3732 | unsigned long end_pfn = start_pfn + size; | 3736 | unsigned long end_pfn = start_pfn + size; |
3733 | unsigned long pfn; | 3737 | unsigned long pfn; |
3734 | struct zone *z; | 3738 | struct zone *z; |
3735 | 3739 | ||
3736 | if (highest_memmap_pfn < end_pfn - 1) | 3740 | if (highest_memmap_pfn < end_pfn - 1) |
3737 | highest_memmap_pfn = end_pfn - 1; | 3741 | highest_memmap_pfn = end_pfn - 1; |
3738 | 3742 | ||
3739 | z = &NODE_DATA(nid)->node_zones[zone]; | 3743 | z = &NODE_DATA(nid)->node_zones[zone]; |
3740 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { | 3744 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { |
3741 | /* | 3745 | /* |
3742 | * There can be holes in boot-time mem_map[]s | 3746 | * There can be holes in boot-time mem_map[]s |
3743 | * handed to this function. They do not | 3747 | * handed to this function. They do not |
3744 | * exist on hotplugged memory. | 3748 | * exist on hotplugged memory. |
3745 | */ | 3749 | */ |
3746 | if (context == MEMMAP_EARLY) { | 3750 | if (context == MEMMAP_EARLY) { |
3747 | if (!early_pfn_valid(pfn)) | 3751 | if (!early_pfn_valid(pfn)) |
3748 | continue; | 3752 | continue; |
3749 | if (!early_pfn_in_nid(pfn, nid)) | 3753 | if (!early_pfn_in_nid(pfn, nid)) |
3750 | continue; | 3754 | continue; |
3751 | } | 3755 | } |
3752 | page = pfn_to_page(pfn); | 3756 | page = pfn_to_page(pfn); |
3753 | set_page_links(page, zone, nid, pfn); | 3757 | set_page_links(page, zone, nid, pfn); |
3754 | mminit_verify_page_links(page, zone, nid, pfn); | 3758 | mminit_verify_page_links(page, zone, nid, pfn); |
3755 | init_page_count(page); | 3759 | init_page_count(page); |
3756 | reset_page_mapcount(page); | 3760 | reset_page_mapcount(page); |
3757 | SetPageReserved(page); | 3761 | SetPageReserved(page); |
3758 | /* | 3762 | /* |
3759 | * Mark the block movable so that blocks are reserved for | 3763 | * Mark the block movable so that blocks are reserved for |
3760 | * movable at startup. This will force kernel allocations | 3764 | * movable at startup. This will force kernel allocations |
3761 | * to reserve their blocks rather than leaking throughout | 3765 | * to reserve their blocks rather than leaking throughout |
3762 | * the address space during boot when many long-lived | 3766 | * the address space during boot when many long-lived |
3763 | * kernel allocations are made. Later some blocks near | 3767 | * kernel allocations are made. Later some blocks near |
3764 | * the start are marked MIGRATE_RESERVE by | 3768 | * the start are marked MIGRATE_RESERVE by |
3765 | * setup_zone_migrate_reserve() | 3769 | * setup_zone_migrate_reserve() |
3766 | * | 3770 | * |
3767 | * bitmap is created for zone's valid pfn range. but memmap | 3771 | * bitmap is created for zone's valid pfn range. but memmap |
3768 | * can be created for invalid pages (for alignment) | 3772 | * can be created for invalid pages (for alignment) |
3769 | * check here not to call set_pageblock_migratetype() against | 3773 | * check here not to call set_pageblock_migratetype() against |
3770 | * pfn out of zone. | 3774 | * pfn out of zone. |
3771 | */ | 3775 | */ |
3772 | if ((z->zone_start_pfn <= pfn) | 3776 | if ((z->zone_start_pfn <= pfn) |
3773 | && (pfn < z->zone_start_pfn + z->spanned_pages) | 3777 | && (pfn < z->zone_start_pfn + z->spanned_pages) |
3774 | && !(pfn & (pageblock_nr_pages - 1))) | 3778 | && !(pfn & (pageblock_nr_pages - 1))) |
3775 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); | 3779 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); |
3776 | 3780 | ||
3777 | INIT_LIST_HEAD(&page->lru); | 3781 | INIT_LIST_HEAD(&page->lru); |
3778 | #ifdef WANT_PAGE_VIRTUAL | 3782 | #ifdef WANT_PAGE_VIRTUAL |
3779 | /* The shift won't overflow because ZONE_NORMAL is below 4G. */ | 3783 | /* The shift won't overflow because ZONE_NORMAL is below 4G. */ |
3780 | if (!is_highmem_idx(zone)) | 3784 | if (!is_highmem_idx(zone)) |
3781 | set_page_address(page, __va(pfn << PAGE_SHIFT)); | 3785 | set_page_address(page, __va(pfn << PAGE_SHIFT)); |
3782 | #endif | 3786 | #endif |
3783 | } | 3787 | } |
3784 | } | 3788 | } |
3785 | 3789 | ||
3786 | static void __meminit zone_init_free_lists(struct zone *zone) | 3790 | static void __meminit zone_init_free_lists(struct zone *zone) |
3787 | { | 3791 | { |
3788 | int order, t; | 3792 | int order, t; |
3789 | for_each_migratetype_order(order, t) { | 3793 | for_each_migratetype_order(order, t) { |
3790 | INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); | 3794 | INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); |
3791 | zone->free_area[order].nr_free = 0; | 3795 | zone->free_area[order].nr_free = 0; |
3792 | } | 3796 | } |
3793 | } | 3797 | } |
3794 | 3798 | ||
3795 | #ifndef __HAVE_ARCH_MEMMAP_INIT | 3799 | #ifndef __HAVE_ARCH_MEMMAP_INIT |
3796 | #define memmap_init(size, nid, zone, start_pfn) \ | 3800 | #define memmap_init(size, nid, zone, start_pfn) \ |
3797 | memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) | 3801 | memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) |
3798 | #endif | 3802 | #endif |
3799 | 3803 | ||
3800 | static int __meminit zone_batchsize(struct zone *zone) | 3804 | static int __meminit zone_batchsize(struct zone *zone) |
3801 | { | 3805 | { |
3802 | #ifdef CONFIG_MMU | 3806 | #ifdef CONFIG_MMU |
3803 | int batch; | 3807 | int batch; |
3804 | 3808 | ||
3805 | /* | 3809 | /* |
3806 | * The per-cpu-pages pools are set to around 1000th of the | 3810 | * The per-cpu-pages pools are set to around 1000th of the |
3807 | * size of the zone. But no more than 1/2 of a meg. | 3811 | * size of the zone. But no more than 1/2 of a meg. |
3808 | * | 3812 | * |
3809 | * OK, so we don't know how big the cache is. So guess. | 3813 | * OK, so we don't know how big the cache is. So guess. |
3810 | */ | 3814 | */ |
3811 | batch = zone->present_pages / 1024; | 3815 | batch = zone->present_pages / 1024; |
3812 | if (batch * PAGE_SIZE > 512 * 1024) | 3816 | if (batch * PAGE_SIZE > 512 * 1024) |
3813 | batch = (512 * 1024) / PAGE_SIZE; | 3817 | batch = (512 * 1024) / PAGE_SIZE; |
3814 | batch /= 4; /* We effectively *= 4 below */ | 3818 | batch /= 4; /* We effectively *= 4 below */ |
3815 | if (batch < 1) | 3819 | if (batch < 1) |
3816 | batch = 1; | 3820 | batch = 1; |
3817 | 3821 | ||
3818 | /* | 3822 | /* |
3819 | * Clamp the batch to a 2^n - 1 value. Having a power | 3823 | * Clamp the batch to a 2^n - 1 value. Having a power |
3820 | * of 2 value was found to be more likely to have | 3824 | * of 2 value was found to be more likely to have |
3821 | * suboptimal cache aliasing properties in some cases. | 3825 | * suboptimal cache aliasing properties in some cases. |
3822 | * | 3826 | * |
3823 | * For example if 2 tasks are alternately allocating | 3827 | * For example if 2 tasks are alternately allocating |
3824 | * batches of pages, one task can end up with a lot | 3828 | * batches of pages, one task can end up with a lot |
3825 | * of pages of one half of the possible page colors | 3829 | * of pages of one half of the possible page colors |
3826 | * and the other with pages of the other colors. | 3830 | * and the other with pages of the other colors. |
3827 | */ | 3831 | */ |
3828 | batch = rounddown_pow_of_two(batch + batch/2) - 1; | 3832 | batch = rounddown_pow_of_two(batch + batch/2) - 1; |
3829 | 3833 | ||
3830 | return batch; | 3834 | return batch; |
3831 | 3835 | ||
3832 | #else | 3836 | #else |
3833 | /* The deferral and batching of frees should be suppressed under NOMMU | 3837 | /* The deferral and batching of frees should be suppressed under NOMMU |
3834 | * conditions. | 3838 | * conditions. |
3835 | * | 3839 | * |
3836 | * The problem is that NOMMU needs to be able to allocate large chunks | 3840 | * The problem is that NOMMU needs to be able to allocate large chunks |
3837 | * of contiguous memory as there's no hardware page translation to | 3841 | * of contiguous memory as there's no hardware page translation to |
3838 | * assemble apparent contiguous memory from discontiguous pages. | 3842 | * assemble apparent contiguous memory from discontiguous pages. |
3839 | * | 3843 | * |
3840 | * Queueing large contiguous runs of pages for batching, however, | 3844 | * Queueing large contiguous runs of pages for batching, however, |
3841 | * causes the pages to actually be freed in smaller chunks. As there | 3845 | * causes the pages to actually be freed in smaller chunks. As there |
3842 | * can be a significant delay between the individual batches being | 3846 | * can be a significant delay between the individual batches being |
3843 | * recycled, this leads to the once large chunks of space being | 3847 | * recycled, this leads to the once large chunks of space being |
3844 | * fragmented and becoming unavailable for high-order allocations. | 3848 | * fragmented and becoming unavailable for high-order allocations. |
3845 | */ | 3849 | */ |
3846 | return 0; | 3850 | return 0; |
3847 | #endif | 3851 | #endif |
3848 | } | 3852 | } |
3849 | 3853 | ||
3850 | static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) | 3854 | static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) |
3851 | { | 3855 | { |
3852 | struct per_cpu_pages *pcp; | 3856 | struct per_cpu_pages *pcp; |
3853 | int migratetype; | 3857 | int migratetype; |
3854 | 3858 | ||
3855 | memset(p, 0, sizeof(*p)); | 3859 | memset(p, 0, sizeof(*p)); |
3856 | 3860 | ||
3857 | pcp = &p->pcp; | 3861 | pcp = &p->pcp; |
3858 | pcp->count = 0; | 3862 | pcp->count = 0; |
3859 | pcp->high = 6 * batch; | 3863 | pcp->high = 6 * batch; |
3860 | pcp->batch = max(1UL, 1 * batch); | 3864 | pcp->batch = max(1UL, 1 * batch); |
3861 | for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) | 3865 | for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) |
3862 | INIT_LIST_HEAD(&pcp->lists[migratetype]); | 3866 | INIT_LIST_HEAD(&pcp->lists[migratetype]); |
3863 | } | 3867 | } |
3864 | 3868 | ||
3865 | /* | 3869 | /* |
3866 | * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist | 3870 | * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist |
3867 | * to the value high for the pageset p. | 3871 | * to the value high for the pageset p. |
3868 | */ | 3872 | */ |
3869 | 3873 | ||
3870 | static void setup_pagelist_highmark(struct per_cpu_pageset *p, | 3874 | static void setup_pagelist_highmark(struct per_cpu_pageset *p, |
3871 | unsigned long high) | 3875 | unsigned long high) |
3872 | { | 3876 | { |
3873 | struct per_cpu_pages *pcp; | 3877 | struct per_cpu_pages *pcp; |
3874 | 3878 | ||
3875 | pcp = &p->pcp; | 3879 | pcp = &p->pcp; |
3876 | pcp->high = high; | 3880 | pcp->high = high; |
3877 | pcp->batch = max(1UL, high/4); | 3881 | pcp->batch = max(1UL, high/4); |
3878 | if ((high/4) > (PAGE_SHIFT * 8)) | 3882 | if ((high/4) > (PAGE_SHIFT * 8)) |
3879 | pcp->batch = PAGE_SHIFT * 8; | 3883 | pcp->batch = PAGE_SHIFT * 8; |
3880 | } | 3884 | } |
3881 | 3885 | ||
3882 | static void __meminit setup_zone_pageset(struct zone *zone) | 3886 | static void __meminit setup_zone_pageset(struct zone *zone) |
3883 | { | 3887 | { |
3884 | int cpu; | 3888 | int cpu; |
3885 | 3889 | ||
3886 | zone->pageset = alloc_percpu(struct per_cpu_pageset); | 3890 | zone->pageset = alloc_percpu(struct per_cpu_pageset); |
3887 | 3891 | ||
3888 | for_each_possible_cpu(cpu) { | 3892 | for_each_possible_cpu(cpu) { |
3889 | struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); | 3893 | struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); |
3890 | 3894 | ||
3891 | setup_pageset(pcp, zone_batchsize(zone)); | 3895 | setup_pageset(pcp, zone_batchsize(zone)); |
3892 | 3896 | ||
3893 | if (percpu_pagelist_fraction) | 3897 | if (percpu_pagelist_fraction) |
3894 | setup_pagelist_highmark(pcp, | 3898 | setup_pagelist_highmark(pcp, |
3895 | (zone->present_pages / | 3899 | (zone->present_pages / |
3896 | percpu_pagelist_fraction)); | 3900 | percpu_pagelist_fraction)); |
3897 | } | 3901 | } |
3898 | } | 3902 | } |
3899 | 3903 | ||
3900 | /* | 3904 | /* |
3901 | * Allocate per cpu pagesets and initialize them. | 3905 | * Allocate per cpu pagesets and initialize them. |
3902 | * Before this call only boot pagesets were available. | 3906 | * Before this call only boot pagesets were available. |
3903 | */ | 3907 | */ |
3904 | void __init setup_per_cpu_pageset(void) | 3908 | void __init setup_per_cpu_pageset(void) |
3905 | { | 3909 | { |
3906 | struct zone *zone; | 3910 | struct zone *zone; |
3907 | 3911 | ||
3908 | for_each_populated_zone(zone) | 3912 | for_each_populated_zone(zone) |
3909 | setup_zone_pageset(zone); | 3913 | setup_zone_pageset(zone); |
3910 | } | 3914 | } |
3911 | 3915 | ||
3912 | static noinline __init_refok | 3916 | static noinline __init_refok |
3913 | int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) | 3917 | int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) |
3914 | { | 3918 | { |
3915 | int i; | 3919 | int i; |
3916 | struct pglist_data *pgdat = zone->zone_pgdat; | 3920 | struct pglist_data *pgdat = zone->zone_pgdat; |
3917 | size_t alloc_size; | 3921 | size_t alloc_size; |
3918 | 3922 | ||
3919 | /* | 3923 | /* |
3920 | * The per-page waitqueue mechanism uses hashed waitqueues | 3924 | * The per-page waitqueue mechanism uses hashed waitqueues |
3921 | * per zone. | 3925 | * per zone. |
3922 | */ | 3926 | */ |
3923 | zone->wait_table_hash_nr_entries = | 3927 | zone->wait_table_hash_nr_entries = |
3924 | wait_table_hash_nr_entries(zone_size_pages); | 3928 | wait_table_hash_nr_entries(zone_size_pages); |
3925 | zone->wait_table_bits = | 3929 | zone->wait_table_bits = |
3926 | wait_table_bits(zone->wait_table_hash_nr_entries); | 3930 | wait_table_bits(zone->wait_table_hash_nr_entries); |
3927 | alloc_size = zone->wait_table_hash_nr_entries | 3931 | alloc_size = zone->wait_table_hash_nr_entries |
3928 | * sizeof(wait_queue_head_t); | 3932 | * sizeof(wait_queue_head_t); |
3929 | 3933 | ||
3930 | if (!slab_is_available()) { | 3934 | if (!slab_is_available()) { |
3931 | zone->wait_table = (wait_queue_head_t *) | 3935 | zone->wait_table = (wait_queue_head_t *) |
3932 | alloc_bootmem_node_nopanic(pgdat, alloc_size); | 3936 | alloc_bootmem_node_nopanic(pgdat, alloc_size); |
3933 | } else { | 3937 | } else { |
3934 | /* | 3938 | /* |
3935 | * This case means that a zone whose size was 0 gets new memory | 3939 | * This case means that a zone whose size was 0 gets new memory |
3936 | * via memory hot-add. | 3940 | * via memory hot-add. |
3937 | * But it may be the case that a new node was hot-added. In | 3941 | * But it may be the case that a new node was hot-added. In |
3938 | * this case vmalloc() will not be able to use this new node's | 3942 | * this case vmalloc() will not be able to use this new node's |
3939 | * memory - this wait_table must be initialized to use this new | 3943 | * memory - this wait_table must be initialized to use this new |
3940 | * node itself as well. | 3944 | * node itself as well. |
3941 | * To use this new node's memory, further consideration will be | 3945 | * To use this new node's memory, further consideration will be |
3942 | * necessary. | 3946 | * necessary. |
3943 | */ | 3947 | */ |
3944 | zone->wait_table = vmalloc(alloc_size); | 3948 | zone->wait_table = vmalloc(alloc_size); |
3945 | } | 3949 | } |
3946 | if (!zone->wait_table) | 3950 | if (!zone->wait_table) |
3947 | return -ENOMEM; | 3951 | return -ENOMEM; |
3948 | 3952 | ||
3949 | for(i = 0; i < zone->wait_table_hash_nr_entries; ++i) | 3953 | for(i = 0; i < zone->wait_table_hash_nr_entries; ++i) |
3950 | init_waitqueue_head(zone->wait_table + i); | 3954 | init_waitqueue_head(zone->wait_table + i); |
3951 | 3955 | ||
3952 | return 0; | 3956 | return 0; |
3953 | } | 3957 | } |
3954 | 3958 | ||
3955 | static __meminit void zone_pcp_init(struct zone *zone) | 3959 | static __meminit void zone_pcp_init(struct zone *zone) |
3956 | { | 3960 | { |
3957 | /* | 3961 | /* |
3958 | * per cpu subsystem is not up at this point. The following code | 3962 | * per cpu subsystem is not up at this point. The following code |
3959 | * relies on the ability of the linker to provide the | 3963 | * relies on the ability of the linker to provide the |
3960 | * offset of a (static) per cpu variable into the per cpu area. | 3964 | * offset of a (static) per cpu variable into the per cpu area. |
3961 | */ | 3965 | */ |
3962 | zone->pageset = &boot_pageset; | 3966 | zone->pageset = &boot_pageset; |
3963 | 3967 | ||
3964 | if (zone->present_pages) | 3968 | if (zone->present_pages) |
3965 | printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", | 3969 | printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", |
3966 | zone->name, zone->present_pages, | 3970 | zone->name, zone->present_pages, |
3967 | zone_batchsize(zone)); | 3971 | zone_batchsize(zone)); |
3968 | } | 3972 | } |
3969 | 3973 | ||
3970 | int __meminit init_currently_empty_zone(struct zone *zone, | 3974 | int __meminit init_currently_empty_zone(struct zone *zone, |
3971 | unsigned long zone_start_pfn, | 3975 | unsigned long zone_start_pfn, |
3972 | unsigned long size, | 3976 | unsigned long size, |
3973 | enum memmap_context context) | 3977 | enum memmap_context context) |
3974 | { | 3978 | { |
3975 | struct pglist_data *pgdat = zone->zone_pgdat; | 3979 | struct pglist_data *pgdat = zone->zone_pgdat; |
3976 | int ret; | 3980 | int ret; |
3977 | ret = zone_wait_table_init(zone, size); | 3981 | ret = zone_wait_table_init(zone, size); |
3978 | if (ret) | 3982 | if (ret) |
3979 | return ret; | 3983 | return ret; |
3980 | pgdat->nr_zones = zone_idx(zone) + 1; | 3984 | pgdat->nr_zones = zone_idx(zone) + 1; |
3981 | 3985 | ||
3982 | zone->zone_start_pfn = zone_start_pfn; | 3986 | zone->zone_start_pfn = zone_start_pfn; |
3983 | 3987 | ||
3984 | mminit_dprintk(MMINIT_TRACE, "memmap_init", | 3988 | mminit_dprintk(MMINIT_TRACE, "memmap_init", |
3985 | "Initialising map node %d zone %lu pfns %lu -> %lu\n", | 3989 | "Initialising map node %d zone %lu pfns %lu -> %lu\n", |
3986 | pgdat->node_id, | 3990 | pgdat->node_id, |
3987 | (unsigned long)zone_idx(zone), | 3991 | (unsigned long)zone_idx(zone), |
3988 | zone_start_pfn, (zone_start_pfn + size)); | 3992 | zone_start_pfn, (zone_start_pfn + size)); |
3989 | 3993 | ||
3990 | zone_init_free_lists(zone); | 3994 | zone_init_free_lists(zone); |
3991 | 3995 | ||
3992 | return 0; | 3996 | return 0; |
3993 | } | 3997 | } |
3994 | 3998 | ||
3995 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | 3999 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
3996 | #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID | 4000 | #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID |
3997 | /* | 4001 | /* |
3998 | * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. | 4002 | * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. |
3999 | * Architectures may implement their own version but if add_active_range() | 4003 | * Architectures may implement their own version but if add_active_range() |
4000 | * was used and there are no special requirements, this is a convenient | 4004 | * was used and there are no special requirements, this is a convenient |
4001 | * alternative | 4005 | * alternative |
4002 | */ | 4006 | */ |
4003 | int __meminit __early_pfn_to_nid(unsigned long pfn) | 4007 | int __meminit __early_pfn_to_nid(unsigned long pfn) |
4004 | { | 4008 | { |
4005 | unsigned long start_pfn, end_pfn; | 4009 | unsigned long start_pfn, end_pfn; |
4006 | int i, nid; | 4010 | int i, nid; |
4007 | 4011 | ||
4008 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) | 4012 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) |
4009 | if (start_pfn <= pfn && pfn < end_pfn) | 4013 | if (start_pfn <= pfn && pfn < end_pfn) |
4010 | return nid; | 4014 | return nid; |
4011 | /* This is a memory hole */ | 4015 | /* This is a memory hole */ |
4012 | return -1; | 4016 | return -1; |
4013 | } | 4017 | } |
4014 | #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ | 4018 | #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ |
4015 | 4019 | ||
4016 | int __meminit early_pfn_to_nid(unsigned long pfn) | 4020 | int __meminit early_pfn_to_nid(unsigned long pfn) |
4017 | { | 4021 | { |
4018 | int nid; | 4022 | int nid; |
4019 | 4023 | ||
4020 | nid = __early_pfn_to_nid(pfn); | 4024 | nid = __early_pfn_to_nid(pfn); |
4021 | if (nid >= 0) | 4025 | if (nid >= 0) |
4022 | return nid; | 4026 | return nid; |
4023 | /* just returns 0 */ | 4027 | /* just returns 0 */ |
4024 | return 0; | 4028 | return 0; |
4025 | } | 4029 | } |
4026 | 4030 | ||
4027 | #ifdef CONFIG_NODES_SPAN_OTHER_NODES | 4031 | #ifdef CONFIG_NODES_SPAN_OTHER_NODES |
4028 | bool __meminit early_pfn_in_nid(unsigned long pfn, int node) | 4032 | bool __meminit early_pfn_in_nid(unsigned long pfn, int node) |
4029 | { | 4033 | { |
4030 | int nid; | 4034 | int nid; |
4031 | 4035 | ||
4032 | nid = __early_pfn_to_nid(pfn); | 4036 | nid = __early_pfn_to_nid(pfn); |
4033 | if (nid >= 0 && nid != node) | 4037 | if (nid >= 0 && nid != node) |
4034 | return false; | 4038 | return false; |
4035 | return true; | 4039 | return true; |
4036 | } | 4040 | } |
4037 | #endif | 4041 | #endif |
4038 | 4042 | ||
4039 | /** | 4043 | /** |
4040 | * free_bootmem_with_active_regions - Call free_bootmem_node for each active range | 4044 | * free_bootmem_with_active_regions - Call free_bootmem_node for each active range |
4041 | * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. | 4045 | * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. |
4042 | * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node | 4046 | * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node |
4043 | * | 4047 | * |
4044 | * If an architecture guarantees that all ranges registered with | 4048 | * If an architecture guarantees that all ranges registered with |
4045 | * add_active_ranges() contain no holes and may be freed, this | 4049 | * add_active_ranges() contain no holes and may be freed, this |
4046 | * this function may be used instead of calling free_bootmem() manually. | 4050 | * this function may be used instead of calling free_bootmem() manually. |
4047 | */ | 4051 | */ |
4048 | void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) | 4052 | void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) |
4049 | { | 4053 | { |
4050 | unsigned long start_pfn, end_pfn; | 4054 | unsigned long start_pfn, end_pfn; |
4051 | int i, this_nid; | 4055 | int i, this_nid; |
4052 | 4056 | ||
4053 | for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) { | 4057 | for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) { |
4054 | start_pfn = min(start_pfn, max_low_pfn); | 4058 | start_pfn = min(start_pfn, max_low_pfn); |
4055 | end_pfn = min(end_pfn, max_low_pfn); | 4059 | end_pfn = min(end_pfn, max_low_pfn); |
4056 | 4060 | ||
4057 | if (start_pfn < end_pfn) | 4061 | if (start_pfn < end_pfn) |
4058 | free_bootmem_node(NODE_DATA(this_nid), | 4062 | free_bootmem_node(NODE_DATA(this_nid), |
4059 | PFN_PHYS(start_pfn), | 4063 | PFN_PHYS(start_pfn), |
4060 | (end_pfn - start_pfn) << PAGE_SHIFT); | 4064 | (end_pfn - start_pfn) << PAGE_SHIFT); |
4061 | } | 4065 | } |
4062 | } | 4066 | } |
4063 | 4067 | ||
4064 | /** | 4068 | /** |
4065 | * sparse_memory_present_with_active_regions - Call memory_present for each active range | 4069 | * sparse_memory_present_with_active_regions - Call memory_present for each active range |
4066 | * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. | 4070 | * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. |
4067 | * | 4071 | * |
4068 | * If an architecture guarantees that all ranges registered with | 4072 | * If an architecture guarantees that all ranges registered with |
4069 | * add_active_ranges() contain no holes and may be freed, this | 4073 | * add_active_ranges() contain no holes and may be freed, this |
4070 | * function may be used instead of calling memory_present() manually. | 4074 | * function may be used instead of calling memory_present() manually. |
4071 | */ | 4075 | */ |
4072 | void __init sparse_memory_present_with_active_regions(int nid) | 4076 | void __init sparse_memory_present_with_active_regions(int nid) |
4073 | { | 4077 | { |
4074 | unsigned long start_pfn, end_pfn; | 4078 | unsigned long start_pfn, end_pfn; |
4075 | int i, this_nid; | 4079 | int i, this_nid; |
4076 | 4080 | ||
4077 | for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) | 4081 | for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) |
4078 | memory_present(this_nid, start_pfn, end_pfn); | 4082 | memory_present(this_nid, start_pfn, end_pfn); |
4079 | } | 4083 | } |
4080 | 4084 | ||
4081 | /** | 4085 | /** |
4082 | * get_pfn_range_for_nid - Return the start and end page frames for a node | 4086 | * get_pfn_range_for_nid - Return the start and end page frames for a node |
4083 | * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. | 4087 | * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. |
4084 | * @start_pfn: Passed by reference. On return, it will have the node start_pfn. | 4088 | * @start_pfn: Passed by reference. On return, it will have the node start_pfn. |
4085 | * @end_pfn: Passed by reference. On return, it will have the node end_pfn. | 4089 | * @end_pfn: Passed by reference. On return, it will have the node end_pfn. |
4086 | * | 4090 | * |
4087 | * It returns the start and end page frame of a node based on information | 4091 | * It returns the start and end page frame of a node based on information |
4088 | * provided by an arch calling add_active_range(). If called for a node | 4092 | * provided by an arch calling add_active_range(). If called for a node |
4089 | * with no available memory, a warning is printed and the start and end | 4093 | * with no available memory, a warning is printed and the start and end |
4090 | * PFNs will be 0. | 4094 | * PFNs will be 0. |
4091 | */ | 4095 | */ |
4092 | void __meminit get_pfn_range_for_nid(unsigned int nid, | 4096 | void __meminit get_pfn_range_for_nid(unsigned int nid, |
4093 | unsigned long *start_pfn, unsigned long *end_pfn) | 4097 | unsigned long *start_pfn, unsigned long *end_pfn) |
4094 | { | 4098 | { |
4095 | unsigned long this_start_pfn, this_end_pfn; | 4099 | unsigned long this_start_pfn, this_end_pfn; |
4096 | int i; | 4100 | int i; |
4097 | 4101 | ||
4098 | *start_pfn = -1UL; | 4102 | *start_pfn = -1UL; |
4099 | *end_pfn = 0; | 4103 | *end_pfn = 0; |
4100 | 4104 | ||
4101 | for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) { | 4105 | for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) { |
4102 | *start_pfn = min(*start_pfn, this_start_pfn); | 4106 | *start_pfn = min(*start_pfn, this_start_pfn); |
4103 | *end_pfn = max(*end_pfn, this_end_pfn); | 4107 | *end_pfn = max(*end_pfn, this_end_pfn); |
4104 | } | 4108 | } |
4105 | 4109 | ||
4106 | if (*start_pfn == -1UL) | 4110 | if (*start_pfn == -1UL) |
4107 | *start_pfn = 0; | 4111 | *start_pfn = 0; |
4108 | } | 4112 | } |
4109 | 4113 | ||
4110 | /* | 4114 | /* |
4111 | * This finds a zone that can be used for ZONE_MOVABLE pages. The | 4115 | * This finds a zone that can be used for ZONE_MOVABLE pages. The |
4112 | * assumption is made that zones within a node are ordered in monotonic | 4116 | * assumption is made that zones within a node are ordered in monotonic |
4113 | * increasing memory addresses so that the "highest" populated zone is used | 4117 | * increasing memory addresses so that the "highest" populated zone is used |
4114 | */ | 4118 | */ |
4115 | static void __init find_usable_zone_for_movable(void) | 4119 | static void __init find_usable_zone_for_movable(void) |
4116 | { | 4120 | { |
4117 | int zone_index; | 4121 | int zone_index; |
4118 | for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) { | 4122 | for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) { |
4119 | if (zone_index == ZONE_MOVABLE) | 4123 | if (zone_index == ZONE_MOVABLE) |
4120 | continue; | 4124 | continue; |
4121 | 4125 | ||
4122 | if (arch_zone_highest_possible_pfn[zone_index] > | 4126 | if (arch_zone_highest_possible_pfn[zone_index] > |
4123 | arch_zone_lowest_possible_pfn[zone_index]) | 4127 | arch_zone_lowest_possible_pfn[zone_index]) |
4124 | break; | 4128 | break; |
4125 | } | 4129 | } |
4126 | 4130 | ||
4127 | VM_BUG_ON(zone_index == -1); | 4131 | VM_BUG_ON(zone_index == -1); |
4128 | movable_zone = zone_index; | 4132 | movable_zone = zone_index; |
4129 | } | 4133 | } |
4130 | 4134 | ||
4131 | /* | 4135 | /* |
4132 | * The zone ranges provided by the architecture do not include ZONE_MOVABLE | 4136 | * The zone ranges provided by the architecture do not include ZONE_MOVABLE |
4133 | * because it is sized independent of architecture. Unlike the other zones, | 4137 | * because it is sized independent of architecture. Unlike the other zones, |
4134 | * the starting point for ZONE_MOVABLE is not fixed. It may be different | 4138 | * the starting point for ZONE_MOVABLE is not fixed. It may be different |
4135 | * in each node depending on the size of each node and how evenly kernelcore | 4139 | * in each node depending on the size of each node and how evenly kernelcore |
4136 | * is distributed. This helper function adjusts the zone ranges | 4140 | * is distributed. This helper function adjusts the zone ranges |
4137 | * provided by the architecture for a given node by using the end of the | 4141 | * provided by the architecture for a given node by using the end of the |
4138 | * highest usable zone for ZONE_MOVABLE. This preserves the assumption that | 4142 | * highest usable zone for ZONE_MOVABLE. This preserves the assumption that |
4139 | * zones within a node are in order of monotonic increases memory addresses | 4143 | * zones within a node are in order of monotonic increases memory addresses |
4140 | */ | 4144 | */ |
4141 | static void __meminit adjust_zone_range_for_zone_movable(int nid, | 4145 | static void __meminit adjust_zone_range_for_zone_movable(int nid, |
4142 | unsigned long zone_type, | 4146 | unsigned long zone_type, |
4143 | unsigned long node_start_pfn, | 4147 | unsigned long node_start_pfn, |
4144 | unsigned long node_end_pfn, | 4148 | unsigned long node_end_pfn, |
4145 | unsigned long *zone_start_pfn, | 4149 | unsigned long *zone_start_pfn, |
4146 | unsigned long *zone_end_pfn) | 4150 | unsigned long *zone_end_pfn) |
4147 | { | 4151 | { |
4148 | /* Only adjust if ZONE_MOVABLE is on this node */ | 4152 | /* Only adjust if ZONE_MOVABLE is on this node */ |
4149 | if (zone_movable_pfn[nid]) { | 4153 | if (zone_movable_pfn[nid]) { |
4150 | /* Size ZONE_MOVABLE */ | 4154 | /* Size ZONE_MOVABLE */ |
4151 | if (zone_type == ZONE_MOVABLE) { | 4155 | if (zone_type == ZONE_MOVABLE) { |
4152 | *zone_start_pfn = zone_movable_pfn[nid]; | 4156 | *zone_start_pfn = zone_movable_pfn[nid]; |
4153 | *zone_end_pfn = min(node_end_pfn, | 4157 | *zone_end_pfn = min(node_end_pfn, |
4154 | arch_zone_highest_possible_pfn[movable_zone]); | 4158 | arch_zone_highest_possible_pfn[movable_zone]); |
4155 | 4159 | ||
4156 | /* Adjust for ZONE_MOVABLE starting within this range */ | 4160 | /* Adjust for ZONE_MOVABLE starting within this range */ |
4157 | } else if (*zone_start_pfn < zone_movable_pfn[nid] && | 4161 | } else if (*zone_start_pfn < zone_movable_pfn[nid] && |
4158 | *zone_end_pfn > zone_movable_pfn[nid]) { | 4162 | *zone_end_pfn > zone_movable_pfn[nid]) { |
4159 | *zone_end_pfn = zone_movable_pfn[nid]; | 4163 | *zone_end_pfn = zone_movable_pfn[nid]; |
4160 | 4164 | ||
4161 | /* Check if this whole range is within ZONE_MOVABLE */ | 4165 | /* Check if this whole range is within ZONE_MOVABLE */ |
4162 | } else if (*zone_start_pfn >= zone_movable_pfn[nid]) | 4166 | } else if (*zone_start_pfn >= zone_movable_pfn[nid]) |
4163 | *zone_start_pfn = *zone_end_pfn; | 4167 | *zone_start_pfn = *zone_end_pfn; |
4164 | } | 4168 | } |
4165 | } | 4169 | } |
4166 | 4170 | ||
4167 | /* | 4171 | /* |
4168 | * Return the number of pages a zone spans in a node, including holes | 4172 | * Return the number of pages a zone spans in a node, including holes |
4169 | * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() | 4173 | * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() |
4170 | */ | 4174 | */ |
4171 | static unsigned long __meminit zone_spanned_pages_in_node(int nid, | 4175 | static unsigned long __meminit zone_spanned_pages_in_node(int nid, |
4172 | unsigned long zone_type, | 4176 | unsigned long zone_type, |
4173 | unsigned long *ignored) | 4177 | unsigned long *ignored) |
4174 | { | 4178 | { |
4175 | unsigned long node_start_pfn, node_end_pfn; | 4179 | unsigned long node_start_pfn, node_end_pfn; |
4176 | unsigned long zone_start_pfn, zone_end_pfn; | 4180 | unsigned long zone_start_pfn, zone_end_pfn; |
4177 | 4181 | ||
4178 | /* Get the start and end of the node and zone */ | 4182 | /* Get the start and end of the node and zone */ |
4179 | get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); | 4183 | get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); |
4180 | zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; | 4184 | zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; |
4181 | zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; | 4185 | zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; |
4182 | adjust_zone_range_for_zone_movable(nid, zone_type, | 4186 | adjust_zone_range_for_zone_movable(nid, zone_type, |
4183 | node_start_pfn, node_end_pfn, | 4187 | node_start_pfn, node_end_pfn, |
4184 | &zone_start_pfn, &zone_end_pfn); | 4188 | &zone_start_pfn, &zone_end_pfn); |
4185 | 4189 | ||
4186 | /* Check that this node has pages within the zone's required range */ | 4190 | /* Check that this node has pages within the zone's required range */ |
4187 | if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) | 4191 | if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) |
4188 | return 0; | 4192 | return 0; |
4189 | 4193 | ||
4190 | /* Move the zone boundaries inside the node if necessary */ | 4194 | /* Move the zone boundaries inside the node if necessary */ |
4191 | zone_end_pfn = min(zone_end_pfn, node_end_pfn); | 4195 | zone_end_pfn = min(zone_end_pfn, node_end_pfn); |
4192 | zone_start_pfn = max(zone_start_pfn, node_start_pfn); | 4196 | zone_start_pfn = max(zone_start_pfn, node_start_pfn); |
4193 | 4197 | ||
4194 | /* Return the spanned pages */ | 4198 | /* Return the spanned pages */ |
4195 | return zone_end_pfn - zone_start_pfn; | 4199 | return zone_end_pfn - zone_start_pfn; |
4196 | } | 4200 | } |
4197 | 4201 | ||
4198 | /* | 4202 | /* |
4199 | * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, | 4203 | * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, |
4200 | * then all holes in the requested range will be accounted for. | 4204 | * then all holes in the requested range will be accounted for. |
4201 | */ | 4205 | */ |
4202 | unsigned long __meminit __absent_pages_in_range(int nid, | 4206 | unsigned long __meminit __absent_pages_in_range(int nid, |
4203 | unsigned long range_start_pfn, | 4207 | unsigned long range_start_pfn, |
4204 | unsigned long range_end_pfn) | 4208 | unsigned long range_end_pfn) |
4205 | { | 4209 | { |
4206 | unsigned long nr_absent = range_end_pfn - range_start_pfn; | 4210 | unsigned long nr_absent = range_end_pfn - range_start_pfn; |
4207 | unsigned long start_pfn, end_pfn; | 4211 | unsigned long start_pfn, end_pfn; |
4208 | int i; | 4212 | int i; |
4209 | 4213 | ||
4210 | for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { | 4214 | for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { |
4211 | start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn); | 4215 | start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn); |
4212 | end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn); | 4216 | end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn); |
4213 | nr_absent -= end_pfn - start_pfn; | 4217 | nr_absent -= end_pfn - start_pfn; |
4214 | } | 4218 | } |
4215 | return nr_absent; | 4219 | return nr_absent; |
4216 | } | 4220 | } |
4217 | 4221 | ||
4218 | /** | 4222 | /** |
4219 | * absent_pages_in_range - Return number of page frames in holes within a range | 4223 | * absent_pages_in_range - Return number of page frames in holes within a range |
4220 | * @start_pfn: The start PFN to start searching for holes | 4224 | * @start_pfn: The start PFN to start searching for holes |
4221 | * @end_pfn: The end PFN to stop searching for holes | 4225 | * @end_pfn: The end PFN to stop searching for holes |
4222 | * | 4226 | * |
4223 | * It returns the number of pages frames in memory holes within a range. | 4227 | * It returns the number of pages frames in memory holes within a range. |
4224 | */ | 4228 | */ |
4225 | unsigned long __init absent_pages_in_range(unsigned long start_pfn, | 4229 | unsigned long __init absent_pages_in_range(unsigned long start_pfn, |
4226 | unsigned long end_pfn) | 4230 | unsigned long end_pfn) |
4227 | { | 4231 | { |
4228 | return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn); | 4232 | return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn); |
4229 | } | 4233 | } |
4230 | 4234 | ||
4231 | /* Return the number of page frames in holes in a zone on a node */ | 4235 | /* Return the number of page frames in holes in a zone on a node */ |
4232 | static unsigned long __meminit zone_absent_pages_in_node(int nid, | 4236 | static unsigned long __meminit zone_absent_pages_in_node(int nid, |
4233 | unsigned long zone_type, | 4237 | unsigned long zone_type, |
4234 | unsigned long *ignored) | 4238 | unsigned long *ignored) |
4235 | { | 4239 | { |
4236 | unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; | 4240 | unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; |
4237 | unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; | 4241 | unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; |
4238 | unsigned long node_start_pfn, node_end_pfn; | 4242 | unsigned long node_start_pfn, node_end_pfn; |
4239 | unsigned long zone_start_pfn, zone_end_pfn; | 4243 | unsigned long zone_start_pfn, zone_end_pfn; |
4240 | 4244 | ||
4241 | get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); | 4245 | get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); |
4242 | zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); | 4246 | zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); |
4243 | zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); | 4247 | zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); |
4244 | 4248 | ||
4245 | adjust_zone_range_for_zone_movable(nid, zone_type, | 4249 | adjust_zone_range_for_zone_movable(nid, zone_type, |
4246 | node_start_pfn, node_end_pfn, | 4250 | node_start_pfn, node_end_pfn, |
4247 | &zone_start_pfn, &zone_end_pfn); | 4251 | &zone_start_pfn, &zone_end_pfn); |
4248 | return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); | 4252 | return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); |
4249 | } | 4253 | } |
4250 | 4254 | ||
4251 | #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | 4255 | #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
4252 | static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, | 4256 | static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, |
4253 | unsigned long zone_type, | 4257 | unsigned long zone_type, |
4254 | unsigned long *zones_size) | 4258 | unsigned long *zones_size) |
4255 | { | 4259 | { |
4256 | return zones_size[zone_type]; | 4260 | return zones_size[zone_type]; |
4257 | } | 4261 | } |
4258 | 4262 | ||
4259 | static inline unsigned long __meminit zone_absent_pages_in_node(int nid, | 4263 | static inline unsigned long __meminit zone_absent_pages_in_node(int nid, |
4260 | unsigned long zone_type, | 4264 | unsigned long zone_type, |
4261 | unsigned long *zholes_size) | 4265 | unsigned long *zholes_size) |
4262 | { | 4266 | { |
4263 | if (!zholes_size) | 4267 | if (!zholes_size) |
4264 | return 0; | 4268 | return 0; |
4265 | 4269 | ||
4266 | return zholes_size[zone_type]; | 4270 | return zholes_size[zone_type]; |
4267 | } | 4271 | } |
4268 | 4272 | ||
4269 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | 4273 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
4270 | 4274 | ||
4271 | static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, | 4275 | static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, |
4272 | unsigned long *zones_size, unsigned long *zholes_size) | 4276 | unsigned long *zones_size, unsigned long *zholes_size) |
4273 | { | 4277 | { |
4274 | unsigned long realtotalpages, totalpages = 0; | 4278 | unsigned long realtotalpages, totalpages = 0; |
4275 | enum zone_type i; | 4279 | enum zone_type i; |
4276 | 4280 | ||
4277 | for (i = 0; i < MAX_NR_ZONES; i++) | 4281 | for (i = 0; i < MAX_NR_ZONES; i++) |
4278 | totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, | 4282 | totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, |
4279 | zones_size); | 4283 | zones_size); |
4280 | pgdat->node_spanned_pages = totalpages; | 4284 | pgdat->node_spanned_pages = totalpages; |
4281 | 4285 | ||
4282 | realtotalpages = totalpages; | 4286 | realtotalpages = totalpages; |
4283 | for (i = 0; i < MAX_NR_ZONES; i++) | 4287 | for (i = 0; i < MAX_NR_ZONES; i++) |
4284 | realtotalpages -= | 4288 | realtotalpages -= |
4285 | zone_absent_pages_in_node(pgdat->node_id, i, | 4289 | zone_absent_pages_in_node(pgdat->node_id, i, |
4286 | zholes_size); | 4290 | zholes_size); |
4287 | pgdat->node_present_pages = realtotalpages; | 4291 | pgdat->node_present_pages = realtotalpages; |
4288 | printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, | 4292 | printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, |
4289 | realtotalpages); | 4293 | realtotalpages); |
4290 | } | 4294 | } |
4291 | 4295 | ||
4292 | #ifndef CONFIG_SPARSEMEM | 4296 | #ifndef CONFIG_SPARSEMEM |
4293 | /* | 4297 | /* |
4294 | * Calculate the size of the zone->blockflags rounded to an unsigned long | 4298 | * Calculate the size of the zone->blockflags rounded to an unsigned long |
4295 | * Start by making sure zonesize is a multiple of pageblock_order by rounding | 4299 | * Start by making sure zonesize is a multiple of pageblock_order by rounding |
4296 | * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally | 4300 | * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally |
4297 | * round what is now in bits to nearest long in bits, then return it in | 4301 | * round what is now in bits to nearest long in bits, then return it in |
4298 | * bytes. | 4302 | * bytes. |
4299 | */ | 4303 | */ |
4300 | static unsigned long __init usemap_size(unsigned long zonesize) | 4304 | static unsigned long __init usemap_size(unsigned long zonesize) |
4301 | { | 4305 | { |
4302 | unsigned long usemapsize; | 4306 | unsigned long usemapsize; |
4303 | 4307 | ||
4304 | usemapsize = roundup(zonesize, pageblock_nr_pages); | 4308 | usemapsize = roundup(zonesize, pageblock_nr_pages); |
4305 | usemapsize = usemapsize >> pageblock_order; | 4309 | usemapsize = usemapsize >> pageblock_order; |
4306 | usemapsize *= NR_PAGEBLOCK_BITS; | 4310 | usemapsize *= NR_PAGEBLOCK_BITS; |
4307 | usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long)); | 4311 | usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long)); |
4308 | 4312 | ||
4309 | return usemapsize / 8; | 4313 | return usemapsize / 8; |
4310 | } | 4314 | } |
4311 | 4315 | ||
4312 | static void __init setup_usemap(struct pglist_data *pgdat, | 4316 | static void __init setup_usemap(struct pglist_data *pgdat, |
4313 | struct zone *zone, unsigned long zonesize) | 4317 | struct zone *zone, unsigned long zonesize) |
4314 | { | 4318 | { |
4315 | unsigned long usemapsize = usemap_size(zonesize); | 4319 | unsigned long usemapsize = usemap_size(zonesize); |
4316 | zone->pageblock_flags = NULL; | 4320 | zone->pageblock_flags = NULL; |
4317 | if (usemapsize) | 4321 | if (usemapsize) |
4318 | zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat, | 4322 | zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat, |
4319 | usemapsize); | 4323 | usemapsize); |
4320 | } | 4324 | } |
4321 | #else | 4325 | #else |
4322 | static inline void setup_usemap(struct pglist_data *pgdat, | 4326 | static inline void setup_usemap(struct pglist_data *pgdat, |
4323 | struct zone *zone, unsigned long zonesize) {} | 4327 | struct zone *zone, unsigned long zonesize) {} |
4324 | #endif /* CONFIG_SPARSEMEM */ | 4328 | #endif /* CONFIG_SPARSEMEM */ |
4325 | 4329 | ||
4326 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE | 4330 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE |
4327 | 4331 | ||
4328 | /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ | 4332 | /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ |
4329 | void __init set_pageblock_order(void) | 4333 | void __init set_pageblock_order(void) |
4330 | { | 4334 | { |
4331 | unsigned int order; | 4335 | unsigned int order; |
4332 | 4336 | ||
4333 | /* Check that pageblock_nr_pages has not already been setup */ | 4337 | /* Check that pageblock_nr_pages has not already been setup */ |
4334 | if (pageblock_order) | 4338 | if (pageblock_order) |
4335 | return; | 4339 | return; |
4336 | 4340 | ||
4337 | if (HPAGE_SHIFT > PAGE_SHIFT) | 4341 | if (HPAGE_SHIFT > PAGE_SHIFT) |
4338 | order = HUGETLB_PAGE_ORDER; | 4342 | order = HUGETLB_PAGE_ORDER; |
4339 | else | 4343 | else |
4340 | order = MAX_ORDER - 1; | 4344 | order = MAX_ORDER - 1; |
4341 | 4345 | ||
4342 | /* | 4346 | /* |
4343 | * Assume the largest contiguous order of interest is a huge page. | 4347 | * Assume the largest contiguous order of interest is a huge page. |
4344 | * This value may be variable depending on boot parameters on IA64 and | 4348 | * This value may be variable depending on boot parameters on IA64 and |
4345 | * powerpc. | 4349 | * powerpc. |
4346 | */ | 4350 | */ |
4347 | pageblock_order = order; | 4351 | pageblock_order = order; |
4348 | } | 4352 | } |
4349 | #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ | 4353 | #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ |
4350 | 4354 | ||
4351 | /* | 4355 | /* |
4352 | * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() | 4356 | * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() |
4353 | * is unused as pageblock_order is set at compile-time. See | 4357 | * is unused as pageblock_order is set at compile-time. See |
4354 | * include/linux/pageblock-flags.h for the values of pageblock_order based on | 4358 | * include/linux/pageblock-flags.h for the values of pageblock_order based on |
4355 | * the kernel config | 4359 | * the kernel config |
4356 | */ | 4360 | */ |
4357 | void __init set_pageblock_order(void) | 4361 | void __init set_pageblock_order(void) |
4358 | { | 4362 | { |
4359 | } | 4363 | } |
4360 | 4364 | ||
4361 | #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ | 4365 | #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ |
4362 | 4366 | ||
4363 | /* | 4367 | /* |
4364 | * Set up the zone data structures: | 4368 | * Set up the zone data structures: |
4365 | * - mark all pages reserved | 4369 | * - mark all pages reserved |
4366 | * - mark all memory queues empty | 4370 | * - mark all memory queues empty |
4367 | * - clear the memory bitmaps | 4371 | * - clear the memory bitmaps |
4368 | */ | 4372 | */ |
4369 | static void __paginginit free_area_init_core(struct pglist_data *pgdat, | 4373 | static void __paginginit free_area_init_core(struct pglist_data *pgdat, |
4370 | unsigned long *zones_size, unsigned long *zholes_size) | 4374 | unsigned long *zones_size, unsigned long *zholes_size) |
4371 | { | 4375 | { |
4372 | enum zone_type j; | 4376 | enum zone_type j; |
4373 | int nid = pgdat->node_id; | 4377 | int nid = pgdat->node_id; |
4374 | unsigned long zone_start_pfn = pgdat->node_start_pfn; | 4378 | unsigned long zone_start_pfn = pgdat->node_start_pfn; |
4375 | int ret; | 4379 | int ret; |
4376 | 4380 | ||
4377 | pgdat_resize_init(pgdat); | 4381 | pgdat_resize_init(pgdat); |
4378 | pgdat->nr_zones = 0; | 4382 | pgdat->nr_zones = 0; |
4379 | init_waitqueue_head(&pgdat->kswapd_wait); | 4383 | init_waitqueue_head(&pgdat->kswapd_wait); |
4380 | pgdat->kswapd_max_order = 0; | 4384 | pgdat->kswapd_max_order = 0; |
4381 | pgdat_page_cgroup_init(pgdat); | 4385 | pgdat_page_cgroup_init(pgdat); |
4382 | 4386 | ||
4383 | for (j = 0; j < MAX_NR_ZONES; j++) { | 4387 | for (j = 0; j < MAX_NR_ZONES; j++) { |
4384 | struct zone *zone = pgdat->node_zones + j; | 4388 | struct zone *zone = pgdat->node_zones + j; |
4385 | unsigned long size, realsize, memmap_pages; | 4389 | unsigned long size, realsize, memmap_pages; |
4386 | 4390 | ||
4387 | size = zone_spanned_pages_in_node(nid, j, zones_size); | 4391 | size = zone_spanned_pages_in_node(nid, j, zones_size); |
4388 | realsize = size - zone_absent_pages_in_node(nid, j, | 4392 | realsize = size - zone_absent_pages_in_node(nid, j, |
4389 | zholes_size); | 4393 | zholes_size); |
4390 | 4394 | ||
4391 | /* | 4395 | /* |
4392 | * Adjust realsize so that it accounts for how much memory | 4396 | * Adjust realsize so that it accounts for how much memory |
4393 | * is used by this zone for memmap. This affects the watermark | 4397 | * is used by this zone for memmap. This affects the watermark |
4394 | * and per-cpu initialisations | 4398 | * and per-cpu initialisations |
4395 | */ | 4399 | */ |
4396 | memmap_pages = | 4400 | memmap_pages = |
4397 | PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; | 4401 | PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; |
4398 | if (realsize >= memmap_pages) { | 4402 | if (realsize >= memmap_pages) { |
4399 | realsize -= memmap_pages; | 4403 | realsize -= memmap_pages; |
4400 | if (memmap_pages) | 4404 | if (memmap_pages) |
4401 | printk(KERN_DEBUG | 4405 | printk(KERN_DEBUG |
4402 | " %s zone: %lu pages used for memmap\n", | 4406 | " %s zone: %lu pages used for memmap\n", |
4403 | zone_names[j], memmap_pages); | 4407 | zone_names[j], memmap_pages); |
4404 | } else | 4408 | } else |
4405 | printk(KERN_WARNING | 4409 | printk(KERN_WARNING |
4406 | " %s zone: %lu pages exceeds realsize %lu\n", | 4410 | " %s zone: %lu pages exceeds realsize %lu\n", |
4407 | zone_names[j], memmap_pages, realsize); | 4411 | zone_names[j], memmap_pages, realsize); |
4408 | 4412 | ||
4409 | /* Account for reserved pages */ | 4413 | /* Account for reserved pages */ |
4410 | if (j == 0 && realsize > dma_reserve) { | 4414 | if (j == 0 && realsize > dma_reserve) { |
4411 | realsize -= dma_reserve; | 4415 | realsize -= dma_reserve; |
4412 | printk(KERN_DEBUG " %s zone: %lu pages reserved\n", | 4416 | printk(KERN_DEBUG " %s zone: %lu pages reserved\n", |
4413 | zone_names[0], dma_reserve); | 4417 | zone_names[0], dma_reserve); |
4414 | } | 4418 | } |
4415 | 4419 | ||
4416 | if (!is_highmem_idx(j)) | 4420 | if (!is_highmem_idx(j)) |
4417 | nr_kernel_pages += realsize; | 4421 | nr_kernel_pages += realsize; |
4418 | nr_all_pages += realsize; | 4422 | nr_all_pages += realsize; |
4419 | 4423 | ||
4420 | zone->spanned_pages = size; | 4424 | zone->spanned_pages = size; |
4421 | zone->present_pages = realsize; | 4425 | zone->present_pages = realsize; |
4422 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA | 4426 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA |
4423 | zone->compact_cached_free_pfn = zone->zone_start_pfn + | 4427 | zone->compact_cached_free_pfn = zone->zone_start_pfn + |
4424 | zone->spanned_pages; | 4428 | zone->spanned_pages; |
4425 | zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1); | 4429 | zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1); |
4426 | #endif | 4430 | #endif |
4427 | #ifdef CONFIG_NUMA | 4431 | #ifdef CONFIG_NUMA |
4428 | zone->node = nid; | 4432 | zone->node = nid; |
4429 | zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) | 4433 | zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) |
4430 | / 100; | 4434 | / 100; |
4431 | zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; | 4435 | zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; |
4432 | #endif | 4436 | #endif |
4433 | zone->name = zone_names[j]; | 4437 | zone->name = zone_names[j]; |
4434 | spin_lock_init(&zone->lock); | 4438 | spin_lock_init(&zone->lock); |
4435 | spin_lock_init(&zone->lru_lock); | 4439 | spin_lock_init(&zone->lru_lock); |
4436 | zone_seqlock_init(zone); | 4440 | zone_seqlock_init(zone); |
4437 | zone->zone_pgdat = pgdat; | 4441 | zone->zone_pgdat = pgdat; |
4438 | 4442 | ||
4439 | zone_pcp_init(zone); | 4443 | zone_pcp_init(zone); |
4440 | lruvec_init(&zone->lruvec, zone); | 4444 | lruvec_init(&zone->lruvec, zone); |
4441 | zap_zone_vm_stats(zone); | 4445 | zap_zone_vm_stats(zone); |
4442 | zone->flags = 0; | 4446 | zone->flags = 0; |
4443 | #ifdef CONFIG_MEMORY_ISOLATION | 4447 | #ifdef CONFIG_MEMORY_ISOLATION |
4444 | zone->nr_pageblock_isolate = 0; | 4448 | zone->nr_pageblock_isolate = 0; |
4445 | #endif | 4449 | #endif |
4446 | if (!size) | 4450 | if (!size) |
4447 | continue; | 4451 | continue; |
4448 | 4452 | ||
4449 | set_pageblock_order(); | 4453 | set_pageblock_order(); |
4450 | setup_usemap(pgdat, zone, size); | 4454 | setup_usemap(pgdat, zone, size); |
4451 | ret = init_currently_empty_zone(zone, zone_start_pfn, | 4455 | ret = init_currently_empty_zone(zone, zone_start_pfn, |
4452 | size, MEMMAP_EARLY); | 4456 | size, MEMMAP_EARLY); |
4453 | BUG_ON(ret); | 4457 | BUG_ON(ret); |
4454 | memmap_init(size, nid, j, zone_start_pfn); | 4458 | memmap_init(size, nid, j, zone_start_pfn); |
4455 | zone_start_pfn += size; | 4459 | zone_start_pfn += size; |
4456 | } | 4460 | } |
4457 | } | 4461 | } |
4458 | 4462 | ||
4459 | static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) | 4463 | static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) |
4460 | { | 4464 | { |
4461 | /* Skip empty nodes */ | 4465 | /* Skip empty nodes */ |
4462 | if (!pgdat->node_spanned_pages) | 4466 | if (!pgdat->node_spanned_pages) |
4463 | return; | 4467 | return; |
4464 | 4468 | ||
4465 | #ifdef CONFIG_FLAT_NODE_MEM_MAP | 4469 | #ifdef CONFIG_FLAT_NODE_MEM_MAP |
4466 | /* ia64 gets its own node_mem_map, before this, without bootmem */ | 4470 | /* ia64 gets its own node_mem_map, before this, without bootmem */ |
4467 | if (!pgdat->node_mem_map) { | 4471 | if (!pgdat->node_mem_map) { |
4468 | unsigned long size, start, end; | 4472 | unsigned long size, start, end; |
4469 | struct page *map; | 4473 | struct page *map; |
4470 | 4474 | ||
4471 | /* | 4475 | /* |
4472 | * The zone's endpoints aren't required to be MAX_ORDER | 4476 | * The zone's endpoints aren't required to be MAX_ORDER |
4473 | * aligned but the node_mem_map endpoints must be in order | 4477 | * aligned but the node_mem_map endpoints must be in order |
4474 | * for the buddy allocator to function correctly. | 4478 | * for the buddy allocator to function correctly. |
4475 | */ | 4479 | */ |
4476 | start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); | 4480 | start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); |
4477 | end = pgdat->node_start_pfn + pgdat->node_spanned_pages; | 4481 | end = pgdat->node_start_pfn + pgdat->node_spanned_pages; |
4478 | end = ALIGN(end, MAX_ORDER_NR_PAGES); | 4482 | end = ALIGN(end, MAX_ORDER_NR_PAGES); |
4479 | size = (end - start) * sizeof(struct page); | 4483 | size = (end - start) * sizeof(struct page); |
4480 | map = alloc_remap(pgdat->node_id, size); | 4484 | map = alloc_remap(pgdat->node_id, size); |
4481 | if (!map) | 4485 | if (!map) |
4482 | map = alloc_bootmem_node_nopanic(pgdat, size); | 4486 | map = alloc_bootmem_node_nopanic(pgdat, size); |
4483 | pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); | 4487 | pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); |
4484 | } | 4488 | } |
4485 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 4489 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
4486 | /* | 4490 | /* |
4487 | * With no DISCONTIG, the global mem_map is just set as node 0's | 4491 | * With no DISCONTIG, the global mem_map is just set as node 0's |
4488 | */ | 4492 | */ |
4489 | if (pgdat == NODE_DATA(0)) { | 4493 | if (pgdat == NODE_DATA(0)) { |
4490 | mem_map = NODE_DATA(0)->node_mem_map; | 4494 | mem_map = NODE_DATA(0)->node_mem_map; |
4491 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | 4495 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
4492 | if (page_to_pfn(mem_map) != pgdat->node_start_pfn) | 4496 | if (page_to_pfn(mem_map) != pgdat->node_start_pfn) |
4493 | mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET); | 4497 | mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET); |
4494 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | 4498 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
4495 | } | 4499 | } |
4496 | #endif | 4500 | #endif |
4497 | #endif /* CONFIG_FLAT_NODE_MEM_MAP */ | 4501 | #endif /* CONFIG_FLAT_NODE_MEM_MAP */ |
4498 | } | 4502 | } |
4499 | 4503 | ||
4500 | void __paginginit free_area_init_node(int nid, unsigned long *zones_size, | 4504 | void __paginginit free_area_init_node(int nid, unsigned long *zones_size, |
4501 | unsigned long node_start_pfn, unsigned long *zholes_size) | 4505 | unsigned long node_start_pfn, unsigned long *zholes_size) |
4502 | { | 4506 | { |
4503 | pg_data_t *pgdat = NODE_DATA(nid); | 4507 | pg_data_t *pgdat = NODE_DATA(nid); |
4504 | 4508 | ||
4505 | pgdat->node_id = nid; | 4509 | pgdat->node_id = nid; |
4506 | pgdat->node_start_pfn = node_start_pfn; | 4510 | pgdat->node_start_pfn = node_start_pfn; |
4507 | calculate_node_totalpages(pgdat, zones_size, zholes_size); | 4511 | calculate_node_totalpages(pgdat, zones_size, zholes_size); |
4508 | 4512 | ||
4509 | alloc_node_mem_map(pgdat); | 4513 | alloc_node_mem_map(pgdat); |
4510 | #ifdef CONFIG_FLAT_NODE_MEM_MAP | 4514 | #ifdef CONFIG_FLAT_NODE_MEM_MAP |
4511 | printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n", | 4515 | printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n", |
4512 | nid, (unsigned long)pgdat, | 4516 | nid, (unsigned long)pgdat, |
4513 | (unsigned long)pgdat->node_mem_map); | 4517 | (unsigned long)pgdat->node_mem_map); |
4514 | #endif | 4518 | #endif |
4515 | 4519 | ||
4516 | free_area_init_core(pgdat, zones_size, zholes_size); | 4520 | free_area_init_core(pgdat, zones_size, zholes_size); |
4517 | } | 4521 | } |
4518 | 4522 | ||
4519 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | 4523 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
4520 | 4524 | ||
4521 | #if MAX_NUMNODES > 1 | 4525 | #if MAX_NUMNODES > 1 |
4522 | /* | 4526 | /* |
4523 | * Figure out the number of possible node ids. | 4527 | * Figure out the number of possible node ids. |
4524 | */ | 4528 | */ |
4525 | static void __init setup_nr_node_ids(void) | 4529 | static void __init setup_nr_node_ids(void) |
4526 | { | 4530 | { |
4527 | unsigned int node; | 4531 | unsigned int node; |
4528 | unsigned int highest = 0; | 4532 | unsigned int highest = 0; |
4529 | 4533 | ||
4530 | for_each_node_mask(node, node_possible_map) | 4534 | for_each_node_mask(node, node_possible_map) |
4531 | highest = node; | 4535 | highest = node; |
4532 | nr_node_ids = highest + 1; | 4536 | nr_node_ids = highest + 1; |
4533 | } | 4537 | } |
4534 | #else | 4538 | #else |
4535 | static inline void setup_nr_node_ids(void) | 4539 | static inline void setup_nr_node_ids(void) |
4536 | { | 4540 | { |
4537 | } | 4541 | } |
4538 | #endif | 4542 | #endif |
4539 | 4543 | ||
4540 | /** | 4544 | /** |
4541 | * node_map_pfn_alignment - determine the maximum internode alignment | 4545 | * node_map_pfn_alignment - determine the maximum internode alignment |
4542 | * | 4546 | * |
4543 | * This function should be called after node map is populated and sorted. | 4547 | * This function should be called after node map is populated and sorted. |
4544 | * It calculates the maximum power of two alignment which can distinguish | 4548 | * It calculates the maximum power of two alignment which can distinguish |
4545 | * all the nodes. | 4549 | * all the nodes. |
4546 | * | 4550 | * |
4547 | * For example, if all nodes are 1GiB and aligned to 1GiB, the return value | 4551 | * For example, if all nodes are 1GiB and aligned to 1GiB, the return value |
4548 | * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the | 4552 | * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the |
4549 | * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is | 4553 | * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is |
4550 | * shifted, 1GiB is enough and this function will indicate so. | 4554 | * shifted, 1GiB is enough and this function will indicate so. |
4551 | * | 4555 | * |
4552 | * This is used to test whether pfn -> nid mapping of the chosen memory | 4556 | * This is used to test whether pfn -> nid mapping of the chosen memory |
4553 | * model has fine enough granularity to avoid incorrect mapping for the | 4557 | * model has fine enough granularity to avoid incorrect mapping for the |
4554 | * populated node map. | 4558 | * populated node map. |
4555 | * | 4559 | * |
4556 | * Returns the determined alignment in pfn's. 0 if there is no alignment | 4560 | * Returns the determined alignment in pfn's. 0 if there is no alignment |
4557 | * requirement (single node). | 4561 | * requirement (single node). |
4558 | */ | 4562 | */ |
4559 | unsigned long __init node_map_pfn_alignment(void) | 4563 | unsigned long __init node_map_pfn_alignment(void) |
4560 | { | 4564 | { |
4561 | unsigned long accl_mask = 0, last_end = 0; | 4565 | unsigned long accl_mask = 0, last_end = 0; |
4562 | unsigned long start, end, mask; | 4566 | unsigned long start, end, mask; |
4563 | int last_nid = -1; | 4567 | int last_nid = -1; |
4564 | int i, nid; | 4568 | int i, nid; |
4565 | 4569 | ||
4566 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) { | 4570 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) { |
4567 | if (!start || last_nid < 0 || last_nid == nid) { | 4571 | if (!start || last_nid < 0 || last_nid == nid) { |
4568 | last_nid = nid; | 4572 | last_nid = nid; |
4569 | last_end = end; | 4573 | last_end = end; |
4570 | continue; | 4574 | continue; |
4571 | } | 4575 | } |
4572 | 4576 | ||
4573 | /* | 4577 | /* |
4574 | * Start with a mask granular enough to pin-point to the | 4578 | * Start with a mask granular enough to pin-point to the |
4575 | * start pfn and tick off bits one-by-one until it becomes | 4579 | * start pfn and tick off bits one-by-one until it becomes |
4576 | * too coarse to separate the current node from the last. | 4580 | * too coarse to separate the current node from the last. |
4577 | */ | 4581 | */ |
4578 | mask = ~((1 << __ffs(start)) - 1); | 4582 | mask = ~((1 << __ffs(start)) - 1); |
4579 | while (mask && last_end <= (start & (mask << 1))) | 4583 | while (mask && last_end <= (start & (mask << 1))) |
4580 | mask <<= 1; | 4584 | mask <<= 1; |
4581 | 4585 | ||
4582 | /* accumulate all internode masks */ | 4586 | /* accumulate all internode masks */ |
4583 | accl_mask |= mask; | 4587 | accl_mask |= mask; |
4584 | } | 4588 | } |
4585 | 4589 | ||
4586 | /* convert mask to number of pages */ | 4590 | /* convert mask to number of pages */ |
4587 | return ~accl_mask + 1; | 4591 | return ~accl_mask + 1; |
4588 | } | 4592 | } |
4589 | 4593 | ||
4590 | /* Find the lowest pfn for a node */ | 4594 | /* Find the lowest pfn for a node */ |
4591 | static unsigned long __init find_min_pfn_for_node(int nid) | 4595 | static unsigned long __init find_min_pfn_for_node(int nid) |
4592 | { | 4596 | { |
4593 | unsigned long min_pfn = ULONG_MAX; | 4597 | unsigned long min_pfn = ULONG_MAX; |
4594 | unsigned long start_pfn; | 4598 | unsigned long start_pfn; |
4595 | int i; | 4599 | int i; |
4596 | 4600 | ||
4597 | for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL) | 4601 | for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL) |
4598 | min_pfn = min(min_pfn, start_pfn); | 4602 | min_pfn = min(min_pfn, start_pfn); |
4599 | 4603 | ||
4600 | if (min_pfn == ULONG_MAX) { | 4604 | if (min_pfn == ULONG_MAX) { |
4601 | printk(KERN_WARNING | 4605 | printk(KERN_WARNING |
4602 | "Could not find start_pfn for node %d\n", nid); | 4606 | "Could not find start_pfn for node %d\n", nid); |
4603 | return 0; | 4607 | return 0; |
4604 | } | 4608 | } |
4605 | 4609 | ||
4606 | return min_pfn; | 4610 | return min_pfn; |
4607 | } | 4611 | } |
4608 | 4612 | ||
4609 | /** | 4613 | /** |
4610 | * find_min_pfn_with_active_regions - Find the minimum PFN registered | 4614 | * find_min_pfn_with_active_regions - Find the minimum PFN registered |
4611 | * | 4615 | * |
4612 | * It returns the minimum PFN based on information provided via | 4616 | * It returns the minimum PFN based on information provided via |
4613 | * add_active_range(). | 4617 | * add_active_range(). |
4614 | */ | 4618 | */ |
4615 | unsigned long __init find_min_pfn_with_active_regions(void) | 4619 | unsigned long __init find_min_pfn_with_active_regions(void) |
4616 | { | 4620 | { |
4617 | return find_min_pfn_for_node(MAX_NUMNODES); | 4621 | return find_min_pfn_for_node(MAX_NUMNODES); |
4618 | } | 4622 | } |
4619 | 4623 | ||
4620 | /* | 4624 | /* |
4621 | * early_calculate_totalpages() | 4625 | * early_calculate_totalpages() |
4622 | * Sum pages in active regions for movable zone. | 4626 | * Sum pages in active regions for movable zone. |
4623 | * Populate N_HIGH_MEMORY for calculating usable_nodes. | 4627 | * Populate N_HIGH_MEMORY for calculating usable_nodes. |
4624 | */ | 4628 | */ |
4625 | static unsigned long __init early_calculate_totalpages(void) | 4629 | static unsigned long __init early_calculate_totalpages(void) |
4626 | { | 4630 | { |
4627 | unsigned long totalpages = 0; | 4631 | unsigned long totalpages = 0; |
4628 | unsigned long start_pfn, end_pfn; | 4632 | unsigned long start_pfn, end_pfn; |
4629 | int i, nid; | 4633 | int i, nid; |
4630 | 4634 | ||
4631 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { | 4635 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { |
4632 | unsigned long pages = end_pfn - start_pfn; | 4636 | unsigned long pages = end_pfn - start_pfn; |
4633 | 4637 | ||
4634 | totalpages += pages; | 4638 | totalpages += pages; |
4635 | if (pages) | 4639 | if (pages) |
4636 | node_set_state(nid, N_HIGH_MEMORY); | 4640 | node_set_state(nid, N_HIGH_MEMORY); |
4637 | } | 4641 | } |
4638 | return totalpages; | 4642 | return totalpages; |
4639 | } | 4643 | } |
4640 | 4644 | ||
4641 | /* | 4645 | /* |
4642 | * Find the PFN the Movable zone begins in each node. Kernel memory | 4646 | * Find the PFN the Movable zone begins in each node. Kernel memory |
4643 | * is spread evenly between nodes as long as the nodes have enough | 4647 | * is spread evenly between nodes as long as the nodes have enough |
4644 | * memory. When they don't, some nodes will have more kernelcore than | 4648 | * memory. When they don't, some nodes will have more kernelcore than |
4645 | * others | 4649 | * others |
4646 | */ | 4650 | */ |
4647 | static void __init find_zone_movable_pfns_for_nodes(void) | 4651 | static void __init find_zone_movable_pfns_for_nodes(void) |
4648 | { | 4652 | { |
4649 | int i, nid; | 4653 | int i, nid; |
4650 | unsigned long usable_startpfn; | 4654 | unsigned long usable_startpfn; |
4651 | unsigned long kernelcore_node, kernelcore_remaining; | 4655 | unsigned long kernelcore_node, kernelcore_remaining; |
4652 | /* save the state before borrow the nodemask */ | 4656 | /* save the state before borrow the nodemask */ |
4653 | nodemask_t saved_node_state = node_states[N_HIGH_MEMORY]; | 4657 | nodemask_t saved_node_state = node_states[N_HIGH_MEMORY]; |
4654 | unsigned long totalpages = early_calculate_totalpages(); | 4658 | unsigned long totalpages = early_calculate_totalpages(); |
4655 | int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); | 4659 | int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); |
4656 | 4660 | ||
4657 | /* | 4661 | /* |
4658 | * If movablecore was specified, calculate what size of | 4662 | * If movablecore was specified, calculate what size of |
4659 | * kernelcore that corresponds so that memory usable for | 4663 | * kernelcore that corresponds so that memory usable for |
4660 | * any allocation type is evenly spread. If both kernelcore | 4664 | * any allocation type is evenly spread. If both kernelcore |
4661 | * and movablecore are specified, then the value of kernelcore | 4665 | * and movablecore are specified, then the value of kernelcore |
4662 | * will be used for required_kernelcore if it's greater than | 4666 | * will be used for required_kernelcore if it's greater than |
4663 | * what movablecore would have allowed. | 4667 | * what movablecore would have allowed. |
4664 | */ | 4668 | */ |
4665 | if (required_movablecore) { | 4669 | if (required_movablecore) { |
4666 | unsigned long corepages; | 4670 | unsigned long corepages; |
4667 | 4671 | ||
4668 | /* | 4672 | /* |
4669 | * Round-up so that ZONE_MOVABLE is at least as large as what | 4673 | * Round-up so that ZONE_MOVABLE is at least as large as what |
4670 | * was requested by the user | 4674 | * was requested by the user |
4671 | */ | 4675 | */ |
4672 | required_movablecore = | 4676 | required_movablecore = |
4673 | roundup(required_movablecore, MAX_ORDER_NR_PAGES); | 4677 | roundup(required_movablecore, MAX_ORDER_NR_PAGES); |
4674 | corepages = totalpages - required_movablecore; | 4678 | corepages = totalpages - required_movablecore; |
4675 | 4679 | ||
4676 | required_kernelcore = max(required_kernelcore, corepages); | 4680 | required_kernelcore = max(required_kernelcore, corepages); |
4677 | } | 4681 | } |
4678 | 4682 | ||
4679 | /* If kernelcore was not specified, there is no ZONE_MOVABLE */ | 4683 | /* If kernelcore was not specified, there is no ZONE_MOVABLE */ |
4680 | if (!required_kernelcore) | 4684 | if (!required_kernelcore) |
4681 | goto out; | 4685 | goto out; |
4682 | 4686 | ||
4683 | /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ | 4687 | /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ |
4684 | find_usable_zone_for_movable(); | 4688 | find_usable_zone_for_movable(); |
4685 | usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; | 4689 | usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; |
4686 | 4690 | ||
4687 | restart: | 4691 | restart: |
4688 | /* Spread kernelcore memory as evenly as possible throughout nodes */ | 4692 | /* Spread kernelcore memory as evenly as possible throughout nodes */ |
4689 | kernelcore_node = required_kernelcore / usable_nodes; | 4693 | kernelcore_node = required_kernelcore / usable_nodes; |
4690 | for_each_node_state(nid, N_HIGH_MEMORY) { | 4694 | for_each_node_state(nid, N_HIGH_MEMORY) { |
4691 | unsigned long start_pfn, end_pfn; | 4695 | unsigned long start_pfn, end_pfn; |
4692 | 4696 | ||
4693 | /* | 4697 | /* |
4694 | * Recalculate kernelcore_node if the division per node | 4698 | * Recalculate kernelcore_node if the division per node |
4695 | * now exceeds what is necessary to satisfy the requested | 4699 | * now exceeds what is necessary to satisfy the requested |
4696 | * amount of memory for the kernel | 4700 | * amount of memory for the kernel |
4697 | */ | 4701 | */ |
4698 | if (required_kernelcore < kernelcore_node) | 4702 | if (required_kernelcore < kernelcore_node) |
4699 | kernelcore_node = required_kernelcore / usable_nodes; | 4703 | kernelcore_node = required_kernelcore / usable_nodes; |
4700 | 4704 | ||
4701 | /* | 4705 | /* |
4702 | * As the map is walked, we track how much memory is usable | 4706 | * As the map is walked, we track how much memory is usable |
4703 | * by the kernel using kernelcore_remaining. When it is | 4707 | * by the kernel using kernelcore_remaining. When it is |
4704 | * 0, the rest of the node is usable by ZONE_MOVABLE | 4708 | * 0, the rest of the node is usable by ZONE_MOVABLE |
4705 | */ | 4709 | */ |
4706 | kernelcore_remaining = kernelcore_node; | 4710 | kernelcore_remaining = kernelcore_node; |
4707 | 4711 | ||
4708 | /* Go through each range of PFNs within this node */ | 4712 | /* Go through each range of PFNs within this node */ |
4709 | for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { | 4713 | for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { |
4710 | unsigned long size_pages; | 4714 | unsigned long size_pages; |
4711 | 4715 | ||
4712 | start_pfn = max(start_pfn, zone_movable_pfn[nid]); | 4716 | start_pfn = max(start_pfn, zone_movable_pfn[nid]); |
4713 | if (start_pfn >= end_pfn) | 4717 | if (start_pfn >= end_pfn) |
4714 | continue; | 4718 | continue; |
4715 | 4719 | ||
4716 | /* Account for what is only usable for kernelcore */ | 4720 | /* Account for what is only usable for kernelcore */ |
4717 | if (start_pfn < usable_startpfn) { | 4721 | if (start_pfn < usable_startpfn) { |
4718 | unsigned long kernel_pages; | 4722 | unsigned long kernel_pages; |
4719 | kernel_pages = min(end_pfn, usable_startpfn) | 4723 | kernel_pages = min(end_pfn, usable_startpfn) |
4720 | - start_pfn; | 4724 | - start_pfn; |
4721 | 4725 | ||
4722 | kernelcore_remaining -= min(kernel_pages, | 4726 | kernelcore_remaining -= min(kernel_pages, |
4723 | kernelcore_remaining); | 4727 | kernelcore_remaining); |
4724 | required_kernelcore -= min(kernel_pages, | 4728 | required_kernelcore -= min(kernel_pages, |
4725 | required_kernelcore); | 4729 | required_kernelcore); |
4726 | 4730 | ||
4727 | /* Continue if range is now fully accounted */ | 4731 | /* Continue if range is now fully accounted */ |
4728 | if (end_pfn <= usable_startpfn) { | 4732 | if (end_pfn <= usable_startpfn) { |
4729 | 4733 | ||
4730 | /* | 4734 | /* |
4731 | * Push zone_movable_pfn to the end so | 4735 | * Push zone_movable_pfn to the end so |
4732 | * that if we have to rebalance | 4736 | * that if we have to rebalance |
4733 | * kernelcore across nodes, we will | 4737 | * kernelcore across nodes, we will |
4734 | * not double account here | 4738 | * not double account here |
4735 | */ | 4739 | */ |
4736 | zone_movable_pfn[nid] = end_pfn; | 4740 | zone_movable_pfn[nid] = end_pfn; |
4737 | continue; | 4741 | continue; |
4738 | } | 4742 | } |
4739 | start_pfn = usable_startpfn; | 4743 | start_pfn = usable_startpfn; |
4740 | } | 4744 | } |
4741 | 4745 | ||
4742 | /* | 4746 | /* |
4743 | * The usable PFN range for ZONE_MOVABLE is from | 4747 | * The usable PFN range for ZONE_MOVABLE is from |
4744 | * start_pfn->end_pfn. Calculate size_pages as the | 4748 | * start_pfn->end_pfn. Calculate size_pages as the |
4745 | * number of pages used as kernelcore | 4749 | * number of pages used as kernelcore |
4746 | */ | 4750 | */ |
4747 | size_pages = end_pfn - start_pfn; | 4751 | size_pages = end_pfn - start_pfn; |
4748 | if (size_pages > kernelcore_remaining) | 4752 | if (size_pages > kernelcore_remaining) |
4749 | size_pages = kernelcore_remaining; | 4753 | size_pages = kernelcore_remaining; |
4750 | zone_movable_pfn[nid] = start_pfn + size_pages; | 4754 | zone_movable_pfn[nid] = start_pfn + size_pages; |
4751 | 4755 | ||
4752 | /* | 4756 | /* |
4753 | * Some kernelcore has been met, update counts and | 4757 | * Some kernelcore has been met, update counts and |
4754 | * break if the kernelcore for this node has been | 4758 | * break if the kernelcore for this node has been |
4755 | * satisified | 4759 | * satisified |
4756 | */ | 4760 | */ |
4757 | required_kernelcore -= min(required_kernelcore, | 4761 | required_kernelcore -= min(required_kernelcore, |
4758 | size_pages); | 4762 | size_pages); |
4759 | kernelcore_remaining -= size_pages; | 4763 | kernelcore_remaining -= size_pages; |
4760 | if (!kernelcore_remaining) | 4764 | if (!kernelcore_remaining) |
4761 | break; | 4765 | break; |
4762 | } | 4766 | } |
4763 | } | 4767 | } |
4764 | 4768 | ||
4765 | /* | 4769 | /* |
4766 | * If there is still required_kernelcore, we do another pass with one | 4770 | * If there is still required_kernelcore, we do another pass with one |
4767 | * less node in the count. This will push zone_movable_pfn[nid] further | 4771 | * less node in the count. This will push zone_movable_pfn[nid] further |
4768 | * along on the nodes that still have memory until kernelcore is | 4772 | * along on the nodes that still have memory until kernelcore is |
4769 | * satisified | 4773 | * satisified |
4770 | */ | 4774 | */ |
4771 | usable_nodes--; | 4775 | usable_nodes--; |
4772 | if (usable_nodes && required_kernelcore > usable_nodes) | 4776 | if (usable_nodes && required_kernelcore > usable_nodes) |
4773 | goto restart; | 4777 | goto restart; |
4774 | 4778 | ||
4775 | /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ | 4779 | /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ |
4776 | for (nid = 0; nid < MAX_NUMNODES; nid++) | 4780 | for (nid = 0; nid < MAX_NUMNODES; nid++) |
4777 | zone_movable_pfn[nid] = | 4781 | zone_movable_pfn[nid] = |
4778 | roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); | 4782 | roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); |
4779 | 4783 | ||
4780 | out: | 4784 | out: |
4781 | /* restore the node_state */ | 4785 | /* restore the node_state */ |
4782 | node_states[N_HIGH_MEMORY] = saved_node_state; | 4786 | node_states[N_HIGH_MEMORY] = saved_node_state; |
4783 | } | 4787 | } |
4784 | 4788 | ||
4785 | /* Any regular memory on that node ? */ | 4789 | /* Any regular memory on that node ? */ |
4786 | static void __init check_for_regular_memory(pg_data_t *pgdat) | 4790 | static void __init check_for_regular_memory(pg_data_t *pgdat) |
4787 | { | 4791 | { |
4788 | #ifdef CONFIG_HIGHMEM | 4792 | #ifdef CONFIG_HIGHMEM |
4789 | enum zone_type zone_type; | 4793 | enum zone_type zone_type; |
4790 | 4794 | ||
4791 | for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) { | 4795 | for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) { |
4792 | struct zone *zone = &pgdat->node_zones[zone_type]; | 4796 | struct zone *zone = &pgdat->node_zones[zone_type]; |
4793 | if (zone->present_pages) { | 4797 | if (zone->present_pages) { |
4794 | node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY); | 4798 | node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY); |
4795 | break; | 4799 | break; |
4796 | } | 4800 | } |
4797 | } | 4801 | } |
4798 | #endif | 4802 | #endif |
4799 | } | 4803 | } |
4800 | 4804 | ||
4801 | /** | 4805 | /** |
4802 | * free_area_init_nodes - Initialise all pg_data_t and zone data | 4806 | * free_area_init_nodes - Initialise all pg_data_t and zone data |
4803 | * @max_zone_pfn: an array of max PFNs for each zone | 4807 | * @max_zone_pfn: an array of max PFNs for each zone |
4804 | * | 4808 | * |
4805 | * This will call free_area_init_node() for each active node in the system. | 4809 | * This will call free_area_init_node() for each active node in the system. |
4806 | * Using the page ranges provided by add_active_range(), the size of each | 4810 | * Using the page ranges provided by add_active_range(), the size of each |
4807 | * zone in each node and their holes is calculated. If the maximum PFN | 4811 | * zone in each node and their holes is calculated. If the maximum PFN |
4808 | * between two adjacent zones match, it is assumed that the zone is empty. | 4812 | * between two adjacent zones match, it is assumed that the zone is empty. |
4809 | * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed | 4813 | * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed |
4810 | * that arch_max_dma32_pfn has no pages. It is also assumed that a zone | 4814 | * that arch_max_dma32_pfn has no pages. It is also assumed that a zone |
4811 | * starts where the previous one ended. For example, ZONE_DMA32 starts | 4815 | * starts where the previous one ended. For example, ZONE_DMA32 starts |
4812 | * at arch_max_dma_pfn. | 4816 | * at arch_max_dma_pfn. |
4813 | */ | 4817 | */ |
4814 | void __init free_area_init_nodes(unsigned long *max_zone_pfn) | 4818 | void __init free_area_init_nodes(unsigned long *max_zone_pfn) |
4815 | { | 4819 | { |
4816 | unsigned long start_pfn, end_pfn; | 4820 | unsigned long start_pfn, end_pfn; |
4817 | int i, nid; | 4821 | int i, nid; |
4818 | 4822 | ||
4819 | /* Record where the zone boundaries are */ | 4823 | /* Record where the zone boundaries are */ |
4820 | memset(arch_zone_lowest_possible_pfn, 0, | 4824 | memset(arch_zone_lowest_possible_pfn, 0, |
4821 | sizeof(arch_zone_lowest_possible_pfn)); | 4825 | sizeof(arch_zone_lowest_possible_pfn)); |
4822 | memset(arch_zone_highest_possible_pfn, 0, | 4826 | memset(arch_zone_highest_possible_pfn, 0, |
4823 | sizeof(arch_zone_highest_possible_pfn)); | 4827 | sizeof(arch_zone_highest_possible_pfn)); |
4824 | arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); | 4828 | arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); |
4825 | arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; | 4829 | arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; |
4826 | for (i = 1; i < MAX_NR_ZONES; i++) { | 4830 | for (i = 1; i < MAX_NR_ZONES; i++) { |
4827 | if (i == ZONE_MOVABLE) | 4831 | if (i == ZONE_MOVABLE) |
4828 | continue; | 4832 | continue; |
4829 | arch_zone_lowest_possible_pfn[i] = | 4833 | arch_zone_lowest_possible_pfn[i] = |
4830 | arch_zone_highest_possible_pfn[i-1]; | 4834 | arch_zone_highest_possible_pfn[i-1]; |
4831 | arch_zone_highest_possible_pfn[i] = | 4835 | arch_zone_highest_possible_pfn[i] = |
4832 | max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); | 4836 | max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); |
4833 | } | 4837 | } |
4834 | arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; | 4838 | arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; |
4835 | arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; | 4839 | arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; |
4836 | 4840 | ||
4837 | /* Find the PFNs that ZONE_MOVABLE begins at in each node */ | 4841 | /* Find the PFNs that ZONE_MOVABLE begins at in each node */ |
4838 | memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); | 4842 | memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); |
4839 | find_zone_movable_pfns_for_nodes(); | 4843 | find_zone_movable_pfns_for_nodes(); |
4840 | 4844 | ||
4841 | /* Print out the zone ranges */ | 4845 | /* Print out the zone ranges */ |
4842 | printk("Zone ranges:\n"); | 4846 | printk("Zone ranges:\n"); |
4843 | for (i = 0; i < MAX_NR_ZONES; i++) { | 4847 | for (i = 0; i < MAX_NR_ZONES; i++) { |
4844 | if (i == ZONE_MOVABLE) | 4848 | if (i == ZONE_MOVABLE) |
4845 | continue; | 4849 | continue; |
4846 | printk(KERN_CONT " %-8s ", zone_names[i]); | 4850 | printk(KERN_CONT " %-8s ", zone_names[i]); |
4847 | if (arch_zone_lowest_possible_pfn[i] == | 4851 | if (arch_zone_lowest_possible_pfn[i] == |
4848 | arch_zone_highest_possible_pfn[i]) | 4852 | arch_zone_highest_possible_pfn[i]) |
4849 | printk(KERN_CONT "empty\n"); | 4853 | printk(KERN_CONT "empty\n"); |
4850 | else | 4854 | else |
4851 | printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n", | 4855 | printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n", |
4852 | arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT, | 4856 | arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT, |
4853 | (arch_zone_highest_possible_pfn[i] | 4857 | (arch_zone_highest_possible_pfn[i] |
4854 | << PAGE_SHIFT) - 1); | 4858 | << PAGE_SHIFT) - 1); |
4855 | } | 4859 | } |
4856 | 4860 | ||
4857 | /* Print out the PFNs ZONE_MOVABLE begins at in each node */ | 4861 | /* Print out the PFNs ZONE_MOVABLE begins at in each node */ |
4858 | printk("Movable zone start for each node\n"); | 4862 | printk("Movable zone start for each node\n"); |
4859 | for (i = 0; i < MAX_NUMNODES; i++) { | 4863 | for (i = 0; i < MAX_NUMNODES; i++) { |
4860 | if (zone_movable_pfn[i]) | 4864 | if (zone_movable_pfn[i]) |
4861 | printk(" Node %d: %#010lx\n", i, | 4865 | printk(" Node %d: %#010lx\n", i, |
4862 | zone_movable_pfn[i] << PAGE_SHIFT); | 4866 | zone_movable_pfn[i] << PAGE_SHIFT); |
4863 | } | 4867 | } |
4864 | 4868 | ||
4865 | /* Print out the early_node_map[] */ | 4869 | /* Print out the early_node_map[] */ |
4866 | printk("Early memory node ranges\n"); | 4870 | printk("Early memory node ranges\n"); |
4867 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) | 4871 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) |
4868 | printk(" node %3d: [mem %#010lx-%#010lx]\n", nid, | 4872 | printk(" node %3d: [mem %#010lx-%#010lx]\n", nid, |
4869 | start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1); | 4873 | start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1); |
4870 | 4874 | ||
4871 | /* Initialise every node */ | 4875 | /* Initialise every node */ |
4872 | mminit_verify_pageflags_layout(); | 4876 | mminit_verify_pageflags_layout(); |
4873 | setup_nr_node_ids(); | 4877 | setup_nr_node_ids(); |
4874 | for_each_online_node(nid) { | 4878 | for_each_online_node(nid) { |
4875 | pg_data_t *pgdat = NODE_DATA(nid); | 4879 | pg_data_t *pgdat = NODE_DATA(nid); |
4876 | free_area_init_node(nid, NULL, | 4880 | free_area_init_node(nid, NULL, |
4877 | find_min_pfn_for_node(nid), NULL); | 4881 | find_min_pfn_for_node(nid), NULL); |
4878 | 4882 | ||
4879 | /* Any memory on that node */ | 4883 | /* Any memory on that node */ |
4880 | if (pgdat->node_present_pages) | 4884 | if (pgdat->node_present_pages) |
4881 | node_set_state(nid, N_HIGH_MEMORY); | 4885 | node_set_state(nid, N_HIGH_MEMORY); |
4882 | check_for_regular_memory(pgdat); | 4886 | check_for_regular_memory(pgdat); |
4883 | } | 4887 | } |
4884 | } | 4888 | } |
4885 | 4889 | ||
4886 | static int __init cmdline_parse_core(char *p, unsigned long *core) | 4890 | static int __init cmdline_parse_core(char *p, unsigned long *core) |
4887 | { | 4891 | { |
4888 | unsigned long long coremem; | 4892 | unsigned long long coremem; |
4889 | if (!p) | 4893 | if (!p) |
4890 | return -EINVAL; | 4894 | return -EINVAL; |
4891 | 4895 | ||
4892 | coremem = memparse(p, &p); | 4896 | coremem = memparse(p, &p); |
4893 | *core = coremem >> PAGE_SHIFT; | 4897 | *core = coremem >> PAGE_SHIFT; |
4894 | 4898 | ||
4895 | /* Paranoid check that UL is enough for the coremem value */ | 4899 | /* Paranoid check that UL is enough for the coremem value */ |
4896 | WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX); | 4900 | WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX); |
4897 | 4901 | ||
4898 | return 0; | 4902 | return 0; |
4899 | } | 4903 | } |
4900 | 4904 | ||
4901 | /* | 4905 | /* |
4902 | * kernelcore=size sets the amount of memory for use for allocations that | 4906 | * kernelcore=size sets the amount of memory for use for allocations that |
4903 | * cannot be reclaimed or migrated. | 4907 | * cannot be reclaimed or migrated. |
4904 | */ | 4908 | */ |
4905 | static int __init cmdline_parse_kernelcore(char *p) | 4909 | static int __init cmdline_parse_kernelcore(char *p) |
4906 | { | 4910 | { |
4907 | return cmdline_parse_core(p, &required_kernelcore); | 4911 | return cmdline_parse_core(p, &required_kernelcore); |
4908 | } | 4912 | } |
4909 | 4913 | ||
4910 | /* | 4914 | /* |
4911 | * movablecore=size sets the amount of memory for use for allocations that | 4915 | * movablecore=size sets the amount of memory for use for allocations that |
4912 | * can be reclaimed or migrated. | 4916 | * can be reclaimed or migrated. |
4913 | */ | 4917 | */ |
4914 | static int __init cmdline_parse_movablecore(char *p) | 4918 | static int __init cmdline_parse_movablecore(char *p) |
4915 | { | 4919 | { |
4916 | return cmdline_parse_core(p, &required_movablecore); | 4920 | return cmdline_parse_core(p, &required_movablecore); |
4917 | } | 4921 | } |
4918 | 4922 | ||
4919 | early_param("kernelcore", cmdline_parse_kernelcore); | 4923 | early_param("kernelcore", cmdline_parse_kernelcore); |
4920 | early_param("movablecore", cmdline_parse_movablecore); | 4924 | early_param("movablecore", cmdline_parse_movablecore); |
4921 | 4925 | ||
4922 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | 4926 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
4923 | 4927 | ||
4924 | /** | 4928 | /** |
4925 | * set_dma_reserve - set the specified number of pages reserved in the first zone | 4929 | * set_dma_reserve - set the specified number of pages reserved in the first zone |
4926 | * @new_dma_reserve: The number of pages to mark reserved | 4930 | * @new_dma_reserve: The number of pages to mark reserved |
4927 | * | 4931 | * |
4928 | * The per-cpu batchsize and zone watermarks are determined by present_pages. | 4932 | * The per-cpu batchsize and zone watermarks are determined by present_pages. |
4929 | * In the DMA zone, a significant percentage may be consumed by kernel image | 4933 | * In the DMA zone, a significant percentage may be consumed by kernel image |
4930 | * and other unfreeable allocations which can skew the watermarks badly. This | 4934 | * and other unfreeable allocations which can skew the watermarks badly. This |
4931 | * function may optionally be used to account for unfreeable pages in the | 4935 | * function may optionally be used to account for unfreeable pages in the |
4932 | * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and | 4936 | * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and |
4933 | * smaller per-cpu batchsize. | 4937 | * smaller per-cpu batchsize. |
4934 | */ | 4938 | */ |
4935 | void __init set_dma_reserve(unsigned long new_dma_reserve) | 4939 | void __init set_dma_reserve(unsigned long new_dma_reserve) |
4936 | { | 4940 | { |
4937 | dma_reserve = new_dma_reserve; | 4941 | dma_reserve = new_dma_reserve; |
4938 | } | 4942 | } |
4939 | 4943 | ||
4940 | void __init free_area_init(unsigned long *zones_size) | 4944 | void __init free_area_init(unsigned long *zones_size) |
4941 | { | 4945 | { |
4942 | free_area_init_node(0, zones_size, | 4946 | free_area_init_node(0, zones_size, |
4943 | __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); | 4947 | __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); |
4944 | } | 4948 | } |
4945 | 4949 | ||
4946 | static int page_alloc_cpu_notify(struct notifier_block *self, | 4950 | static int page_alloc_cpu_notify(struct notifier_block *self, |
4947 | unsigned long action, void *hcpu) | 4951 | unsigned long action, void *hcpu) |
4948 | { | 4952 | { |
4949 | int cpu = (unsigned long)hcpu; | 4953 | int cpu = (unsigned long)hcpu; |
4950 | 4954 | ||
4951 | if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { | 4955 | if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { |
4952 | lru_add_drain_cpu(cpu); | 4956 | lru_add_drain_cpu(cpu); |
4953 | drain_pages(cpu); | 4957 | drain_pages(cpu); |
4954 | 4958 | ||
4955 | /* | 4959 | /* |
4956 | * Spill the event counters of the dead processor | 4960 | * Spill the event counters of the dead processor |
4957 | * into the current processors event counters. | 4961 | * into the current processors event counters. |
4958 | * This artificially elevates the count of the current | 4962 | * This artificially elevates the count of the current |
4959 | * processor. | 4963 | * processor. |
4960 | */ | 4964 | */ |
4961 | vm_events_fold_cpu(cpu); | 4965 | vm_events_fold_cpu(cpu); |
4962 | 4966 | ||
4963 | /* | 4967 | /* |
4964 | * Zero the differential counters of the dead processor | 4968 | * Zero the differential counters of the dead processor |
4965 | * so that the vm statistics are consistent. | 4969 | * so that the vm statistics are consistent. |
4966 | * | 4970 | * |
4967 | * This is only okay since the processor is dead and cannot | 4971 | * This is only okay since the processor is dead and cannot |
4968 | * race with what we are doing. | 4972 | * race with what we are doing. |
4969 | */ | 4973 | */ |
4970 | refresh_cpu_vm_stats(cpu); | 4974 | refresh_cpu_vm_stats(cpu); |
4971 | } | 4975 | } |
4972 | return NOTIFY_OK; | 4976 | return NOTIFY_OK; |
4973 | } | 4977 | } |
4974 | 4978 | ||
4975 | void __init page_alloc_init(void) | 4979 | void __init page_alloc_init(void) |
4976 | { | 4980 | { |
4977 | hotcpu_notifier(page_alloc_cpu_notify, 0); | 4981 | hotcpu_notifier(page_alloc_cpu_notify, 0); |
4978 | } | 4982 | } |
4979 | 4983 | ||
4980 | /* | 4984 | /* |
4981 | * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio | 4985 | * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio |
4982 | * or min_free_kbytes changes. | 4986 | * or min_free_kbytes changes. |
4983 | */ | 4987 | */ |
4984 | static void calculate_totalreserve_pages(void) | 4988 | static void calculate_totalreserve_pages(void) |
4985 | { | 4989 | { |
4986 | struct pglist_data *pgdat; | 4990 | struct pglist_data *pgdat; |
4987 | unsigned long reserve_pages = 0; | 4991 | unsigned long reserve_pages = 0; |
4988 | enum zone_type i, j; | 4992 | enum zone_type i, j; |
4989 | 4993 | ||
4990 | for_each_online_pgdat(pgdat) { | 4994 | for_each_online_pgdat(pgdat) { |
4991 | for (i = 0; i < MAX_NR_ZONES; i++) { | 4995 | for (i = 0; i < MAX_NR_ZONES; i++) { |
4992 | struct zone *zone = pgdat->node_zones + i; | 4996 | struct zone *zone = pgdat->node_zones + i; |
4993 | unsigned long max = 0; | 4997 | unsigned long max = 0; |
4994 | 4998 | ||
4995 | /* Find valid and maximum lowmem_reserve in the zone */ | 4999 | /* Find valid and maximum lowmem_reserve in the zone */ |
4996 | for (j = i; j < MAX_NR_ZONES; j++) { | 5000 | for (j = i; j < MAX_NR_ZONES; j++) { |
4997 | if (zone->lowmem_reserve[j] > max) | 5001 | if (zone->lowmem_reserve[j] > max) |
4998 | max = zone->lowmem_reserve[j]; | 5002 | max = zone->lowmem_reserve[j]; |
4999 | } | 5003 | } |
5000 | 5004 | ||
5001 | /* we treat the high watermark as reserved pages. */ | 5005 | /* we treat the high watermark as reserved pages. */ |
5002 | max += high_wmark_pages(zone); | 5006 | max += high_wmark_pages(zone); |
5003 | 5007 | ||
5004 | if (max > zone->present_pages) | 5008 | if (max > zone->present_pages) |
5005 | max = zone->present_pages; | 5009 | max = zone->present_pages; |
5006 | reserve_pages += max; | 5010 | reserve_pages += max; |
5007 | /* | 5011 | /* |
5008 | * Lowmem reserves are not available to | 5012 | * Lowmem reserves are not available to |
5009 | * GFP_HIGHUSER page cache allocations and | 5013 | * GFP_HIGHUSER page cache allocations and |
5010 | * kswapd tries to balance zones to their high | 5014 | * kswapd tries to balance zones to their high |
5011 | * watermark. As a result, neither should be | 5015 | * watermark. As a result, neither should be |
5012 | * regarded as dirtyable memory, to prevent a | 5016 | * regarded as dirtyable memory, to prevent a |
5013 | * situation where reclaim has to clean pages | 5017 | * situation where reclaim has to clean pages |
5014 | * in order to balance the zones. | 5018 | * in order to balance the zones. |
5015 | */ | 5019 | */ |
5016 | zone->dirty_balance_reserve = max; | 5020 | zone->dirty_balance_reserve = max; |
5017 | } | 5021 | } |
5018 | } | 5022 | } |
5019 | dirty_balance_reserve = reserve_pages; | 5023 | dirty_balance_reserve = reserve_pages; |
5020 | totalreserve_pages = reserve_pages; | 5024 | totalreserve_pages = reserve_pages; |
5021 | } | 5025 | } |
5022 | 5026 | ||
5023 | /* | 5027 | /* |
5024 | * setup_per_zone_lowmem_reserve - called whenever | 5028 | * setup_per_zone_lowmem_reserve - called whenever |
5025 | * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone | 5029 | * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone |
5026 | * has a correct pages reserved value, so an adequate number of | 5030 | * has a correct pages reserved value, so an adequate number of |
5027 | * pages are left in the zone after a successful __alloc_pages(). | 5031 | * pages are left in the zone after a successful __alloc_pages(). |
5028 | */ | 5032 | */ |
5029 | static void setup_per_zone_lowmem_reserve(void) | 5033 | static void setup_per_zone_lowmem_reserve(void) |
5030 | { | 5034 | { |
5031 | struct pglist_data *pgdat; | 5035 | struct pglist_data *pgdat; |
5032 | enum zone_type j, idx; | 5036 | enum zone_type j, idx; |
5033 | 5037 | ||
5034 | for_each_online_pgdat(pgdat) { | 5038 | for_each_online_pgdat(pgdat) { |
5035 | for (j = 0; j < MAX_NR_ZONES; j++) { | 5039 | for (j = 0; j < MAX_NR_ZONES; j++) { |
5036 | struct zone *zone = pgdat->node_zones + j; | 5040 | struct zone *zone = pgdat->node_zones + j; |
5037 | unsigned long present_pages = zone->present_pages; | 5041 | unsigned long present_pages = zone->present_pages; |
5038 | 5042 | ||
5039 | zone->lowmem_reserve[j] = 0; | 5043 | zone->lowmem_reserve[j] = 0; |
5040 | 5044 | ||
5041 | idx = j; | 5045 | idx = j; |
5042 | while (idx) { | 5046 | while (idx) { |
5043 | struct zone *lower_zone; | 5047 | struct zone *lower_zone; |
5044 | 5048 | ||
5045 | idx--; | 5049 | idx--; |
5046 | 5050 | ||
5047 | if (sysctl_lowmem_reserve_ratio[idx] < 1) | 5051 | if (sysctl_lowmem_reserve_ratio[idx] < 1) |
5048 | sysctl_lowmem_reserve_ratio[idx] = 1; | 5052 | sysctl_lowmem_reserve_ratio[idx] = 1; |
5049 | 5053 | ||
5050 | lower_zone = pgdat->node_zones + idx; | 5054 | lower_zone = pgdat->node_zones + idx; |
5051 | lower_zone->lowmem_reserve[j] = present_pages / | 5055 | lower_zone->lowmem_reserve[j] = present_pages / |
5052 | sysctl_lowmem_reserve_ratio[idx]; | 5056 | sysctl_lowmem_reserve_ratio[idx]; |
5053 | present_pages += lower_zone->present_pages; | 5057 | present_pages += lower_zone->present_pages; |
5054 | } | 5058 | } |
5055 | } | 5059 | } |
5056 | } | 5060 | } |
5057 | 5061 | ||
5058 | /* update totalreserve_pages */ | 5062 | /* update totalreserve_pages */ |
5059 | calculate_totalreserve_pages(); | 5063 | calculate_totalreserve_pages(); |
5060 | } | 5064 | } |
5061 | 5065 | ||
5062 | static void __setup_per_zone_wmarks(void) | 5066 | static void __setup_per_zone_wmarks(void) |
5063 | { | 5067 | { |
5064 | unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); | 5068 | unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); |
5065 | unsigned long lowmem_pages = 0; | 5069 | unsigned long lowmem_pages = 0; |
5066 | struct zone *zone; | 5070 | struct zone *zone; |
5067 | unsigned long flags; | 5071 | unsigned long flags; |
5068 | 5072 | ||
5069 | /* Calculate total number of !ZONE_HIGHMEM pages */ | 5073 | /* Calculate total number of !ZONE_HIGHMEM pages */ |
5070 | for_each_zone(zone) { | 5074 | for_each_zone(zone) { |
5071 | if (!is_highmem(zone)) | 5075 | if (!is_highmem(zone)) |
5072 | lowmem_pages += zone->present_pages; | 5076 | lowmem_pages += zone->present_pages; |
5073 | } | 5077 | } |
5074 | 5078 | ||
5075 | for_each_zone(zone) { | 5079 | for_each_zone(zone) { |
5076 | u64 tmp; | 5080 | u64 tmp; |
5077 | 5081 | ||
5078 | spin_lock_irqsave(&zone->lock, flags); | 5082 | spin_lock_irqsave(&zone->lock, flags); |
5079 | tmp = (u64)pages_min * zone->present_pages; | 5083 | tmp = (u64)pages_min * zone->present_pages; |
5080 | do_div(tmp, lowmem_pages); | 5084 | do_div(tmp, lowmem_pages); |
5081 | if (is_highmem(zone)) { | 5085 | if (is_highmem(zone)) { |
5082 | /* | 5086 | /* |
5083 | * __GFP_HIGH and PF_MEMALLOC allocations usually don't | 5087 | * __GFP_HIGH and PF_MEMALLOC allocations usually don't |
5084 | * need highmem pages, so cap pages_min to a small | 5088 | * need highmem pages, so cap pages_min to a small |
5085 | * value here. | 5089 | * value here. |
5086 | * | 5090 | * |
5087 | * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) | 5091 | * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) |
5088 | * deltas controls asynch page reclaim, and so should | 5092 | * deltas controls asynch page reclaim, and so should |
5089 | * not be capped for highmem. | 5093 | * not be capped for highmem. |
5090 | */ | 5094 | */ |
5091 | int min_pages; | 5095 | int min_pages; |
5092 | 5096 | ||
5093 | min_pages = zone->present_pages / 1024; | 5097 | min_pages = zone->present_pages / 1024; |
5094 | if (min_pages < SWAP_CLUSTER_MAX) | 5098 | if (min_pages < SWAP_CLUSTER_MAX) |
5095 | min_pages = SWAP_CLUSTER_MAX; | 5099 | min_pages = SWAP_CLUSTER_MAX; |
5096 | if (min_pages > 128) | 5100 | if (min_pages > 128) |
5097 | min_pages = 128; | 5101 | min_pages = 128; |
5098 | zone->watermark[WMARK_MIN] = min_pages; | 5102 | zone->watermark[WMARK_MIN] = min_pages; |
5099 | } else { | 5103 | } else { |
5100 | /* | 5104 | /* |
5101 | * If it's a lowmem zone, reserve a number of pages | 5105 | * If it's a lowmem zone, reserve a number of pages |
5102 | * proportionate to the zone's size. | 5106 | * proportionate to the zone's size. |
5103 | */ | 5107 | */ |
5104 | zone->watermark[WMARK_MIN] = tmp; | 5108 | zone->watermark[WMARK_MIN] = tmp; |
5105 | } | 5109 | } |
5106 | 5110 | ||
5107 | zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); | 5111 | zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); |
5108 | zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); | 5112 | zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); |
5109 | 5113 | ||
5110 | zone->watermark[WMARK_MIN] += cma_wmark_pages(zone); | 5114 | zone->watermark[WMARK_MIN] += cma_wmark_pages(zone); |
5111 | zone->watermark[WMARK_LOW] += cma_wmark_pages(zone); | 5115 | zone->watermark[WMARK_LOW] += cma_wmark_pages(zone); |
5112 | zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone); | 5116 | zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone); |
5113 | 5117 | ||
5114 | setup_zone_migrate_reserve(zone); | 5118 | setup_zone_migrate_reserve(zone); |
5115 | spin_unlock_irqrestore(&zone->lock, flags); | 5119 | spin_unlock_irqrestore(&zone->lock, flags); |
5116 | } | 5120 | } |
5117 | 5121 | ||
5118 | /* update totalreserve_pages */ | 5122 | /* update totalreserve_pages */ |
5119 | calculate_totalreserve_pages(); | 5123 | calculate_totalreserve_pages(); |
5120 | } | 5124 | } |
5121 | 5125 | ||
5122 | /** | 5126 | /** |
5123 | * setup_per_zone_wmarks - called when min_free_kbytes changes | 5127 | * setup_per_zone_wmarks - called when min_free_kbytes changes |
5124 | * or when memory is hot-{added|removed} | 5128 | * or when memory is hot-{added|removed} |
5125 | * | 5129 | * |
5126 | * Ensures that the watermark[min,low,high] values for each zone are set | 5130 | * Ensures that the watermark[min,low,high] values for each zone are set |
5127 | * correctly with respect to min_free_kbytes. | 5131 | * correctly with respect to min_free_kbytes. |
5128 | */ | 5132 | */ |
5129 | void setup_per_zone_wmarks(void) | 5133 | void setup_per_zone_wmarks(void) |
5130 | { | 5134 | { |
5131 | mutex_lock(&zonelists_mutex); | 5135 | mutex_lock(&zonelists_mutex); |
5132 | __setup_per_zone_wmarks(); | 5136 | __setup_per_zone_wmarks(); |
5133 | mutex_unlock(&zonelists_mutex); | 5137 | mutex_unlock(&zonelists_mutex); |
5134 | } | 5138 | } |
5135 | 5139 | ||
5136 | /* | 5140 | /* |
5137 | * The inactive anon list should be small enough that the VM never has to | 5141 | * The inactive anon list should be small enough that the VM never has to |
5138 | * do too much work, but large enough that each inactive page has a chance | 5142 | * do too much work, but large enough that each inactive page has a chance |
5139 | * to be referenced again before it is swapped out. | 5143 | * to be referenced again before it is swapped out. |
5140 | * | 5144 | * |
5141 | * The inactive_anon ratio is the target ratio of ACTIVE_ANON to | 5145 | * The inactive_anon ratio is the target ratio of ACTIVE_ANON to |
5142 | * INACTIVE_ANON pages on this zone's LRU, maintained by the | 5146 | * INACTIVE_ANON pages on this zone's LRU, maintained by the |
5143 | * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of | 5147 | * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of |
5144 | * the anonymous pages are kept on the inactive list. | 5148 | * the anonymous pages are kept on the inactive list. |
5145 | * | 5149 | * |
5146 | * total target max | 5150 | * total target max |
5147 | * memory ratio inactive anon | 5151 | * memory ratio inactive anon |
5148 | * ------------------------------------- | 5152 | * ------------------------------------- |
5149 | * 10MB 1 5MB | 5153 | * 10MB 1 5MB |
5150 | * 100MB 1 50MB | 5154 | * 100MB 1 50MB |
5151 | * 1GB 3 250MB | 5155 | * 1GB 3 250MB |
5152 | * 10GB 10 0.9GB | 5156 | * 10GB 10 0.9GB |
5153 | * 100GB 31 3GB | 5157 | * 100GB 31 3GB |
5154 | * 1TB 101 10GB | 5158 | * 1TB 101 10GB |
5155 | * 10TB 320 32GB | 5159 | * 10TB 320 32GB |
5156 | */ | 5160 | */ |
5157 | static void __meminit calculate_zone_inactive_ratio(struct zone *zone) | 5161 | static void __meminit calculate_zone_inactive_ratio(struct zone *zone) |
5158 | { | 5162 | { |
5159 | unsigned int gb, ratio; | 5163 | unsigned int gb, ratio; |
5160 | 5164 | ||
5161 | /* Zone size in gigabytes */ | 5165 | /* Zone size in gigabytes */ |
5162 | gb = zone->present_pages >> (30 - PAGE_SHIFT); | 5166 | gb = zone->present_pages >> (30 - PAGE_SHIFT); |
5163 | if (gb) | 5167 | if (gb) |
5164 | ratio = int_sqrt(10 * gb); | 5168 | ratio = int_sqrt(10 * gb); |
5165 | else | 5169 | else |
5166 | ratio = 1; | 5170 | ratio = 1; |
5167 | 5171 | ||
5168 | zone->inactive_ratio = ratio; | 5172 | zone->inactive_ratio = ratio; |
5169 | } | 5173 | } |
5170 | 5174 | ||
5171 | static void __meminit setup_per_zone_inactive_ratio(void) | 5175 | static void __meminit setup_per_zone_inactive_ratio(void) |
5172 | { | 5176 | { |
5173 | struct zone *zone; | 5177 | struct zone *zone; |
5174 | 5178 | ||
5175 | for_each_zone(zone) | 5179 | for_each_zone(zone) |
5176 | calculate_zone_inactive_ratio(zone); | 5180 | calculate_zone_inactive_ratio(zone); |
5177 | } | 5181 | } |
5178 | 5182 | ||
5179 | /* | 5183 | /* |
5180 | * Initialise min_free_kbytes. | 5184 | * Initialise min_free_kbytes. |
5181 | * | 5185 | * |
5182 | * For small machines we want it small (128k min). For large machines | 5186 | * For small machines we want it small (128k min). For large machines |
5183 | * we want it large (64MB max). But it is not linear, because network | 5187 | * we want it large (64MB max). But it is not linear, because network |
5184 | * bandwidth does not increase linearly with machine size. We use | 5188 | * bandwidth does not increase linearly with machine size. We use |
5185 | * | 5189 | * |
5186 | * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: | 5190 | * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: |
5187 | * min_free_kbytes = sqrt(lowmem_kbytes * 16) | 5191 | * min_free_kbytes = sqrt(lowmem_kbytes * 16) |
5188 | * | 5192 | * |
5189 | * which yields | 5193 | * which yields |
5190 | * | 5194 | * |
5191 | * 16MB: 512k | 5195 | * 16MB: 512k |
5192 | * 32MB: 724k | 5196 | * 32MB: 724k |
5193 | * 64MB: 1024k | 5197 | * 64MB: 1024k |
5194 | * 128MB: 1448k | 5198 | * 128MB: 1448k |
5195 | * 256MB: 2048k | 5199 | * 256MB: 2048k |
5196 | * 512MB: 2896k | 5200 | * 512MB: 2896k |
5197 | * 1024MB: 4096k | 5201 | * 1024MB: 4096k |
5198 | * 2048MB: 5792k | 5202 | * 2048MB: 5792k |
5199 | * 4096MB: 8192k | 5203 | * 4096MB: 8192k |
5200 | * 8192MB: 11584k | 5204 | * 8192MB: 11584k |
5201 | * 16384MB: 16384k | 5205 | * 16384MB: 16384k |
5202 | */ | 5206 | */ |
5203 | int __meminit init_per_zone_wmark_min(void) | 5207 | int __meminit init_per_zone_wmark_min(void) |
5204 | { | 5208 | { |
5205 | unsigned long lowmem_kbytes; | 5209 | unsigned long lowmem_kbytes; |
5206 | 5210 | ||
5207 | lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); | 5211 | lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); |
5208 | 5212 | ||
5209 | min_free_kbytes = int_sqrt(lowmem_kbytes * 16); | 5213 | min_free_kbytes = int_sqrt(lowmem_kbytes * 16); |
5210 | if (min_free_kbytes < 128) | 5214 | if (min_free_kbytes < 128) |
5211 | min_free_kbytes = 128; | 5215 | min_free_kbytes = 128; |
5212 | if (min_free_kbytes > 65536) | 5216 | if (min_free_kbytes > 65536) |
5213 | min_free_kbytes = 65536; | 5217 | min_free_kbytes = 65536; |
5214 | setup_per_zone_wmarks(); | 5218 | setup_per_zone_wmarks(); |
5215 | refresh_zone_stat_thresholds(); | 5219 | refresh_zone_stat_thresholds(); |
5216 | setup_per_zone_lowmem_reserve(); | 5220 | setup_per_zone_lowmem_reserve(); |
5217 | setup_per_zone_inactive_ratio(); | 5221 | setup_per_zone_inactive_ratio(); |
5218 | return 0; | 5222 | return 0; |
5219 | } | 5223 | } |
5220 | module_init(init_per_zone_wmark_min) | 5224 | module_init(init_per_zone_wmark_min) |
5221 | 5225 | ||
5222 | /* | 5226 | /* |
5223 | * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so | 5227 | * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so |
5224 | * that we can call two helper functions whenever min_free_kbytes | 5228 | * that we can call two helper functions whenever min_free_kbytes |
5225 | * changes. | 5229 | * changes. |
5226 | */ | 5230 | */ |
5227 | int min_free_kbytes_sysctl_handler(ctl_table *table, int write, | 5231 | int min_free_kbytes_sysctl_handler(ctl_table *table, int write, |
5228 | void __user *buffer, size_t *length, loff_t *ppos) | 5232 | void __user *buffer, size_t *length, loff_t *ppos) |
5229 | { | 5233 | { |
5230 | proc_dointvec(table, write, buffer, length, ppos); | 5234 | proc_dointvec(table, write, buffer, length, ppos); |
5231 | if (write) | 5235 | if (write) |
5232 | setup_per_zone_wmarks(); | 5236 | setup_per_zone_wmarks(); |
5233 | return 0; | 5237 | return 0; |
5234 | } | 5238 | } |
5235 | 5239 | ||
5236 | #ifdef CONFIG_NUMA | 5240 | #ifdef CONFIG_NUMA |
5237 | int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, | 5241 | int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, |
5238 | void __user *buffer, size_t *length, loff_t *ppos) | 5242 | void __user *buffer, size_t *length, loff_t *ppos) |
5239 | { | 5243 | { |
5240 | struct zone *zone; | 5244 | struct zone *zone; |
5241 | int rc; | 5245 | int rc; |
5242 | 5246 | ||
5243 | rc = proc_dointvec_minmax(table, write, buffer, length, ppos); | 5247 | rc = proc_dointvec_minmax(table, write, buffer, length, ppos); |
5244 | if (rc) | 5248 | if (rc) |
5245 | return rc; | 5249 | return rc; |
5246 | 5250 | ||
5247 | for_each_zone(zone) | 5251 | for_each_zone(zone) |
5248 | zone->min_unmapped_pages = (zone->present_pages * | 5252 | zone->min_unmapped_pages = (zone->present_pages * |
5249 | sysctl_min_unmapped_ratio) / 100; | 5253 | sysctl_min_unmapped_ratio) / 100; |
5250 | return 0; | 5254 | return 0; |
5251 | } | 5255 | } |
5252 | 5256 | ||
5253 | int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, | 5257 | int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, |
5254 | void __user *buffer, size_t *length, loff_t *ppos) | 5258 | void __user *buffer, size_t *length, loff_t *ppos) |
5255 | { | 5259 | { |
5256 | struct zone *zone; | 5260 | struct zone *zone; |
5257 | int rc; | 5261 | int rc; |
5258 | 5262 | ||
5259 | rc = proc_dointvec_minmax(table, write, buffer, length, ppos); | 5263 | rc = proc_dointvec_minmax(table, write, buffer, length, ppos); |
5260 | if (rc) | 5264 | if (rc) |
5261 | return rc; | 5265 | return rc; |
5262 | 5266 | ||
5263 | for_each_zone(zone) | 5267 | for_each_zone(zone) |
5264 | zone->min_slab_pages = (zone->present_pages * | 5268 | zone->min_slab_pages = (zone->present_pages * |
5265 | sysctl_min_slab_ratio) / 100; | 5269 | sysctl_min_slab_ratio) / 100; |
5266 | return 0; | 5270 | return 0; |
5267 | } | 5271 | } |
5268 | #endif | 5272 | #endif |
5269 | 5273 | ||
5270 | /* | 5274 | /* |
5271 | * lowmem_reserve_ratio_sysctl_handler - just a wrapper around | 5275 | * lowmem_reserve_ratio_sysctl_handler - just a wrapper around |
5272 | * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() | 5276 | * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() |
5273 | * whenever sysctl_lowmem_reserve_ratio changes. | 5277 | * whenever sysctl_lowmem_reserve_ratio changes. |
5274 | * | 5278 | * |
5275 | * The reserve ratio obviously has absolutely no relation with the | 5279 | * The reserve ratio obviously has absolutely no relation with the |
5276 | * minimum watermarks. The lowmem reserve ratio can only make sense | 5280 | * minimum watermarks. The lowmem reserve ratio can only make sense |
5277 | * if in function of the boot time zone sizes. | 5281 | * if in function of the boot time zone sizes. |
5278 | */ | 5282 | */ |
5279 | int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, | 5283 | int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, |
5280 | void __user *buffer, size_t *length, loff_t *ppos) | 5284 | void __user *buffer, size_t *length, loff_t *ppos) |
5281 | { | 5285 | { |
5282 | proc_dointvec_minmax(table, write, buffer, length, ppos); | 5286 | proc_dointvec_minmax(table, write, buffer, length, ppos); |
5283 | setup_per_zone_lowmem_reserve(); | 5287 | setup_per_zone_lowmem_reserve(); |
5284 | return 0; | 5288 | return 0; |
5285 | } | 5289 | } |
5286 | 5290 | ||
5287 | /* | 5291 | /* |
5288 | * percpu_pagelist_fraction - changes the pcp->high for each zone on each | 5292 | * percpu_pagelist_fraction - changes the pcp->high for each zone on each |
5289 | * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist | 5293 | * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist |
5290 | * can have before it gets flushed back to buddy allocator. | 5294 | * can have before it gets flushed back to buddy allocator. |
5291 | */ | 5295 | */ |
5292 | 5296 | ||
5293 | int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, | 5297 | int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, |
5294 | void __user *buffer, size_t *length, loff_t *ppos) | 5298 | void __user *buffer, size_t *length, loff_t *ppos) |
5295 | { | 5299 | { |
5296 | struct zone *zone; | 5300 | struct zone *zone; |
5297 | unsigned int cpu; | 5301 | unsigned int cpu; |
5298 | int ret; | 5302 | int ret; |
5299 | 5303 | ||
5300 | ret = proc_dointvec_minmax(table, write, buffer, length, ppos); | 5304 | ret = proc_dointvec_minmax(table, write, buffer, length, ppos); |
5301 | if (!write || (ret < 0)) | 5305 | if (!write || (ret < 0)) |
5302 | return ret; | 5306 | return ret; |
5303 | for_each_populated_zone(zone) { | 5307 | for_each_populated_zone(zone) { |
5304 | for_each_possible_cpu(cpu) { | 5308 | for_each_possible_cpu(cpu) { |
5305 | unsigned long high; | 5309 | unsigned long high; |
5306 | high = zone->present_pages / percpu_pagelist_fraction; | 5310 | high = zone->present_pages / percpu_pagelist_fraction; |
5307 | setup_pagelist_highmark( | 5311 | setup_pagelist_highmark( |
5308 | per_cpu_ptr(zone->pageset, cpu), high); | 5312 | per_cpu_ptr(zone->pageset, cpu), high); |
5309 | } | 5313 | } |
5310 | } | 5314 | } |
5311 | return 0; | 5315 | return 0; |
5312 | } | 5316 | } |
5313 | 5317 | ||
5314 | int hashdist = HASHDIST_DEFAULT; | 5318 | int hashdist = HASHDIST_DEFAULT; |
5315 | 5319 | ||
5316 | #ifdef CONFIG_NUMA | 5320 | #ifdef CONFIG_NUMA |
5317 | static int __init set_hashdist(char *str) | 5321 | static int __init set_hashdist(char *str) |
5318 | { | 5322 | { |
5319 | if (!str) | 5323 | if (!str) |
5320 | return 0; | 5324 | return 0; |
5321 | hashdist = simple_strtoul(str, &str, 0); | 5325 | hashdist = simple_strtoul(str, &str, 0); |
5322 | return 1; | 5326 | return 1; |
5323 | } | 5327 | } |
5324 | __setup("hashdist=", set_hashdist); | 5328 | __setup("hashdist=", set_hashdist); |
5325 | #endif | 5329 | #endif |
5326 | 5330 | ||
5327 | /* | 5331 | /* |
5328 | * allocate a large system hash table from bootmem | 5332 | * allocate a large system hash table from bootmem |
5329 | * - it is assumed that the hash table must contain an exact power-of-2 | 5333 | * - it is assumed that the hash table must contain an exact power-of-2 |
5330 | * quantity of entries | 5334 | * quantity of entries |
5331 | * - limit is the number of hash buckets, not the total allocation size | 5335 | * - limit is the number of hash buckets, not the total allocation size |
5332 | */ | 5336 | */ |
5333 | void *__init alloc_large_system_hash(const char *tablename, | 5337 | void *__init alloc_large_system_hash(const char *tablename, |
5334 | unsigned long bucketsize, | 5338 | unsigned long bucketsize, |
5335 | unsigned long numentries, | 5339 | unsigned long numentries, |
5336 | int scale, | 5340 | int scale, |
5337 | int flags, | 5341 | int flags, |
5338 | unsigned int *_hash_shift, | 5342 | unsigned int *_hash_shift, |
5339 | unsigned int *_hash_mask, | 5343 | unsigned int *_hash_mask, |
5340 | unsigned long low_limit, | 5344 | unsigned long low_limit, |
5341 | unsigned long high_limit) | 5345 | unsigned long high_limit) |
5342 | { | 5346 | { |
5343 | unsigned long long max = high_limit; | 5347 | unsigned long long max = high_limit; |
5344 | unsigned long log2qty, size; | 5348 | unsigned long log2qty, size; |
5345 | void *table = NULL; | 5349 | void *table = NULL; |
5346 | 5350 | ||
5347 | /* allow the kernel cmdline to have a say */ | 5351 | /* allow the kernel cmdline to have a say */ |
5348 | if (!numentries) { | 5352 | if (!numentries) { |
5349 | /* round applicable memory size up to nearest megabyte */ | 5353 | /* round applicable memory size up to nearest megabyte */ |
5350 | numentries = nr_kernel_pages; | 5354 | numentries = nr_kernel_pages; |
5351 | numentries += (1UL << (20 - PAGE_SHIFT)) - 1; | 5355 | numentries += (1UL << (20 - PAGE_SHIFT)) - 1; |
5352 | numentries >>= 20 - PAGE_SHIFT; | 5356 | numentries >>= 20 - PAGE_SHIFT; |
5353 | numentries <<= 20 - PAGE_SHIFT; | 5357 | numentries <<= 20 - PAGE_SHIFT; |
5354 | 5358 | ||
5355 | /* limit to 1 bucket per 2^scale bytes of low memory */ | 5359 | /* limit to 1 bucket per 2^scale bytes of low memory */ |
5356 | if (scale > PAGE_SHIFT) | 5360 | if (scale > PAGE_SHIFT) |
5357 | numentries >>= (scale - PAGE_SHIFT); | 5361 | numentries >>= (scale - PAGE_SHIFT); |
5358 | else | 5362 | else |
5359 | numentries <<= (PAGE_SHIFT - scale); | 5363 | numentries <<= (PAGE_SHIFT - scale); |
5360 | 5364 | ||
5361 | /* Make sure we've got at least a 0-order allocation.. */ | 5365 | /* Make sure we've got at least a 0-order allocation.. */ |
5362 | if (unlikely(flags & HASH_SMALL)) { | 5366 | if (unlikely(flags & HASH_SMALL)) { |
5363 | /* Makes no sense without HASH_EARLY */ | 5367 | /* Makes no sense without HASH_EARLY */ |
5364 | WARN_ON(!(flags & HASH_EARLY)); | 5368 | WARN_ON(!(flags & HASH_EARLY)); |
5365 | if (!(numentries >> *_hash_shift)) { | 5369 | if (!(numentries >> *_hash_shift)) { |
5366 | numentries = 1UL << *_hash_shift; | 5370 | numentries = 1UL << *_hash_shift; |
5367 | BUG_ON(!numentries); | 5371 | BUG_ON(!numentries); |
5368 | } | 5372 | } |
5369 | } else if (unlikely((numentries * bucketsize) < PAGE_SIZE)) | 5373 | } else if (unlikely((numentries * bucketsize) < PAGE_SIZE)) |
5370 | numentries = PAGE_SIZE / bucketsize; | 5374 | numentries = PAGE_SIZE / bucketsize; |
5371 | } | 5375 | } |
5372 | numentries = roundup_pow_of_two(numentries); | 5376 | numentries = roundup_pow_of_two(numentries); |
5373 | 5377 | ||
5374 | /* limit allocation size to 1/16 total memory by default */ | 5378 | /* limit allocation size to 1/16 total memory by default */ |
5375 | if (max == 0) { | 5379 | if (max == 0) { |
5376 | max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; | 5380 | max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; |
5377 | do_div(max, bucketsize); | 5381 | do_div(max, bucketsize); |
5378 | } | 5382 | } |
5379 | max = min(max, 0x80000000ULL); | 5383 | max = min(max, 0x80000000ULL); |
5380 | 5384 | ||
5381 | if (numentries < low_limit) | 5385 | if (numentries < low_limit) |
5382 | numentries = low_limit; | 5386 | numentries = low_limit; |
5383 | if (numentries > max) | 5387 | if (numentries > max) |
5384 | numentries = max; | 5388 | numentries = max; |
5385 | 5389 | ||
5386 | log2qty = ilog2(numentries); | 5390 | log2qty = ilog2(numentries); |
5387 | 5391 | ||
5388 | do { | 5392 | do { |
5389 | size = bucketsize << log2qty; | 5393 | size = bucketsize << log2qty; |
5390 | if (flags & HASH_EARLY) | 5394 | if (flags & HASH_EARLY) |
5391 | table = alloc_bootmem_nopanic(size); | 5395 | table = alloc_bootmem_nopanic(size); |
5392 | else if (hashdist) | 5396 | else if (hashdist) |
5393 | table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); | 5397 | table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); |
5394 | else { | 5398 | else { |
5395 | /* | 5399 | /* |
5396 | * If bucketsize is not a power-of-two, we may free | 5400 | * If bucketsize is not a power-of-two, we may free |
5397 | * some pages at the end of hash table which | 5401 | * some pages at the end of hash table which |
5398 | * alloc_pages_exact() automatically does | 5402 | * alloc_pages_exact() automatically does |
5399 | */ | 5403 | */ |
5400 | if (get_order(size) < MAX_ORDER) { | 5404 | if (get_order(size) < MAX_ORDER) { |
5401 | table = alloc_pages_exact(size, GFP_ATOMIC); | 5405 | table = alloc_pages_exact(size, GFP_ATOMIC); |
5402 | kmemleak_alloc(table, size, 1, GFP_ATOMIC); | 5406 | kmemleak_alloc(table, size, 1, GFP_ATOMIC); |
5403 | } | 5407 | } |
5404 | } | 5408 | } |
5405 | } while (!table && size > PAGE_SIZE && --log2qty); | 5409 | } while (!table && size > PAGE_SIZE && --log2qty); |
5406 | 5410 | ||
5407 | if (!table) | 5411 | if (!table) |
5408 | panic("Failed to allocate %s hash table\n", tablename); | 5412 | panic("Failed to allocate %s hash table\n", tablename); |
5409 | 5413 | ||
5410 | printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n", | 5414 | printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n", |
5411 | tablename, | 5415 | tablename, |
5412 | (1UL << log2qty), | 5416 | (1UL << log2qty), |
5413 | ilog2(size) - PAGE_SHIFT, | 5417 | ilog2(size) - PAGE_SHIFT, |
5414 | size); | 5418 | size); |
5415 | 5419 | ||
5416 | if (_hash_shift) | 5420 | if (_hash_shift) |
5417 | *_hash_shift = log2qty; | 5421 | *_hash_shift = log2qty; |
5418 | if (_hash_mask) | 5422 | if (_hash_mask) |
5419 | *_hash_mask = (1 << log2qty) - 1; | 5423 | *_hash_mask = (1 << log2qty) - 1; |
5420 | 5424 | ||
5421 | return table; | 5425 | return table; |
5422 | } | 5426 | } |
5423 | 5427 | ||
5424 | /* Return a pointer to the bitmap storing bits affecting a block of pages */ | 5428 | /* Return a pointer to the bitmap storing bits affecting a block of pages */ |
5425 | static inline unsigned long *get_pageblock_bitmap(struct zone *zone, | 5429 | static inline unsigned long *get_pageblock_bitmap(struct zone *zone, |
5426 | unsigned long pfn) | 5430 | unsigned long pfn) |
5427 | { | 5431 | { |
5428 | #ifdef CONFIG_SPARSEMEM | 5432 | #ifdef CONFIG_SPARSEMEM |
5429 | return __pfn_to_section(pfn)->pageblock_flags; | 5433 | return __pfn_to_section(pfn)->pageblock_flags; |
5430 | #else | 5434 | #else |
5431 | return zone->pageblock_flags; | 5435 | return zone->pageblock_flags; |
5432 | #endif /* CONFIG_SPARSEMEM */ | 5436 | #endif /* CONFIG_SPARSEMEM */ |
5433 | } | 5437 | } |
5434 | 5438 | ||
5435 | static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) | 5439 | static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) |
5436 | { | 5440 | { |
5437 | #ifdef CONFIG_SPARSEMEM | 5441 | #ifdef CONFIG_SPARSEMEM |
5438 | pfn &= (PAGES_PER_SECTION-1); | 5442 | pfn &= (PAGES_PER_SECTION-1); |
5439 | return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; | 5443 | return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; |
5440 | #else | 5444 | #else |
5441 | pfn = pfn - zone->zone_start_pfn; | 5445 | pfn = pfn - zone->zone_start_pfn; |
5442 | return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; | 5446 | return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; |
5443 | #endif /* CONFIG_SPARSEMEM */ | 5447 | #endif /* CONFIG_SPARSEMEM */ |
5444 | } | 5448 | } |
5445 | 5449 | ||
5446 | /** | 5450 | /** |
5447 | * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages | 5451 | * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages |
5448 | * @page: The page within the block of interest | 5452 | * @page: The page within the block of interest |
5449 | * @start_bitidx: The first bit of interest to retrieve | 5453 | * @start_bitidx: The first bit of interest to retrieve |
5450 | * @end_bitidx: The last bit of interest | 5454 | * @end_bitidx: The last bit of interest |
5451 | * returns pageblock_bits flags | 5455 | * returns pageblock_bits flags |
5452 | */ | 5456 | */ |
5453 | unsigned long get_pageblock_flags_group(struct page *page, | 5457 | unsigned long get_pageblock_flags_group(struct page *page, |
5454 | int start_bitidx, int end_bitidx) | 5458 | int start_bitidx, int end_bitidx) |
5455 | { | 5459 | { |
5456 | struct zone *zone; | 5460 | struct zone *zone; |
5457 | unsigned long *bitmap; | 5461 | unsigned long *bitmap; |
5458 | unsigned long pfn, bitidx; | 5462 | unsigned long pfn, bitidx; |
5459 | unsigned long flags = 0; | 5463 | unsigned long flags = 0; |
5460 | unsigned long value = 1; | 5464 | unsigned long value = 1; |
5461 | 5465 | ||
5462 | zone = page_zone(page); | 5466 | zone = page_zone(page); |
5463 | pfn = page_to_pfn(page); | 5467 | pfn = page_to_pfn(page); |
5464 | bitmap = get_pageblock_bitmap(zone, pfn); | 5468 | bitmap = get_pageblock_bitmap(zone, pfn); |
5465 | bitidx = pfn_to_bitidx(zone, pfn); | 5469 | bitidx = pfn_to_bitidx(zone, pfn); |
5466 | 5470 | ||
5467 | for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) | 5471 | for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) |
5468 | if (test_bit(bitidx + start_bitidx, bitmap)) | 5472 | if (test_bit(bitidx + start_bitidx, bitmap)) |
5469 | flags |= value; | 5473 | flags |= value; |
5470 | 5474 | ||
5471 | return flags; | 5475 | return flags; |
5472 | } | 5476 | } |
5473 | 5477 | ||
5474 | /** | 5478 | /** |
5475 | * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages | 5479 | * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages |
5476 | * @page: The page within the block of interest | 5480 | * @page: The page within the block of interest |
5477 | * @start_bitidx: The first bit of interest | 5481 | * @start_bitidx: The first bit of interest |
5478 | * @end_bitidx: The last bit of interest | 5482 | * @end_bitidx: The last bit of interest |
5479 | * @flags: The flags to set | 5483 | * @flags: The flags to set |
5480 | */ | 5484 | */ |
5481 | void set_pageblock_flags_group(struct page *page, unsigned long flags, | 5485 | void set_pageblock_flags_group(struct page *page, unsigned long flags, |
5482 | int start_bitidx, int end_bitidx) | 5486 | int start_bitidx, int end_bitidx) |
5483 | { | 5487 | { |
5484 | struct zone *zone; | 5488 | struct zone *zone; |
5485 | unsigned long *bitmap; | 5489 | unsigned long *bitmap; |
5486 | unsigned long pfn, bitidx; | 5490 | unsigned long pfn, bitidx; |
5487 | unsigned long value = 1; | 5491 | unsigned long value = 1; |
5488 | 5492 | ||
5489 | zone = page_zone(page); | 5493 | zone = page_zone(page); |
5490 | pfn = page_to_pfn(page); | 5494 | pfn = page_to_pfn(page); |
5491 | bitmap = get_pageblock_bitmap(zone, pfn); | 5495 | bitmap = get_pageblock_bitmap(zone, pfn); |
5492 | bitidx = pfn_to_bitidx(zone, pfn); | 5496 | bitidx = pfn_to_bitidx(zone, pfn); |
5493 | VM_BUG_ON(pfn < zone->zone_start_pfn); | 5497 | VM_BUG_ON(pfn < zone->zone_start_pfn); |
5494 | VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages); | 5498 | VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages); |
5495 | 5499 | ||
5496 | for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) | 5500 | for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) |
5497 | if (flags & value) | 5501 | if (flags & value) |
5498 | __set_bit(bitidx + start_bitidx, bitmap); | 5502 | __set_bit(bitidx + start_bitidx, bitmap); |
5499 | else | 5503 | else |
5500 | __clear_bit(bitidx + start_bitidx, bitmap); | 5504 | __clear_bit(bitidx + start_bitidx, bitmap); |
5501 | } | 5505 | } |
5502 | 5506 | ||
5503 | /* | 5507 | /* |
5504 | * This function checks whether pageblock includes unmovable pages or not. | 5508 | * This function checks whether pageblock includes unmovable pages or not. |
5505 | * If @count is not zero, it is okay to include less @count unmovable pages | 5509 | * If @count is not zero, it is okay to include less @count unmovable pages |
5506 | * | 5510 | * |
5507 | * PageLRU check wihtout isolation or lru_lock could race so that | 5511 | * PageLRU check wihtout isolation or lru_lock could race so that |
5508 | * MIGRATE_MOVABLE block might include unmovable pages. It means you can't | 5512 | * MIGRATE_MOVABLE block might include unmovable pages. It means you can't |
5509 | * expect this function should be exact. | 5513 | * expect this function should be exact. |
5510 | */ | 5514 | */ |
5511 | bool has_unmovable_pages(struct zone *zone, struct page *page, int count) | 5515 | bool has_unmovable_pages(struct zone *zone, struct page *page, int count) |
5512 | { | 5516 | { |
5513 | unsigned long pfn, iter, found; | 5517 | unsigned long pfn, iter, found; |
5514 | int mt; | 5518 | int mt; |
5515 | 5519 | ||
5516 | /* | 5520 | /* |
5517 | * For avoiding noise data, lru_add_drain_all() should be called | 5521 | * For avoiding noise data, lru_add_drain_all() should be called |
5518 | * If ZONE_MOVABLE, the zone never contains unmovable pages | 5522 | * If ZONE_MOVABLE, the zone never contains unmovable pages |
5519 | */ | 5523 | */ |
5520 | if (zone_idx(zone) == ZONE_MOVABLE) | 5524 | if (zone_idx(zone) == ZONE_MOVABLE) |
5521 | return false; | 5525 | return false; |
5522 | mt = get_pageblock_migratetype(page); | 5526 | mt = get_pageblock_migratetype(page); |
5523 | if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) | 5527 | if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) |
5524 | return false; | 5528 | return false; |
5525 | 5529 | ||
5526 | pfn = page_to_pfn(page); | 5530 | pfn = page_to_pfn(page); |
5527 | for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { | 5531 | for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { |
5528 | unsigned long check = pfn + iter; | 5532 | unsigned long check = pfn + iter; |
5529 | 5533 | ||
5530 | if (!pfn_valid_within(check)) | 5534 | if (!pfn_valid_within(check)) |
5531 | continue; | 5535 | continue; |
5532 | 5536 | ||
5533 | page = pfn_to_page(check); | 5537 | page = pfn_to_page(check); |
5534 | /* | 5538 | /* |
5535 | * We can't use page_count without pin a page | 5539 | * We can't use page_count without pin a page |
5536 | * because another CPU can free compound page. | 5540 | * because another CPU can free compound page. |
5537 | * This check already skips compound tails of THP | 5541 | * This check already skips compound tails of THP |
5538 | * because their page->_count is zero at all time. | 5542 | * because their page->_count is zero at all time. |
5539 | */ | 5543 | */ |
5540 | if (!atomic_read(&page->_count)) { | 5544 | if (!atomic_read(&page->_count)) { |
5541 | if (PageBuddy(page)) | 5545 | if (PageBuddy(page)) |
5542 | iter += (1 << page_order(page)) - 1; | 5546 | iter += (1 << page_order(page)) - 1; |
5543 | continue; | 5547 | continue; |
5544 | } | 5548 | } |
5545 | 5549 | ||
5546 | if (!PageLRU(page)) | 5550 | if (!PageLRU(page)) |
5547 | found++; | 5551 | found++; |
5548 | /* | 5552 | /* |
5549 | * If there are RECLAIMABLE pages, we need to check it. | 5553 | * If there are RECLAIMABLE pages, we need to check it. |
5550 | * But now, memory offline itself doesn't call shrink_slab() | 5554 | * But now, memory offline itself doesn't call shrink_slab() |
5551 | * and it still to be fixed. | 5555 | * and it still to be fixed. |
5552 | */ | 5556 | */ |
5553 | /* | 5557 | /* |
5554 | * If the page is not RAM, page_count()should be 0. | 5558 | * If the page is not RAM, page_count()should be 0. |
5555 | * we don't need more check. This is an _used_ not-movable page. | 5559 | * we don't need more check. This is an _used_ not-movable page. |
5556 | * | 5560 | * |
5557 | * The problematic thing here is PG_reserved pages. PG_reserved | 5561 | * The problematic thing here is PG_reserved pages. PG_reserved |
5558 | * is set to both of a memory hole page and a _used_ kernel | 5562 | * is set to both of a memory hole page and a _used_ kernel |
5559 | * page at boot. | 5563 | * page at boot. |
5560 | */ | 5564 | */ |
5561 | if (found > count) | 5565 | if (found > count) |
5562 | return true; | 5566 | return true; |
5563 | } | 5567 | } |
5564 | return false; | 5568 | return false; |
5565 | } | 5569 | } |
5566 | 5570 | ||
5567 | bool is_pageblock_removable_nolock(struct page *page) | 5571 | bool is_pageblock_removable_nolock(struct page *page) |
5568 | { | 5572 | { |
5569 | struct zone *zone; | 5573 | struct zone *zone; |
5570 | unsigned long pfn; | 5574 | unsigned long pfn; |
5571 | 5575 | ||
5572 | /* | 5576 | /* |
5573 | * We have to be careful here because we are iterating over memory | 5577 | * We have to be careful here because we are iterating over memory |
5574 | * sections which are not zone aware so we might end up outside of | 5578 | * sections which are not zone aware so we might end up outside of |
5575 | * the zone but still within the section. | 5579 | * the zone but still within the section. |
5576 | * We have to take care about the node as well. If the node is offline | 5580 | * We have to take care about the node as well. If the node is offline |
5577 | * its NODE_DATA will be NULL - see page_zone. | 5581 | * its NODE_DATA will be NULL - see page_zone. |
5578 | */ | 5582 | */ |
5579 | if (!node_online(page_to_nid(page))) | 5583 | if (!node_online(page_to_nid(page))) |
5580 | return false; | 5584 | return false; |
5581 | 5585 | ||
5582 | zone = page_zone(page); | 5586 | zone = page_zone(page); |
5583 | pfn = page_to_pfn(page); | 5587 | pfn = page_to_pfn(page); |
5584 | if (zone->zone_start_pfn > pfn || | 5588 | if (zone->zone_start_pfn > pfn || |
5585 | zone->zone_start_pfn + zone->spanned_pages <= pfn) | 5589 | zone->zone_start_pfn + zone->spanned_pages <= pfn) |
5586 | return false; | 5590 | return false; |
5587 | 5591 | ||
5588 | return !has_unmovable_pages(zone, page, 0); | 5592 | return !has_unmovable_pages(zone, page, 0); |
5589 | } | 5593 | } |
5590 | 5594 | ||
5591 | #ifdef CONFIG_CMA | 5595 | #ifdef CONFIG_CMA |
5592 | 5596 | ||
5593 | static unsigned long pfn_max_align_down(unsigned long pfn) | 5597 | static unsigned long pfn_max_align_down(unsigned long pfn) |
5594 | { | 5598 | { |
5595 | return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES, | 5599 | return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES, |
5596 | pageblock_nr_pages) - 1); | 5600 | pageblock_nr_pages) - 1); |
5597 | } | 5601 | } |
5598 | 5602 | ||
5599 | static unsigned long pfn_max_align_up(unsigned long pfn) | 5603 | static unsigned long pfn_max_align_up(unsigned long pfn) |
5600 | { | 5604 | { |
5601 | return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES, | 5605 | return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES, |
5602 | pageblock_nr_pages)); | 5606 | pageblock_nr_pages)); |
5603 | } | 5607 | } |
5604 | 5608 | ||
5605 | static struct page * | 5609 | static struct page * |
5606 | __alloc_contig_migrate_alloc(struct page *page, unsigned long private, | 5610 | __alloc_contig_migrate_alloc(struct page *page, unsigned long private, |
5607 | int **resultp) | 5611 | int **resultp) |
5608 | { | 5612 | { |
5609 | gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; | 5613 | gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; |
5610 | 5614 | ||
5611 | if (PageHighMem(page)) | 5615 | if (PageHighMem(page)) |
5612 | gfp_mask |= __GFP_HIGHMEM; | 5616 | gfp_mask |= __GFP_HIGHMEM; |
5613 | 5617 | ||
5614 | return alloc_page(gfp_mask); | 5618 | return alloc_page(gfp_mask); |
5615 | } | 5619 | } |
5616 | 5620 | ||
5617 | /* [start, end) must belong to a single zone. */ | 5621 | /* [start, end) must belong to a single zone. */ |
5618 | static int __alloc_contig_migrate_range(unsigned long start, unsigned long end) | 5622 | static int __alloc_contig_migrate_range(unsigned long start, unsigned long end) |
5619 | { | 5623 | { |
5620 | /* This function is based on compact_zone() from compaction.c. */ | 5624 | /* This function is based on compact_zone() from compaction.c. */ |
5621 | 5625 | ||
5622 | unsigned long pfn = start; | 5626 | unsigned long pfn = start; |
5623 | unsigned int tries = 0; | 5627 | unsigned int tries = 0; |
5624 | int ret = 0; | 5628 | int ret = 0; |
5625 | 5629 | ||
5626 | struct compact_control cc = { | 5630 | struct compact_control cc = { |
5627 | .nr_migratepages = 0, | 5631 | .nr_migratepages = 0, |
5628 | .order = -1, | 5632 | .order = -1, |
5629 | .zone = page_zone(pfn_to_page(start)), | 5633 | .zone = page_zone(pfn_to_page(start)), |
5630 | .sync = true, | 5634 | .sync = true, |
5631 | }; | 5635 | }; |
5632 | INIT_LIST_HEAD(&cc.migratepages); | 5636 | INIT_LIST_HEAD(&cc.migratepages); |
5633 | 5637 | ||
5634 | migrate_prep_local(); | 5638 | migrate_prep_local(); |
5635 | 5639 | ||
5636 | while (pfn < end || !list_empty(&cc.migratepages)) { | 5640 | while (pfn < end || !list_empty(&cc.migratepages)) { |
5637 | if (fatal_signal_pending(current)) { | 5641 | if (fatal_signal_pending(current)) { |
5638 | ret = -EINTR; | 5642 | ret = -EINTR; |
5639 | break; | 5643 | break; |
5640 | } | 5644 | } |
5641 | 5645 | ||
5642 | if (list_empty(&cc.migratepages)) { | 5646 | if (list_empty(&cc.migratepages)) { |
5643 | cc.nr_migratepages = 0; | 5647 | cc.nr_migratepages = 0; |
5644 | pfn = isolate_migratepages_range(cc.zone, &cc, | 5648 | pfn = isolate_migratepages_range(cc.zone, &cc, |
5645 | pfn, end); | 5649 | pfn, end); |
5646 | if (!pfn) { | 5650 | if (!pfn) { |
5647 | ret = -EINTR; | 5651 | ret = -EINTR; |
5648 | break; | 5652 | break; |
5649 | } | 5653 | } |
5650 | tries = 0; | 5654 | tries = 0; |
5651 | } else if (++tries == 5) { | 5655 | } else if (++tries == 5) { |
5652 | ret = ret < 0 ? ret : -EBUSY; | 5656 | ret = ret < 0 ? ret : -EBUSY; |
5653 | break; | 5657 | break; |
5654 | } | 5658 | } |
5655 | 5659 | ||
5656 | ret = migrate_pages(&cc.migratepages, | 5660 | ret = migrate_pages(&cc.migratepages, |
5657 | __alloc_contig_migrate_alloc, | 5661 | __alloc_contig_migrate_alloc, |
5658 | 0, false, MIGRATE_SYNC); | 5662 | 0, false, MIGRATE_SYNC); |
5659 | } | 5663 | } |
5660 | 5664 | ||
5661 | putback_lru_pages(&cc.migratepages); | 5665 | putback_lru_pages(&cc.migratepages); |
5662 | return ret > 0 ? 0 : ret; | 5666 | return ret > 0 ? 0 : ret; |
5663 | } | 5667 | } |
5664 | 5668 | ||
5665 | /* | 5669 | /* |
5666 | * Update zone's cma pages counter used for watermark level calculation. | 5670 | * Update zone's cma pages counter used for watermark level calculation. |
5667 | */ | 5671 | */ |
5668 | static inline void __update_cma_watermarks(struct zone *zone, int count) | 5672 | static inline void __update_cma_watermarks(struct zone *zone, int count) |
5669 | { | 5673 | { |
5670 | unsigned long flags; | 5674 | unsigned long flags; |
5671 | spin_lock_irqsave(&zone->lock, flags); | 5675 | spin_lock_irqsave(&zone->lock, flags); |
5672 | zone->min_cma_pages += count; | 5676 | zone->min_cma_pages += count; |
5673 | spin_unlock_irqrestore(&zone->lock, flags); | 5677 | spin_unlock_irqrestore(&zone->lock, flags); |
5674 | setup_per_zone_wmarks(); | 5678 | setup_per_zone_wmarks(); |
5675 | } | 5679 | } |
5676 | 5680 | ||
5677 | /* | 5681 | /* |
5678 | * Trigger memory pressure bump to reclaim some pages in order to be able to | 5682 | * Trigger memory pressure bump to reclaim some pages in order to be able to |
5679 | * allocate 'count' pages in single page units. Does similar work as | 5683 | * allocate 'count' pages in single page units. Does similar work as |
5680 | *__alloc_pages_slowpath() function. | 5684 | *__alloc_pages_slowpath() function. |
5681 | */ | 5685 | */ |
5682 | static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count) | 5686 | static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count) |
5683 | { | 5687 | { |
5684 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | 5688 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); |
5685 | struct zonelist *zonelist = node_zonelist(0, gfp_mask); | 5689 | struct zonelist *zonelist = node_zonelist(0, gfp_mask); |
5686 | int did_some_progress = 0; | 5690 | int did_some_progress = 0; |
5687 | int order = 1; | 5691 | int order = 1; |
5688 | 5692 | ||
5689 | /* | 5693 | /* |
5690 | * Increase level of watermarks to force kswapd do his job | 5694 | * Increase level of watermarks to force kswapd do his job |
5691 | * to stabilise at new watermark level. | 5695 | * to stabilise at new watermark level. |
5692 | */ | 5696 | */ |
5693 | __update_cma_watermarks(zone, count); | 5697 | __update_cma_watermarks(zone, count); |
5694 | 5698 | ||
5695 | /* Obey watermarks as if the page was being allocated */ | 5699 | /* Obey watermarks as if the page was being allocated */ |
5696 | while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) { | 5700 | while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) { |
5697 | wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone)); | 5701 | wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone)); |
5698 | 5702 | ||
5699 | did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, | 5703 | did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, |
5700 | NULL); | 5704 | NULL); |
5701 | if (!did_some_progress) { | 5705 | if (!did_some_progress) { |
5702 | /* Exhausted what can be done so it's blamo time */ | 5706 | /* Exhausted what can be done so it's blamo time */ |
5703 | out_of_memory(zonelist, gfp_mask, order, NULL, false); | 5707 | out_of_memory(zonelist, gfp_mask, order, NULL, false); |
5704 | } | 5708 | } |
5705 | } | 5709 | } |
5706 | 5710 | ||
5707 | /* Restore original watermark levels. */ | 5711 | /* Restore original watermark levels. */ |
5708 | __update_cma_watermarks(zone, -count); | 5712 | __update_cma_watermarks(zone, -count); |
5709 | 5713 | ||
5710 | return count; | 5714 | return count; |
5711 | } | 5715 | } |
5712 | 5716 | ||
5713 | /** | 5717 | /** |
5714 | * alloc_contig_range() -- tries to allocate given range of pages | 5718 | * alloc_contig_range() -- tries to allocate given range of pages |
5715 | * @start: start PFN to allocate | 5719 | * @start: start PFN to allocate |
5716 | * @end: one-past-the-last PFN to allocate | 5720 | * @end: one-past-the-last PFN to allocate |
5717 | * @migratetype: migratetype of the underlaying pageblocks (either | 5721 | * @migratetype: migratetype of the underlaying pageblocks (either |
5718 | * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks | 5722 | * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks |
5719 | * in range must have the same migratetype and it must | 5723 | * in range must have the same migratetype and it must |
5720 | * be either of the two. | 5724 | * be either of the two. |
5721 | * | 5725 | * |
5722 | * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES | 5726 | * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES |
5723 | * aligned, however it's the caller's responsibility to guarantee that | 5727 | * aligned, however it's the caller's responsibility to guarantee that |
5724 | * we are the only thread that changes migrate type of pageblocks the | 5728 | * we are the only thread that changes migrate type of pageblocks the |
5725 | * pages fall in. | 5729 | * pages fall in. |
5726 | * | 5730 | * |
5727 | * The PFN range must belong to a single zone. | 5731 | * The PFN range must belong to a single zone. |
5728 | * | 5732 | * |
5729 | * Returns zero on success or negative error code. On success all | 5733 | * Returns zero on success or negative error code. On success all |
5730 | * pages which PFN is in [start, end) are allocated for the caller and | 5734 | * pages which PFN is in [start, end) are allocated for the caller and |
5731 | * need to be freed with free_contig_range(). | 5735 | * need to be freed with free_contig_range(). |
5732 | */ | 5736 | */ |
5733 | int alloc_contig_range(unsigned long start, unsigned long end, | 5737 | int alloc_contig_range(unsigned long start, unsigned long end, |
5734 | unsigned migratetype) | 5738 | unsigned migratetype) |
5735 | { | 5739 | { |
5736 | struct zone *zone = page_zone(pfn_to_page(start)); | 5740 | struct zone *zone = page_zone(pfn_to_page(start)); |
5737 | unsigned long outer_start, outer_end; | 5741 | unsigned long outer_start, outer_end; |
5738 | int ret = 0, order; | 5742 | int ret = 0, order; |
5739 | 5743 | ||
5740 | /* | 5744 | /* |
5741 | * What we do here is we mark all pageblocks in range as | 5745 | * What we do here is we mark all pageblocks in range as |
5742 | * MIGRATE_ISOLATE. Because pageblock and max order pages may | 5746 | * MIGRATE_ISOLATE. Because pageblock and max order pages may |
5743 | * have different sizes, and due to the way page allocator | 5747 | * have different sizes, and due to the way page allocator |
5744 | * work, we align the range to biggest of the two pages so | 5748 | * work, we align the range to biggest of the two pages so |
5745 | * that page allocator won't try to merge buddies from | 5749 | * that page allocator won't try to merge buddies from |
5746 | * different pageblocks and change MIGRATE_ISOLATE to some | 5750 | * different pageblocks and change MIGRATE_ISOLATE to some |
5747 | * other migration type. | 5751 | * other migration type. |
5748 | * | 5752 | * |
5749 | * Once the pageblocks are marked as MIGRATE_ISOLATE, we | 5753 | * Once the pageblocks are marked as MIGRATE_ISOLATE, we |
5750 | * migrate the pages from an unaligned range (ie. pages that | 5754 | * migrate the pages from an unaligned range (ie. pages that |
5751 | * we are interested in). This will put all the pages in | 5755 | * we are interested in). This will put all the pages in |
5752 | * range back to page allocator as MIGRATE_ISOLATE. | 5756 | * range back to page allocator as MIGRATE_ISOLATE. |
5753 | * | 5757 | * |
5754 | * When this is done, we take the pages in range from page | 5758 | * When this is done, we take the pages in range from page |
5755 | * allocator removing them from the buddy system. This way | 5759 | * allocator removing them from the buddy system. This way |
5756 | * page allocator will never consider using them. | 5760 | * page allocator will never consider using them. |
5757 | * | 5761 | * |
5758 | * This lets us mark the pageblocks back as | 5762 | * This lets us mark the pageblocks back as |
5759 | * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the | 5763 | * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the |
5760 | * aligned range but not in the unaligned, original range are | 5764 | * aligned range but not in the unaligned, original range are |
5761 | * put back to page allocator so that buddy can use them. | 5765 | * put back to page allocator so that buddy can use them. |
5762 | */ | 5766 | */ |
5763 | 5767 | ||
5764 | ret = start_isolate_page_range(pfn_max_align_down(start), | 5768 | ret = start_isolate_page_range(pfn_max_align_down(start), |
5765 | pfn_max_align_up(end), migratetype); | 5769 | pfn_max_align_up(end), migratetype); |
5766 | if (ret) | 5770 | if (ret) |
5767 | goto done; | 5771 | goto done; |
5768 | 5772 | ||
5769 | ret = __alloc_contig_migrate_range(start, end); | 5773 | ret = __alloc_contig_migrate_range(start, end); |
5770 | if (ret) | 5774 | if (ret) |
5771 | goto done; | 5775 | goto done; |
5772 | 5776 | ||
5773 | /* | 5777 | /* |
5774 | * Pages from [start, end) are within a MAX_ORDER_NR_PAGES | 5778 | * Pages from [start, end) are within a MAX_ORDER_NR_PAGES |
5775 | * aligned blocks that are marked as MIGRATE_ISOLATE. What's | 5779 | * aligned blocks that are marked as MIGRATE_ISOLATE. What's |
5776 | * more, all pages in [start, end) are free in page allocator. | 5780 | * more, all pages in [start, end) are free in page allocator. |
5777 | * What we are going to do is to allocate all pages from | 5781 | * What we are going to do is to allocate all pages from |
5778 | * [start, end) (that is remove them from page allocator). | 5782 | * [start, end) (that is remove them from page allocator). |
5779 | * | 5783 | * |
5780 | * The only problem is that pages at the beginning and at the | 5784 | * The only problem is that pages at the beginning and at the |
5781 | * end of interesting range may be not aligned with pages that | 5785 | * end of interesting range may be not aligned with pages that |
5782 | * page allocator holds, ie. they can be part of higher order | 5786 | * page allocator holds, ie. they can be part of higher order |
5783 | * pages. Because of this, we reserve the bigger range and | 5787 | * pages. Because of this, we reserve the bigger range and |
5784 | * once this is done free the pages we are not interested in. | 5788 | * once this is done free the pages we are not interested in. |
5785 | * | 5789 | * |
5786 | * We don't have to hold zone->lock here because the pages are | 5790 | * We don't have to hold zone->lock here because the pages are |
5787 | * isolated thus they won't get removed from buddy. | 5791 | * isolated thus they won't get removed from buddy. |
5788 | */ | 5792 | */ |
5789 | 5793 | ||
5790 | lru_add_drain_all(); | 5794 | lru_add_drain_all(); |
5791 | drain_all_pages(); | 5795 | drain_all_pages(); |
5792 | 5796 | ||
5793 | order = 0; | 5797 | order = 0; |
5794 | outer_start = start; | 5798 | outer_start = start; |
5795 | while (!PageBuddy(pfn_to_page(outer_start))) { | 5799 | while (!PageBuddy(pfn_to_page(outer_start))) { |
5796 | if (++order >= MAX_ORDER) { | 5800 | if (++order >= MAX_ORDER) { |
5797 | ret = -EBUSY; | 5801 | ret = -EBUSY; |
5798 | goto done; | 5802 | goto done; |
5799 | } | 5803 | } |
5800 | outer_start &= ~0UL << order; | 5804 | outer_start &= ~0UL << order; |
5801 | } | 5805 | } |
5802 | 5806 | ||
5803 | /* Make sure the range is really isolated. */ | 5807 | /* Make sure the range is really isolated. */ |
5804 | if (test_pages_isolated(outer_start, end)) { | 5808 | if (test_pages_isolated(outer_start, end)) { |
5805 | pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", | 5809 | pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", |
5806 | outer_start, end); | 5810 | outer_start, end); |
5807 | ret = -EBUSY; | 5811 | ret = -EBUSY; |
5808 | goto done; | 5812 | goto done; |
5809 | } | 5813 | } |
5810 | 5814 | ||
5811 | /* | 5815 | /* |
5812 | * Reclaim enough pages to make sure that contiguous allocation | 5816 | * Reclaim enough pages to make sure that contiguous allocation |
5813 | * will not starve the system. | 5817 | * will not starve the system. |
5814 | */ | 5818 | */ |
5815 | __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start); | 5819 | __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start); |
5816 | 5820 | ||
5817 | /* Grab isolated pages from freelists. */ | 5821 | /* Grab isolated pages from freelists. */ |
5818 | outer_end = isolate_freepages_range(outer_start, end); | 5822 | outer_end = isolate_freepages_range(outer_start, end); |
5819 | if (!outer_end) { | 5823 | if (!outer_end) { |
5820 | ret = -EBUSY; | 5824 | ret = -EBUSY; |
5821 | goto done; | 5825 | goto done; |
5822 | } | 5826 | } |
5823 | 5827 | ||
5824 | /* Free head and tail (if any) */ | 5828 | /* Free head and tail (if any) */ |
5825 | if (start != outer_start) | 5829 | if (start != outer_start) |
5826 | free_contig_range(outer_start, start - outer_start); | 5830 | free_contig_range(outer_start, start - outer_start); |
5827 | if (end != outer_end) | 5831 | if (end != outer_end) |
5828 | free_contig_range(end, outer_end - end); | 5832 | free_contig_range(end, outer_end - end); |
5829 | 5833 | ||
5830 | done: | 5834 | done: |
5831 | undo_isolate_page_range(pfn_max_align_down(start), | 5835 | undo_isolate_page_range(pfn_max_align_down(start), |
5832 | pfn_max_align_up(end), migratetype); | 5836 | pfn_max_align_up(end), migratetype); |
5833 | return ret; | 5837 | return ret; |
5834 | } | 5838 | } |
5835 | 5839 | ||
5836 | void free_contig_range(unsigned long pfn, unsigned nr_pages) | 5840 | void free_contig_range(unsigned long pfn, unsigned nr_pages) |
5837 | { | 5841 | { |
5838 | for (; nr_pages--; ++pfn) | 5842 | for (; nr_pages--; ++pfn) |
5839 | __free_page(pfn_to_page(pfn)); | 5843 | __free_page(pfn_to_page(pfn)); |
5840 | } | 5844 | } |
5841 | #endif | 5845 | #endif |
5842 | 5846 | ||
5843 | #ifdef CONFIG_MEMORY_HOTPLUG | 5847 | #ifdef CONFIG_MEMORY_HOTPLUG |
5844 | static int __meminit __zone_pcp_update(void *data) | 5848 | static int __meminit __zone_pcp_update(void *data) |
5845 | { | 5849 | { |
5846 | struct zone *zone = data; | 5850 | struct zone *zone = data; |
5847 | int cpu; | 5851 | int cpu; |
5848 | unsigned long batch = zone_batchsize(zone), flags; | 5852 | unsigned long batch = zone_batchsize(zone), flags; |
5849 | 5853 | ||
5850 | for_each_possible_cpu(cpu) { | 5854 | for_each_possible_cpu(cpu) { |
5851 | struct per_cpu_pageset *pset; | 5855 | struct per_cpu_pageset *pset; |
5852 | struct per_cpu_pages *pcp; | 5856 | struct per_cpu_pages *pcp; |
5853 | 5857 | ||
5854 | pset = per_cpu_ptr(zone->pageset, cpu); | 5858 | pset = per_cpu_ptr(zone->pageset, cpu); |
5855 | pcp = &pset->pcp; | 5859 | pcp = &pset->pcp; |
5856 | 5860 | ||
5857 | local_irq_save(flags); | 5861 | local_irq_save(flags); |
5858 | if (pcp->count > 0) | 5862 | if (pcp->count > 0) |
5859 | free_pcppages_bulk(zone, pcp->count, pcp); | 5863 | free_pcppages_bulk(zone, pcp->count, pcp); |
5860 | setup_pageset(pset, batch); | 5864 | setup_pageset(pset, batch); |
5861 | local_irq_restore(flags); | 5865 | local_irq_restore(flags); |
5862 | } | 5866 | } |
5863 | return 0; | 5867 | return 0; |
5864 | } | 5868 | } |
5865 | 5869 | ||
5866 | void __meminit zone_pcp_update(struct zone *zone) | 5870 | void __meminit zone_pcp_update(struct zone *zone) |
5867 | { | 5871 | { |
5868 | stop_machine(__zone_pcp_update, zone, NULL); | 5872 | stop_machine(__zone_pcp_update, zone, NULL); |
5869 | } | 5873 | } |
5870 | #endif | 5874 | #endif |
5871 | 5875 | ||
5872 | #ifdef CONFIG_MEMORY_HOTREMOVE | 5876 | #ifdef CONFIG_MEMORY_HOTREMOVE |
5873 | void zone_pcp_reset(struct zone *zone) | 5877 | void zone_pcp_reset(struct zone *zone) |
5874 | { | 5878 | { |
5875 | unsigned long flags; | 5879 | unsigned long flags; |
5876 | 5880 | ||
5877 | /* avoid races with drain_pages() */ | 5881 | /* avoid races with drain_pages() */ |
5878 | local_irq_save(flags); | 5882 | local_irq_save(flags); |
5879 | if (zone->pageset != &boot_pageset) { | 5883 | if (zone->pageset != &boot_pageset) { |
5880 | free_percpu(zone->pageset); | 5884 | free_percpu(zone->pageset); |
5881 | zone->pageset = &boot_pageset; | 5885 | zone->pageset = &boot_pageset; |
5882 | } | 5886 | } |
5883 | local_irq_restore(flags); | 5887 | local_irq_restore(flags); |
5884 | } | 5888 | } |
5885 | 5889 | ||
5886 | /* | 5890 | /* |
5887 | * All pages in the range must be isolated before calling this. | 5891 | * All pages in the range must be isolated before calling this. |
5888 | */ | 5892 | */ |
5889 | void | 5893 | void |
5890 | __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) | 5894 | __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) |
5891 | { | 5895 | { |
5892 | struct page *page; | 5896 | struct page *page; |
5893 | struct zone *zone; | 5897 | struct zone *zone; |
5894 | int order, i; | 5898 | int order, i; |
5895 | unsigned long pfn; | 5899 | unsigned long pfn; |
5896 | unsigned long flags; | 5900 | unsigned long flags; |
5897 | /* find the first valid pfn */ | 5901 | /* find the first valid pfn */ |
5898 | for (pfn = start_pfn; pfn < end_pfn; pfn++) | 5902 | for (pfn = start_pfn; pfn < end_pfn; pfn++) |
5899 | if (pfn_valid(pfn)) | 5903 | if (pfn_valid(pfn)) |
5900 | break; | 5904 | break; |
5901 | if (pfn == end_pfn) | 5905 | if (pfn == end_pfn) |
5902 | return; | 5906 | return; |
5903 | zone = page_zone(pfn_to_page(pfn)); | 5907 | zone = page_zone(pfn_to_page(pfn)); |
5904 | spin_lock_irqsave(&zone->lock, flags); | 5908 | spin_lock_irqsave(&zone->lock, flags); |
5905 | pfn = start_pfn; | 5909 | pfn = start_pfn; |
5906 | while (pfn < end_pfn) { | 5910 | while (pfn < end_pfn) { |
5907 | if (!pfn_valid(pfn)) { | 5911 | if (!pfn_valid(pfn)) { |
5908 | pfn++; | 5912 | pfn++; |
5909 | continue; | 5913 | continue; |
5910 | } | 5914 | } |
5911 | page = pfn_to_page(pfn); | 5915 | page = pfn_to_page(pfn); |
5912 | BUG_ON(page_count(page)); | 5916 | BUG_ON(page_count(page)); |
5913 | BUG_ON(!PageBuddy(page)); | 5917 | BUG_ON(!PageBuddy(page)); |
5914 | order = page_order(page); | 5918 | order = page_order(page); |
5915 | #ifdef CONFIG_DEBUG_VM | 5919 | #ifdef CONFIG_DEBUG_VM |
5916 | printk(KERN_INFO "remove from free list %lx %d %lx\n", | 5920 | printk(KERN_INFO "remove from free list %lx %d %lx\n", |
5917 | pfn, 1 << order, end_pfn); | 5921 | pfn, 1 << order, end_pfn); |
5918 | #endif | 5922 | #endif |
5919 | list_del(&page->lru); | 5923 | list_del(&page->lru); |
5920 | rmv_page_order(page); | 5924 | rmv_page_order(page); |
5921 | zone->free_area[order].nr_free--; | 5925 | zone->free_area[order].nr_free--; |
5922 | __mod_zone_page_state(zone, NR_FREE_PAGES, | 5926 | __mod_zone_page_state(zone, NR_FREE_PAGES, |
5923 | - (1UL << order)); | 5927 | - (1UL << order)); |
5924 | for (i = 0; i < (1 << order); i++) | 5928 | for (i = 0; i < (1 << order); i++) |
5925 | SetPageReserved((page+i)); | 5929 | SetPageReserved((page+i)); |
5926 | pfn += (1 << order); | 5930 | pfn += (1 << order); |
5927 | } | 5931 | } |
5928 | spin_unlock_irqrestore(&zone->lock, flags); | 5932 | spin_unlock_irqrestore(&zone->lock, flags); |
5929 | } | 5933 | } |
5930 | #endif | 5934 | #endif |
5931 | 5935 | ||
5932 | #ifdef CONFIG_MEMORY_FAILURE | 5936 | #ifdef CONFIG_MEMORY_FAILURE |
5933 | bool is_free_buddy_page(struct page *page) | 5937 | bool is_free_buddy_page(struct page *page) |
5934 | { | 5938 | { |
5935 | struct zone *zone = page_zone(page); | 5939 | struct zone *zone = page_zone(page); |
5936 | unsigned long pfn = page_to_pfn(page); | 5940 | unsigned long pfn = page_to_pfn(page); |
5937 | unsigned long flags; | 5941 | unsigned long flags; |
5938 | int order; | 5942 | int order; |
5939 | 5943 | ||
5940 | spin_lock_irqsave(&zone->lock, flags); | 5944 | spin_lock_irqsave(&zone->lock, flags); |
5941 | for (order = 0; order < MAX_ORDER; order++) { | 5945 | for (order = 0; order < MAX_ORDER; order++) { |
5942 | struct page *page_head = page - (pfn & ((1 << order) - 1)); | 5946 | struct page *page_head = page - (pfn & ((1 << order) - 1)); |
5943 | 5947 | ||
5944 | if (PageBuddy(page_head) && page_order(page_head) >= order) | 5948 | if (PageBuddy(page_head) && page_order(page_head) >= order) |
5945 | break; | 5949 | break; |
5946 | } | 5950 | } |
5947 | spin_unlock_irqrestore(&zone->lock, flags); | 5951 | spin_unlock_irqrestore(&zone->lock, flags); |
5948 | 5952 | ||
5949 | return order < MAX_ORDER; | 5953 | return order < MAX_ORDER; |
5950 | } | 5954 | } |
5951 | #endif | 5955 | #endif |
5952 | 5956 | ||
5953 | static const struct trace_print_flags pageflag_names[] = { | 5957 | static const struct trace_print_flags pageflag_names[] = { |
5954 | {1UL << PG_locked, "locked" }, | 5958 | {1UL << PG_locked, "locked" }, |
5955 | {1UL << PG_error, "error" }, | 5959 | {1UL << PG_error, "error" }, |
5956 | {1UL << PG_referenced, "referenced" }, | 5960 | {1UL << PG_referenced, "referenced" }, |
5957 | {1UL << PG_uptodate, "uptodate" }, | 5961 | {1UL << PG_uptodate, "uptodate" }, |
5958 | {1UL << PG_dirty, "dirty" }, | 5962 | {1UL << PG_dirty, "dirty" }, |
5959 | {1UL << PG_lru, "lru" }, | 5963 | {1UL << PG_lru, "lru" }, |
5960 | {1UL << PG_active, "active" }, | 5964 | {1UL << PG_active, "active" }, |
5961 | {1UL << PG_slab, "slab" }, | 5965 | {1UL << PG_slab, "slab" }, |
5962 | {1UL << PG_owner_priv_1, "owner_priv_1" }, | 5966 | {1UL << PG_owner_priv_1, "owner_priv_1" }, |
5963 | {1UL << PG_arch_1, "arch_1" }, | 5967 | {1UL << PG_arch_1, "arch_1" }, |
5964 | {1UL << PG_reserved, "reserved" }, | 5968 | {1UL << PG_reserved, "reserved" }, |
5965 | {1UL << PG_private, "private" }, | 5969 | {1UL << PG_private, "private" }, |
5966 | {1UL << PG_private_2, "private_2" }, | 5970 | {1UL << PG_private_2, "private_2" }, |
5967 | {1UL << PG_writeback, "writeback" }, | 5971 | {1UL << PG_writeback, "writeback" }, |
5968 | #ifdef CONFIG_PAGEFLAGS_EXTENDED | 5972 | #ifdef CONFIG_PAGEFLAGS_EXTENDED |
5969 | {1UL << PG_head, "head" }, | 5973 | {1UL << PG_head, "head" }, |
5970 | {1UL << PG_tail, "tail" }, | 5974 | {1UL << PG_tail, "tail" }, |
5971 | #else | 5975 | #else |
5972 | {1UL << PG_compound, "compound" }, | 5976 | {1UL << PG_compound, "compound" }, |
5973 | #endif | 5977 | #endif |
5974 | {1UL << PG_swapcache, "swapcache" }, | 5978 | {1UL << PG_swapcache, "swapcache" }, |
5975 | {1UL << PG_mappedtodisk, "mappedtodisk" }, | 5979 | {1UL << PG_mappedtodisk, "mappedtodisk" }, |
5976 | {1UL << PG_reclaim, "reclaim" }, | 5980 | {1UL << PG_reclaim, "reclaim" }, |
5977 | {1UL << PG_swapbacked, "swapbacked" }, | 5981 | {1UL << PG_swapbacked, "swapbacked" }, |
5978 | {1UL << PG_unevictable, "unevictable" }, | 5982 | {1UL << PG_unevictable, "unevictable" }, |
5979 | #ifdef CONFIG_MMU | 5983 | #ifdef CONFIG_MMU |
5980 | {1UL << PG_mlocked, "mlocked" }, | 5984 | {1UL << PG_mlocked, "mlocked" }, |
5981 | #endif | 5985 | #endif |
5982 | #ifdef CONFIG_ARCH_USES_PG_UNCACHED | 5986 | #ifdef CONFIG_ARCH_USES_PG_UNCACHED |
5983 | {1UL << PG_uncached, "uncached" }, | 5987 | {1UL << PG_uncached, "uncached" }, |
5984 | #endif | 5988 | #endif |
5985 | #ifdef CONFIG_MEMORY_FAILURE | 5989 | #ifdef CONFIG_MEMORY_FAILURE |
5986 | {1UL << PG_hwpoison, "hwpoison" }, | 5990 | {1UL << PG_hwpoison, "hwpoison" }, |
5987 | #endif | 5991 | #endif |
5988 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 5992 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
5989 | {1UL << PG_compound_lock, "compound_lock" }, | 5993 | {1UL << PG_compound_lock, "compound_lock" }, |
5990 | #endif | 5994 | #endif |
5991 | }; | 5995 | }; |
5992 | 5996 | ||
5993 | static void dump_page_flags(unsigned long flags) | 5997 | static void dump_page_flags(unsigned long flags) |
5994 | { | 5998 | { |
5995 | const char *delim = ""; | 5999 | const char *delim = ""; |
5996 | unsigned long mask; | 6000 | unsigned long mask; |
5997 | int i; | 6001 | int i; |
5998 | 6002 | ||
5999 | BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); | 6003 | BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); |
6000 | 6004 | ||
6001 | printk(KERN_ALERT "page flags: %#lx(", flags); | 6005 | printk(KERN_ALERT "page flags: %#lx(", flags); |
6002 | 6006 | ||
6003 | /* remove zone id */ | 6007 | /* remove zone id */ |
6004 | flags &= (1UL << NR_PAGEFLAGS) - 1; | 6008 | flags &= (1UL << NR_PAGEFLAGS) - 1; |
6005 | 6009 | ||
6006 | for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) { | 6010 | for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) { |
6007 | 6011 | ||
6008 | mask = pageflag_names[i].mask; | 6012 | mask = pageflag_names[i].mask; |
6009 | if ((flags & mask) != mask) | 6013 | if ((flags & mask) != mask) |
6010 | continue; | 6014 | continue; |
6011 | 6015 | ||
6012 | flags &= ~mask; | 6016 | flags &= ~mask; |
6013 | printk("%s%s", delim, pageflag_names[i].name); | 6017 | printk("%s%s", delim, pageflag_names[i].name); |
6014 | delim = "|"; | 6018 | delim = "|"; |
6015 | } | 6019 | } |
6016 | 6020 | ||
6017 | /* check for left over flags */ | 6021 | /* check for left over flags */ |
6018 | if (flags) | 6022 | if (flags) |
6019 | printk("%s%#lx", delim, flags); | 6023 | printk("%s%#lx", delim, flags); |
6020 | 6024 | ||
6021 | printk(")\n"); | 6025 | printk(")\n"); |
6022 | } | 6026 | } |
6023 | 6027 | ||
6024 | void dump_page(struct page *page) | 6028 | void dump_page(struct page *page) |
6025 | { | 6029 | { |
6026 | printk(KERN_ALERT | 6030 | printk(KERN_ALERT |
6027 | "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", | 6031 | "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", |
6028 | page, atomic_read(&page->_count), page_mapcount(page), | 6032 | page, atomic_read(&page->_count), page_mapcount(page), |
6029 | page->mapping, page->index); | 6033 | page->mapping, page->index); |
6030 | dump_page_flags(page->flags); | 6034 | dump_page_flags(page->flags); |
6031 | mem_cgroup_print_bad_page(page); | 6035 | mem_cgroup_print_bad_page(page); |
6032 | } | 6036 | } |
6033 | 6037 |