Commit 95c354fe9f7d6decc08a92aa26eb233ecc2155bf
Committed by
Ingo Molnar
1 parent
a95d67f87e
Exists in
master
and in
4 other branches
spinlock: lockbreak cleanup
The break_lock data structure and code for spinlocks is quite nasty. Not only does it double the size of a spinlock but it changes locking to a potentially less optimal trylock. Put all of that under CONFIG_GENERIC_LOCKBREAK, and introduce a __raw_spin_is_contended that uses the lock data itself to determine whether there are waiters on the lock, to be used if CONFIG_GENERIC_LOCKBREAK is not set. Rename need_lockbreak to spin_needbreak, make it use spin_is_contended to decouple it from the spinlock implementation, and make it typesafe (rwlocks do not have any need_lockbreak sites -- why do they even get bloated up with that break_lock then?). Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Showing 19 changed files with 72 additions and 37 deletions Inline Diff
- arch/arm/Kconfig
- arch/ia64/Kconfig
- arch/m32r/Kconfig
- arch/mips/Kconfig
- arch/parisc/Kconfig
- arch/powerpc/Kconfig
- arch/sparc64/Kconfig
- arch/x86/Kconfig
- fs/jbd/checkpoint.c
- fs/jbd/commit.c
- fs/jbd2/checkpoint.c
- fs/jbd2/commit.c
- include/linux/sched.h
- include/linux/spinlock.h
- include/linux/spinlock_types.h
- include/linux/spinlock_up.h
- kernel/sched.c
- kernel/spinlock.c
- mm/memory.c
arch/arm/Kconfig
1 | # | 1 | # |
2 | # For a description of the syntax of this configuration file, | 2 | # For a description of the syntax of this configuration file, |
3 | # see Documentation/kbuild/kconfig-language.txt. | 3 | # see Documentation/kbuild/kconfig-language.txt. |
4 | # | 4 | # |
5 | 5 | ||
6 | mainmenu "Linux Kernel Configuration" | 6 | mainmenu "Linux Kernel Configuration" |
7 | 7 | ||
8 | config ARM | 8 | config ARM |
9 | bool | 9 | bool |
10 | default y | 10 | default y |
11 | select RTC_LIB | 11 | select RTC_LIB |
12 | select SYS_SUPPORTS_APM_EMULATION | 12 | select SYS_SUPPORTS_APM_EMULATION |
13 | help | 13 | help |
14 | The ARM series is a line of low-power-consumption RISC chip designs | 14 | The ARM series is a line of low-power-consumption RISC chip designs |
15 | licensed by ARM Ltd and targeted at embedded applications and | 15 | licensed by ARM Ltd and targeted at embedded applications and |
16 | handhelds such as the Compaq IPAQ. ARM-based PCs are no longer | 16 | handhelds such as the Compaq IPAQ. ARM-based PCs are no longer |
17 | manufactured, but legacy ARM-based PC hardware remains popular in | 17 | manufactured, but legacy ARM-based PC hardware remains popular in |
18 | Europe. There is an ARM Linux project with a web page at | 18 | Europe. There is an ARM Linux project with a web page at |
19 | <http://www.arm.linux.org.uk/>. | 19 | <http://www.arm.linux.org.uk/>. |
20 | 20 | ||
21 | config SYS_SUPPORTS_APM_EMULATION | 21 | config SYS_SUPPORTS_APM_EMULATION |
22 | bool | 22 | bool |
23 | 23 | ||
24 | config GENERIC_GPIO | 24 | config GENERIC_GPIO |
25 | bool | 25 | bool |
26 | default n | 26 | default n |
27 | 27 | ||
28 | config GENERIC_TIME | 28 | config GENERIC_TIME |
29 | bool | 29 | bool |
30 | default n | 30 | default n |
31 | 31 | ||
32 | config GENERIC_CLOCKEVENTS | 32 | config GENERIC_CLOCKEVENTS |
33 | bool | 33 | bool |
34 | default n | 34 | default n |
35 | 35 | ||
36 | config MMU | 36 | config MMU |
37 | bool | 37 | bool |
38 | default y | 38 | default y |
39 | 39 | ||
40 | config NO_IOPORT | 40 | config NO_IOPORT |
41 | bool | 41 | bool |
42 | default n | 42 | default n |
43 | 43 | ||
44 | config EISA | 44 | config EISA |
45 | bool | 45 | bool |
46 | ---help--- | 46 | ---help--- |
47 | The Extended Industry Standard Architecture (EISA) bus was | 47 | The Extended Industry Standard Architecture (EISA) bus was |
48 | developed as an open alternative to the IBM MicroChannel bus. | 48 | developed as an open alternative to the IBM MicroChannel bus. |
49 | 49 | ||
50 | The EISA bus provided some of the features of the IBM MicroChannel | 50 | The EISA bus provided some of the features of the IBM MicroChannel |
51 | bus while maintaining backward compatibility with cards made for | 51 | bus while maintaining backward compatibility with cards made for |
52 | the older ISA bus. The EISA bus saw limited use between 1988 and | 52 | the older ISA bus. The EISA bus saw limited use between 1988 and |
53 | 1995 when it was made obsolete by the PCI bus. | 53 | 1995 when it was made obsolete by the PCI bus. |
54 | 54 | ||
55 | Say Y here if you are building a kernel for an EISA-based machine. | 55 | Say Y here if you are building a kernel for an EISA-based machine. |
56 | 56 | ||
57 | Otherwise, say N. | 57 | Otherwise, say N. |
58 | 58 | ||
59 | config SBUS | 59 | config SBUS |
60 | bool | 60 | bool |
61 | 61 | ||
62 | config MCA | 62 | config MCA |
63 | bool | 63 | bool |
64 | help | 64 | help |
65 | MicroChannel Architecture is found in some IBM PS/2 machines and | 65 | MicroChannel Architecture is found in some IBM PS/2 machines and |
66 | laptops. It is a bus system similar to PCI or ISA. See | 66 | laptops. It is a bus system similar to PCI or ISA. See |
67 | <file:Documentation/mca.txt> (and especially the web page given | 67 | <file:Documentation/mca.txt> (and especially the web page given |
68 | there) before attempting to build an MCA bus kernel. | 68 | there) before attempting to build an MCA bus kernel. |
69 | 69 | ||
70 | config GENERIC_HARDIRQS | 70 | config GENERIC_HARDIRQS |
71 | bool | 71 | bool |
72 | default y | 72 | default y |
73 | 73 | ||
74 | config STACKTRACE_SUPPORT | 74 | config STACKTRACE_SUPPORT |
75 | bool | 75 | bool |
76 | default y | 76 | default y |
77 | 77 | ||
78 | config LOCKDEP_SUPPORT | 78 | config LOCKDEP_SUPPORT |
79 | bool | 79 | bool |
80 | default y | 80 | default y |
81 | 81 | ||
82 | config TRACE_IRQFLAGS_SUPPORT | 82 | config TRACE_IRQFLAGS_SUPPORT |
83 | bool | 83 | bool |
84 | default y | 84 | default y |
85 | 85 | ||
86 | config HARDIRQS_SW_RESEND | 86 | config HARDIRQS_SW_RESEND |
87 | bool | 87 | bool |
88 | default y | 88 | default y |
89 | 89 | ||
90 | config GENERIC_IRQ_PROBE | 90 | config GENERIC_IRQ_PROBE |
91 | bool | 91 | bool |
92 | default y | 92 | default y |
93 | 93 | ||
94 | config GENERIC_LOCKBREAK | ||
95 | bool | ||
96 | default y | ||
97 | depends on SMP && PREEMPT | ||
98 | |||
94 | config RWSEM_GENERIC_SPINLOCK | 99 | config RWSEM_GENERIC_SPINLOCK |
95 | bool | 100 | bool |
96 | default y | 101 | default y |
97 | 102 | ||
98 | config RWSEM_XCHGADD_ALGORITHM | 103 | config RWSEM_XCHGADD_ALGORITHM |
99 | bool | 104 | bool |
100 | 105 | ||
101 | config ARCH_HAS_ILOG2_U32 | 106 | config ARCH_HAS_ILOG2_U32 |
102 | bool | 107 | bool |
103 | default n | 108 | default n |
104 | 109 | ||
105 | config ARCH_HAS_ILOG2_U64 | 110 | config ARCH_HAS_ILOG2_U64 |
106 | bool | 111 | bool |
107 | default n | 112 | default n |
108 | 113 | ||
109 | config GENERIC_HWEIGHT | 114 | config GENERIC_HWEIGHT |
110 | bool | 115 | bool |
111 | default y | 116 | default y |
112 | 117 | ||
113 | config GENERIC_CALIBRATE_DELAY | 118 | config GENERIC_CALIBRATE_DELAY |
114 | bool | 119 | bool |
115 | default y | 120 | default y |
116 | 121 | ||
117 | config ARCH_MAY_HAVE_PC_FDC | 122 | config ARCH_MAY_HAVE_PC_FDC |
118 | bool | 123 | bool |
119 | 124 | ||
120 | config ZONE_DMA | 125 | config ZONE_DMA |
121 | bool | 126 | bool |
122 | default y | 127 | default y |
123 | 128 | ||
124 | config GENERIC_ISA_DMA | 129 | config GENERIC_ISA_DMA |
125 | bool | 130 | bool |
126 | 131 | ||
127 | config FIQ | 132 | config FIQ |
128 | bool | 133 | bool |
129 | 134 | ||
130 | config ARCH_MTD_XIP | 135 | config ARCH_MTD_XIP |
131 | bool | 136 | bool |
132 | 137 | ||
133 | config VECTORS_BASE | 138 | config VECTORS_BASE |
134 | hex | 139 | hex |
135 | default 0xffff0000 if MMU || CPU_HIGH_VECTOR | 140 | default 0xffff0000 if MMU || CPU_HIGH_VECTOR |
136 | default DRAM_BASE if REMAP_VECTORS_TO_RAM | 141 | default DRAM_BASE if REMAP_VECTORS_TO_RAM |
137 | default 0x00000000 | 142 | default 0x00000000 |
138 | help | 143 | help |
139 | The base address of exception vectors. | 144 | The base address of exception vectors. |
140 | 145 | ||
141 | source "init/Kconfig" | 146 | source "init/Kconfig" |
142 | 147 | ||
143 | menu "System Type" | 148 | menu "System Type" |
144 | 149 | ||
145 | choice | 150 | choice |
146 | prompt "ARM system type" | 151 | prompt "ARM system type" |
147 | default ARCH_VERSATILE | 152 | default ARCH_VERSATILE |
148 | 153 | ||
149 | config ARCH_AAEC2000 | 154 | config ARCH_AAEC2000 |
150 | bool "Agilent AAEC-2000 based" | 155 | bool "Agilent AAEC-2000 based" |
151 | select ARM_AMBA | 156 | select ARM_AMBA |
152 | help | 157 | help |
153 | This enables support for systems based on the Agilent AAEC-2000 | 158 | This enables support for systems based on the Agilent AAEC-2000 |
154 | 159 | ||
155 | config ARCH_INTEGRATOR | 160 | config ARCH_INTEGRATOR |
156 | bool "ARM Ltd. Integrator family" | 161 | bool "ARM Ltd. Integrator family" |
157 | select ARM_AMBA | 162 | select ARM_AMBA |
158 | select ICST525 | 163 | select ICST525 |
159 | help | 164 | help |
160 | Support for ARM's Integrator platform. | 165 | Support for ARM's Integrator platform. |
161 | 166 | ||
162 | config ARCH_REALVIEW | 167 | config ARCH_REALVIEW |
163 | bool "ARM Ltd. RealView family" | 168 | bool "ARM Ltd. RealView family" |
164 | select ARM_AMBA | 169 | select ARM_AMBA |
165 | select ICST307 | 170 | select ICST307 |
166 | help | 171 | help |
167 | This enables support for ARM Ltd RealView boards. | 172 | This enables support for ARM Ltd RealView boards. |
168 | 173 | ||
169 | config ARCH_VERSATILE | 174 | config ARCH_VERSATILE |
170 | bool "ARM Ltd. Versatile family" | 175 | bool "ARM Ltd. Versatile family" |
171 | select ARM_AMBA | 176 | select ARM_AMBA |
172 | select ARM_VIC | 177 | select ARM_VIC |
173 | select ICST307 | 178 | select ICST307 |
174 | select GENERIC_TIME | 179 | select GENERIC_TIME |
175 | select GENERIC_CLOCKEVENTS | 180 | select GENERIC_CLOCKEVENTS |
176 | help | 181 | help |
177 | This enables support for ARM Ltd Versatile board. | 182 | This enables support for ARM Ltd Versatile board. |
178 | 183 | ||
179 | config ARCH_AT91 | 184 | config ARCH_AT91 |
180 | bool "Atmel AT91" | 185 | bool "Atmel AT91" |
181 | select GENERIC_GPIO | 186 | select GENERIC_GPIO |
182 | help | 187 | help |
183 | This enables support for systems based on the Atmel AT91RM9200, | 188 | This enables support for systems based on the Atmel AT91RM9200, |
184 | AT91SAM9 and AT91CAP9 processors. | 189 | AT91SAM9 and AT91CAP9 processors. |
185 | 190 | ||
186 | config ARCH_CLPS7500 | 191 | config ARCH_CLPS7500 |
187 | bool "Cirrus CL-PS7500FE" | 192 | bool "Cirrus CL-PS7500FE" |
188 | select TIMER_ACORN | 193 | select TIMER_ACORN |
189 | select ISA | 194 | select ISA |
190 | select NO_IOPORT | 195 | select NO_IOPORT |
191 | help | 196 | help |
192 | Support for the Cirrus Logic PS7500FE system-on-a-chip. | 197 | Support for the Cirrus Logic PS7500FE system-on-a-chip. |
193 | 198 | ||
194 | config ARCH_CLPS711X | 199 | config ARCH_CLPS711X |
195 | bool "Cirrus Logic CLPS711x/EP721x-based" | 200 | bool "Cirrus Logic CLPS711x/EP721x-based" |
196 | help | 201 | help |
197 | Support for Cirrus Logic 711x/721x based boards. | 202 | Support for Cirrus Logic 711x/721x based boards. |
198 | 203 | ||
199 | config ARCH_CO285 | 204 | config ARCH_CO285 |
200 | bool "Co-EBSA285" | 205 | bool "Co-EBSA285" |
201 | select FOOTBRIDGE | 206 | select FOOTBRIDGE |
202 | select FOOTBRIDGE_ADDIN | 207 | select FOOTBRIDGE_ADDIN |
203 | help | 208 | help |
204 | Support for Intel's EBSA285 companion chip. | 209 | Support for Intel's EBSA285 companion chip. |
205 | 210 | ||
206 | config ARCH_EBSA110 | 211 | config ARCH_EBSA110 |
207 | bool "EBSA-110" | 212 | bool "EBSA-110" |
208 | select ISA | 213 | select ISA |
209 | select NO_IOPORT | 214 | select NO_IOPORT |
210 | help | 215 | help |
211 | This is an evaluation board for the StrongARM processor available | 216 | This is an evaluation board for the StrongARM processor available |
212 | from Digital. It has limited hardware on-board, including an | 217 | from Digital. It has limited hardware on-board, including an |
213 | Ethernet interface, two PCMCIA sockets, two serial ports and a | 218 | Ethernet interface, two PCMCIA sockets, two serial ports and a |
214 | parallel port. | 219 | parallel port. |
215 | 220 | ||
216 | config ARCH_EP93XX | 221 | config ARCH_EP93XX |
217 | bool "EP93xx-based" | 222 | bool "EP93xx-based" |
218 | select ARM_AMBA | 223 | select ARM_AMBA |
219 | select ARM_VIC | 224 | select ARM_VIC |
220 | select GENERIC_GPIO | 225 | select GENERIC_GPIO |
221 | help | 226 | help |
222 | This enables support for the Cirrus EP93xx series of CPUs. | 227 | This enables support for the Cirrus EP93xx series of CPUs. |
223 | 228 | ||
224 | config ARCH_FOOTBRIDGE | 229 | config ARCH_FOOTBRIDGE |
225 | bool "FootBridge" | 230 | bool "FootBridge" |
226 | select FOOTBRIDGE | 231 | select FOOTBRIDGE |
227 | help | 232 | help |
228 | Support for systems based on the DC21285 companion chip | 233 | Support for systems based on the DC21285 companion chip |
229 | ("FootBridge"), such as the Simtec CATS and the Rebel NetWinder. | 234 | ("FootBridge"), such as the Simtec CATS and the Rebel NetWinder. |
230 | 235 | ||
231 | config ARCH_NETX | 236 | config ARCH_NETX |
232 | bool "Hilscher NetX based" | 237 | bool "Hilscher NetX based" |
233 | select ARM_VIC | 238 | select ARM_VIC |
234 | help | 239 | help |
235 | This enables support for systems based on the Hilscher NetX Soc | 240 | This enables support for systems based on the Hilscher NetX Soc |
236 | 241 | ||
237 | config ARCH_H720X | 242 | config ARCH_H720X |
238 | bool "Hynix HMS720x-based" | 243 | bool "Hynix HMS720x-based" |
239 | select ISA_DMA_API | 244 | select ISA_DMA_API |
240 | help | 245 | help |
241 | This enables support for systems based on the Hynix HMS720x | 246 | This enables support for systems based on the Hynix HMS720x |
242 | 247 | ||
243 | config ARCH_IMX | 248 | config ARCH_IMX |
244 | bool "IMX" | 249 | bool "IMX" |
245 | select GENERIC_GPIO | 250 | select GENERIC_GPIO |
246 | select GENERIC_TIME | 251 | select GENERIC_TIME |
247 | select GENERIC_CLOCKEVENTS | 252 | select GENERIC_CLOCKEVENTS |
248 | help | 253 | help |
249 | Support for Motorola's i.MX family of processors (MX1, MXL). | 254 | Support for Motorola's i.MX family of processors (MX1, MXL). |
250 | 255 | ||
251 | config ARCH_IOP13XX | 256 | config ARCH_IOP13XX |
252 | bool "IOP13xx-based" | 257 | bool "IOP13xx-based" |
253 | depends on MMU | 258 | depends on MMU |
254 | select PLAT_IOP | 259 | select PLAT_IOP |
255 | select PCI | 260 | select PCI |
256 | select ARCH_SUPPORTS_MSI | 261 | select ARCH_SUPPORTS_MSI |
257 | help | 262 | help |
258 | Support for Intel's IOP13XX (XScale) family of processors. | 263 | Support for Intel's IOP13XX (XScale) family of processors. |
259 | 264 | ||
260 | config ARCH_IOP32X | 265 | config ARCH_IOP32X |
261 | bool "IOP32x-based" | 266 | bool "IOP32x-based" |
262 | depends on MMU | 267 | depends on MMU |
263 | select PLAT_IOP | 268 | select PLAT_IOP |
264 | select PCI | 269 | select PCI |
265 | help | 270 | help |
266 | Support for Intel's 80219 and IOP32X (XScale) family of | 271 | Support for Intel's 80219 and IOP32X (XScale) family of |
267 | processors. | 272 | processors. |
268 | 273 | ||
269 | config ARCH_IOP33X | 274 | config ARCH_IOP33X |
270 | bool "IOP33x-based" | 275 | bool "IOP33x-based" |
271 | depends on MMU | 276 | depends on MMU |
272 | select PLAT_IOP | 277 | select PLAT_IOP |
273 | select PCI | 278 | select PCI |
274 | help | 279 | help |
275 | Support for Intel's IOP33X (XScale) family of processors. | 280 | Support for Intel's IOP33X (XScale) family of processors. |
276 | 281 | ||
277 | config ARCH_IXP23XX | 282 | config ARCH_IXP23XX |
278 | bool "IXP23XX-based" | 283 | bool "IXP23XX-based" |
279 | depends on MMU | 284 | depends on MMU |
280 | select PCI | 285 | select PCI |
281 | help | 286 | help |
282 | Support for Intel's IXP23xx (XScale) family of processors. | 287 | Support for Intel's IXP23xx (XScale) family of processors. |
283 | 288 | ||
284 | config ARCH_IXP2000 | 289 | config ARCH_IXP2000 |
285 | bool "IXP2400/2800-based" | 290 | bool "IXP2400/2800-based" |
286 | depends on MMU | 291 | depends on MMU |
287 | select PCI | 292 | select PCI |
288 | help | 293 | help |
289 | Support for Intel's IXP2400/2800 (XScale) family of processors. | 294 | Support for Intel's IXP2400/2800 (XScale) family of processors. |
290 | 295 | ||
291 | config ARCH_IXP4XX | 296 | config ARCH_IXP4XX |
292 | bool "IXP4xx-based" | 297 | bool "IXP4xx-based" |
293 | depends on MMU | 298 | depends on MMU |
294 | select GENERIC_GPIO | 299 | select GENERIC_GPIO |
295 | select GENERIC_TIME | 300 | select GENERIC_TIME |
296 | select GENERIC_CLOCKEVENTS | 301 | select GENERIC_CLOCKEVENTS |
297 | help | 302 | help |
298 | Support for Intel's IXP4XX (XScale) family of processors. | 303 | Support for Intel's IXP4XX (XScale) family of processors. |
299 | 304 | ||
300 | config ARCH_L7200 | 305 | config ARCH_L7200 |
301 | bool "LinkUp-L7200" | 306 | bool "LinkUp-L7200" |
302 | select FIQ | 307 | select FIQ |
303 | help | 308 | help |
304 | Say Y here if you intend to run this kernel on a LinkUp Systems | 309 | Say Y here if you intend to run this kernel on a LinkUp Systems |
305 | L7200 Software Development Board which uses an ARM720T processor. | 310 | L7200 Software Development Board which uses an ARM720T processor. |
306 | Information on this board can be obtained at: | 311 | Information on this board can be obtained at: |
307 | 312 | ||
308 | <http://www.linkupsys.com/> | 313 | <http://www.linkupsys.com/> |
309 | 314 | ||
310 | If you have any questions or comments about the Linux kernel port | 315 | If you have any questions or comments about the Linux kernel port |
311 | to this board, send e-mail to <sjhill@cotw.com>. | 316 | to this board, send e-mail to <sjhill@cotw.com>. |
312 | 317 | ||
313 | config ARCH_KS8695 | 318 | config ARCH_KS8695 |
314 | bool "Micrel/Kendin KS8695" | 319 | bool "Micrel/Kendin KS8695" |
315 | select GENERIC_GPIO | 320 | select GENERIC_GPIO |
316 | help | 321 | help |
317 | Support for Micrel/Kendin KS8695 "Centaur" (ARM922T) based | 322 | Support for Micrel/Kendin KS8695 "Centaur" (ARM922T) based |
318 | System-on-Chip devices. | 323 | System-on-Chip devices. |
319 | 324 | ||
320 | config ARCH_NS9XXX | 325 | config ARCH_NS9XXX |
321 | bool "NetSilicon NS9xxx" | 326 | bool "NetSilicon NS9xxx" |
322 | select GENERIC_GPIO | 327 | select GENERIC_GPIO |
323 | select GENERIC_TIME | 328 | select GENERIC_TIME |
324 | select GENERIC_CLOCKEVENTS | 329 | select GENERIC_CLOCKEVENTS |
325 | help | 330 | help |
326 | Say Y here if you intend to run this kernel on a NetSilicon NS9xxx | 331 | Say Y here if you intend to run this kernel on a NetSilicon NS9xxx |
327 | System. | 332 | System. |
328 | 333 | ||
329 | <http://www.digi.com/products/microprocessors/index.jsp> | 334 | <http://www.digi.com/products/microprocessors/index.jsp> |
330 | 335 | ||
331 | config ARCH_MXC | 336 | config ARCH_MXC |
332 | bool "Freescale MXC/iMX-based" | 337 | bool "Freescale MXC/iMX-based" |
333 | select ARCH_MTD_XIP | 338 | select ARCH_MTD_XIP |
334 | help | 339 | help |
335 | Support for Freescale MXC/iMX-based family of processors | 340 | Support for Freescale MXC/iMX-based family of processors |
336 | 341 | ||
337 | config ARCH_ORION | 342 | config ARCH_ORION |
338 | bool "Marvell Orion" | 343 | bool "Marvell Orion" |
339 | depends on MMU | 344 | depends on MMU |
340 | select PCI | 345 | select PCI |
341 | select GENERIC_GPIO | 346 | select GENERIC_GPIO |
342 | select GENERIC_TIME | 347 | select GENERIC_TIME |
343 | select GENERIC_CLOCKEVENTS | 348 | select GENERIC_CLOCKEVENTS |
344 | help | 349 | help |
345 | Support for Marvell Orion System on Chip family. | 350 | Support for Marvell Orion System on Chip family. |
346 | 351 | ||
347 | config ARCH_PNX4008 | 352 | config ARCH_PNX4008 |
348 | bool "Philips Nexperia PNX4008 Mobile" | 353 | bool "Philips Nexperia PNX4008 Mobile" |
349 | help | 354 | help |
350 | This enables support for Philips PNX4008 mobile platform. | 355 | This enables support for Philips PNX4008 mobile platform. |
351 | 356 | ||
352 | config ARCH_PXA | 357 | config ARCH_PXA |
353 | bool "PXA2xx/PXA3xx-based" | 358 | bool "PXA2xx/PXA3xx-based" |
354 | depends on MMU | 359 | depends on MMU |
355 | select ARCH_MTD_XIP | 360 | select ARCH_MTD_XIP |
356 | select GENERIC_GPIO | 361 | select GENERIC_GPIO |
357 | select GENERIC_TIME | 362 | select GENERIC_TIME |
358 | select GENERIC_CLOCKEVENTS | 363 | select GENERIC_CLOCKEVENTS |
359 | select TICK_ONESHOT | 364 | select TICK_ONESHOT |
360 | help | 365 | help |
361 | Support for Intel/Marvell's PXA2xx/PXA3xx processor line. | 366 | Support for Intel/Marvell's PXA2xx/PXA3xx processor line. |
362 | 367 | ||
363 | config ARCH_RPC | 368 | config ARCH_RPC |
364 | bool "RiscPC" | 369 | bool "RiscPC" |
365 | select ARCH_ACORN | 370 | select ARCH_ACORN |
366 | select FIQ | 371 | select FIQ |
367 | select TIMER_ACORN | 372 | select TIMER_ACORN |
368 | select ARCH_MAY_HAVE_PC_FDC | 373 | select ARCH_MAY_HAVE_PC_FDC |
369 | select ISA_DMA_API | 374 | select ISA_DMA_API |
370 | select NO_IOPORT | 375 | select NO_IOPORT |
371 | help | 376 | help |
372 | On the Acorn Risc-PC, Linux can support the internal IDE disk and | 377 | On the Acorn Risc-PC, Linux can support the internal IDE disk and |
373 | CD-ROM interface, serial and parallel port, and the floppy drive. | 378 | CD-ROM interface, serial and parallel port, and the floppy drive. |
374 | 379 | ||
375 | config ARCH_SA1100 | 380 | config ARCH_SA1100 |
376 | bool "SA1100-based" | 381 | bool "SA1100-based" |
377 | select ISA | 382 | select ISA |
378 | select ARCH_DISCONTIGMEM_ENABLE | 383 | select ARCH_DISCONTIGMEM_ENABLE |
379 | select ARCH_MTD_XIP | 384 | select ARCH_MTD_XIP |
380 | select GENERIC_GPIO | 385 | select GENERIC_GPIO |
381 | select GENERIC_TIME | 386 | select GENERIC_TIME |
382 | help | 387 | help |
383 | Support for StrongARM 11x0 based boards. | 388 | Support for StrongARM 11x0 based boards. |
384 | 389 | ||
385 | config ARCH_S3C2410 | 390 | config ARCH_S3C2410 |
386 | bool "Samsung S3C2410, S3C2412, S3C2413, S3C2440, S3C2442, S3C2443" | 391 | bool "Samsung S3C2410, S3C2412, S3C2413, S3C2440, S3C2442, S3C2443" |
387 | select GENERIC_GPIO | 392 | select GENERIC_GPIO |
388 | help | 393 | help |
389 | Samsung S3C2410X CPU based systems, such as the Simtec Electronics | 394 | Samsung S3C2410X CPU based systems, such as the Simtec Electronics |
390 | BAST (<http://www.simtec.co.uk/products/EB110ITX/>), the IPAQ 1940 or | 395 | BAST (<http://www.simtec.co.uk/products/EB110ITX/>), the IPAQ 1940 or |
391 | the Samsung SMDK2410 development board (and derivatives). | 396 | the Samsung SMDK2410 development board (and derivatives). |
392 | 397 | ||
393 | config ARCH_SHARK | 398 | config ARCH_SHARK |
394 | bool "Shark" | 399 | bool "Shark" |
395 | select ISA | 400 | select ISA |
396 | select ISA_DMA | 401 | select ISA_DMA |
397 | select PCI | 402 | select PCI |
398 | help | 403 | help |
399 | Support for the StrongARM based Digital DNARD machine, also known | 404 | Support for the StrongARM based Digital DNARD machine, also known |
400 | as "Shark" (<http://www.shark-linux.de/shark.html>). | 405 | as "Shark" (<http://www.shark-linux.de/shark.html>). |
401 | 406 | ||
402 | config ARCH_LH7A40X | 407 | config ARCH_LH7A40X |
403 | bool "Sharp LH7A40X" | 408 | bool "Sharp LH7A40X" |
404 | help | 409 | help |
405 | Say Y here for systems based on one of the Sharp LH7A40X | 410 | Say Y here for systems based on one of the Sharp LH7A40X |
406 | System on a Chip processors. These CPUs include an ARM922T | 411 | System on a Chip processors. These CPUs include an ARM922T |
407 | core with a wide array of integrated devices for | 412 | core with a wide array of integrated devices for |
408 | hand-held and low-power applications. | 413 | hand-held and low-power applications. |
409 | 414 | ||
410 | config ARCH_DAVINCI | 415 | config ARCH_DAVINCI |
411 | bool "TI DaVinci" | 416 | bool "TI DaVinci" |
412 | select GENERIC_TIME | 417 | select GENERIC_TIME |
413 | select GENERIC_CLOCKEVENTS | 418 | select GENERIC_CLOCKEVENTS |
414 | select GENERIC_GPIO | 419 | select GENERIC_GPIO |
415 | help | 420 | help |
416 | Support for TI's DaVinci platform. | 421 | Support for TI's DaVinci platform. |
417 | 422 | ||
418 | config ARCH_OMAP | 423 | config ARCH_OMAP |
419 | bool "TI OMAP" | 424 | bool "TI OMAP" |
420 | select GENERIC_GPIO | 425 | select GENERIC_GPIO |
421 | select GENERIC_TIME | 426 | select GENERIC_TIME |
422 | help | 427 | help |
423 | Support for TI's OMAP platform (OMAP1 and OMAP2). | 428 | Support for TI's OMAP platform (OMAP1 and OMAP2). |
424 | 429 | ||
425 | config ARCH_MSM7X00A | 430 | config ARCH_MSM7X00A |
426 | bool "Qualcomm MSM7X00A" | 431 | bool "Qualcomm MSM7X00A" |
427 | select GENERIC_TIME | 432 | select GENERIC_TIME |
428 | select GENERIC_CLOCKEVENTS | 433 | select GENERIC_CLOCKEVENTS |
429 | help | 434 | help |
430 | Support for Qualcomm MSM7X00A based systems. This runs on the ARM11 | 435 | Support for Qualcomm MSM7X00A based systems. This runs on the ARM11 |
431 | apps processor of the MSM7X00A and depends on a shared memory | 436 | apps processor of the MSM7X00A and depends on a shared memory |
432 | interface to the ARM9 modem processor which runs the baseband stack | 437 | interface to the ARM9 modem processor which runs the baseband stack |
433 | and controls some vital subsystems (clock and power control, etc). | 438 | and controls some vital subsystems (clock and power control, etc). |
434 | <http://www.cdmatech.com/products/msm7200_chipset_solution.jsp> | 439 | <http://www.cdmatech.com/products/msm7200_chipset_solution.jsp> |
435 | 440 | ||
436 | endchoice | 441 | endchoice |
437 | 442 | ||
438 | source "arch/arm/mach-clps711x/Kconfig" | 443 | source "arch/arm/mach-clps711x/Kconfig" |
439 | 444 | ||
440 | source "arch/arm/mach-ep93xx/Kconfig" | 445 | source "arch/arm/mach-ep93xx/Kconfig" |
441 | 446 | ||
442 | source "arch/arm/mach-footbridge/Kconfig" | 447 | source "arch/arm/mach-footbridge/Kconfig" |
443 | 448 | ||
444 | source "arch/arm/mach-integrator/Kconfig" | 449 | source "arch/arm/mach-integrator/Kconfig" |
445 | 450 | ||
446 | source "arch/arm/mach-iop32x/Kconfig" | 451 | source "arch/arm/mach-iop32x/Kconfig" |
447 | 452 | ||
448 | source "arch/arm/mach-iop33x/Kconfig" | 453 | source "arch/arm/mach-iop33x/Kconfig" |
449 | 454 | ||
450 | source "arch/arm/mach-iop13xx/Kconfig" | 455 | source "arch/arm/mach-iop13xx/Kconfig" |
451 | 456 | ||
452 | source "arch/arm/mach-ixp4xx/Kconfig" | 457 | source "arch/arm/mach-ixp4xx/Kconfig" |
453 | 458 | ||
454 | source "arch/arm/mach-ixp2000/Kconfig" | 459 | source "arch/arm/mach-ixp2000/Kconfig" |
455 | 460 | ||
456 | source "arch/arm/mach-ixp23xx/Kconfig" | 461 | source "arch/arm/mach-ixp23xx/Kconfig" |
457 | 462 | ||
458 | source "arch/arm/mach-pxa/Kconfig" | 463 | source "arch/arm/mach-pxa/Kconfig" |
459 | 464 | ||
460 | source "arch/arm/mach-sa1100/Kconfig" | 465 | source "arch/arm/mach-sa1100/Kconfig" |
461 | 466 | ||
462 | source "arch/arm/plat-omap/Kconfig" | 467 | source "arch/arm/plat-omap/Kconfig" |
463 | 468 | ||
464 | source "arch/arm/mach-omap1/Kconfig" | 469 | source "arch/arm/mach-omap1/Kconfig" |
465 | 470 | ||
466 | source "arch/arm/mach-omap2/Kconfig" | 471 | source "arch/arm/mach-omap2/Kconfig" |
467 | 472 | ||
468 | source "arch/arm/mach-orion/Kconfig" | 473 | source "arch/arm/mach-orion/Kconfig" |
469 | 474 | ||
470 | source "arch/arm/plat-s3c24xx/Kconfig" | 475 | source "arch/arm/plat-s3c24xx/Kconfig" |
471 | source "arch/arm/plat-s3c/Kconfig" | 476 | source "arch/arm/plat-s3c/Kconfig" |
472 | 477 | ||
473 | if ARCH_S3C2410 | 478 | if ARCH_S3C2410 |
474 | source "arch/arm/mach-s3c2400/Kconfig" | 479 | source "arch/arm/mach-s3c2400/Kconfig" |
475 | source "arch/arm/mach-s3c2410/Kconfig" | 480 | source "arch/arm/mach-s3c2410/Kconfig" |
476 | source "arch/arm/mach-s3c2412/Kconfig" | 481 | source "arch/arm/mach-s3c2412/Kconfig" |
477 | source "arch/arm/mach-s3c2440/Kconfig" | 482 | source "arch/arm/mach-s3c2440/Kconfig" |
478 | source "arch/arm/mach-s3c2442/Kconfig" | 483 | source "arch/arm/mach-s3c2442/Kconfig" |
479 | source "arch/arm/mach-s3c2443/Kconfig" | 484 | source "arch/arm/mach-s3c2443/Kconfig" |
480 | endif | 485 | endif |
481 | 486 | ||
482 | source "arch/arm/mach-lh7a40x/Kconfig" | 487 | source "arch/arm/mach-lh7a40x/Kconfig" |
483 | 488 | ||
484 | source "arch/arm/mach-imx/Kconfig" | 489 | source "arch/arm/mach-imx/Kconfig" |
485 | 490 | ||
486 | source "arch/arm/mach-h720x/Kconfig" | 491 | source "arch/arm/mach-h720x/Kconfig" |
487 | 492 | ||
488 | source "arch/arm/mach-versatile/Kconfig" | 493 | source "arch/arm/mach-versatile/Kconfig" |
489 | 494 | ||
490 | source "arch/arm/mach-aaec2000/Kconfig" | 495 | source "arch/arm/mach-aaec2000/Kconfig" |
491 | 496 | ||
492 | source "arch/arm/mach-realview/Kconfig" | 497 | source "arch/arm/mach-realview/Kconfig" |
493 | 498 | ||
494 | source "arch/arm/mach-at91/Kconfig" | 499 | source "arch/arm/mach-at91/Kconfig" |
495 | 500 | ||
496 | source "arch/arm/plat-mxc/Kconfig" | 501 | source "arch/arm/plat-mxc/Kconfig" |
497 | 502 | ||
498 | source "arch/arm/mach-netx/Kconfig" | 503 | source "arch/arm/mach-netx/Kconfig" |
499 | 504 | ||
500 | source "arch/arm/mach-ns9xxx/Kconfig" | 505 | source "arch/arm/mach-ns9xxx/Kconfig" |
501 | 506 | ||
502 | source "arch/arm/mach-davinci/Kconfig" | 507 | source "arch/arm/mach-davinci/Kconfig" |
503 | 508 | ||
504 | source "arch/arm/mach-ks8695/Kconfig" | 509 | source "arch/arm/mach-ks8695/Kconfig" |
505 | 510 | ||
506 | source "arch/arm/mach-msm/Kconfig" | 511 | source "arch/arm/mach-msm/Kconfig" |
507 | 512 | ||
508 | # Definitions to make life easier | 513 | # Definitions to make life easier |
509 | config ARCH_ACORN | 514 | config ARCH_ACORN |
510 | bool | 515 | bool |
511 | 516 | ||
512 | config PLAT_IOP | 517 | config PLAT_IOP |
513 | bool | 518 | bool |
514 | 519 | ||
515 | source arch/arm/mm/Kconfig | 520 | source arch/arm/mm/Kconfig |
516 | 521 | ||
517 | config IWMMXT | 522 | config IWMMXT |
518 | bool "Enable iWMMXt support" | 523 | bool "Enable iWMMXt support" |
519 | depends on CPU_XSCALE || CPU_XSC3 | 524 | depends on CPU_XSCALE || CPU_XSC3 |
520 | default y if PXA27x || PXA3xx | 525 | default y if PXA27x || PXA3xx |
521 | help | 526 | help |
522 | Enable support for iWMMXt context switching at run time if | 527 | Enable support for iWMMXt context switching at run time if |
523 | running on a CPU that supports it. | 528 | running on a CPU that supports it. |
524 | 529 | ||
525 | # bool 'Use XScale PMU as timer source' CONFIG_XSCALE_PMU_TIMER | 530 | # bool 'Use XScale PMU as timer source' CONFIG_XSCALE_PMU_TIMER |
526 | config XSCALE_PMU | 531 | config XSCALE_PMU |
527 | bool | 532 | bool |
528 | depends on CPU_XSCALE && !XSCALE_PMU_TIMER | 533 | depends on CPU_XSCALE && !XSCALE_PMU_TIMER |
529 | default y | 534 | default y |
530 | 535 | ||
531 | if !MMU | 536 | if !MMU |
532 | source "arch/arm/Kconfig-nommu" | 537 | source "arch/arm/Kconfig-nommu" |
533 | endif | 538 | endif |
534 | 539 | ||
535 | endmenu | 540 | endmenu |
536 | 541 | ||
537 | source "arch/arm/common/Kconfig" | 542 | source "arch/arm/common/Kconfig" |
538 | 543 | ||
539 | config FORCE_MAX_ZONEORDER | 544 | config FORCE_MAX_ZONEORDER |
540 | int | 545 | int |
541 | depends on SA1111 | 546 | depends on SA1111 |
542 | default "9" | 547 | default "9" |
543 | 548 | ||
544 | menu "Bus support" | 549 | menu "Bus support" |
545 | 550 | ||
546 | config ARM_AMBA | 551 | config ARM_AMBA |
547 | bool | 552 | bool |
548 | 553 | ||
549 | config ISA | 554 | config ISA |
550 | bool | 555 | bool |
551 | help | 556 | help |
552 | Find out whether you have ISA slots on your motherboard. ISA is the | 557 | Find out whether you have ISA slots on your motherboard. ISA is the |
553 | name of a bus system, i.e. the way the CPU talks to the other stuff | 558 | name of a bus system, i.e. the way the CPU talks to the other stuff |
554 | inside your box. Other bus systems are PCI, EISA, MicroChannel | 559 | inside your box. Other bus systems are PCI, EISA, MicroChannel |
555 | (MCA) or VESA. ISA is an older system, now being displaced by PCI; | 560 | (MCA) or VESA. ISA is an older system, now being displaced by PCI; |
556 | newer boards don't support it. If you have ISA, say Y, otherwise N. | 561 | newer boards don't support it. If you have ISA, say Y, otherwise N. |
557 | 562 | ||
558 | # Select ISA DMA controller support | 563 | # Select ISA DMA controller support |
559 | config ISA_DMA | 564 | config ISA_DMA |
560 | bool | 565 | bool |
561 | select ISA_DMA_API | 566 | select ISA_DMA_API |
562 | 567 | ||
563 | # Select ISA DMA interface | 568 | # Select ISA DMA interface |
564 | config ISA_DMA_API | 569 | config ISA_DMA_API |
565 | bool | 570 | bool |
566 | 571 | ||
567 | config PCI | 572 | config PCI |
568 | bool "PCI support" if ARCH_INTEGRATOR_AP || ARCH_VERSATILE_PB || ARCH_IXP4XX || ARCH_KS8695 || MACH_ARMCORE | 573 | bool "PCI support" if ARCH_INTEGRATOR_AP || ARCH_VERSATILE_PB || ARCH_IXP4XX || ARCH_KS8695 || MACH_ARMCORE |
569 | help | 574 | help |
570 | Find out whether you have a PCI motherboard. PCI is the name of a | 575 | Find out whether you have a PCI motherboard. PCI is the name of a |
571 | bus system, i.e. the way the CPU talks to the other stuff inside | 576 | bus system, i.e. the way the CPU talks to the other stuff inside |
572 | your box. Other bus systems are ISA, EISA, MicroChannel (MCA) or | 577 | your box. Other bus systems are ISA, EISA, MicroChannel (MCA) or |
573 | VESA. If you have PCI, say Y, otherwise N. | 578 | VESA. If you have PCI, say Y, otherwise N. |
574 | 579 | ||
575 | The PCI-HOWTO, available from | 580 | The PCI-HOWTO, available from |
576 | <http://www.tldp.org/docs.html#howto>, contains valuable | 581 | <http://www.tldp.org/docs.html#howto>, contains valuable |
577 | information about which PCI hardware does work under Linux and which | 582 | information about which PCI hardware does work under Linux and which |
578 | doesn't. | 583 | doesn't. |
579 | 584 | ||
580 | config PCI_SYSCALL | 585 | config PCI_SYSCALL |
581 | def_bool PCI | 586 | def_bool PCI |
582 | 587 | ||
583 | # Select the host bridge type | 588 | # Select the host bridge type |
584 | config PCI_HOST_VIA82C505 | 589 | config PCI_HOST_VIA82C505 |
585 | bool | 590 | bool |
586 | depends on PCI && ARCH_SHARK | 591 | depends on PCI && ARCH_SHARK |
587 | default y | 592 | default y |
588 | 593 | ||
589 | config PCI_HOST_ITE8152 | 594 | config PCI_HOST_ITE8152 |
590 | bool | 595 | bool |
591 | depends on PCI && MACH_ARMCORE | 596 | depends on PCI && MACH_ARMCORE |
592 | default y | 597 | default y |
593 | select DMABOUNCE | 598 | select DMABOUNCE |
594 | 599 | ||
595 | source "drivers/pci/Kconfig" | 600 | source "drivers/pci/Kconfig" |
596 | 601 | ||
597 | source "drivers/pcmcia/Kconfig" | 602 | source "drivers/pcmcia/Kconfig" |
598 | 603 | ||
599 | endmenu | 604 | endmenu |
600 | 605 | ||
601 | menu "Kernel Features" | 606 | menu "Kernel Features" |
602 | 607 | ||
603 | source "kernel/time/Kconfig" | 608 | source "kernel/time/Kconfig" |
604 | 609 | ||
605 | config SMP | 610 | config SMP |
606 | bool "Symmetric Multi-Processing (EXPERIMENTAL)" | 611 | bool "Symmetric Multi-Processing (EXPERIMENTAL)" |
607 | depends on EXPERIMENTAL && REALVIEW_MPCORE | 612 | depends on EXPERIMENTAL && REALVIEW_MPCORE |
608 | help | 613 | help |
609 | This enables support for systems with more than one CPU. If you have | 614 | This enables support for systems with more than one CPU. If you have |
610 | a system with only one CPU, like most personal computers, say N. If | 615 | a system with only one CPU, like most personal computers, say N. If |
611 | you have a system with more than one CPU, say Y. | 616 | you have a system with more than one CPU, say Y. |
612 | 617 | ||
613 | If you say N here, the kernel will run on single and multiprocessor | 618 | If you say N here, the kernel will run on single and multiprocessor |
614 | machines, but will use only one CPU of a multiprocessor machine. If | 619 | machines, but will use only one CPU of a multiprocessor machine. If |
615 | you say Y here, the kernel will run on many, but not all, single | 620 | you say Y here, the kernel will run on many, but not all, single |
616 | processor machines. On a single processor machine, the kernel will | 621 | processor machines. On a single processor machine, the kernel will |
617 | run faster if you say N here. | 622 | run faster if you say N here. |
618 | 623 | ||
619 | See also the <file:Documentation/smp.txt>, | 624 | See also the <file:Documentation/smp.txt>, |
620 | <file:Documentation/i386/IO-APIC.txt>, | 625 | <file:Documentation/i386/IO-APIC.txt>, |
621 | <file:Documentation/nmi_watchdog.txt> and the SMP-HOWTO available at | 626 | <file:Documentation/nmi_watchdog.txt> and the SMP-HOWTO available at |
622 | <http://www.linuxdoc.org/docs.html#howto>. | 627 | <http://www.linuxdoc.org/docs.html#howto>. |
623 | 628 | ||
624 | If you don't know what to do here, say N. | 629 | If you don't know what to do here, say N. |
625 | 630 | ||
626 | config NR_CPUS | 631 | config NR_CPUS |
627 | int "Maximum number of CPUs (2-32)" | 632 | int "Maximum number of CPUs (2-32)" |
628 | range 2 32 | 633 | range 2 32 |
629 | depends on SMP | 634 | depends on SMP |
630 | default "4" | 635 | default "4" |
631 | 636 | ||
632 | config HOTPLUG_CPU | 637 | config HOTPLUG_CPU |
633 | bool "Support for hot-pluggable CPUs (EXPERIMENTAL)" | 638 | bool "Support for hot-pluggable CPUs (EXPERIMENTAL)" |
634 | depends on SMP && HOTPLUG && EXPERIMENTAL | 639 | depends on SMP && HOTPLUG && EXPERIMENTAL |
635 | help | 640 | help |
636 | Say Y here to experiment with turning CPUs off and on. CPUs | 641 | Say Y here to experiment with turning CPUs off and on. CPUs |
637 | can be controlled through /sys/devices/system/cpu. | 642 | can be controlled through /sys/devices/system/cpu. |
638 | 643 | ||
639 | config LOCAL_TIMERS | 644 | config LOCAL_TIMERS |
640 | bool "Use local timer interrupts" | 645 | bool "Use local timer interrupts" |
641 | depends on SMP && REALVIEW_MPCORE | 646 | depends on SMP && REALVIEW_MPCORE |
642 | default y | 647 | default y |
643 | help | 648 | help |
644 | Enable support for local timers on SMP platforms, rather then the | 649 | Enable support for local timers on SMP platforms, rather then the |
645 | legacy IPI broadcast method. Local timers allows the system | 650 | legacy IPI broadcast method. Local timers allows the system |
646 | accounting to be spread across the timer interval, preventing a | 651 | accounting to be spread across the timer interval, preventing a |
647 | "thundering herd" at every timer tick. | 652 | "thundering herd" at every timer tick. |
648 | 653 | ||
649 | config PREEMPT | 654 | config PREEMPT |
650 | bool "Preemptible Kernel (EXPERIMENTAL)" | 655 | bool "Preemptible Kernel (EXPERIMENTAL)" |
651 | depends on EXPERIMENTAL | 656 | depends on EXPERIMENTAL |
652 | help | 657 | help |
653 | This option reduces the latency of the kernel when reacting to | 658 | This option reduces the latency of the kernel when reacting to |
654 | real-time or interactive events by allowing a low priority process to | 659 | real-time or interactive events by allowing a low priority process to |
655 | be preempted even if it is in kernel mode executing a system call. | 660 | be preempted even if it is in kernel mode executing a system call. |
656 | This allows applications to run more reliably even when the system is | 661 | This allows applications to run more reliably even when the system is |
657 | under load. | 662 | under load. |
658 | 663 | ||
659 | Say Y here if you are building a kernel for a desktop, embedded | 664 | Say Y here if you are building a kernel for a desktop, embedded |
660 | or real-time system. Say N if you are unsure. | 665 | or real-time system. Say N if you are unsure. |
661 | 666 | ||
662 | config NO_IDLE_HZ | 667 | config NO_IDLE_HZ |
663 | bool "Dynamic tick timer" | 668 | bool "Dynamic tick timer" |
664 | depends on !GENERIC_CLOCKEVENTS | 669 | depends on !GENERIC_CLOCKEVENTS |
665 | help | 670 | help |
666 | Select this option if you want to disable continuous timer ticks | 671 | Select this option if you want to disable continuous timer ticks |
667 | and have them programmed to occur as required. This option saves | 672 | and have them programmed to occur as required. This option saves |
668 | power as the system can remain in idle state for longer. | 673 | power as the system can remain in idle state for longer. |
669 | 674 | ||
670 | By default dynamic tick is disabled during the boot, and can be | 675 | By default dynamic tick is disabled during the boot, and can be |
671 | manually enabled with: | 676 | manually enabled with: |
672 | 677 | ||
673 | echo 1 > /sys/devices/system/timer/timer0/dyn_tick | 678 | echo 1 > /sys/devices/system/timer/timer0/dyn_tick |
674 | 679 | ||
675 | Alternatively, if you want dynamic tick automatically enabled | 680 | Alternatively, if you want dynamic tick automatically enabled |
676 | during boot, pass "dyntick=enable" via the kernel command string. | 681 | during boot, pass "dyntick=enable" via the kernel command string. |
677 | 682 | ||
678 | Please note that dynamic tick may affect the accuracy of | 683 | Please note that dynamic tick may affect the accuracy of |
679 | timekeeping on some platforms depending on the implementation. | 684 | timekeeping on some platforms depending on the implementation. |
680 | Currently at least OMAP, PXA2xx and SA11x0 platforms are known | 685 | Currently at least OMAP, PXA2xx and SA11x0 platforms are known |
681 | to have accurate timekeeping with dynamic tick. | 686 | to have accurate timekeeping with dynamic tick. |
682 | 687 | ||
683 | config HZ | 688 | config HZ |
684 | int | 689 | int |
685 | default 128 if ARCH_L7200 | 690 | default 128 if ARCH_L7200 |
686 | default 200 if ARCH_EBSA110 || ARCH_S3C2410 | 691 | default 200 if ARCH_EBSA110 || ARCH_S3C2410 |
687 | default OMAP_32K_TIMER_HZ if ARCH_OMAP && OMAP_32K_TIMER | 692 | default OMAP_32K_TIMER_HZ if ARCH_OMAP && OMAP_32K_TIMER |
688 | default AT91_TIMER_HZ if ARCH_AT91 | 693 | default AT91_TIMER_HZ if ARCH_AT91 |
689 | default 100 | 694 | default 100 |
690 | 695 | ||
691 | config AEABI | 696 | config AEABI |
692 | bool "Use the ARM EABI to compile the kernel" | 697 | bool "Use the ARM EABI to compile the kernel" |
693 | help | 698 | help |
694 | This option allows for the kernel to be compiled using the latest | 699 | This option allows for the kernel to be compiled using the latest |
695 | ARM ABI (aka EABI). This is only useful if you are using a user | 700 | ARM ABI (aka EABI). This is only useful if you are using a user |
696 | space environment that is also compiled with EABI. | 701 | space environment that is also compiled with EABI. |
697 | 702 | ||
698 | Since there are major incompatibilities between the legacy ABI and | 703 | Since there are major incompatibilities between the legacy ABI and |
699 | EABI, especially with regard to structure member alignment, this | 704 | EABI, especially with regard to structure member alignment, this |
700 | option also changes the kernel syscall calling convention to | 705 | option also changes the kernel syscall calling convention to |
701 | disambiguate both ABIs and allow for backward compatibility support | 706 | disambiguate both ABIs and allow for backward compatibility support |
702 | (selected with CONFIG_OABI_COMPAT). | 707 | (selected with CONFIG_OABI_COMPAT). |
703 | 708 | ||
704 | To use this you need GCC version 4.0.0 or later. | 709 | To use this you need GCC version 4.0.0 or later. |
705 | 710 | ||
706 | config OABI_COMPAT | 711 | config OABI_COMPAT |
707 | bool "Allow old ABI binaries to run with this kernel (EXPERIMENTAL)" | 712 | bool "Allow old ABI binaries to run with this kernel (EXPERIMENTAL)" |
708 | depends on AEABI && EXPERIMENTAL | 713 | depends on AEABI && EXPERIMENTAL |
709 | default y | 714 | default y |
710 | help | 715 | help |
711 | This option preserves the old syscall interface along with the | 716 | This option preserves the old syscall interface along with the |
712 | new (ARM EABI) one. It also provides a compatibility layer to | 717 | new (ARM EABI) one. It also provides a compatibility layer to |
713 | intercept syscalls that have structure arguments which layout | 718 | intercept syscalls that have structure arguments which layout |
714 | in memory differs between the legacy ABI and the new ARM EABI | 719 | in memory differs between the legacy ABI and the new ARM EABI |
715 | (only for non "thumb" binaries). This option adds a tiny | 720 | (only for non "thumb" binaries). This option adds a tiny |
716 | overhead to all syscalls and produces a slightly larger kernel. | 721 | overhead to all syscalls and produces a slightly larger kernel. |
717 | If you know you'll be using only pure EABI user space then you | 722 | If you know you'll be using only pure EABI user space then you |
718 | can say N here. If this option is not selected and you attempt | 723 | can say N here. If this option is not selected and you attempt |
719 | to execute a legacy ABI binary then the result will be | 724 | to execute a legacy ABI binary then the result will be |
720 | UNPREDICTABLE (in fact it can be predicted that it won't work | 725 | UNPREDICTABLE (in fact it can be predicted that it won't work |
721 | at all). If in doubt say Y. | 726 | at all). If in doubt say Y. |
722 | 727 | ||
723 | config ARCH_DISCONTIGMEM_ENABLE | 728 | config ARCH_DISCONTIGMEM_ENABLE |
724 | bool | 729 | bool |
725 | default (ARCH_LH7A40X && !LH7A40X_CONTIGMEM) | 730 | default (ARCH_LH7A40X && !LH7A40X_CONTIGMEM) |
726 | help | 731 | help |
727 | Say Y to support efficient handling of discontiguous physical memory, | 732 | Say Y to support efficient handling of discontiguous physical memory, |
728 | for architectures which are either NUMA (Non-Uniform Memory Access) | 733 | for architectures which are either NUMA (Non-Uniform Memory Access) |
729 | or have huge holes in the physical address space for other reasons. | 734 | or have huge holes in the physical address space for other reasons. |
730 | See <file:Documentation/vm/numa> for more. | 735 | See <file:Documentation/vm/numa> for more. |
731 | 736 | ||
732 | config NODES_SHIFT | 737 | config NODES_SHIFT |
733 | int | 738 | int |
734 | default "4" if ARCH_LH7A40X | 739 | default "4" if ARCH_LH7A40X |
735 | default "2" | 740 | default "2" |
736 | depends on NEED_MULTIPLE_NODES | 741 | depends on NEED_MULTIPLE_NODES |
737 | 742 | ||
738 | source "mm/Kconfig" | 743 | source "mm/Kconfig" |
739 | 744 | ||
740 | config LEDS | 745 | config LEDS |
741 | bool "Timer and CPU usage LEDs" | 746 | bool "Timer and CPU usage LEDs" |
742 | depends on ARCH_CDB89712 || ARCH_CO285 || ARCH_EBSA110 || \ | 747 | depends on ARCH_CDB89712 || ARCH_CO285 || ARCH_EBSA110 || \ |
743 | ARCH_EBSA285 || ARCH_IMX || ARCH_INTEGRATOR || \ | 748 | ARCH_EBSA285 || ARCH_IMX || ARCH_INTEGRATOR || \ |
744 | ARCH_LUBBOCK || MACH_MAINSTONE || ARCH_NETWINDER || \ | 749 | ARCH_LUBBOCK || MACH_MAINSTONE || ARCH_NETWINDER || \ |
745 | ARCH_OMAP || ARCH_P720T || ARCH_PXA_IDP || \ | 750 | ARCH_OMAP || ARCH_P720T || ARCH_PXA_IDP || \ |
746 | ARCH_SA1100 || ARCH_SHARK || ARCH_VERSATILE || \ | 751 | ARCH_SA1100 || ARCH_SHARK || ARCH_VERSATILE || \ |
747 | ARCH_AT91 || MACH_TRIZEPS4 || ARCH_DAVINCI || \ | 752 | ARCH_AT91 || MACH_TRIZEPS4 || ARCH_DAVINCI || \ |
748 | ARCH_KS8695 || MACH_RD88F5182 | 753 | ARCH_KS8695 || MACH_RD88F5182 |
749 | help | 754 | help |
750 | If you say Y here, the LEDs on your machine will be used | 755 | If you say Y here, the LEDs on your machine will be used |
751 | to provide useful information about your current system status. | 756 | to provide useful information about your current system status. |
752 | 757 | ||
753 | If you are compiling a kernel for a NetWinder or EBSA-285, you will | 758 | If you are compiling a kernel for a NetWinder or EBSA-285, you will |
754 | be able to select which LEDs are active using the options below. If | 759 | be able to select which LEDs are active using the options below. If |
755 | you are compiling a kernel for the EBSA-110 or the LART however, the | 760 | you are compiling a kernel for the EBSA-110 or the LART however, the |
756 | red LED will simply flash regularly to indicate that the system is | 761 | red LED will simply flash regularly to indicate that the system is |
757 | still functional. It is safe to say Y here if you have a CATS | 762 | still functional. It is safe to say Y here if you have a CATS |
758 | system, but the driver will do nothing. | 763 | system, but the driver will do nothing. |
759 | 764 | ||
760 | config LEDS_TIMER | 765 | config LEDS_TIMER |
761 | bool "Timer LED" if (!ARCH_CDB89712 && !ARCH_OMAP) || \ | 766 | bool "Timer LED" if (!ARCH_CDB89712 && !ARCH_OMAP) || \ |
762 | OMAP_OSK_MISTRAL || MACH_OMAP_H2 \ | 767 | OMAP_OSK_MISTRAL || MACH_OMAP_H2 \ |
763 | || MACH_OMAP_PERSEUS2 | 768 | || MACH_OMAP_PERSEUS2 |
764 | depends on LEDS | 769 | depends on LEDS |
765 | depends on !GENERIC_CLOCKEVENTS | 770 | depends on !GENERIC_CLOCKEVENTS |
766 | default y if ARCH_EBSA110 | 771 | default y if ARCH_EBSA110 |
767 | help | 772 | help |
768 | If you say Y here, one of the system LEDs (the green one on the | 773 | If you say Y here, one of the system LEDs (the green one on the |
769 | NetWinder, the amber one on the EBSA285, or the red one on the LART) | 774 | NetWinder, the amber one on the EBSA285, or the red one on the LART) |
770 | will flash regularly to indicate that the system is still | 775 | will flash regularly to indicate that the system is still |
771 | operational. This is mainly useful to kernel hackers who are | 776 | operational. This is mainly useful to kernel hackers who are |
772 | debugging unstable kernels. | 777 | debugging unstable kernels. |
773 | 778 | ||
774 | The LART uses the same LED for both Timer LED and CPU usage LED | 779 | The LART uses the same LED for both Timer LED and CPU usage LED |
775 | functions. You may choose to use both, but the Timer LED function | 780 | functions. You may choose to use both, but the Timer LED function |
776 | will overrule the CPU usage LED. | 781 | will overrule the CPU usage LED. |
777 | 782 | ||
778 | config LEDS_CPU | 783 | config LEDS_CPU |
779 | bool "CPU usage LED" if (!ARCH_CDB89712 && !ARCH_EBSA110 && \ | 784 | bool "CPU usage LED" if (!ARCH_CDB89712 && !ARCH_EBSA110 && \ |
780 | !ARCH_OMAP) \ | 785 | !ARCH_OMAP) \ |
781 | || OMAP_OSK_MISTRAL || MACH_OMAP_H2 \ | 786 | || OMAP_OSK_MISTRAL || MACH_OMAP_H2 \ |
782 | || MACH_OMAP_PERSEUS2 | 787 | || MACH_OMAP_PERSEUS2 |
783 | depends on LEDS | 788 | depends on LEDS |
784 | help | 789 | help |
785 | If you say Y here, the red LED will be used to give a good real | 790 | If you say Y here, the red LED will be used to give a good real |
786 | time indication of CPU usage, by lighting whenever the idle task | 791 | time indication of CPU usage, by lighting whenever the idle task |
787 | is not currently executing. | 792 | is not currently executing. |
788 | 793 | ||
789 | The LART uses the same LED for both Timer LED and CPU usage LED | 794 | The LART uses the same LED for both Timer LED and CPU usage LED |
790 | functions. You may choose to use both, but the Timer LED function | 795 | functions. You may choose to use both, but the Timer LED function |
791 | will overrule the CPU usage LED. | 796 | will overrule the CPU usage LED. |
792 | 797 | ||
793 | config ALIGNMENT_TRAP | 798 | config ALIGNMENT_TRAP |
794 | bool | 799 | bool |
795 | depends on CPU_CP15_MMU | 800 | depends on CPU_CP15_MMU |
796 | default y if !ARCH_EBSA110 | 801 | default y if !ARCH_EBSA110 |
797 | help | 802 | help |
798 | ARM processors cannot fetch/store information which is not | 803 | ARM processors cannot fetch/store information which is not |
799 | naturally aligned on the bus, i.e., a 4 byte fetch must start at an | 804 | naturally aligned on the bus, i.e., a 4 byte fetch must start at an |
800 | address divisible by 4. On 32-bit ARM processors, these non-aligned | 805 | address divisible by 4. On 32-bit ARM processors, these non-aligned |
801 | fetch/store instructions will be emulated in software if you say | 806 | fetch/store instructions will be emulated in software if you say |
802 | here, which has a severe performance impact. This is necessary for | 807 | here, which has a severe performance impact. This is necessary for |
803 | correct operation of some network protocols. With an IP-only | 808 | correct operation of some network protocols. With an IP-only |
804 | configuration it is safe to say N, otherwise say Y. | 809 | configuration it is safe to say N, otherwise say Y. |
805 | 810 | ||
806 | endmenu | 811 | endmenu |
807 | 812 | ||
808 | menu "Boot options" | 813 | menu "Boot options" |
809 | 814 | ||
810 | # Compressed boot loader in ROM. Yes, we really want to ask about | 815 | # Compressed boot loader in ROM. Yes, we really want to ask about |
811 | # TEXT and BSS so we preserve their values in the config files. | 816 | # TEXT and BSS so we preserve their values in the config files. |
812 | config ZBOOT_ROM_TEXT | 817 | config ZBOOT_ROM_TEXT |
813 | hex "Compressed ROM boot loader base address" | 818 | hex "Compressed ROM boot loader base address" |
814 | default "0" | 819 | default "0" |
815 | help | 820 | help |
816 | The physical address at which the ROM-able zImage is to be | 821 | The physical address at which the ROM-able zImage is to be |
817 | placed in the target. Platforms which normally make use of | 822 | placed in the target. Platforms which normally make use of |
818 | ROM-able zImage formats normally set this to a suitable | 823 | ROM-able zImage formats normally set this to a suitable |
819 | value in their defconfig file. | 824 | value in their defconfig file. |
820 | 825 | ||
821 | If ZBOOT_ROM is not enabled, this has no effect. | 826 | If ZBOOT_ROM is not enabled, this has no effect. |
822 | 827 | ||
823 | config ZBOOT_ROM_BSS | 828 | config ZBOOT_ROM_BSS |
824 | hex "Compressed ROM boot loader BSS address" | 829 | hex "Compressed ROM boot loader BSS address" |
825 | default "0" | 830 | default "0" |
826 | help | 831 | help |
827 | The base address of an area of read/write memory in the target | 832 | The base address of an area of read/write memory in the target |
828 | for the ROM-able zImage which must be available while the | 833 | for the ROM-able zImage which must be available while the |
829 | decompressor is running. It must be large enough to hold the | 834 | decompressor is running. It must be large enough to hold the |
830 | entire decompressed kernel plus an additional 128 KiB. | 835 | entire decompressed kernel plus an additional 128 KiB. |
831 | Platforms which normally make use of ROM-able zImage formats | 836 | Platforms which normally make use of ROM-able zImage formats |
832 | normally set this to a suitable value in their defconfig file. | 837 | normally set this to a suitable value in their defconfig file. |
833 | 838 | ||
834 | If ZBOOT_ROM is not enabled, this has no effect. | 839 | If ZBOOT_ROM is not enabled, this has no effect. |
835 | 840 | ||
836 | config ZBOOT_ROM | 841 | config ZBOOT_ROM |
837 | bool "Compressed boot loader in ROM/flash" | 842 | bool "Compressed boot loader in ROM/flash" |
838 | depends on ZBOOT_ROM_TEXT != ZBOOT_ROM_BSS | 843 | depends on ZBOOT_ROM_TEXT != ZBOOT_ROM_BSS |
839 | help | 844 | help |
840 | Say Y here if you intend to execute your compressed kernel image | 845 | Say Y here if you intend to execute your compressed kernel image |
841 | (zImage) directly from ROM or flash. If unsure, say N. | 846 | (zImage) directly from ROM or flash. If unsure, say N. |
842 | 847 | ||
843 | config CMDLINE | 848 | config CMDLINE |
844 | string "Default kernel command string" | 849 | string "Default kernel command string" |
845 | default "" | 850 | default "" |
846 | help | 851 | help |
847 | On some architectures (EBSA110 and CATS), there is currently no way | 852 | On some architectures (EBSA110 and CATS), there is currently no way |
848 | for the boot loader to pass arguments to the kernel. For these | 853 | for the boot loader to pass arguments to the kernel. For these |
849 | architectures, you should supply some command-line options at build | 854 | architectures, you should supply some command-line options at build |
850 | time by entering them here. As a minimum, you should specify the | 855 | time by entering them here. As a minimum, you should specify the |
851 | memory size and the root device (e.g., mem=64M root=/dev/nfs). | 856 | memory size and the root device (e.g., mem=64M root=/dev/nfs). |
852 | 857 | ||
853 | config XIP_KERNEL | 858 | config XIP_KERNEL |
854 | bool "Kernel Execute-In-Place from ROM" | 859 | bool "Kernel Execute-In-Place from ROM" |
855 | depends on !ZBOOT_ROM | 860 | depends on !ZBOOT_ROM |
856 | help | 861 | help |
857 | Execute-In-Place allows the kernel to run from non-volatile storage | 862 | Execute-In-Place allows the kernel to run from non-volatile storage |
858 | directly addressable by the CPU, such as NOR flash. This saves RAM | 863 | directly addressable by the CPU, such as NOR flash. This saves RAM |
859 | space since the text section of the kernel is not loaded from flash | 864 | space since the text section of the kernel is not loaded from flash |
860 | to RAM. Read-write sections, such as the data section and stack, | 865 | to RAM. Read-write sections, such as the data section and stack, |
861 | are still copied to RAM. The XIP kernel is not compressed since | 866 | are still copied to RAM. The XIP kernel is not compressed since |
862 | it has to run directly from flash, so it will take more space to | 867 | it has to run directly from flash, so it will take more space to |
863 | store it. The flash address used to link the kernel object files, | 868 | store it. The flash address used to link the kernel object files, |
864 | and for storing it, is configuration dependent. Therefore, if you | 869 | and for storing it, is configuration dependent. Therefore, if you |
865 | say Y here, you must know the proper physical address where to | 870 | say Y here, you must know the proper physical address where to |
866 | store the kernel image depending on your own flash memory usage. | 871 | store the kernel image depending on your own flash memory usage. |
867 | 872 | ||
868 | Also note that the make target becomes "make xipImage" rather than | 873 | Also note that the make target becomes "make xipImage" rather than |
869 | "make zImage" or "make Image". The final kernel binary to put in | 874 | "make zImage" or "make Image". The final kernel binary to put in |
870 | ROM memory will be arch/arm/boot/xipImage. | 875 | ROM memory will be arch/arm/boot/xipImage. |
871 | 876 | ||
872 | If unsure, say N. | 877 | If unsure, say N. |
873 | 878 | ||
874 | config XIP_PHYS_ADDR | 879 | config XIP_PHYS_ADDR |
875 | hex "XIP Kernel Physical Location" | 880 | hex "XIP Kernel Physical Location" |
876 | depends on XIP_KERNEL | 881 | depends on XIP_KERNEL |
877 | default "0x00080000" | 882 | default "0x00080000" |
878 | help | 883 | help |
879 | This is the physical address in your flash memory the kernel will | 884 | This is the physical address in your flash memory the kernel will |
880 | be linked for and stored to. This address is dependent on your | 885 | be linked for and stored to. This address is dependent on your |
881 | own flash usage. | 886 | own flash usage. |
882 | 887 | ||
883 | config KEXEC | 888 | config KEXEC |
884 | bool "Kexec system call (EXPERIMENTAL)" | 889 | bool "Kexec system call (EXPERIMENTAL)" |
885 | depends on EXPERIMENTAL | 890 | depends on EXPERIMENTAL |
886 | help | 891 | help |
887 | kexec is a system call that implements the ability to shutdown your | 892 | kexec is a system call that implements the ability to shutdown your |
888 | current kernel, and to start another kernel. It is like a reboot | 893 | current kernel, and to start another kernel. It is like a reboot |
889 | but it is independent of the system firmware. And like a reboot | 894 | but it is independent of the system firmware. And like a reboot |
890 | you can start any kernel with it, not just Linux. | 895 | you can start any kernel with it, not just Linux. |
891 | 896 | ||
892 | It is an ongoing process to be certain the hardware in a machine | 897 | It is an ongoing process to be certain the hardware in a machine |
893 | is properly shutdown, so do not be surprised if this code does not | 898 | is properly shutdown, so do not be surprised if this code does not |
894 | initially work for you. It may help to enable device hotplugging | 899 | initially work for you. It may help to enable device hotplugging |
895 | support. | 900 | support. |
896 | 901 | ||
897 | endmenu | 902 | endmenu |
898 | 903 | ||
899 | if (ARCH_SA1100 || ARCH_INTEGRATOR || ARCH_OMAP || ARCH_IMX || ARCH_PXA) | 904 | if (ARCH_SA1100 || ARCH_INTEGRATOR || ARCH_OMAP || ARCH_IMX || ARCH_PXA) |
900 | 905 | ||
901 | menu "CPU Frequency scaling" | 906 | menu "CPU Frequency scaling" |
902 | 907 | ||
903 | source "drivers/cpufreq/Kconfig" | 908 | source "drivers/cpufreq/Kconfig" |
904 | 909 | ||
905 | config CPU_FREQ_SA1100 | 910 | config CPU_FREQ_SA1100 |
906 | bool | 911 | bool |
907 | depends on CPU_FREQ && (SA1100_H3100 || SA1100_H3600 || SA1100_H3800 || SA1100_LART || SA1100_PLEB || SA1100_BADGE4 || SA1100_HACKKIT) | 912 | depends on CPU_FREQ && (SA1100_H3100 || SA1100_H3600 || SA1100_H3800 || SA1100_LART || SA1100_PLEB || SA1100_BADGE4 || SA1100_HACKKIT) |
908 | default y | 913 | default y |
909 | 914 | ||
910 | config CPU_FREQ_SA1110 | 915 | config CPU_FREQ_SA1110 |
911 | bool | 916 | bool |
912 | depends on CPU_FREQ && (SA1100_ASSABET || SA1100_CERF || SA1100_PT_SYSTEM3) | 917 | depends on CPU_FREQ && (SA1100_ASSABET || SA1100_CERF || SA1100_PT_SYSTEM3) |
913 | default y | 918 | default y |
914 | 919 | ||
915 | config CPU_FREQ_INTEGRATOR | 920 | config CPU_FREQ_INTEGRATOR |
916 | tristate "CPUfreq driver for ARM Integrator CPUs" | 921 | tristate "CPUfreq driver for ARM Integrator CPUs" |
917 | depends on ARCH_INTEGRATOR && CPU_FREQ | 922 | depends on ARCH_INTEGRATOR && CPU_FREQ |
918 | default y | 923 | default y |
919 | help | 924 | help |
920 | This enables the CPUfreq driver for ARM Integrator CPUs. | 925 | This enables the CPUfreq driver for ARM Integrator CPUs. |
921 | 926 | ||
922 | For details, take a look at <file:Documentation/cpu-freq>. | 927 | For details, take a look at <file:Documentation/cpu-freq>. |
923 | 928 | ||
924 | If in doubt, say Y. | 929 | If in doubt, say Y. |
925 | 930 | ||
926 | config CPU_FREQ_IMX | 931 | config CPU_FREQ_IMX |
927 | tristate "CPUfreq driver for i.MX CPUs" | 932 | tristate "CPUfreq driver for i.MX CPUs" |
928 | depends on ARCH_IMX && CPU_FREQ | 933 | depends on ARCH_IMX && CPU_FREQ |
929 | default n | 934 | default n |
930 | help | 935 | help |
931 | This enables the CPUfreq driver for i.MX CPUs. | 936 | This enables the CPUfreq driver for i.MX CPUs. |
932 | 937 | ||
933 | If in doubt, say N. | 938 | If in doubt, say N. |
934 | 939 | ||
935 | config CPU_FREQ_PXA | 940 | config CPU_FREQ_PXA |
936 | bool | 941 | bool |
937 | depends on CPU_FREQ && ARCH_PXA && PXA25x | 942 | depends on CPU_FREQ && ARCH_PXA && PXA25x |
938 | default y | 943 | default y |
939 | select CPU_FREQ_DEFAULT_GOV_USERSPACE | 944 | select CPU_FREQ_DEFAULT_GOV_USERSPACE |
940 | 945 | ||
941 | endmenu | 946 | endmenu |
942 | 947 | ||
943 | endif | 948 | endif |
944 | 949 | ||
945 | menu "Floating point emulation" | 950 | menu "Floating point emulation" |
946 | 951 | ||
947 | comment "At least one emulation must be selected" | 952 | comment "At least one emulation must be selected" |
948 | 953 | ||
949 | config FPE_NWFPE | 954 | config FPE_NWFPE |
950 | bool "NWFPE math emulation" | 955 | bool "NWFPE math emulation" |
951 | depends on !AEABI || OABI_COMPAT | 956 | depends on !AEABI || OABI_COMPAT |
952 | ---help--- | 957 | ---help--- |
953 | Say Y to include the NWFPE floating point emulator in the kernel. | 958 | Say Y to include the NWFPE floating point emulator in the kernel. |
954 | This is necessary to run most binaries. Linux does not currently | 959 | This is necessary to run most binaries. Linux does not currently |
955 | support floating point hardware so you need to say Y here even if | 960 | support floating point hardware so you need to say Y here even if |
956 | your machine has an FPA or floating point co-processor podule. | 961 | your machine has an FPA or floating point co-processor podule. |
957 | 962 | ||
958 | You may say N here if you are going to load the Acorn FPEmulator | 963 | You may say N here if you are going to load the Acorn FPEmulator |
959 | early in the bootup. | 964 | early in the bootup. |
960 | 965 | ||
961 | config FPE_NWFPE_XP | 966 | config FPE_NWFPE_XP |
962 | bool "Support extended precision" | 967 | bool "Support extended precision" |
963 | depends on FPE_NWFPE | 968 | depends on FPE_NWFPE |
964 | help | 969 | help |
965 | Say Y to include 80-bit support in the kernel floating-point | 970 | Say Y to include 80-bit support in the kernel floating-point |
966 | emulator. Otherwise, only 32 and 64-bit support is compiled in. | 971 | emulator. Otherwise, only 32 and 64-bit support is compiled in. |
967 | Note that gcc does not generate 80-bit operations by default, | 972 | Note that gcc does not generate 80-bit operations by default, |
968 | so in most cases this option only enlarges the size of the | 973 | so in most cases this option only enlarges the size of the |
969 | floating point emulator without any good reason. | 974 | floating point emulator without any good reason. |
970 | 975 | ||
971 | You almost surely want to say N here. | 976 | You almost surely want to say N here. |
972 | 977 | ||
973 | config FPE_FASTFPE | 978 | config FPE_FASTFPE |
974 | bool "FastFPE math emulation (EXPERIMENTAL)" | 979 | bool "FastFPE math emulation (EXPERIMENTAL)" |
975 | depends on (!AEABI || OABI_COMPAT) && !CPU_32v3 && EXPERIMENTAL | 980 | depends on (!AEABI || OABI_COMPAT) && !CPU_32v3 && EXPERIMENTAL |
976 | ---help--- | 981 | ---help--- |
977 | Say Y here to include the FAST floating point emulator in the kernel. | 982 | Say Y here to include the FAST floating point emulator in the kernel. |
978 | This is an experimental much faster emulator which now also has full | 983 | This is an experimental much faster emulator which now also has full |
979 | precision for the mantissa. It does not support any exceptions. | 984 | precision for the mantissa. It does not support any exceptions. |
980 | It is very simple, and approximately 3-6 times faster than NWFPE. | 985 | It is very simple, and approximately 3-6 times faster than NWFPE. |
981 | 986 | ||
982 | It should be sufficient for most programs. It may be not suitable | 987 | It should be sufficient for most programs. It may be not suitable |
983 | for scientific calculations, but you have to check this for yourself. | 988 | for scientific calculations, but you have to check this for yourself. |
984 | If you do not feel you need a faster FP emulation you should better | 989 | If you do not feel you need a faster FP emulation you should better |
985 | choose NWFPE. | 990 | choose NWFPE. |
986 | 991 | ||
987 | config VFP | 992 | config VFP |
988 | bool "VFP-format floating point maths" | 993 | bool "VFP-format floating point maths" |
989 | depends on CPU_V6 || CPU_ARM926T || CPU_V7 || CPU_FEROCEON | 994 | depends on CPU_V6 || CPU_ARM926T || CPU_V7 || CPU_FEROCEON |
990 | help | 995 | help |
991 | Say Y to include VFP support code in the kernel. This is needed | 996 | Say Y to include VFP support code in the kernel. This is needed |
992 | if your hardware includes a VFP unit. | 997 | if your hardware includes a VFP unit. |
993 | 998 | ||
994 | Please see <file:Documentation/arm/VFP/release-notes.txt> for | 999 | Please see <file:Documentation/arm/VFP/release-notes.txt> for |
995 | release notes and additional status information. | 1000 | release notes and additional status information. |
996 | 1001 | ||
997 | Say N if your target does not have VFP hardware. | 1002 | Say N if your target does not have VFP hardware. |
998 | 1003 | ||
999 | config VFPv3 | 1004 | config VFPv3 |
1000 | bool | 1005 | bool |
1001 | depends on VFP | 1006 | depends on VFP |
1002 | default y if CPU_V7 | 1007 | default y if CPU_V7 |
1003 | 1008 | ||
1004 | config NEON | 1009 | config NEON |
1005 | bool "Advanced SIMD (NEON) Extension support" | 1010 | bool "Advanced SIMD (NEON) Extension support" |
1006 | depends on VFPv3 && CPU_V7 | 1011 | depends on VFPv3 && CPU_V7 |
1007 | help | 1012 | help |
1008 | Say Y to include support code for NEON, the ARMv7 Advanced SIMD | 1013 | Say Y to include support code for NEON, the ARMv7 Advanced SIMD |
1009 | Extension. | 1014 | Extension. |
1010 | 1015 | ||
1011 | endmenu | 1016 | endmenu |
1012 | 1017 | ||
1013 | menu "Userspace binary formats" | 1018 | menu "Userspace binary formats" |
1014 | 1019 | ||
1015 | source "fs/Kconfig.binfmt" | 1020 | source "fs/Kconfig.binfmt" |
1016 | 1021 | ||
1017 | config ARTHUR | 1022 | config ARTHUR |
1018 | tristate "RISC OS personality" | 1023 | tristate "RISC OS personality" |
1019 | depends on !AEABI | 1024 | depends on !AEABI |
1020 | help | 1025 | help |
1021 | Say Y here to include the kernel code necessary if you want to run | 1026 | Say Y here to include the kernel code necessary if you want to run |
1022 | Acorn RISC OS/Arthur binaries under Linux. This code is still very | 1027 | Acorn RISC OS/Arthur binaries under Linux. This code is still very |
1023 | experimental; if this sounds frightening, say N and sleep in peace. | 1028 | experimental; if this sounds frightening, say N and sleep in peace. |
1024 | You can also say M here to compile this support as a module (which | 1029 | You can also say M here to compile this support as a module (which |
1025 | will be called arthur). | 1030 | will be called arthur). |
1026 | 1031 | ||
1027 | endmenu | 1032 | endmenu |
1028 | 1033 | ||
1029 | menu "Power management options" | 1034 | menu "Power management options" |
1030 | 1035 | ||
1031 | source "kernel/power/Kconfig" | 1036 | source "kernel/power/Kconfig" |
1032 | 1037 | ||
1033 | endmenu | 1038 | endmenu |
1034 | 1039 | ||
1035 | source "net/Kconfig" | 1040 | source "net/Kconfig" |
1036 | 1041 | ||
1037 | menu "Device Drivers" | 1042 | menu "Device Drivers" |
1038 | 1043 | ||
1039 | source "drivers/base/Kconfig" | 1044 | source "drivers/base/Kconfig" |
1040 | 1045 | ||
1041 | source "drivers/connector/Kconfig" | 1046 | source "drivers/connector/Kconfig" |
1042 | 1047 | ||
1043 | if ALIGNMENT_TRAP || !CPU_CP15_MMU | 1048 | if ALIGNMENT_TRAP || !CPU_CP15_MMU |
1044 | source "drivers/mtd/Kconfig" | 1049 | source "drivers/mtd/Kconfig" |
1045 | endif | 1050 | endif |
1046 | 1051 | ||
1047 | source "drivers/parport/Kconfig" | 1052 | source "drivers/parport/Kconfig" |
1048 | 1053 | ||
1049 | source "drivers/pnp/Kconfig" | 1054 | source "drivers/pnp/Kconfig" |
1050 | 1055 | ||
1051 | source "drivers/block/Kconfig" | 1056 | source "drivers/block/Kconfig" |
1052 | 1057 | ||
1053 | # misc before ide - BLK_DEV_SGIIOC4 depends on SGI_IOC4 | 1058 | # misc before ide - BLK_DEV_SGIIOC4 depends on SGI_IOC4 |
1054 | 1059 | ||
1055 | source "drivers/misc/Kconfig" | 1060 | source "drivers/misc/Kconfig" |
1056 | 1061 | ||
1057 | if PCMCIA || ARCH_CLPS7500 || ARCH_IOP32X || ARCH_IOP33X || ARCH_IXP4XX \ | 1062 | if PCMCIA || ARCH_CLPS7500 || ARCH_IOP32X || ARCH_IOP33X || ARCH_IXP4XX \ |
1058 | || ARCH_L7200 || ARCH_LH7A40X || ARCH_PXA || ARCH_RPC \ | 1063 | || ARCH_L7200 || ARCH_LH7A40X || ARCH_PXA || ARCH_RPC \ |
1059 | || ARCH_S3C2410 || ARCH_SA1100 || ARCH_SHARK || FOOTBRIDGE \ | 1064 | || ARCH_S3C2410 || ARCH_SA1100 || ARCH_SHARK || FOOTBRIDGE \ |
1060 | || ARCH_IXP23XX | 1065 | || ARCH_IXP23XX |
1061 | source "drivers/ide/Kconfig" | 1066 | source "drivers/ide/Kconfig" |
1062 | endif | 1067 | endif |
1063 | 1068 | ||
1064 | source "drivers/scsi/Kconfig" | 1069 | source "drivers/scsi/Kconfig" |
1065 | 1070 | ||
1066 | source "drivers/ata/Kconfig" | 1071 | source "drivers/ata/Kconfig" |
1067 | 1072 | ||
1068 | source "drivers/md/Kconfig" | 1073 | source "drivers/md/Kconfig" |
1069 | 1074 | ||
1070 | source "drivers/message/fusion/Kconfig" | 1075 | source "drivers/message/fusion/Kconfig" |
1071 | 1076 | ||
1072 | source "drivers/ieee1394/Kconfig" | 1077 | source "drivers/ieee1394/Kconfig" |
1073 | 1078 | ||
1074 | source "drivers/message/i2o/Kconfig" | 1079 | source "drivers/message/i2o/Kconfig" |
1075 | 1080 | ||
1076 | source "drivers/net/Kconfig" | 1081 | source "drivers/net/Kconfig" |
1077 | 1082 | ||
1078 | source "drivers/isdn/Kconfig" | 1083 | source "drivers/isdn/Kconfig" |
1079 | 1084 | ||
1080 | # input before char - char/joystick depends on it. As does USB. | 1085 | # input before char - char/joystick depends on it. As does USB. |
1081 | 1086 | ||
1082 | source "drivers/input/Kconfig" | 1087 | source "drivers/input/Kconfig" |
1083 | 1088 | ||
1084 | source "drivers/char/Kconfig" | 1089 | source "drivers/char/Kconfig" |
1085 | 1090 | ||
1086 | source "drivers/i2c/Kconfig" | 1091 | source "drivers/i2c/Kconfig" |
1087 | 1092 | ||
1088 | source "drivers/spi/Kconfig" | 1093 | source "drivers/spi/Kconfig" |
1089 | 1094 | ||
1090 | source "drivers/w1/Kconfig" | 1095 | source "drivers/w1/Kconfig" |
1091 | 1096 | ||
1092 | source "drivers/power/Kconfig" | 1097 | source "drivers/power/Kconfig" |
1093 | 1098 | ||
1094 | source "drivers/hwmon/Kconfig" | 1099 | source "drivers/hwmon/Kconfig" |
1095 | 1100 | ||
1096 | source "drivers/watchdog/Kconfig" | 1101 | source "drivers/watchdog/Kconfig" |
1097 | 1102 | ||
1098 | source "drivers/ssb/Kconfig" | 1103 | source "drivers/ssb/Kconfig" |
1099 | 1104 | ||
1100 | #source "drivers/l3/Kconfig" | 1105 | #source "drivers/l3/Kconfig" |
1101 | 1106 | ||
1102 | source "drivers/mfd/Kconfig" | 1107 | source "drivers/mfd/Kconfig" |
1103 | 1108 | ||
1104 | source "drivers/media/Kconfig" | 1109 | source "drivers/media/Kconfig" |
1105 | 1110 | ||
1106 | source "drivers/video/Kconfig" | 1111 | source "drivers/video/Kconfig" |
1107 | 1112 | ||
1108 | source "sound/Kconfig" | 1113 | source "sound/Kconfig" |
1109 | 1114 | ||
1110 | source "drivers/hid/Kconfig" | 1115 | source "drivers/hid/Kconfig" |
1111 | 1116 | ||
1112 | source "drivers/usb/Kconfig" | 1117 | source "drivers/usb/Kconfig" |
1113 | 1118 | ||
1114 | source "drivers/mmc/Kconfig" | 1119 | source "drivers/mmc/Kconfig" |
1115 | 1120 | ||
1116 | source "drivers/leds/Kconfig" | 1121 | source "drivers/leds/Kconfig" |
1117 | 1122 | ||
1118 | source "drivers/rtc/Kconfig" | 1123 | source "drivers/rtc/Kconfig" |
1119 | 1124 | ||
1120 | source "drivers/dma/Kconfig" | 1125 | source "drivers/dma/Kconfig" |
1121 | 1126 | ||
1122 | source "drivers/dca/Kconfig" | 1127 | source "drivers/dca/Kconfig" |
1123 | 1128 | ||
1124 | endmenu | 1129 | endmenu |
1125 | 1130 | ||
1126 | source "fs/Kconfig" | 1131 | source "fs/Kconfig" |
1127 | 1132 | ||
1128 | source "arch/arm/Kconfig.instrumentation" | 1133 | source "arch/arm/Kconfig.instrumentation" |
1129 | 1134 | ||
1130 | source "arch/arm/Kconfig.debug" | 1135 | source "arch/arm/Kconfig.debug" |
1131 | 1136 | ||
1132 | source "security/Kconfig" | 1137 | source "security/Kconfig" |
1133 | 1138 | ||
1134 | source "crypto/Kconfig" | 1139 | source "crypto/Kconfig" |
1135 | 1140 | ||
1136 | source "lib/Kconfig" | 1141 | source "lib/Kconfig" |
1137 | 1142 |
arch/ia64/Kconfig
1 | # | 1 | # |
2 | # For a description of the syntax of this configuration file, | 2 | # For a description of the syntax of this configuration file, |
3 | # see Documentation/kbuild/kconfig-language.txt. | 3 | # see Documentation/kbuild/kconfig-language.txt. |
4 | # | 4 | # |
5 | 5 | ||
6 | mainmenu "IA-64 Linux Kernel Configuration" | 6 | mainmenu "IA-64 Linux Kernel Configuration" |
7 | 7 | ||
8 | source "init/Kconfig" | 8 | source "init/Kconfig" |
9 | 9 | ||
10 | menu "Processor type and features" | 10 | menu "Processor type and features" |
11 | 11 | ||
12 | config IA64 | 12 | config IA64 |
13 | bool | 13 | bool |
14 | select PCI if (!IA64_HP_SIM) | 14 | select PCI if (!IA64_HP_SIM) |
15 | select ACPI if (!IA64_HP_SIM) | 15 | select ACPI if (!IA64_HP_SIM) |
16 | select PM if (!IA64_HP_SIM) | 16 | select PM if (!IA64_HP_SIM) |
17 | select ARCH_SUPPORTS_MSI | 17 | select ARCH_SUPPORTS_MSI |
18 | default y | 18 | default y |
19 | help | 19 | help |
20 | The Itanium Processor Family is Intel's 64-bit successor to | 20 | The Itanium Processor Family is Intel's 64-bit successor to |
21 | the 32-bit X86 line. The IA-64 Linux project has a home | 21 | the 32-bit X86 line. The IA-64 Linux project has a home |
22 | page at <http://www.linuxia64.org/> and a mailing list at | 22 | page at <http://www.linuxia64.org/> and a mailing list at |
23 | <linux-ia64@vger.kernel.org>. | 23 | <linux-ia64@vger.kernel.org>. |
24 | 24 | ||
25 | config 64BIT | 25 | config 64BIT |
26 | bool | 26 | bool |
27 | select ATA_NONSTANDARD if ATA | 27 | select ATA_NONSTANDARD if ATA |
28 | default y | 28 | default y |
29 | 29 | ||
30 | config ZONE_DMA | 30 | config ZONE_DMA |
31 | def_bool y | 31 | def_bool y |
32 | depends on !IA64_SGI_SN2 | 32 | depends on !IA64_SGI_SN2 |
33 | 33 | ||
34 | config QUICKLIST | 34 | config QUICKLIST |
35 | bool | 35 | bool |
36 | default y | 36 | default y |
37 | 37 | ||
38 | config MMU | 38 | config MMU |
39 | bool | 39 | bool |
40 | default y | 40 | default y |
41 | 41 | ||
42 | config SWIOTLB | 42 | config SWIOTLB |
43 | bool | 43 | bool |
44 | 44 | ||
45 | config GENERIC_LOCKBREAK | ||
46 | bool | ||
47 | default y | ||
48 | depends on SMP && PREEMPT | ||
49 | |||
45 | config RWSEM_XCHGADD_ALGORITHM | 50 | config RWSEM_XCHGADD_ALGORITHM |
46 | bool | 51 | bool |
47 | default y | 52 | default y |
48 | 53 | ||
49 | config ARCH_HAS_ILOG2_U32 | 54 | config ARCH_HAS_ILOG2_U32 |
50 | bool | 55 | bool |
51 | default n | 56 | default n |
52 | 57 | ||
53 | config ARCH_HAS_ILOG2_U64 | 58 | config ARCH_HAS_ILOG2_U64 |
54 | bool | 59 | bool |
55 | default n | 60 | default n |
56 | 61 | ||
57 | config HUGETLB_PAGE_SIZE_VARIABLE | 62 | config HUGETLB_PAGE_SIZE_VARIABLE |
58 | bool | 63 | bool |
59 | depends on HUGETLB_PAGE | 64 | depends on HUGETLB_PAGE |
60 | default y | 65 | default y |
61 | 66 | ||
62 | config GENERIC_FIND_NEXT_BIT | 67 | config GENERIC_FIND_NEXT_BIT |
63 | bool | 68 | bool |
64 | default y | 69 | default y |
65 | 70 | ||
66 | config GENERIC_CALIBRATE_DELAY | 71 | config GENERIC_CALIBRATE_DELAY |
67 | bool | 72 | bool |
68 | default y | 73 | default y |
69 | 74 | ||
70 | config GENERIC_TIME | 75 | config GENERIC_TIME |
71 | bool | 76 | bool |
72 | default y | 77 | default y |
73 | 78 | ||
74 | config GENERIC_TIME_VSYSCALL | 79 | config GENERIC_TIME_VSYSCALL |
75 | bool | 80 | bool |
76 | default y | 81 | default y |
77 | 82 | ||
78 | config DMI | 83 | config DMI |
79 | bool | 84 | bool |
80 | default y | 85 | default y |
81 | 86 | ||
82 | config EFI | 87 | config EFI |
83 | bool | 88 | bool |
84 | default y | 89 | default y |
85 | 90 | ||
86 | config GENERIC_IOMAP | 91 | config GENERIC_IOMAP |
87 | bool | 92 | bool |
88 | default y | 93 | default y |
89 | 94 | ||
90 | config SCHED_NO_NO_OMIT_FRAME_POINTER | 95 | config SCHED_NO_NO_OMIT_FRAME_POINTER |
91 | bool | 96 | bool |
92 | default y | 97 | default y |
93 | 98 | ||
94 | config IA64_UNCACHED_ALLOCATOR | 99 | config IA64_UNCACHED_ALLOCATOR |
95 | bool | 100 | bool |
96 | select GENERIC_ALLOCATOR | 101 | select GENERIC_ALLOCATOR |
97 | 102 | ||
98 | config AUDIT_ARCH | 103 | config AUDIT_ARCH |
99 | bool | 104 | bool |
100 | default y | 105 | default y |
101 | 106 | ||
102 | choice | 107 | choice |
103 | prompt "System type" | 108 | prompt "System type" |
104 | default IA64_GENERIC | 109 | default IA64_GENERIC |
105 | 110 | ||
106 | config IA64_GENERIC | 111 | config IA64_GENERIC |
107 | bool "generic" | 112 | bool "generic" |
108 | select NUMA | 113 | select NUMA |
109 | select ACPI_NUMA | 114 | select ACPI_NUMA |
110 | select SWIOTLB | 115 | select SWIOTLB |
111 | help | 116 | help |
112 | This selects the system type of your hardware. A "generic" kernel | 117 | This selects the system type of your hardware. A "generic" kernel |
113 | will run on any supported IA-64 system. However, if you configure | 118 | will run on any supported IA-64 system. However, if you configure |
114 | a kernel for your specific system, it will be faster and smaller. | 119 | a kernel for your specific system, it will be faster and smaller. |
115 | 120 | ||
116 | generic For any supported IA-64 system | 121 | generic For any supported IA-64 system |
117 | DIG-compliant For DIG ("Developer's Interface Guide") compliant systems | 122 | DIG-compliant For DIG ("Developer's Interface Guide") compliant systems |
118 | HP-zx1/sx1000 For HP systems | 123 | HP-zx1/sx1000 For HP systems |
119 | HP-zx1/sx1000+swiotlb For HP systems with (broken) DMA-constrained devices. | 124 | HP-zx1/sx1000+swiotlb For HP systems with (broken) DMA-constrained devices. |
120 | SGI-SN2 For SGI Altix systems | 125 | SGI-SN2 For SGI Altix systems |
121 | Ski-simulator For the HP simulator <http://www.hpl.hp.com/research/linux/ski/> | 126 | Ski-simulator For the HP simulator <http://www.hpl.hp.com/research/linux/ski/> |
122 | 127 | ||
123 | If you don't know what to do, choose "generic". | 128 | If you don't know what to do, choose "generic". |
124 | 129 | ||
125 | config IA64_DIG | 130 | config IA64_DIG |
126 | bool "DIG-compliant" | 131 | bool "DIG-compliant" |
127 | select SWIOTLB | 132 | select SWIOTLB |
128 | 133 | ||
129 | config IA64_HP_ZX1 | 134 | config IA64_HP_ZX1 |
130 | bool "HP-zx1/sx1000" | 135 | bool "HP-zx1/sx1000" |
131 | help | 136 | help |
132 | Build a kernel that runs on HP zx1 and sx1000 systems. This adds | 137 | Build a kernel that runs on HP zx1 and sx1000 systems. This adds |
133 | support for the HP I/O MMU. | 138 | support for the HP I/O MMU. |
134 | 139 | ||
135 | config IA64_HP_ZX1_SWIOTLB | 140 | config IA64_HP_ZX1_SWIOTLB |
136 | bool "HP-zx1/sx1000 with software I/O TLB" | 141 | bool "HP-zx1/sx1000 with software I/O TLB" |
137 | select SWIOTLB | 142 | select SWIOTLB |
138 | help | 143 | help |
139 | Build a kernel that runs on HP zx1 and sx1000 systems even when they | 144 | Build a kernel that runs on HP zx1 and sx1000 systems even when they |
140 | have broken PCI devices which cannot DMA to full 32 bits. Apart | 145 | have broken PCI devices which cannot DMA to full 32 bits. Apart |
141 | from support for the HP I/O MMU, this includes support for the software | 146 | from support for the HP I/O MMU, this includes support for the software |
142 | I/O TLB, which allows supporting the broken devices at the expense of | 147 | I/O TLB, which allows supporting the broken devices at the expense of |
143 | wasting some kernel memory (about 2MB by default). | 148 | wasting some kernel memory (about 2MB by default). |
144 | 149 | ||
145 | config IA64_SGI_SN2 | 150 | config IA64_SGI_SN2 |
146 | bool "SGI-SN2" | 151 | bool "SGI-SN2" |
147 | help | 152 | help |
148 | Selecting this option will optimize the kernel for use on sn2 based | 153 | Selecting this option will optimize the kernel for use on sn2 based |
149 | systems, but the resulting kernel binary will not run on other | 154 | systems, but the resulting kernel binary will not run on other |
150 | types of ia64 systems. If you have an SGI Altix system, it's safe | 155 | types of ia64 systems. If you have an SGI Altix system, it's safe |
151 | to select this option. If in doubt, select ia64 generic support | 156 | to select this option. If in doubt, select ia64 generic support |
152 | instead. | 157 | instead. |
153 | 158 | ||
154 | config IA64_HP_SIM | 159 | config IA64_HP_SIM |
155 | bool "Ski-simulator" | 160 | bool "Ski-simulator" |
156 | select SWIOTLB | 161 | select SWIOTLB |
157 | 162 | ||
158 | endchoice | 163 | endchoice |
159 | 164 | ||
160 | choice | 165 | choice |
161 | prompt "Processor type" | 166 | prompt "Processor type" |
162 | default ITANIUM | 167 | default ITANIUM |
163 | 168 | ||
164 | config ITANIUM | 169 | config ITANIUM |
165 | bool "Itanium" | 170 | bool "Itanium" |
166 | help | 171 | help |
167 | Select your IA-64 processor type. The default is Itanium. | 172 | Select your IA-64 processor type. The default is Itanium. |
168 | This choice is safe for all IA-64 systems, but may not perform | 173 | This choice is safe for all IA-64 systems, but may not perform |
169 | optimally on systems with, say, Itanium 2 or newer processors. | 174 | optimally on systems with, say, Itanium 2 or newer processors. |
170 | 175 | ||
171 | config MCKINLEY | 176 | config MCKINLEY |
172 | bool "Itanium 2" | 177 | bool "Itanium 2" |
173 | help | 178 | help |
174 | Select this to configure for an Itanium 2 (McKinley) processor. | 179 | Select this to configure for an Itanium 2 (McKinley) processor. |
175 | 180 | ||
176 | endchoice | 181 | endchoice |
177 | 182 | ||
178 | choice | 183 | choice |
179 | prompt "Kernel page size" | 184 | prompt "Kernel page size" |
180 | default IA64_PAGE_SIZE_16KB | 185 | default IA64_PAGE_SIZE_16KB |
181 | 186 | ||
182 | config IA64_PAGE_SIZE_4KB | 187 | config IA64_PAGE_SIZE_4KB |
183 | bool "4KB" | 188 | bool "4KB" |
184 | help | 189 | help |
185 | This lets you select the page size of the kernel. For best IA-64 | 190 | This lets you select the page size of the kernel. For best IA-64 |
186 | performance, a page size of 8KB or 16KB is recommended. For best | 191 | performance, a page size of 8KB or 16KB is recommended. For best |
187 | IA-32 compatibility, a page size of 4KB should be selected (the vast | 192 | IA-32 compatibility, a page size of 4KB should be selected (the vast |
188 | majority of IA-32 binaries work perfectly fine with a larger page | 193 | majority of IA-32 binaries work perfectly fine with a larger page |
189 | size). For Itanium 2 or newer systems, a page size of 64KB can also | 194 | size). For Itanium 2 or newer systems, a page size of 64KB can also |
190 | be selected. | 195 | be selected. |
191 | 196 | ||
192 | 4KB For best IA-32 compatibility | 197 | 4KB For best IA-32 compatibility |
193 | 8KB For best IA-64 performance | 198 | 8KB For best IA-64 performance |
194 | 16KB For best IA-64 performance | 199 | 16KB For best IA-64 performance |
195 | 64KB Requires Itanium 2 or newer processor. | 200 | 64KB Requires Itanium 2 or newer processor. |
196 | 201 | ||
197 | If you don't know what to do, choose 16KB. | 202 | If you don't know what to do, choose 16KB. |
198 | 203 | ||
199 | config IA64_PAGE_SIZE_8KB | 204 | config IA64_PAGE_SIZE_8KB |
200 | bool "8KB" | 205 | bool "8KB" |
201 | 206 | ||
202 | config IA64_PAGE_SIZE_16KB | 207 | config IA64_PAGE_SIZE_16KB |
203 | bool "16KB" | 208 | bool "16KB" |
204 | 209 | ||
205 | config IA64_PAGE_SIZE_64KB | 210 | config IA64_PAGE_SIZE_64KB |
206 | depends on !ITANIUM | 211 | depends on !ITANIUM |
207 | bool "64KB" | 212 | bool "64KB" |
208 | 213 | ||
209 | endchoice | 214 | endchoice |
210 | 215 | ||
211 | choice | 216 | choice |
212 | prompt "Page Table Levels" | 217 | prompt "Page Table Levels" |
213 | default PGTABLE_3 | 218 | default PGTABLE_3 |
214 | 219 | ||
215 | config PGTABLE_3 | 220 | config PGTABLE_3 |
216 | bool "3 Levels" | 221 | bool "3 Levels" |
217 | 222 | ||
218 | config PGTABLE_4 | 223 | config PGTABLE_4 |
219 | depends on !IA64_PAGE_SIZE_64KB | 224 | depends on !IA64_PAGE_SIZE_64KB |
220 | bool "4 Levels" | 225 | bool "4 Levels" |
221 | 226 | ||
222 | endchoice | 227 | endchoice |
223 | 228 | ||
224 | source kernel/Kconfig.hz | 229 | source kernel/Kconfig.hz |
225 | 230 | ||
226 | config IA64_BRL_EMU | 231 | config IA64_BRL_EMU |
227 | bool | 232 | bool |
228 | depends on ITANIUM | 233 | depends on ITANIUM |
229 | default y | 234 | default y |
230 | 235 | ||
231 | # align cache-sensitive data to 128 bytes | 236 | # align cache-sensitive data to 128 bytes |
232 | config IA64_L1_CACHE_SHIFT | 237 | config IA64_L1_CACHE_SHIFT |
233 | int | 238 | int |
234 | default "7" if MCKINLEY | 239 | default "7" if MCKINLEY |
235 | default "6" if ITANIUM | 240 | default "6" if ITANIUM |
236 | 241 | ||
237 | config IA64_CYCLONE | 242 | config IA64_CYCLONE |
238 | bool "Cyclone (EXA) Time Source support" | 243 | bool "Cyclone (EXA) Time Source support" |
239 | help | 244 | help |
240 | Say Y here to enable support for IBM EXA Cyclone time source. | 245 | Say Y here to enable support for IBM EXA Cyclone time source. |
241 | If you're unsure, answer N. | 246 | If you're unsure, answer N. |
242 | 247 | ||
243 | config IOSAPIC | 248 | config IOSAPIC |
244 | bool | 249 | bool |
245 | depends on !IA64_HP_SIM | 250 | depends on !IA64_HP_SIM |
246 | default y | 251 | default y |
247 | 252 | ||
248 | config IA64_SGI_SN_XP | 253 | config IA64_SGI_SN_XP |
249 | tristate "Support communication between SGI SSIs" | 254 | tristate "Support communication between SGI SSIs" |
250 | depends on IA64_GENERIC || IA64_SGI_SN2 | 255 | depends on IA64_GENERIC || IA64_SGI_SN2 |
251 | select IA64_UNCACHED_ALLOCATOR | 256 | select IA64_UNCACHED_ALLOCATOR |
252 | help | 257 | help |
253 | An SGI machine can be divided into multiple Single System | 258 | An SGI machine can be divided into multiple Single System |
254 | Images which act independently of each other and have | 259 | Images which act independently of each other and have |
255 | hardware based memory protection from the others. Enabling | 260 | hardware based memory protection from the others. Enabling |
256 | this feature will allow for direct communication between SSIs | 261 | this feature will allow for direct communication between SSIs |
257 | based on a network adapter and DMA messaging. | 262 | based on a network adapter and DMA messaging. |
258 | 263 | ||
259 | config FORCE_MAX_ZONEORDER | 264 | config FORCE_MAX_ZONEORDER |
260 | int "MAX_ORDER (11 - 17)" if !HUGETLB_PAGE | 265 | int "MAX_ORDER (11 - 17)" if !HUGETLB_PAGE |
261 | range 11 17 if !HUGETLB_PAGE | 266 | range 11 17 if !HUGETLB_PAGE |
262 | default "17" if HUGETLB_PAGE | 267 | default "17" if HUGETLB_PAGE |
263 | default "11" | 268 | default "11" |
264 | 269 | ||
265 | config SMP | 270 | config SMP |
266 | bool "Symmetric multi-processing support" | 271 | bool "Symmetric multi-processing support" |
267 | help | 272 | help |
268 | This enables support for systems with more than one CPU. If you have | 273 | This enables support for systems with more than one CPU. If you have |
269 | a system with only one CPU, say N. If you have a system with more | 274 | a system with only one CPU, say N. If you have a system with more |
270 | than one CPU, say Y. | 275 | than one CPU, say Y. |
271 | 276 | ||
272 | If you say N here, the kernel will run on single and multiprocessor | 277 | If you say N here, the kernel will run on single and multiprocessor |
273 | systems, but will use only one CPU of a multiprocessor system. If | 278 | systems, but will use only one CPU of a multiprocessor system. If |
274 | you say Y here, the kernel will run on many, but not all, | 279 | you say Y here, the kernel will run on many, but not all, |
275 | single processor systems. On a single processor system, the kernel | 280 | single processor systems. On a single processor system, the kernel |
276 | will run faster if you say N here. | 281 | will run faster if you say N here. |
277 | 282 | ||
278 | See also the <file:Documentation/smp.txt> and the SMP-HOWTO | 283 | See also the <file:Documentation/smp.txt> and the SMP-HOWTO |
279 | available at <http://www.tldp.org/docs.html#howto>. | 284 | available at <http://www.tldp.org/docs.html#howto>. |
280 | 285 | ||
281 | If you don't know what to do here, say N. | 286 | If you don't know what to do here, say N. |
282 | 287 | ||
283 | config NR_CPUS | 288 | config NR_CPUS |
284 | int "Maximum number of CPUs (2-1024)" | 289 | int "Maximum number of CPUs (2-1024)" |
285 | range 2 1024 | 290 | range 2 1024 |
286 | depends on SMP | 291 | depends on SMP |
287 | default "1024" | 292 | default "1024" |
288 | help | 293 | help |
289 | You should set this to the number of CPUs in your system, but | 294 | You should set this to the number of CPUs in your system, but |
290 | keep in mind that a kernel compiled for, e.g., 2 CPUs will boot but | 295 | keep in mind that a kernel compiled for, e.g., 2 CPUs will boot but |
291 | only use 2 CPUs on a >2 CPU system. Setting this to a value larger | 296 | only use 2 CPUs on a >2 CPU system. Setting this to a value larger |
292 | than 64 will cause the use of a CPU mask array, causing a small | 297 | than 64 will cause the use of a CPU mask array, causing a small |
293 | performance hit. | 298 | performance hit. |
294 | 299 | ||
295 | config HOTPLUG_CPU | 300 | config HOTPLUG_CPU |
296 | bool "Support for hot-pluggable CPUs (EXPERIMENTAL)" | 301 | bool "Support for hot-pluggable CPUs (EXPERIMENTAL)" |
297 | depends on SMP && EXPERIMENTAL | 302 | depends on SMP && EXPERIMENTAL |
298 | select HOTPLUG | 303 | select HOTPLUG |
299 | default n | 304 | default n |
300 | ---help--- | 305 | ---help--- |
301 | Say Y here to experiment with turning CPUs off and on. CPUs | 306 | Say Y here to experiment with turning CPUs off and on. CPUs |
302 | can be controlled through /sys/devices/system/cpu/cpu#. | 307 | can be controlled through /sys/devices/system/cpu/cpu#. |
303 | Say N if you want to disable CPU hotplug. | 308 | Say N if you want to disable CPU hotplug. |
304 | 309 | ||
305 | config ARCH_ENABLE_MEMORY_HOTPLUG | 310 | config ARCH_ENABLE_MEMORY_HOTPLUG |
306 | def_bool y | 311 | def_bool y |
307 | 312 | ||
308 | config ARCH_ENABLE_MEMORY_HOTREMOVE | 313 | config ARCH_ENABLE_MEMORY_HOTREMOVE |
309 | def_bool y | 314 | def_bool y |
310 | 315 | ||
311 | config SCHED_SMT | 316 | config SCHED_SMT |
312 | bool "SMT scheduler support" | 317 | bool "SMT scheduler support" |
313 | depends on SMP | 318 | depends on SMP |
314 | help | 319 | help |
315 | Improves the CPU scheduler's decision making when dealing with | 320 | Improves the CPU scheduler's decision making when dealing with |
316 | Intel IA64 chips with MultiThreading at a cost of slightly increased | 321 | Intel IA64 chips with MultiThreading at a cost of slightly increased |
317 | overhead in some places. If unsure say N here. | 322 | overhead in some places. If unsure say N here. |
318 | 323 | ||
319 | config PERMIT_BSP_REMOVE | 324 | config PERMIT_BSP_REMOVE |
320 | bool "Support removal of Bootstrap Processor" | 325 | bool "Support removal of Bootstrap Processor" |
321 | depends on HOTPLUG_CPU | 326 | depends on HOTPLUG_CPU |
322 | default n | 327 | default n |
323 | ---help--- | 328 | ---help--- |
324 | Say Y here if your platform SAL will support removal of BSP with HOTPLUG_CPU | 329 | Say Y here if your platform SAL will support removal of BSP with HOTPLUG_CPU |
325 | support. | 330 | support. |
326 | 331 | ||
327 | config FORCE_CPEI_RETARGET | 332 | config FORCE_CPEI_RETARGET |
328 | bool "Force assumption that CPEI can be re-targetted" | 333 | bool "Force assumption that CPEI can be re-targetted" |
329 | depends on PERMIT_BSP_REMOVE | 334 | depends on PERMIT_BSP_REMOVE |
330 | default n | 335 | default n |
331 | ---help--- | 336 | ---help--- |
332 | Say Y if you need to force the assumption that CPEI can be re-targetted to | 337 | Say Y if you need to force the assumption that CPEI can be re-targetted to |
333 | any cpu in the system. This hint is available via ACPI 3.0 specifications. | 338 | any cpu in the system. This hint is available via ACPI 3.0 specifications. |
334 | Tiger4 systems are capable of re-directing CPEI to any CPU other than BSP. | 339 | Tiger4 systems are capable of re-directing CPEI to any CPU other than BSP. |
335 | This option it useful to enable this feature on older BIOS's as well. | 340 | This option it useful to enable this feature on older BIOS's as well. |
336 | You can also enable this by using boot command line option force_cpei=1. | 341 | You can also enable this by using boot command line option force_cpei=1. |
337 | 342 | ||
338 | source "kernel/Kconfig.preempt" | 343 | source "kernel/Kconfig.preempt" |
339 | 344 | ||
340 | source "mm/Kconfig" | 345 | source "mm/Kconfig" |
341 | 346 | ||
342 | config ARCH_SELECT_MEMORY_MODEL | 347 | config ARCH_SELECT_MEMORY_MODEL |
343 | def_bool y | 348 | def_bool y |
344 | 349 | ||
345 | config ARCH_DISCONTIGMEM_ENABLE | 350 | config ARCH_DISCONTIGMEM_ENABLE |
346 | def_bool y | 351 | def_bool y |
347 | help | 352 | help |
348 | Say Y to support efficient handling of discontiguous physical memory, | 353 | Say Y to support efficient handling of discontiguous physical memory, |
349 | for architectures which are either NUMA (Non-Uniform Memory Access) | 354 | for architectures which are either NUMA (Non-Uniform Memory Access) |
350 | or have huge holes in the physical address space for other reasons. | 355 | or have huge holes in the physical address space for other reasons. |
351 | See <file:Documentation/vm/numa> for more. | 356 | See <file:Documentation/vm/numa> for more. |
352 | 357 | ||
353 | config ARCH_FLATMEM_ENABLE | 358 | config ARCH_FLATMEM_ENABLE |
354 | def_bool y | 359 | def_bool y |
355 | 360 | ||
356 | config ARCH_SPARSEMEM_ENABLE | 361 | config ARCH_SPARSEMEM_ENABLE |
357 | def_bool y | 362 | def_bool y |
358 | depends on ARCH_DISCONTIGMEM_ENABLE | 363 | depends on ARCH_DISCONTIGMEM_ENABLE |
359 | select SPARSEMEM_VMEMMAP_ENABLE | 364 | select SPARSEMEM_VMEMMAP_ENABLE |
360 | 365 | ||
361 | config ARCH_DISCONTIGMEM_DEFAULT | 366 | config ARCH_DISCONTIGMEM_DEFAULT |
362 | def_bool y if (IA64_SGI_SN2 || IA64_GENERIC || IA64_HP_ZX1 || IA64_HP_ZX1_SWIOTLB) | 367 | def_bool y if (IA64_SGI_SN2 || IA64_GENERIC || IA64_HP_ZX1 || IA64_HP_ZX1_SWIOTLB) |
363 | depends on ARCH_DISCONTIGMEM_ENABLE | 368 | depends on ARCH_DISCONTIGMEM_ENABLE |
364 | 369 | ||
365 | config NUMA | 370 | config NUMA |
366 | bool "NUMA support" | 371 | bool "NUMA support" |
367 | depends on !IA64_HP_SIM && !FLATMEM | 372 | depends on !IA64_HP_SIM && !FLATMEM |
368 | default y if IA64_SGI_SN2 | 373 | default y if IA64_SGI_SN2 |
369 | select ACPI_NUMA if ACPI | 374 | select ACPI_NUMA if ACPI |
370 | help | 375 | help |
371 | Say Y to compile the kernel to support NUMA (Non-Uniform Memory | 376 | Say Y to compile the kernel to support NUMA (Non-Uniform Memory |
372 | Access). This option is for configuring high-end multiprocessor | 377 | Access). This option is for configuring high-end multiprocessor |
373 | server systems. If in doubt, say N. | 378 | server systems. If in doubt, say N. |
374 | 379 | ||
375 | config NODES_SHIFT | 380 | config NODES_SHIFT |
376 | int "Max num nodes shift(3-10)" | 381 | int "Max num nodes shift(3-10)" |
377 | range 3 10 | 382 | range 3 10 |
378 | default "10" | 383 | default "10" |
379 | depends on NEED_MULTIPLE_NODES | 384 | depends on NEED_MULTIPLE_NODES |
380 | help | 385 | help |
381 | This option specifies the maximum number of nodes in your SSI system. | 386 | This option specifies the maximum number of nodes in your SSI system. |
382 | MAX_NUMNODES will be 2^(This value). | 387 | MAX_NUMNODES will be 2^(This value). |
383 | If in doubt, use the default. | 388 | If in doubt, use the default. |
384 | 389 | ||
385 | config ARCH_POPULATES_NODE_MAP | 390 | config ARCH_POPULATES_NODE_MAP |
386 | def_bool y | 391 | def_bool y |
387 | 392 | ||
388 | # VIRTUAL_MEM_MAP and FLAT_NODE_MEM_MAP are functionally equivalent. | 393 | # VIRTUAL_MEM_MAP and FLAT_NODE_MEM_MAP are functionally equivalent. |
389 | # VIRTUAL_MEM_MAP has been retained for historical reasons. | 394 | # VIRTUAL_MEM_MAP has been retained for historical reasons. |
390 | config VIRTUAL_MEM_MAP | 395 | config VIRTUAL_MEM_MAP |
391 | bool "Virtual mem map" | 396 | bool "Virtual mem map" |
392 | depends on !SPARSEMEM | 397 | depends on !SPARSEMEM |
393 | default y if !IA64_HP_SIM | 398 | default y if !IA64_HP_SIM |
394 | help | 399 | help |
395 | Say Y to compile the kernel with support for a virtual mem map. | 400 | Say Y to compile the kernel with support for a virtual mem map. |
396 | This code also only takes effect if a memory hole of greater than | 401 | This code also only takes effect if a memory hole of greater than |
397 | 1 Gb is found during boot. You must turn this option on if you | 402 | 1 Gb is found during boot. You must turn this option on if you |
398 | require the DISCONTIGMEM option for your machine. If you are | 403 | require the DISCONTIGMEM option for your machine. If you are |
399 | unsure, say Y. | 404 | unsure, say Y. |
400 | 405 | ||
401 | config HOLES_IN_ZONE | 406 | config HOLES_IN_ZONE |
402 | bool | 407 | bool |
403 | default y if VIRTUAL_MEM_MAP | 408 | default y if VIRTUAL_MEM_MAP |
404 | 409 | ||
405 | config HAVE_ARCH_EARLY_PFN_TO_NID | 410 | config HAVE_ARCH_EARLY_PFN_TO_NID |
406 | def_bool y | 411 | def_bool y |
407 | depends on NEED_MULTIPLE_NODES | 412 | depends on NEED_MULTIPLE_NODES |
408 | 413 | ||
409 | config HAVE_ARCH_NODEDATA_EXTENSION | 414 | config HAVE_ARCH_NODEDATA_EXTENSION |
410 | def_bool y | 415 | def_bool y |
411 | depends on NUMA | 416 | depends on NUMA |
412 | 417 | ||
413 | config IA32_SUPPORT | 418 | config IA32_SUPPORT |
414 | bool "Support for Linux/x86 binaries" | 419 | bool "Support for Linux/x86 binaries" |
415 | help | 420 | help |
416 | IA-64 processors can execute IA-32 (X86) instructions. By | 421 | IA-64 processors can execute IA-32 (X86) instructions. By |
417 | saying Y here, the kernel will include IA-32 system call | 422 | saying Y here, the kernel will include IA-32 system call |
418 | emulation support which makes it possible to transparently | 423 | emulation support which makes it possible to transparently |
419 | run IA-32 Linux binaries on an IA-64 Linux system. | 424 | run IA-32 Linux binaries on an IA-64 Linux system. |
420 | If in doubt, say Y. | 425 | If in doubt, say Y. |
421 | 426 | ||
422 | config COMPAT | 427 | config COMPAT |
423 | bool | 428 | bool |
424 | depends on IA32_SUPPORT | 429 | depends on IA32_SUPPORT |
425 | default y | 430 | default y |
426 | 431 | ||
427 | config COMPAT_FOR_U64_ALIGNMENT | 432 | config COMPAT_FOR_U64_ALIGNMENT |
428 | def_bool COMPAT | 433 | def_bool COMPAT |
429 | 434 | ||
430 | config IA64_MCA_RECOVERY | 435 | config IA64_MCA_RECOVERY |
431 | tristate "MCA recovery from errors other than TLB." | 436 | tristate "MCA recovery from errors other than TLB." |
432 | 437 | ||
433 | config PERFMON | 438 | config PERFMON |
434 | bool "Performance monitor support" | 439 | bool "Performance monitor support" |
435 | help | 440 | help |
436 | Selects whether support for the IA-64 performance monitor hardware | 441 | Selects whether support for the IA-64 performance monitor hardware |
437 | is included in the kernel. This makes some kernel data-structures a | 442 | is included in the kernel. This makes some kernel data-structures a |
438 | little bigger and slows down execution a bit, but it is generally | 443 | little bigger and slows down execution a bit, but it is generally |
439 | a good idea to turn this on. If you're unsure, say Y. | 444 | a good idea to turn this on. If you're unsure, say Y. |
440 | 445 | ||
441 | config IA64_PALINFO | 446 | config IA64_PALINFO |
442 | tristate "/proc/pal support" | 447 | tristate "/proc/pal support" |
443 | help | 448 | help |
444 | If you say Y here, you are able to get PAL (Processor Abstraction | 449 | If you say Y here, you are able to get PAL (Processor Abstraction |
445 | Layer) information in /proc/pal. This contains useful information | 450 | Layer) information in /proc/pal. This contains useful information |
446 | about the processors in your systems, such as cache and TLB sizes | 451 | about the processors in your systems, such as cache and TLB sizes |
447 | and the PAL firmware version in use. | 452 | and the PAL firmware version in use. |
448 | 453 | ||
449 | To use this option, you have to ensure that the "/proc file system | 454 | To use this option, you have to ensure that the "/proc file system |
450 | support" (CONFIG_PROC_FS) is enabled, too. | 455 | support" (CONFIG_PROC_FS) is enabled, too. |
451 | 456 | ||
452 | config IA64_MC_ERR_INJECT | 457 | config IA64_MC_ERR_INJECT |
453 | tristate "MC error injection support" | 458 | tristate "MC error injection support" |
454 | help | 459 | help |
455 | Adds support for MC error injection. If enabled, the kernel | 460 | Adds support for MC error injection. If enabled, the kernel |
456 | will provide a sysfs interface for user applications to | 461 | will provide a sysfs interface for user applications to |
457 | call MC error injection PAL procedures to inject various errors. | 462 | call MC error injection PAL procedures to inject various errors. |
458 | This is a useful tool for MCA testing. | 463 | This is a useful tool for MCA testing. |
459 | 464 | ||
460 | If you're unsure, do not select this option. | 465 | If you're unsure, do not select this option. |
461 | 466 | ||
462 | config SGI_SN | 467 | config SGI_SN |
463 | def_bool y if (IA64_SGI_SN2 || IA64_GENERIC) | 468 | def_bool y if (IA64_SGI_SN2 || IA64_GENERIC) |
464 | 469 | ||
465 | config IA64_ESI | 470 | config IA64_ESI |
466 | bool "ESI (Extensible SAL Interface) support" | 471 | bool "ESI (Extensible SAL Interface) support" |
467 | help | 472 | help |
468 | If you say Y here, support is built into the kernel to | 473 | If you say Y here, support is built into the kernel to |
469 | make ESI calls. ESI calls are used to support vendor-specific | 474 | make ESI calls. ESI calls are used to support vendor-specific |
470 | firmware extensions, such as the ability to inject memory-errors | 475 | firmware extensions, such as the ability to inject memory-errors |
471 | for test-purposes. If you're unsure, say N. | 476 | for test-purposes. If you're unsure, say N. |
472 | 477 | ||
473 | config IA64_HP_AML_NFW | 478 | config IA64_HP_AML_NFW |
474 | bool "Support ACPI AML calls to native firmware" | 479 | bool "Support ACPI AML calls to native firmware" |
475 | help | 480 | help |
476 | This driver installs a global ACPI Operation Region handler for | 481 | This driver installs a global ACPI Operation Region handler for |
477 | region 0xA1. AML methods can use this OpRegion to call arbitrary | 482 | region 0xA1. AML methods can use this OpRegion to call arbitrary |
478 | native firmware functions. The driver installs the OpRegion | 483 | native firmware functions. The driver installs the OpRegion |
479 | handler if there is an HPQ5001 device or if the user supplies | 484 | handler if there is an HPQ5001 device or if the user supplies |
480 | the "force" module parameter, e.g., with the "aml_nfw.force" | 485 | the "force" module parameter, e.g., with the "aml_nfw.force" |
481 | kernel command line option. | 486 | kernel command line option. |
482 | 487 | ||
483 | source "drivers/sn/Kconfig" | 488 | source "drivers/sn/Kconfig" |
484 | 489 | ||
485 | config KEXEC | 490 | config KEXEC |
486 | bool "kexec system call (EXPERIMENTAL)" | 491 | bool "kexec system call (EXPERIMENTAL)" |
487 | depends on EXPERIMENTAL && !IA64_HP_SIM && (!SMP || HOTPLUG_CPU) | 492 | depends on EXPERIMENTAL && !IA64_HP_SIM && (!SMP || HOTPLUG_CPU) |
488 | help | 493 | help |
489 | kexec is a system call that implements the ability to shutdown your | 494 | kexec is a system call that implements the ability to shutdown your |
490 | current kernel, and to start another kernel. It is like a reboot | 495 | current kernel, and to start another kernel. It is like a reboot |
491 | but it is independent of the system firmware. And like a reboot | 496 | but it is independent of the system firmware. And like a reboot |
492 | you can start any kernel with it, not just Linux. | 497 | you can start any kernel with it, not just Linux. |
493 | 498 | ||
494 | The name comes from the similarity to the exec system call. | 499 | The name comes from the similarity to the exec system call. |
495 | 500 | ||
496 | It is an ongoing process to be certain the hardware in a machine | 501 | It is an ongoing process to be certain the hardware in a machine |
497 | is properly shutdown, so do not be surprised if this code does not | 502 | is properly shutdown, so do not be surprised if this code does not |
498 | initially work for you. It may help to enable device hotplugging | 503 | initially work for you. It may help to enable device hotplugging |
499 | support. As of this writing the exact hardware interface is | 504 | support. As of this writing the exact hardware interface is |
500 | strongly in flux, so no good recommendation can be made. | 505 | strongly in flux, so no good recommendation can be made. |
501 | 506 | ||
502 | config CRASH_DUMP | 507 | config CRASH_DUMP |
503 | bool "kernel crash dumps (EXPERIMENTAL)" | 508 | bool "kernel crash dumps (EXPERIMENTAL)" |
504 | depends on EXPERIMENTAL && IA64_MCA_RECOVERY && !IA64_HP_SIM && (!SMP || HOTPLUG_CPU) | 509 | depends on EXPERIMENTAL && IA64_MCA_RECOVERY && !IA64_HP_SIM && (!SMP || HOTPLUG_CPU) |
505 | help | 510 | help |
506 | Generate crash dump after being started by kexec. | 511 | Generate crash dump after being started by kexec. |
507 | 512 | ||
508 | source "drivers/firmware/Kconfig" | 513 | source "drivers/firmware/Kconfig" |
509 | 514 | ||
510 | source "fs/Kconfig.binfmt" | 515 | source "fs/Kconfig.binfmt" |
511 | 516 | ||
512 | endmenu | 517 | endmenu |
513 | 518 | ||
514 | menu "Power management and ACPI" | 519 | menu "Power management and ACPI" |
515 | 520 | ||
516 | source "kernel/power/Kconfig" | 521 | source "kernel/power/Kconfig" |
517 | 522 | ||
518 | source "drivers/acpi/Kconfig" | 523 | source "drivers/acpi/Kconfig" |
519 | 524 | ||
520 | if PM | 525 | if PM |
521 | 526 | ||
522 | source "arch/ia64/kernel/cpufreq/Kconfig" | 527 | source "arch/ia64/kernel/cpufreq/Kconfig" |
523 | 528 | ||
524 | endif | 529 | endif |
525 | 530 | ||
526 | endmenu | 531 | endmenu |
527 | 532 | ||
528 | if !IA64_HP_SIM | 533 | if !IA64_HP_SIM |
529 | 534 | ||
530 | menu "Bus options (PCI, PCMCIA)" | 535 | menu "Bus options (PCI, PCMCIA)" |
531 | 536 | ||
532 | config PCI | 537 | config PCI |
533 | bool "PCI support" | 538 | bool "PCI support" |
534 | help | 539 | help |
535 | Real IA-64 machines all have PCI/PCI-X/PCI Express busses. Say Y | 540 | Real IA-64 machines all have PCI/PCI-X/PCI Express busses. Say Y |
536 | here unless you are using a simulator without PCI support. | 541 | here unless you are using a simulator without PCI support. |
537 | 542 | ||
538 | config PCI_DOMAINS | 543 | config PCI_DOMAINS |
539 | def_bool PCI | 544 | def_bool PCI |
540 | 545 | ||
541 | config PCI_SYSCALL | 546 | config PCI_SYSCALL |
542 | def_bool PCI | 547 | def_bool PCI |
543 | 548 | ||
544 | source "drivers/pci/pcie/Kconfig" | 549 | source "drivers/pci/pcie/Kconfig" |
545 | 550 | ||
546 | source "drivers/pci/Kconfig" | 551 | source "drivers/pci/Kconfig" |
547 | 552 | ||
548 | source "drivers/pci/hotplug/Kconfig" | 553 | source "drivers/pci/hotplug/Kconfig" |
549 | 554 | ||
550 | source "drivers/pcmcia/Kconfig" | 555 | source "drivers/pcmcia/Kconfig" |
551 | 556 | ||
552 | endmenu | 557 | endmenu |
553 | 558 | ||
554 | endif | 559 | endif |
555 | 560 | ||
556 | source "net/Kconfig" | 561 | source "net/Kconfig" |
557 | 562 | ||
558 | source "drivers/Kconfig" | 563 | source "drivers/Kconfig" |
559 | 564 | ||
560 | config MSPEC | 565 | config MSPEC |
561 | tristate "Memory special operations driver" | 566 | tristate "Memory special operations driver" |
562 | depends on IA64 | 567 | depends on IA64 |
563 | select IA64_UNCACHED_ALLOCATOR | 568 | select IA64_UNCACHED_ALLOCATOR |
564 | help | 569 | help |
565 | If you have an ia64 and you want to enable memory special | 570 | If you have an ia64 and you want to enable memory special |
566 | operations support (formerly known as fetchop), say Y here, | 571 | operations support (formerly known as fetchop), say Y here, |
567 | otherwise say N. | 572 | otherwise say N. |
568 | 573 | ||
569 | source "fs/Kconfig" | 574 | source "fs/Kconfig" |
570 | 575 | ||
571 | source "lib/Kconfig" | 576 | source "lib/Kconfig" |
572 | 577 | ||
573 | # | 578 | # |
574 | # Use the generic interrupt handling code in kernel/irq/: | 579 | # Use the generic interrupt handling code in kernel/irq/: |
575 | # | 580 | # |
576 | config GENERIC_HARDIRQS | 581 | config GENERIC_HARDIRQS |
577 | bool | 582 | bool |
578 | default y | 583 | default y |
579 | 584 | ||
580 | config GENERIC_IRQ_PROBE | 585 | config GENERIC_IRQ_PROBE |
581 | bool | 586 | bool |
582 | default y | 587 | default y |
583 | 588 | ||
584 | config GENERIC_PENDING_IRQ | 589 | config GENERIC_PENDING_IRQ |
585 | bool | 590 | bool |
586 | depends on GENERIC_HARDIRQS && SMP | 591 | depends on GENERIC_HARDIRQS && SMP |
587 | default y | 592 | default y |
588 | 593 | ||
589 | config IRQ_PER_CPU | 594 | config IRQ_PER_CPU |
590 | bool | 595 | bool |
591 | default y | 596 | default y |
592 | 597 | ||
593 | source "arch/ia64/hp/sim/Kconfig" | 598 | source "arch/ia64/hp/sim/Kconfig" |
594 | 599 | ||
595 | source "kernel/Kconfig.instrumentation" | 600 | source "kernel/Kconfig.instrumentation" |
596 | 601 | ||
597 | source "arch/ia64/Kconfig.debug" | 602 | source "arch/ia64/Kconfig.debug" |
598 | 603 | ||
599 | source "security/Kconfig" | 604 | source "security/Kconfig" |
600 | 605 | ||
601 | source "crypto/Kconfig" | 606 | source "crypto/Kconfig" |
602 | 607 |
arch/m32r/Kconfig
1 | # | 1 | # |
2 | # For a description of the syntax of this configuration file, | 2 | # For a description of the syntax of this configuration file, |
3 | # see Documentation/kbuild/kconfig-language.txt. | 3 | # see Documentation/kbuild/kconfig-language.txt. |
4 | # | 4 | # |
5 | 5 | ||
6 | mainmenu "Linux/M32R Kernel Configuration" | 6 | mainmenu "Linux/M32R Kernel Configuration" |
7 | 7 | ||
8 | config M32R | 8 | config M32R |
9 | bool | 9 | bool |
10 | default y | 10 | default y |
11 | 11 | ||
12 | config SBUS | 12 | config SBUS |
13 | bool | 13 | bool |
14 | 14 | ||
15 | config GENERIC_ISA_DMA | 15 | config GENERIC_ISA_DMA |
16 | bool | 16 | bool |
17 | default y | 17 | default y |
18 | 18 | ||
19 | config ZONE_DMA | 19 | config ZONE_DMA |
20 | bool | 20 | bool |
21 | default y | 21 | default y |
22 | 22 | ||
23 | config GENERIC_HARDIRQS | 23 | config GENERIC_HARDIRQS |
24 | bool | 24 | bool |
25 | default y | 25 | default y |
26 | 26 | ||
27 | config GENERIC_IRQ_PROBE | 27 | config GENERIC_IRQ_PROBE |
28 | bool | 28 | bool |
29 | default y | 29 | default y |
30 | 30 | ||
31 | config NO_IOPORT | 31 | config NO_IOPORT |
32 | def_bool y | 32 | def_bool y |
33 | 33 | ||
34 | config NO_DMA | 34 | config NO_DMA |
35 | def_bool y | 35 | def_bool y |
36 | 36 | ||
37 | source "init/Kconfig" | 37 | source "init/Kconfig" |
38 | 38 | ||
39 | 39 | ||
40 | menu "Processor type and features" | 40 | menu "Processor type and features" |
41 | 41 | ||
42 | choice | 42 | choice |
43 | prompt "Platform Type" | 43 | prompt "Platform Type" |
44 | default PLAT_MAPPI | 44 | default PLAT_MAPPI |
45 | 45 | ||
46 | config PLAT_MAPPI | 46 | config PLAT_MAPPI |
47 | bool "Mappi-I" | 47 | bool "Mappi-I" |
48 | help | 48 | help |
49 | The Mappi-I is an FPGA board for SOC (System-On-a-Chip) prototyping. | 49 | The Mappi-I is an FPGA board for SOC (System-On-a-Chip) prototyping. |
50 | You can operate a Linux system on this board by using an M32R | 50 | You can operate a Linux system on this board by using an M32R |
51 | softmacro core, which is a fully-synthesizable functional model | 51 | softmacro core, which is a fully-synthesizable functional model |
52 | described in Verilog-HDL. | 52 | described in Verilog-HDL. |
53 | 53 | ||
54 | The Mappi-I board was the first platform, which had been used | 54 | The Mappi-I board was the first platform, which had been used |
55 | to port and develop a Linux system for the M32R processor. | 55 | to port and develop a Linux system for the M32R processor. |
56 | Currently, the Mappi-II, an heir to the Mappi-I, is available. | 56 | Currently, the Mappi-II, an heir to the Mappi-I, is available. |
57 | 57 | ||
58 | config PLAT_USRV | 58 | config PLAT_USRV |
59 | bool "uServer" | 59 | bool "uServer" |
60 | select PLAT_HAS_INT1ICU | 60 | select PLAT_HAS_INT1ICU |
61 | 61 | ||
62 | config PLAT_M32700UT | 62 | config PLAT_M32700UT |
63 | bool "M32700UT" | 63 | bool "M32700UT" |
64 | select PLAT_HAS_INT0ICU | 64 | select PLAT_HAS_INT0ICU |
65 | select PLAT_HAS_INT1ICU | 65 | select PLAT_HAS_INT1ICU |
66 | select PLAT_HAS_INT2ICU | 66 | select PLAT_HAS_INT2ICU |
67 | help | 67 | help |
68 | The M3T-M32700UT is an evaluation board based on uT-Engine | 68 | The M3T-M32700UT is an evaluation board based on uT-Engine |
69 | specification. This board has an M32700 (Chaos) evaluation chip. | 69 | specification. This board has an M32700 (Chaos) evaluation chip. |
70 | You can say Y for SMP, because the M32700 is a single chip | 70 | You can say Y for SMP, because the M32700 is a single chip |
71 | multiprocessor. | 71 | multiprocessor. |
72 | 72 | ||
73 | config PLAT_OPSPUT | 73 | config PLAT_OPSPUT |
74 | bool "OPSPUT" | 74 | bool "OPSPUT" |
75 | select PLAT_HAS_INT0ICU | 75 | select PLAT_HAS_INT0ICU |
76 | select PLAT_HAS_INT1ICU | 76 | select PLAT_HAS_INT1ICU |
77 | select PLAT_HAS_INT2ICU | 77 | select PLAT_HAS_INT2ICU |
78 | help | 78 | help |
79 | The OPSPUT is an evaluation board based on uT-Engine | 79 | The OPSPUT is an evaluation board based on uT-Engine |
80 | specification. This board has a OPSP-REP chip. | 80 | specification. This board has a OPSP-REP chip. |
81 | 81 | ||
82 | config PLAT_OAKS32R | 82 | config PLAT_OAKS32R |
83 | bool "OAKS32R" | 83 | bool "OAKS32R" |
84 | help | 84 | help |
85 | The OAKS32R is a tiny, inexpensive evaluation board. | 85 | The OAKS32R is a tiny, inexpensive evaluation board. |
86 | Please note that if you say Y here and choose chip "M32102", | 86 | Please note that if you say Y here and choose chip "M32102", |
87 | say N for MMU and select a no-MMU version kernel, otherwise | 87 | say N for MMU and select a no-MMU version kernel, otherwise |
88 | a kernel with MMU support will not work, because the M32102 | 88 | a kernel with MMU support will not work, because the M32102 |
89 | is a microcontroller for embedded systems and it has no MMU. | 89 | is a microcontroller for embedded systems and it has no MMU. |
90 | 90 | ||
91 | config PLAT_MAPPI2 | 91 | config PLAT_MAPPI2 |
92 | bool "Mappi-II(M3A-ZA36/M3A-ZA52)" | 92 | bool "Mappi-II(M3A-ZA36/M3A-ZA52)" |
93 | 93 | ||
94 | config PLAT_MAPPI3 | 94 | config PLAT_MAPPI3 |
95 | bool "Mappi-III(M3A-2170)" | 95 | bool "Mappi-III(M3A-2170)" |
96 | 96 | ||
97 | config PLAT_M32104UT | 97 | config PLAT_M32104UT |
98 | bool "M32104UT" | 98 | bool "M32104UT" |
99 | select PLAT_HAS_INT1ICU | 99 | select PLAT_HAS_INT1ICU |
100 | help | 100 | help |
101 | The M3T-M32104UT is an reference board based on uT-Engine | 101 | The M3T-M32104UT is an reference board based on uT-Engine |
102 | specification. This board has a M32104 chip. | 102 | specification. This board has a M32104 chip. |
103 | 103 | ||
104 | endchoice | 104 | endchoice |
105 | 105 | ||
106 | choice | 106 | choice |
107 | prompt "Processor family" | 107 | prompt "Processor family" |
108 | default CHIP_M32700 | 108 | default CHIP_M32700 |
109 | 109 | ||
110 | config CHIP_M32700 | 110 | config CHIP_M32700 |
111 | bool "M32700 (Chaos)" | 111 | bool "M32700 (Chaos)" |
112 | 112 | ||
113 | config CHIP_M32102 | 113 | config CHIP_M32102 |
114 | bool "M32102" | 114 | bool "M32102" |
115 | 115 | ||
116 | config CHIP_M32104 | 116 | config CHIP_M32104 |
117 | bool "M32104" | 117 | bool "M32104" |
118 | depends on PLAT_M32104UT | 118 | depends on PLAT_M32104UT |
119 | 119 | ||
120 | config CHIP_VDEC2 | 120 | config CHIP_VDEC2 |
121 | bool "VDEC2" | 121 | bool "VDEC2" |
122 | 122 | ||
123 | config CHIP_OPSP | 123 | config CHIP_OPSP |
124 | bool "OPSP" | 124 | bool "OPSP" |
125 | 125 | ||
126 | endchoice | 126 | endchoice |
127 | 127 | ||
128 | config MMU | 128 | config MMU |
129 | bool "Support for memory management hardware" | 129 | bool "Support for memory management hardware" |
130 | depends on CHIP_M32700 || CHIP_VDEC2 || CHIP_OPSP | 130 | depends on CHIP_M32700 || CHIP_VDEC2 || CHIP_OPSP |
131 | default y | 131 | default y |
132 | 132 | ||
133 | config TLB_ENTRIES | 133 | config TLB_ENTRIES |
134 | int "TLB Entries" | 134 | int "TLB Entries" |
135 | depends on CHIP_M32700 || CHIP_VDEC2 || CHIP_OPSP | 135 | depends on CHIP_M32700 || CHIP_VDEC2 || CHIP_OPSP |
136 | default 32 if CHIP_M32700 || CHIP_OPSP | 136 | default 32 if CHIP_M32700 || CHIP_OPSP |
137 | default 16 if CHIP_VDEC2 | 137 | default 16 if CHIP_VDEC2 |
138 | 138 | ||
139 | 139 | ||
140 | config ISA_M32R | 140 | config ISA_M32R |
141 | bool | 141 | bool |
142 | depends on CHIP_M32102 || CHIP_M32104 | 142 | depends on CHIP_M32102 || CHIP_M32104 |
143 | default y | 143 | default y |
144 | 144 | ||
145 | config ISA_M32R2 | 145 | config ISA_M32R2 |
146 | bool | 146 | bool |
147 | depends on CHIP_M32700 || CHIP_VDEC2 || CHIP_OPSP | 147 | depends on CHIP_M32700 || CHIP_VDEC2 || CHIP_OPSP |
148 | default y | 148 | default y |
149 | 149 | ||
150 | config ISA_DSP_LEVEL2 | 150 | config ISA_DSP_LEVEL2 |
151 | bool | 151 | bool |
152 | depends on CHIP_M32700 || CHIP_OPSP | 152 | depends on CHIP_M32700 || CHIP_OPSP |
153 | default y | 153 | default y |
154 | 154 | ||
155 | config ISA_DUAL_ISSUE | 155 | config ISA_DUAL_ISSUE |
156 | bool | 156 | bool |
157 | depends on CHIP_M32700 || CHIP_OPSP | 157 | depends on CHIP_M32700 || CHIP_OPSP |
158 | default y | 158 | default y |
159 | 159 | ||
160 | config PLAT_HAS_INT0ICU | 160 | config PLAT_HAS_INT0ICU |
161 | bool | 161 | bool |
162 | default n | 162 | default n |
163 | 163 | ||
164 | config PLAT_HAS_INT1ICU | 164 | config PLAT_HAS_INT1ICU |
165 | bool | 165 | bool |
166 | default n | 166 | default n |
167 | 167 | ||
168 | config PLAT_HAS_INT2ICU | 168 | config PLAT_HAS_INT2ICU |
169 | bool | 169 | bool |
170 | default n | 170 | default n |
171 | 171 | ||
172 | config BUS_CLOCK | 172 | config BUS_CLOCK |
173 | int "Bus Clock [Hz] (integer)" | 173 | int "Bus Clock [Hz] (integer)" |
174 | default "70000000" if PLAT_MAPPI | 174 | default "70000000" if PLAT_MAPPI |
175 | default "25000000" if PLAT_USRV | 175 | default "25000000" if PLAT_USRV |
176 | default "50000000" if PLAT_MAPPI3 | 176 | default "50000000" if PLAT_MAPPI3 |
177 | default "50000000" if PLAT_M32700UT | 177 | default "50000000" if PLAT_M32700UT |
178 | default "50000000" if PLAT_OPSPUT | 178 | default "50000000" if PLAT_OPSPUT |
179 | default "54000000" if PLAT_M32104UT | 179 | default "54000000" if PLAT_M32104UT |
180 | default "33333333" if PLAT_OAKS32R | 180 | default "33333333" if PLAT_OAKS32R |
181 | default "20000000" if PLAT_MAPPI2 | 181 | default "20000000" if PLAT_MAPPI2 |
182 | 182 | ||
183 | config TIMER_DIVIDE | 183 | config TIMER_DIVIDE |
184 | int "Timer divider (integer)" | 184 | int "Timer divider (integer)" |
185 | default "128" | 185 | default "128" |
186 | 186 | ||
187 | config CPU_LITTLE_ENDIAN | 187 | config CPU_LITTLE_ENDIAN |
188 | bool "Generate little endian code" | 188 | bool "Generate little endian code" |
189 | default n | 189 | default n |
190 | 190 | ||
191 | config MEMORY_START | 191 | config MEMORY_START |
192 | hex "Physical memory start address (hex)" | 192 | hex "Physical memory start address (hex)" |
193 | default "08000000" if PLAT_MAPPI || PLAT_MAPPI2 || PLAT_MAPPI3 | 193 | default "08000000" if PLAT_MAPPI || PLAT_MAPPI2 || PLAT_MAPPI3 |
194 | default "08000000" if PLAT_USRV | 194 | default "08000000" if PLAT_USRV |
195 | default "08000000" if PLAT_M32700UT | 195 | default "08000000" if PLAT_M32700UT |
196 | default "08000000" if PLAT_OPSPUT | 196 | default "08000000" if PLAT_OPSPUT |
197 | default "04000000" if PLAT_M32104UT | 197 | default "04000000" if PLAT_M32104UT |
198 | default "01000000" if PLAT_OAKS32R | 198 | default "01000000" if PLAT_OAKS32R |
199 | 199 | ||
200 | config MEMORY_SIZE | 200 | config MEMORY_SIZE |
201 | hex "Physical memory size (hex)" | 201 | hex "Physical memory size (hex)" |
202 | default "08000000" if PLAT_MAPPI3 | 202 | default "08000000" if PLAT_MAPPI3 |
203 | default "04000000" if PLAT_MAPPI || PLAT_MAPPI2 | 203 | default "04000000" if PLAT_MAPPI || PLAT_MAPPI2 |
204 | default "02000000" if PLAT_USRV | 204 | default "02000000" if PLAT_USRV |
205 | default "01000000" if PLAT_M32700UT | 205 | default "01000000" if PLAT_M32700UT |
206 | default "01000000" if PLAT_OPSPUT | 206 | default "01000000" if PLAT_OPSPUT |
207 | default "01000000" if PLAT_M32104UT | 207 | default "01000000" if PLAT_M32104UT |
208 | default "00800000" if PLAT_OAKS32R | 208 | default "00800000" if PLAT_OAKS32R |
209 | 209 | ||
210 | config NOHIGHMEM | 210 | config NOHIGHMEM |
211 | bool | 211 | bool |
212 | default y | 212 | default y |
213 | 213 | ||
214 | config ARCH_DISCONTIGMEM_ENABLE | 214 | config ARCH_DISCONTIGMEM_ENABLE |
215 | bool "Internal RAM Support" | 215 | bool "Internal RAM Support" |
216 | depends on CHIP_M32700 || CHIP_M32102 || CHIP_VDEC2 || CHIP_OPSP || CHIP_M32104 | 216 | depends on CHIP_M32700 || CHIP_M32102 || CHIP_VDEC2 || CHIP_OPSP || CHIP_M32104 |
217 | default y | 217 | default y |
218 | 218 | ||
219 | source "mm/Kconfig" | 219 | source "mm/Kconfig" |
220 | 220 | ||
221 | config IRAM_START | 221 | config IRAM_START |
222 | hex "Internal memory start address (hex)" | 222 | hex "Internal memory start address (hex)" |
223 | default "00f00000" if !CHIP_M32104 | 223 | default "00f00000" if !CHIP_M32104 |
224 | default "00700000" if CHIP_M32104 | 224 | default "00700000" if CHIP_M32104 |
225 | depends on (CHIP_M32700 || CHIP_M32102 || CHIP_VDEC2 || CHIP_OPSP || CHIP_M32104) && DISCONTIGMEM | 225 | depends on (CHIP_M32700 || CHIP_M32102 || CHIP_VDEC2 || CHIP_OPSP || CHIP_M32104) && DISCONTIGMEM |
226 | 226 | ||
227 | config IRAM_SIZE | 227 | config IRAM_SIZE |
228 | hex "Internal memory size (hex)" | 228 | hex "Internal memory size (hex)" |
229 | depends on (CHIP_M32700 || CHIP_M32102 || CHIP_VDEC2 || CHIP_OPSP || CHIP_M32104) && DISCONTIGMEM | 229 | depends on (CHIP_M32700 || CHIP_M32102 || CHIP_VDEC2 || CHIP_OPSP || CHIP_M32104) && DISCONTIGMEM |
230 | default "00080000" if CHIP_M32700 | 230 | default "00080000" if CHIP_M32700 |
231 | default "00010000" if CHIP_M32102 || CHIP_OPSP || CHIP_M32104 | 231 | default "00010000" if CHIP_M32102 || CHIP_OPSP || CHIP_M32104 |
232 | default "00008000" if CHIP_VDEC2 | 232 | default "00008000" if CHIP_VDEC2 |
233 | 233 | ||
234 | # | 234 | # |
235 | # Define implied options from the CPU selection here | 235 | # Define implied options from the CPU selection here |
236 | # | 236 | # |
237 | 237 | ||
238 | config GENERIC_LOCKBREAK | ||
239 | bool | ||
240 | default y | ||
241 | depends on SMP && PREEMPT | ||
242 | |||
238 | config RWSEM_GENERIC_SPINLOCK | 243 | config RWSEM_GENERIC_SPINLOCK |
239 | bool | 244 | bool |
240 | depends on M32R | 245 | depends on M32R |
241 | default y | 246 | default y |
242 | 247 | ||
243 | config RWSEM_XCHGADD_ALGORITHM | 248 | config RWSEM_XCHGADD_ALGORITHM |
244 | bool | 249 | bool |
245 | default n | 250 | default n |
246 | 251 | ||
247 | config ARCH_HAS_ILOG2_U32 | 252 | config ARCH_HAS_ILOG2_U32 |
248 | bool | 253 | bool |
249 | default n | 254 | default n |
250 | 255 | ||
251 | config ARCH_HAS_ILOG2_U64 | 256 | config ARCH_HAS_ILOG2_U64 |
252 | bool | 257 | bool |
253 | default n | 258 | default n |
254 | 259 | ||
255 | config GENERIC_FIND_NEXT_BIT | 260 | config GENERIC_FIND_NEXT_BIT |
256 | bool | 261 | bool |
257 | default y | 262 | default y |
258 | 263 | ||
259 | config GENERIC_HWEIGHT | 264 | config GENERIC_HWEIGHT |
260 | bool | 265 | bool |
261 | default y | 266 | default y |
262 | 267 | ||
263 | config GENERIC_CALIBRATE_DELAY | 268 | config GENERIC_CALIBRATE_DELAY |
264 | bool | 269 | bool |
265 | default y | 270 | default y |
266 | 271 | ||
267 | config SCHED_NO_NO_OMIT_FRAME_POINTER | 272 | config SCHED_NO_NO_OMIT_FRAME_POINTER |
268 | bool | 273 | bool |
269 | default y | 274 | default y |
270 | 275 | ||
271 | config PREEMPT | 276 | config PREEMPT |
272 | bool "Preemptible Kernel" | 277 | bool "Preemptible Kernel" |
273 | help | 278 | help |
274 | This option reduces the latency of the kernel when reacting to | 279 | This option reduces the latency of the kernel when reacting to |
275 | real-time or interactive events by allowing a low priority process to | 280 | real-time or interactive events by allowing a low priority process to |
276 | be preempted even if it is in kernel mode executing a system call. | 281 | be preempted even if it is in kernel mode executing a system call. |
277 | This allows applications to run more reliably even when the system is | 282 | This allows applications to run more reliably even when the system is |
278 | under load. | 283 | under load. |
279 | 284 | ||
280 | Say Y here if you are building a kernel for a desktop, embedded | 285 | Say Y here if you are building a kernel for a desktop, embedded |
281 | or real-time system. Say N if you are unsure. | 286 | or real-time system. Say N if you are unsure. |
282 | 287 | ||
283 | config SMP | 288 | config SMP |
284 | bool "Symmetric multi-processing support" | 289 | bool "Symmetric multi-processing support" |
285 | ---help--- | 290 | ---help--- |
286 | This enables support for systems with more than one CPU. If you have | 291 | This enables support for systems with more than one CPU. If you have |
287 | a system with only one CPU, like most personal computers, say N. If | 292 | a system with only one CPU, like most personal computers, say N. If |
288 | you have a system with more than one CPU, say Y. | 293 | you have a system with more than one CPU, say Y. |
289 | 294 | ||
290 | If you say N here, the kernel will run on single and multiprocessor | 295 | If you say N here, the kernel will run on single and multiprocessor |
291 | machines, but will use only one CPU of a multiprocessor machine. If | 296 | machines, but will use only one CPU of a multiprocessor machine. If |
292 | you say Y here, the kernel will run on many, but not all, | 297 | you say Y here, the kernel will run on many, but not all, |
293 | singleprocessor machines. On a singleprocessor machine, the kernel | 298 | singleprocessor machines. On a singleprocessor machine, the kernel |
294 | will run faster if you say N here. | 299 | will run faster if you say N here. |
295 | 300 | ||
296 | People using multiprocessor machines who say Y here should also say | 301 | People using multiprocessor machines who say Y here should also say |
297 | Y to "Enhanced Real Time Clock Support", below. The "Advanced Power | 302 | Y to "Enhanced Real Time Clock Support", below. The "Advanced Power |
298 | Management" code will be disabled if you say Y here. | 303 | Management" code will be disabled if you say Y here. |
299 | 304 | ||
300 | See also the <file:Documentation/smp.txt>, | 305 | See also the <file:Documentation/smp.txt>, |
301 | and the SMP-HOWTO available at | 306 | and the SMP-HOWTO available at |
302 | <http://www.linuxdoc.org/docs.html#howto>. | 307 | <http://www.linuxdoc.org/docs.html#howto>. |
303 | 308 | ||
304 | If you don't know what to do here, say N. | 309 | If you don't know what to do here, say N. |
305 | 310 | ||
306 | config CHIP_M32700_TS1 | 311 | config CHIP_M32700_TS1 |
307 | bool "Workaround code for the M32700 TS1 chip's bug" | 312 | bool "Workaround code for the M32700 TS1 chip's bug" |
308 | depends on (CHIP_M32700 && SMP) | 313 | depends on (CHIP_M32700 && SMP) |
309 | default n | 314 | default n |
310 | 315 | ||
311 | config NR_CPUS | 316 | config NR_CPUS |
312 | int "Maximum number of CPUs (2-32)" | 317 | int "Maximum number of CPUs (2-32)" |
313 | range 2 32 | 318 | range 2 32 |
314 | depends on SMP | 319 | depends on SMP |
315 | default "2" | 320 | default "2" |
316 | help | 321 | help |
317 | This allows you to specify the maximum number of CPUs which this | 322 | This allows you to specify the maximum number of CPUs which this |
318 | kernel will support. The maximum supported value is 32 and the | 323 | kernel will support. The maximum supported value is 32 and the |
319 | minimum value which makes sense is 2. | 324 | minimum value which makes sense is 2. |
320 | 325 | ||
321 | This is purely to save memory - each supported CPU adds | 326 | This is purely to save memory - each supported CPU adds |
322 | approximately eight kilobytes to the kernel image. | 327 | approximately eight kilobytes to the kernel image. |
323 | 328 | ||
324 | # Common NUMA Features | 329 | # Common NUMA Features |
325 | config NUMA | 330 | config NUMA |
326 | bool "Numa Memory Allocation Support" | 331 | bool "Numa Memory Allocation Support" |
327 | depends on SMP && BROKEN | 332 | depends on SMP && BROKEN |
328 | default n | 333 | default n |
329 | 334 | ||
330 | config NODES_SHIFT | 335 | config NODES_SHIFT |
331 | int | 336 | int |
332 | default "1" | 337 | default "1" |
333 | depends on NEED_MULTIPLE_NODES | 338 | depends on NEED_MULTIPLE_NODES |
334 | 339 | ||
335 | # turning this on wastes a bunch of space. | 340 | # turning this on wastes a bunch of space. |
336 | # Summit needs it only when NUMA is on | 341 | # Summit needs it only when NUMA is on |
337 | config BOOT_IOREMAP | 342 | config BOOT_IOREMAP |
338 | bool | 343 | bool |
339 | depends on NUMA | 344 | depends on NUMA |
340 | default n | 345 | default n |
341 | 346 | ||
342 | endmenu | 347 | endmenu |
343 | 348 | ||
344 | 349 | ||
345 | menu "Bus options (PCI, PCMCIA, EISA, MCA, ISA)" | 350 | menu "Bus options (PCI, PCMCIA, EISA, MCA, ISA)" |
346 | 351 | ||
347 | config PCI | 352 | config PCI |
348 | bool "PCI support" | 353 | bool "PCI support" |
349 | depends on BROKEN | 354 | depends on BROKEN |
350 | default n | 355 | default n |
351 | help | 356 | help |
352 | Find out whether you have a PCI motherboard. PCI is the name of a | 357 | Find out whether you have a PCI motherboard. PCI is the name of a |
353 | bus system, i.e. the way the CPU talks to the other stuff inside | 358 | bus system, i.e. the way the CPU talks to the other stuff inside |
354 | your box. Other bus systems are ISA, EISA, MicroChannel (MCA) or | 359 | your box. Other bus systems are ISA, EISA, MicroChannel (MCA) or |
355 | VESA. If you have PCI, say Y, otherwise N. | 360 | VESA. If you have PCI, say Y, otherwise N. |
356 | 361 | ||
357 | The PCI-HOWTO, available from | 362 | The PCI-HOWTO, available from |
358 | <http://www.linuxdoc.org/docs.html#howto>, contains valuable | 363 | <http://www.linuxdoc.org/docs.html#howto>, contains valuable |
359 | information about which PCI hardware does work under Linux and which | 364 | information about which PCI hardware does work under Linux and which |
360 | doesn't. | 365 | doesn't. |
361 | 366 | ||
362 | choice | 367 | choice |
363 | prompt "PCI access mode" | 368 | prompt "PCI access mode" |
364 | depends on PCI | 369 | depends on PCI |
365 | default PCI_GOANY | 370 | default PCI_GOANY |
366 | 371 | ||
367 | config PCI_GOBIOS | 372 | config PCI_GOBIOS |
368 | bool "BIOS" | 373 | bool "BIOS" |
369 | ---help--- | 374 | ---help--- |
370 | On PCI systems, the BIOS can be used to detect the PCI devices and | 375 | On PCI systems, the BIOS can be used to detect the PCI devices and |
371 | determine their configuration. However, some old PCI motherboards | 376 | determine their configuration. However, some old PCI motherboards |
372 | have BIOS bugs and may crash if this is done. Also, some embedded | 377 | have BIOS bugs and may crash if this is done. Also, some embedded |
373 | PCI-based systems don't have any BIOS at all. Linux can also try to | 378 | PCI-based systems don't have any BIOS at all. Linux can also try to |
374 | detect the PCI hardware directly without using the BIOS. | 379 | detect the PCI hardware directly without using the BIOS. |
375 | 380 | ||
376 | With this option, you can specify how Linux should detect the PCI | 381 | With this option, you can specify how Linux should detect the PCI |
377 | devices. If you choose "BIOS", the BIOS will be used, if you choose | 382 | devices. If you choose "BIOS", the BIOS will be used, if you choose |
378 | "Direct", the BIOS won't be used, and if you choose "Any", the | 383 | "Direct", the BIOS won't be used, and if you choose "Any", the |
379 | kernel will try the direct access method and falls back to the BIOS | 384 | kernel will try the direct access method and falls back to the BIOS |
380 | if that doesn't work. If unsure, go with the default, which is | 385 | if that doesn't work. If unsure, go with the default, which is |
381 | "Any". | 386 | "Any". |
382 | 387 | ||
383 | config PCI_GODIRECT | 388 | config PCI_GODIRECT |
384 | bool "Direct" | 389 | bool "Direct" |
385 | 390 | ||
386 | config PCI_GOANY | 391 | config PCI_GOANY |
387 | bool "Any" | 392 | bool "Any" |
388 | 393 | ||
389 | endchoice | 394 | endchoice |
390 | 395 | ||
391 | config PCI_BIOS | 396 | config PCI_BIOS |
392 | bool | 397 | bool |
393 | depends on PCI && (PCI_GOBIOS || PCI_GOANY) | 398 | depends on PCI && (PCI_GOBIOS || PCI_GOANY) |
394 | default y | 399 | default y |
395 | 400 | ||
396 | config PCI_DIRECT | 401 | config PCI_DIRECT |
397 | bool | 402 | bool |
398 | depends on PCI && (PCI_GODIRECT || PCI_GOANY) | 403 | depends on PCI && (PCI_GODIRECT || PCI_GOANY) |
399 | default y | 404 | default y |
400 | 405 | ||
401 | source "drivers/pci/Kconfig" | 406 | source "drivers/pci/Kconfig" |
402 | 407 | ||
403 | config ISA | 408 | config ISA |
404 | bool "ISA support" | 409 | bool "ISA support" |
405 | help | 410 | help |
406 | Find out whether you have ISA slots on your motherboard. ISA is the | 411 | Find out whether you have ISA slots on your motherboard. ISA is the |
407 | name of a bus system, i.e. the way the CPU talks to the other stuff | 412 | name of a bus system, i.e. the way the CPU talks to the other stuff |
408 | inside your box. If you have ISA, say Y, otherwise N. | 413 | inside your box. If you have ISA, say Y, otherwise N. |
409 | 414 | ||
410 | source "drivers/pcmcia/Kconfig" | 415 | source "drivers/pcmcia/Kconfig" |
411 | 416 | ||
412 | source "drivers/pci/hotplug/Kconfig" | 417 | source "drivers/pci/hotplug/Kconfig" |
413 | 418 | ||
414 | endmenu | 419 | endmenu |
415 | 420 | ||
416 | 421 | ||
417 | menu "Executable file formats" | 422 | menu "Executable file formats" |
418 | 423 | ||
419 | source "fs/Kconfig.binfmt" | 424 | source "fs/Kconfig.binfmt" |
420 | 425 | ||
421 | endmenu | 426 | endmenu |
422 | 427 | ||
423 | source "net/Kconfig" | 428 | source "net/Kconfig" |
424 | 429 | ||
425 | source "drivers/Kconfig" | 430 | source "drivers/Kconfig" |
426 | 431 | ||
427 | source "fs/Kconfig" | 432 | source "fs/Kconfig" |
428 | 433 | ||
429 | source "kernel/Kconfig.instrumentation" | 434 | source "kernel/Kconfig.instrumentation" |
430 | 435 | ||
431 | source "arch/m32r/Kconfig.debug" | 436 | source "arch/m32r/Kconfig.debug" |
432 | 437 | ||
433 | source "security/Kconfig" | 438 | source "security/Kconfig" |
434 | 439 | ||
435 | source "crypto/Kconfig" | 440 | source "crypto/Kconfig" |
436 | 441 | ||
437 | source "lib/Kconfig" | 442 | source "lib/Kconfig" |
438 | 443 |
arch/mips/Kconfig
1 | config MIPS | 1 | config MIPS |
2 | bool | 2 | bool |
3 | default y | 3 | default y |
4 | # Horrible source of confusion. Die, die, die ... | 4 | # Horrible source of confusion. Die, die, die ... |
5 | select EMBEDDED | 5 | select EMBEDDED |
6 | select RTC_LIB | 6 | select RTC_LIB |
7 | 7 | ||
8 | mainmenu "Linux/MIPS Kernel Configuration" | 8 | mainmenu "Linux/MIPS Kernel Configuration" |
9 | 9 | ||
10 | menu "Machine selection" | 10 | menu "Machine selection" |
11 | 11 | ||
12 | config ZONE_DMA | 12 | config ZONE_DMA |
13 | bool | 13 | bool |
14 | 14 | ||
15 | choice | 15 | choice |
16 | prompt "System type" | 16 | prompt "System type" |
17 | default SGI_IP22 | 17 | default SGI_IP22 |
18 | 18 | ||
19 | config MACH_ALCHEMY | 19 | config MACH_ALCHEMY |
20 | bool "Alchemy processor based machines" | 20 | bool "Alchemy processor based machines" |
21 | 21 | ||
22 | config BASLER_EXCITE | 22 | config BASLER_EXCITE |
23 | bool "Basler eXcite smart camera" | 23 | bool "Basler eXcite smart camera" |
24 | select CEVT_R4K | 24 | select CEVT_R4K |
25 | select CSRC_R4K | 25 | select CSRC_R4K |
26 | select DMA_COHERENT | 26 | select DMA_COHERENT |
27 | select HW_HAS_PCI | 27 | select HW_HAS_PCI |
28 | select IRQ_CPU | 28 | select IRQ_CPU |
29 | select IRQ_CPU_RM7K | 29 | select IRQ_CPU_RM7K |
30 | select IRQ_CPU_RM9K | 30 | select IRQ_CPU_RM9K |
31 | select MIPS_RM9122 | 31 | select MIPS_RM9122 |
32 | select SYS_HAS_CPU_RM9000 | 32 | select SYS_HAS_CPU_RM9000 |
33 | select SYS_SUPPORTS_32BIT_KERNEL | 33 | select SYS_SUPPORTS_32BIT_KERNEL |
34 | select SYS_SUPPORTS_BIG_ENDIAN | 34 | select SYS_SUPPORTS_BIG_ENDIAN |
35 | select SYS_SUPPORTS_KGDB | 35 | select SYS_SUPPORTS_KGDB |
36 | help | 36 | help |
37 | The eXcite is a smart camera platform manufactured by | 37 | The eXcite is a smart camera platform manufactured by |
38 | Basler Vision Technologies AG. | 38 | Basler Vision Technologies AG. |
39 | 39 | ||
40 | config BCM47XX | 40 | config BCM47XX |
41 | bool "BCM47XX based boards" | 41 | bool "BCM47XX based boards" |
42 | select CEVT_R4K | 42 | select CEVT_R4K |
43 | select CSRC_R4K | 43 | select CSRC_R4K |
44 | select DMA_NONCOHERENT | 44 | select DMA_NONCOHERENT |
45 | select HW_HAS_PCI | 45 | select HW_HAS_PCI |
46 | select IRQ_CPU | 46 | select IRQ_CPU |
47 | select SYS_HAS_CPU_MIPS32_R1 | 47 | select SYS_HAS_CPU_MIPS32_R1 |
48 | select SYS_SUPPORTS_32BIT_KERNEL | 48 | select SYS_SUPPORTS_32BIT_KERNEL |
49 | select SYS_SUPPORTS_LITTLE_ENDIAN | 49 | select SYS_SUPPORTS_LITTLE_ENDIAN |
50 | select SSB | 50 | select SSB |
51 | select SSB_DRIVER_MIPS | 51 | select SSB_DRIVER_MIPS |
52 | select SSB_DRIVER_EXTIF | 52 | select SSB_DRIVER_EXTIF |
53 | select SSB_PCICORE_HOSTMODE if PCI | 53 | select SSB_PCICORE_HOSTMODE if PCI |
54 | select GENERIC_GPIO | 54 | select GENERIC_GPIO |
55 | select SYS_HAS_EARLY_PRINTK | 55 | select SYS_HAS_EARLY_PRINTK |
56 | select CFE | 56 | select CFE |
57 | help | 57 | help |
58 | Support for BCM47XX based boards | 58 | Support for BCM47XX based boards |
59 | 59 | ||
60 | config MIPS_COBALT | 60 | config MIPS_COBALT |
61 | bool "Cobalt Server" | 61 | bool "Cobalt Server" |
62 | select CEVT_R4K | 62 | select CEVT_R4K |
63 | select CSRC_R4K | 63 | select CSRC_R4K |
64 | select CEVT_GT641XX | 64 | select CEVT_GT641XX |
65 | select DMA_NONCOHERENT | 65 | select DMA_NONCOHERENT |
66 | select HW_HAS_PCI | 66 | select HW_HAS_PCI |
67 | select I8253 | 67 | select I8253 |
68 | select I8259 | 68 | select I8259 |
69 | select IRQ_CPU | 69 | select IRQ_CPU |
70 | select IRQ_GT641XX | 70 | select IRQ_GT641XX |
71 | select PCI_GT64XXX_PCI0 | 71 | select PCI_GT64XXX_PCI0 |
72 | select SYS_HAS_CPU_NEVADA | 72 | select SYS_HAS_CPU_NEVADA |
73 | select SYS_HAS_EARLY_PRINTK | 73 | select SYS_HAS_EARLY_PRINTK |
74 | select SYS_SUPPORTS_32BIT_KERNEL | 74 | select SYS_SUPPORTS_32BIT_KERNEL |
75 | select SYS_SUPPORTS_64BIT_KERNEL | 75 | select SYS_SUPPORTS_64BIT_KERNEL |
76 | select SYS_SUPPORTS_LITTLE_ENDIAN | 76 | select SYS_SUPPORTS_LITTLE_ENDIAN |
77 | select GENERIC_HARDIRQS_NO__DO_IRQ | 77 | select GENERIC_HARDIRQS_NO__DO_IRQ |
78 | 78 | ||
79 | config MACH_DECSTATION | 79 | config MACH_DECSTATION |
80 | bool "DECstations" | 80 | bool "DECstations" |
81 | select BOOT_ELF32 | 81 | select BOOT_ELF32 |
82 | select CEVT_R4K | 82 | select CEVT_R4K |
83 | select CSRC_R4K | 83 | select CSRC_R4K |
84 | select CPU_DADDI_WORKAROUNDS if 64BIT | 84 | select CPU_DADDI_WORKAROUNDS if 64BIT |
85 | select CPU_R4000_WORKAROUNDS if 64BIT | 85 | select CPU_R4000_WORKAROUNDS if 64BIT |
86 | select CPU_R4400_WORKAROUNDS if 64BIT | 86 | select CPU_R4400_WORKAROUNDS if 64BIT |
87 | select DMA_NONCOHERENT | 87 | select DMA_NONCOHERENT |
88 | select NO_IOPORT | 88 | select NO_IOPORT |
89 | select IRQ_CPU | 89 | select IRQ_CPU |
90 | select SYS_HAS_CPU_R3000 | 90 | select SYS_HAS_CPU_R3000 |
91 | select SYS_HAS_CPU_R4X00 | 91 | select SYS_HAS_CPU_R4X00 |
92 | select SYS_SUPPORTS_32BIT_KERNEL | 92 | select SYS_SUPPORTS_32BIT_KERNEL |
93 | select SYS_SUPPORTS_64BIT_KERNEL if EXPERIMENTAL | 93 | select SYS_SUPPORTS_64BIT_KERNEL if EXPERIMENTAL |
94 | select SYS_SUPPORTS_LITTLE_ENDIAN | 94 | select SYS_SUPPORTS_LITTLE_ENDIAN |
95 | select SYS_SUPPORTS_128HZ | 95 | select SYS_SUPPORTS_128HZ |
96 | select SYS_SUPPORTS_256HZ | 96 | select SYS_SUPPORTS_256HZ |
97 | select SYS_SUPPORTS_1024HZ | 97 | select SYS_SUPPORTS_1024HZ |
98 | help | 98 | help |
99 | This enables support for DEC's MIPS based workstations. For details | 99 | This enables support for DEC's MIPS based workstations. For details |
100 | see the Linux/MIPS FAQ on <http://www.linux-mips.org/> and the | 100 | see the Linux/MIPS FAQ on <http://www.linux-mips.org/> and the |
101 | DECstation porting pages on <http://decstation.unix-ag.org/>. | 101 | DECstation porting pages on <http://decstation.unix-ag.org/>. |
102 | 102 | ||
103 | If you have one of the following DECstation Models you definitely | 103 | If you have one of the following DECstation Models you definitely |
104 | want to choose R4xx0 for the CPU Type: | 104 | want to choose R4xx0 for the CPU Type: |
105 | 105 | ||
106 | DECstation 5000/50 | 106 | DECstation 5000/50 |
107 | DECstation 5000/150 | 107 | DECstation 5000/150 |
108 | DECstation 5000/260 | 108 | DECstation 5000/260 |
109 | DECsystem 5900/260 | 109 | DECsystem 5900/260 |
110 | 110 | ||
111 | otherwise choose R3000. | 111 | otherwise choose R3000. |
112 | 112 | ||
113 | config MACH_JAZZ | 113 | config MACH_JAZZ |
114 | bool "Jazz family of machines" | 114 | bool "Jazz family of machines" |
115 | select ARC | 115 | select ARC |
116 | select ARC32 | 116 | select ARC32 |
117 | select ARCH_MAY_HAVE_PC_FDC | 117 | select ARCH_MAY_HAVE_PC_FDC |
118 | select CEVT_R4K | 118 | select CEVT_R4K |
119 | select CSRC_R4K | 119 | select CSRC_R4K |
120 | select DEFAULT_SGI_PARTITION if CPU_BIG_ENDIAN | 120 | select DEFAULT_SGI_PARTITION if CPU_BIG_ENDIAN |
121 | select GENERIC_ISA_DMA | 121 | select GENERIC_ISA_DMA |
122 | select IRQ_CPU | 122 | select IRQ_CPU |
123 | select I8253 | 123 | select I8253 |
124 | select I8259 | 124 | select I8259 |
125 | select ISA | 125 | select ISA |
126 | select SYS_HAS_CPU_R4X00 | 126 | select SYS_HAS_CPU_R4X00 |
127 | select SYS_SUPPORTS_32BIT_KERNEL | 127 | select SYS_SUPPORTS_32BIT_KERNEL |
128 | select SYS_SUPPORTS_64BIT_KERNEL if EXPERIMENTAL | 128 | select SYS_SUPPORTS_64BIT_KERNEL if EXPERIMENTAL |
129 | select SYS_SUPPORTS_100HZ | 129 | select SYS_SUPPORTS_100HZ |
130 | select GENERIC_HARDIRQS_NO__DO_IRQ | 130 | select GENERIC_HARDIRQS_NO__DO_IRQ |
131 | help | 131 | help |
132 | This a family of machines based on the MIPS R4030 chipset which was | 132 | This a family of machines based on the MIPS R4030 chipset which was |
133 | used by several vendors to build RISC/os and Windows NT workstations. | 133 | used by several vendors to build RISC/os and Windows NT workstations. |
134 | Members include the Acer PICA, MIPS Magnum 4000, MIPS Millenium and | 134 | Members include the Acer PICA, MIPS Magnum 4000, MIPS Millenium and |
135 | Olivetti M700-10 workstations. | 135 | Olivetti M700-10 workstations. |
136 | 136 | ||
137 | config LASAT | 137 | config LASAT |
138 | bool "LASAT Networks platforms" | 138 | bool "LASAT Networks platforms" |
139 | select CEVT_R4K | 139 | select CEVT_R4K |
140 | select CSRC_R4K | 140 | select CSRC_R4K |
141 | select DMA_NONCOHERENT | 141 | select DMA_NONCOHERENT |
142 | select SYS_HAS_EARLY_PRINTK | 142 | select SYS_HAS_EARLY_PRINTK |
143 | select HW_HAS_PCI | 143 | select HW_HAS_PCI |
144 | select IRQ_CPU | 144 | select IRQ_CPU |
145 | select PCI_GT64XXX_PCI0 | 145 | select PCI_GT64XXX_PCI0 |
146 | select MIPS_NILE4 | 146 | select MIPS_NILE4 |
147 | select R5000_CPU_SCACHE | 147 | select R5000_CPU_SCACHE |
148 | select SYS_HAS_CPU_R5000 | 148 | select SYS_HAS_CPU_R5000 |
149 | select SYS_SUPPORTS_32BIT_KERNEL | 149 | select SYS_SUPPORTS_32BIT_KERNEL |
150 | select SYS_SUPPORTS_64BIT_KERNEL if BROKEN | 150 | select SYS_SUPPORTS_64BIT_KERNEL if BROKEN |
151 | select SYS_SUPPORTS_LITTLE_ENDIAN | 151 | select SYS_SUPPORTS_LITTLE_ENDIAN |
152 | select GENERIC_HARDIRQS_NO__DO_IRQ | 152 | select GENERIC_HARDIRQS_NO__DO_IRQ |
153 | 153 | ||
154 | config LEMOTE_FULONG | 154 | config LEMOTE_FULONG |
155 | bool "Lemote Fulong mini-PC" | 155 | bool "Lemote Fulong mini-PC" |
156 | select ARCH_SPARSEMEM_ENABLE | 156 | select ARCH_SPARSEMEM_ENABLE |
157 | select CEVT_R4K | 157 | select CEVT_R4K |
158 | select CSRC_R4K | 158 | select CSRC_R4K |
159 | select SYS_HAS_CPU_LOONGSON2 | 159 | select SYS_HAS_CPU_LOONGSON2 |
160 | select DMA_NONCOHERENT | 160 | select DMA_NONCOHERENT |
161 | select BOOT_ELF32 | 161 | select BOOT_ELF32 |
162 | select BOARD_SCACHE | 162 | select BOARD_SCACHE |
163 | select HAVE_STD_PC_SERIAL_PORT | 163 | select HAVE_STD_PC_SERIAL_PORT |
164 | select HW_HAS_PCI | 164 | select HW_HAS_PCI |
165 | select I8259 | 165 | select I8259 |
166 | select ISA | 166 | select ISA |
167 | select IRQ_CPU | 167 | select IRQ_CPU |
168 | select SYS_SUPPORTS_32BIT_KERNEL | 168 | select SYS_SUPPORTS_32BIT_KERNEL |
169 | select SYS_SUPPORTS_64BIT_KERNEL | 169 | select SYS_SUPPORTS_64BIT_KERNEL |
170 | select SYS_SUPPORTS_LITTLE_ENDIAN | 170 | select SYS_SUPPORTS_LITTLE_ENDIAN |
171 | select SYS_SUPPORTS_HIGHMEM | 171 | select SYS_SUPPORTS_HIGHMEM |
172 | select SYS_HAS_EARLY_PRINTK | 172 | select SYS_HAS_EARLY_PRINTK |
173 | select GENERIC_HARDIRQS_NO__DO_IRQ | 173 | select GENERIC_HARDIRQS_NO__DO_IRQ |
174 | select GENERIC_ISA_DMA_SUPPORT_BROKEN | 174 | select GENERIC_ISA_DMA_SUPPORT_BROKEN |
175 | select CPU_HAS_WB | 175 | select CPU_HAS_WB |
176 | help | 176 | help |
177 | Lemote Fulong mini-PC board based on the Chinese Loongson-2E CPU and | 177 | Lemote Fulong mini-PC board based on the Chinese Loongson-2E CPU and |
178 | an FPGA northbridge | 178 | an FPGA northbridge |
179 | 179 | ||
180 | config MIPS_ATLAS | 180 | config MIPS_ATLAS |
181 | bool "MIPS Atlas board" | 181 | bool "MIPS Atlas board" |
182 | select BOOT_ELF32 | 182 | select BOOT_ELF32 |
183 | select BOOT_RAW | 183 | select BOOT_RAW |
184 | select CEVT_R4K | 184 | select CEVT_R4K |
185 | select CSRC_R4K | 185 | select CSRC_R4K |
186 | select DMA_NONCOHERENT | 186 | select DMA_NONCOHERENT |
187 | select SYS_HAS_EARLY_PRINTK | 187 | select SYS_HAS_EARLY_PRINTK |
188 | select IRQ_CPU | 188 | select IRQ_CPU |
189 | select HW_HAS_PCI | 189 | select HW_HAS_PCI |
190 | select MIPS_BOARDS_GEN | 190 | select MIPS_BOARDS_GEN |
191 | select MIPS_BONITO64 | 191 | select MIPS_BONITO64 |
192 | select PCI_GT64XXX_PCI0 | 192 | select PCI_GT64XXX_PCI0 |
193 | select MIPS_MSC | 193 | select MIPS_MSC |
194 | select RM7000_CPU_SCACHE | 194 | select RM7000_CPU_SCACHE |
195 | select SWAP_IO_SPACE | 195 | select SWAP_IO_SPACE |
196 | select SYS_HAS_CPU_MIPS32_R1 | 196 | select SYS_HAS_CPU_MIPS32_R1 |
197 | select SYS_HAS_CPU_MIPS32_R2 | 197 | select SYS_HAS_CPU_MIPS32_R2 |
198 | select SYS_HAS_CPU_MIPS64_R1 | 198 | select SYS_HAS_CPU_MIPS64_R1 |
199 | select SYS_HAS_CPU_NEVADA | 199 | select SYS_HAS_CPU_NEVADA |
200 | select SYS_HAS_CPU_RM7000 | 200 | select SYS_HAS_CPU_RM7000 |
201 | select SYS_SUPPORTS_32BIT_KERNEL | 201 | select SYS_SUPPORTS_32BIT_KERNEL |
202 | select SYS_SUPPORTS_64BIT_KERNEL | 202 | select SYS_SUPPORTS_64BIT_KERNEL |
203 | select SYS_SUPPORTS_BIG_ENDIAN | 203 | select SYS_SUPPORTS_BIG_ENDIAN |
204 | select SYS_SUPPORTS_LITTLE_ENDIAN | 204 | select SYS_SUPPORTS_LITTLE_ENDIAN |
205 | select SYS_SUPPORTS_MULTITHREADING if EXPERIMENTAL | 205 | select SYS_SUPPORTS_MULTITHREADING if EXPERIMENTAL |
206 | select SYS_SUPPORTS_SMARTMIPS | 206 | select SYS_SUPPORTS_SMARTMIPS |
207 | select GENERIC_HARDIRQS_NO__DO_IRQ | 207 | select GENERIC_HARDIRQS_NO__DO_IRQ |
208 | help | 208 | help |
209 | This enables support for the MIPS Technologies Atlas evaluation | 209 | This enables support for the MIPS Technologies Atlas evaluation |
210 | board. | 210 | board. |
211 | 211 | ||
212 | config MIPS_MALTA | 212 | config MIPS_MALTA |
213 | bool "MIPS Malta board" | 213 | bool "MIPS Malta board" |
214 | select ARCH_MAY_HAVE_PC_FDC | 214 | select ARCH_MAY_HAVE_PC_FDC |
215 | select BOOT_ELF32 | 215 | select BOOT_ELF32 |
216 | select BOOT_RAW | 216 | select BOOT_RAW |
217 | select CEVT_R4K | 217 | select CEVT_R4K |
218 | select CSRC_R4K | 218 | select CSRC_R4K |
219 | select DMA_NONCOHERENT | 219 | select DMA_NONCOHERENT |
220 | select GENERIC_ISA_DMA | 220 | select GENERIC_ISA_DMA |
221 | select IRQ_CPU | 221 | select IRQ_CPU |
222 | select HW_HAS_PCI | 222 | select HW_HAS_PCI |
223 | select I8253 | 223 | select I8253 |
224 | select I8259 | 224 | select I8259 |
225 | select MIPS_BOARDS_GEN | 225 | select MIPS_BOARDS_GEN |
226 | select MIPS_BONITO64 | 226 | select MIPS_BONITO64 |
227 | select MIPS_CPU_SCACHE | 227 | select MIPS_CPU_SCACHE |
228 | select PCI_GT64XXX_PCI0 | 228 | select PCI_GT64XXX_PCI0 |
229 | select MIPS_MSC | 229 | select MIPS_MSC |
230 | select SWAP_IO_SPACE | 230 | select SWAP_IO_SPACE |
231 | select SYS_HAS_CPU_MIPS32_R1 | 231 | select SYS_HAS_CPU_MIPS32_R1 |
232 | select SYS_HAS_CPU_MIPS32_R2 | 232 | select SYS_HAS_CPU_MIPS32_R2 |
233 | select SYS_HAS_CPU_MIPS64_R1 | 233 | select SYS_HAS_CPU_MIPS64_R1 |
234 | select SYS_HAS_CPU_NEVADA | 234 | select SYS_HAS_CPU_NEVADA |
235 | select SYS_HAS_CPU_RM7000 | 235 | select SYS_HAS_CPU_RM7000 |
236 | select SYS_HAS_EARLY_PRINTK | 236 | select SYS_HAS_EARLY_PRINTK |
237 | select SYS_SUPPORTS_32BIT_KERNEL | 237 | select SYS_SUPPORTS_32BIT_KERNEL |
238 | select SYS_SUPPORTS_64BIT_KERNEL | 238 | select SYS_SUPPORTS_64BIT_KERNEL |
239 | select SYS_SUPPORTS_BIG_ENDIAN | 239 | select SYS_SUPPORTS_BIG_ENDIAN |
240 | select SYS_SUPPORTS_LITTLE_ENDIAN | 240 | select SYS_SUPPORTS_LITTLE_ENDIAN |
241 | select SYS_SUPPORTS_MULTITHREADING | 241 | select SYS_SUPPORTS_MULTITHREADING |
242 | select SYS_SUPPORTS_SMARTMIPS | 242 | select SYS_SUPPORTS_SMARTMIPS |
243 | help | 243 | help |
244 | This enables support for the MIPS Technologies Malta evaluation | 244 | This enables support for the MIPS Technologies Malta evaluation |
245 | board. | 245 | board. |
246 | 246 | ||
247 | config MIPS_SEAD | 247 | config MIPS_SEAD |
248 | bool "MIPS SEAD board" | 248 | bool "MIPS SEAD board" |
249 | select CEVT_R4K | 249 | select CEVT_R4K |
250 | select CSRC_R4K | 250 | select CSRC_R4K |
251 | select IRQ_CPU | 251 | select IRQ_CPU |
252 | select DMA_NONCOHERENT | 252 | select DMA_NONCOHERENT |
253 | select SYS_HAS_EARLY_PRINTK | 253 | select SYS_HAS_EARLY_PRINTK |
254 | select MIPS_BOARDS_GEN | 254 | select MIPS_BOARDS_GEN |
255 | select SYS_HAS_CPU_MIPS32_R1 | 255 | select SYS_HAS_CPU_MIPS32_R1 |
256 | select SYS_HAS_CPU_MIPS32_R2 | 256 | select SYS_HAS_CPU_MIPS32_R2 |
257 | select SYS_HAS_CPU_MIPS64_R1 | 257 | select SYS_HAS_CPU_MIPS64_R1 |
258 | select SYS_SUPPORTS_32BIT_KERNEL | 258 | select SYS_SUPPORTS_32BIT_KERNEL |
259 | select SYS_SUPPORTS_64BIT_KERNEL if EXPERIMENTAL | 259 | select SYS_SUPPORTS_64BIT_KERNEL if EXPERIMENTAL |
260 | select SYS_SUPPORTS_BIG_ENDIAN | 260 | select SYS_SUPPORTS_BIG_ENDIAN |
261 | select SYS_SUPPORTS_LITTLE_ENDIAN | 261 | select SYS_SUPPORTS_LITTLE_ENDIAN |
262 | select SYS_SUPPORTS_SMARTMIPS | 262 | select SYS_SUPPORTS_SMARTMIPS |
263 | help | 263 | help |
264 | This enables support for the MIPS Technologies SEAD evaluation | 264 | This enables support for the MIPS Technologies SEAD evaluation |
265 | board. | 265 | board. |
266 | 266 | ||
267 | config MIPS_SIM | 267 | config MIPS_SIM |
268 | bool 'MIPS simulator (MIPSsim)' | 268 | bool 'MIPS simulator (MIPSsim)' |
269 | select CEVT_R4K | 269 | select CEVT_R4K |
270 | select CSRC_R4K | 270 | select CSRC_R4K |
271 | select DMA_NONCOHERENT | 271 | select DMA_NONCOHERENT |
272 | select SYS_HAS_EARLY_PRINTK | 272 | select SYS_HAS_EARLY_PRINTK |
273 | select IRQ_CPU | 273 | select IRQ_CPU |
274 | select BOOT_RAW | 274 | select BOOT_RAW |
275 | select SYS_HAS_CPU_MIPS32_R1 | 275 | select SYS_HAS_CPU_MIPS32_R1 |
276 | select SYS_HAS_CPU_MIPS32_R2 | 276 | select SYS_HAS_CPU_MIPS32_R2 |
277 | select SYS_HAS_EARLY_PRINTK | 277 | select SYS_HAS_EARLY_PRINTK |
278 | select SYS_SUPPORTS_32BIT_KERNEL | 278 | select SYS_SUPPORTS_32BIT_KERNEL |
279 | select SYS_SUPPORTS_BIG_ENDIAN | 279 | select SYS_SUPPORTS_BIG_ENDIAN |
280 | select SYS_SUPPORTS_MULTITHREADING | 280 | select SYS_SUPPORTS_MULTITHREADING |
281 | select SYS_SUPPORTS_LITTLE_ENDIAN | 281 | select SYS_SUPPORTS_LITTLE_ENDIAN |
282 | help | 282 | help |
283 | This option enables support for MIPS Technologies MIPSsim software | 283 | This option enables support for MIPS Technologies MIPSsim software |
284 | emulator. | 284 | emulator. |
285 | 285 | ||
286 | config MARKEINS | 286 | config MARKEINS |
287 | bool "NEC EMMA2RH Mark-eins" | 287 | bool "NEC EMMA2RH Mark-eins" |
288 | select CEVT_R4K | 288 | select CEVT_R4K |
289 | select CSRC_R4K | 289 | select CSRC_R4K |
290 | select DMA_NONCOHERENT | 290 | select DMA_NONCOHERENT |
291 | select HW_HAS_PCI | 291 | select HW_HAS_PCI |
292 | select IRQ_CPU | 292 | select IRQ_CPU |
293 | select SWAP_IO_SPACE | 293 | select SWAP_IO_SPACE |
294 | select SYS_SUPPORTS_32BIT_KERNEL | 294 | select SYS_SUPPORTS_32BIT_KERNEL |
295 | select SYS_SUPPORTS_BIG_ENDIAN | 295 | select SYS_SUPPORTS_BIG_ENDIAN |
296 | select SYS_SUPPORTS_LITTLE_ENDIAN | 296 | select SYS_SUPPORTS_LITTLE_ENDIAN |
297 | select SYS_HAS_CPU_R5000 | 297 | select SYS_HAS_CPU_R5000 |
298 | help | 298 | help |
299 | This enables support for the R5432-based NEC Mark-eins | 299 | This enables support for the R5432-based NEC Mark-eins |
300 | boards with R5500 CPU. | 300 | boards with R5500 CPU. |
301 | 301 | ||
302 | config MACH_VR41XX | 302 | config MACH_VR41XX |
303 | bool "NEC VR4100 series based machines" | 303 | bool "NEC VR4100 series based machines" |
304 | select CEVT_R4K | 304 | select CEVT_R4K |
305 | select CSRC_R4K | 305 | select CSRC_R4K |
306 | select SYS_HAS_CPU_VR41XX | 306 | select SYS_HAS_CPU_VR41XX |
307 | select GENERIC_HARDIRQS_NO__DO_IRQ | 307 | select GENERIC_HARDIRQS_NO__DO_IRQ |
308 | 308 | ||
309 | config PNX8550_JBS | 309 | config PNX8550_JBS |
310 | bool "Philips PNX8550 based JBS board" | 310 | bool "Philips PNX8550 based JBS board" |
311 | select PNX8550 | 311 | select PNX8550 |
312 | select SYS_SUPPORTS_LITTLE_ENDIAN | 312 | select SYS_SUPPORTS_LITTLE_ENDIAN |
313 | 313 | ||
314 | config PNX8550_STB810 | 314 | config PNX8550_STB810 |
315 | bool "Philips PNX8550 based STB810 board" | 315 | bool "Philips PNX8550 based STB810 board" |
316 | select PNX8550 | 316 | select PNX8550 |
317 | select SYS_SUPPORTS_LITTLE_ENDIAN | 317 | select SYS_SUPPORTS_LITTLE_ENDIAN |
318 | 318 | ||
319 | config PMC_MSP | 319 | config PMC_MSP |
320 | bool "PMC-Sierra MSP chipsets" | 320 | bool "PMC-Sierra MSP chipsets" |
321 | depends on EXPERIMENTAL | 321 | depends on EXPERIMENTAL |
322 | select DMA_NONCOHERENT | 322 | select DMA_NONCOHERENT |
323 | select SWAP_IO_SPACE | 323 | select SWAP_IO_SPACE |
324 | select NO_EXCEPT_FILL | 324 | select NO_EXCEPT_FILL |
325 | select BOOT_RAW | 325 | select BOOT_RAW |
326 | select SYS_HAS_CPU_MIPS32_R1 | 326 | select SYS_HAS_CPU_MIPS32_R1 |
327 | select SYS_HAS_CPU_MIPS32_R2 | 327 | select SYS_HAS_CPU_MIPS32_R2 |
328 | select SYS_SUPPORTS_32BIT_KERNEL | 328 | select SYS_SUPPORTS_32BIT_KERNEL |
329 | select SYS_SUPPORTS_BIG_ENDIAN | 329 | select SYS_SUPPORTS_BIG_ENDIAN |
330 | select SYS_SUPPORTS_KGDB | 330 | select SYS_SUPPORTS_KGDB |
331 | select IRQ_CPU | 331 | select IRQ_CPU |
332 | select SERIAL_8250 | 332 | select SERIAL_8250 |
333 | select SERIAL_8250_CONSOLE | 333 | select SERIAL_8250_CONSOLE |
334 | help | 334 | help |
335 | This adds support for the PMC-Sierra family of Multi-Service | 335 | This adds support for the PMC-Sierra family of Multi-Service |
336 | Processor System-On-A-Chips. These parts include a number | 336 | Processor System-On-A-Chips. These parts include a number |
337 | of integrated peripherals, interfaces and DSPs in addition to | 337 | of integrated peripherals, interfaces and DSPs in addition to |
338 | a variety of MIPS cores. | 338 | a variety of MIPS cores. |
339 | 339 | ||
340 | config PMC_YOSEMITE | 340 | config PMC_YOSEMITE |
341 | bool "PMC-Sierra Yosemite eval board" | 341 | bool "PMC-Sierra Yosemite eval board" |
342 | select CEVT_R4K | 342 | select CEVT_R4K |
343 | select CSRC_R4K | 343 | select CSRC_R4K |
344 | select DMA_COHERENT | 344 | select DMA_COHERENT |
345 | select HW_HAS_PCI | 345 | select HW_HAS_PCI |
346 | select IRQ_CPU | 346 | select IRQ_CPU |
347 | select IRQ_CPU_RM7K | 347 | select IRQ_CPU_RM7K |
348 | select IRQ_CPU_RM9K | 348 | select IRQ_CPU_RM9K |
349 | select SWAP_IO_SPACE | 349 | select SWAP_IO_SPACE |
350 | select SYS_HAS_CPU_RM9000 | 350 | select SYS_HAS_CPU_RM9000 |
351 | select SYS_HAS_EARLY_PRINTK | 351 | select SYS_HAS_EARLY_PRINTK |
352 | select SYS_SUPPORTS_32BIT_KERNEL | 352 | select SYS_SUPPORTS_32BIT_KERNEL |
353 | select SYS_SUPPORTS_64BIT_KERNEL | 353 | select SYS_SUPPORTS_64BIT_KERNEL |
354 | select SYS_SUPPORTS_BIG_ENDIAN | 354 | select SYS_SUPPORTS_BIG_ENDIAN |
355 | select SYS_SUPPORTS_HIGHMEM | 355 | select SYS_SUPPORTS_HIGHMEM |
356 | select SYS_SUPPORTS_KGDB | 356 | select SYS_SUPPORTS_KGDB |
357 | select SYS_SUPPORTS_SMP | 357 | select SYS_SUPPORTS_SMP |
358 | help | 358 | help |
359 | Yosemite is an evaluation board for the RM9000x2 processor | 359 | Yosemite is an evaluation board for the RM9000x2 processor |
360 | manufactured by PMC-Sierra. | 360 | manufactured by PMC-Sierra. |
361 | 361 | ||
362 | config SGI_IP22 | 362 | config SGI_IP22 |
363 | bool "SGI IP22 (Indy/Indigo2)" | 363 | bool "SGI IP22 (Indy/Indigo2)" |
364 | select ARC | 364 | select ARC |
365 | select ARC32 | 365 | select ARC32 |
366 | select BOOT_ELF32 | 366 | select BOOT_ELF32 |
367 | select CEVT_R4K | 367 | select CEVT_R4K |
368 | select CSRC_R4K | 368 | select CSRC_R4K |
369 | select DEFAULT_SGI_PARTITION | 369 | select DEFAULT_SGI_PARTITION |
370 | select DMA_NONCOHERENT | 370 | select DMA_NONCOHERENT |
371 | select HW_HAS_EISA | 371 | select HW_HAS_EISA |
372 | select I8253 | 372 | select I8253 |
373 | select I8259 | 373 | select I8259 |
374 | select IP22_CPU_SCACHE | 374 | select IP22_CPU_SCACHE |
375 | select IRQ_CPU | 375 | select IRQ_CPU |
376 | select GENERIC_ISA_DMA_SUPPORT_BROKEN | 376 | select GENERIC_ISA_DMA_SUPPORT_BROKEN |
377 | select SGI_HAS_DS1286 | 377 | select SGI_HAS_DS1286 |
378 | select SGI_HAS_I8042 | 378 | select SGI_HAS_I8042 |
379 | select SGI_HAS_INDYDOG | 379 | select SGI_HAS_INDYDOG |
380 | select SGI_HAS_SEEQ | 380 | select SGI_HAS_SEEQ |
381 | select SGI_HAS_WD93 | 381 | select SGI_HAS_WD93 |
382 | select SGI_HAS_ZILOG | 382 | select SGI_HAS_ZILOG |
383 | select SWAP_IO_SPACE | 383 | select SWAP_IO_SPACE |
384 | select SYS_HAS_CPU_R4X00 | 384 | select SYS_HAS_CPU_R4X00 |
385 | select SYS_HAS_CPU_R5000 | 385 | select SYS_HAS_CPU_R5000 |
386 | select SYS_HAS_EARLY_PRINTK | 386 | select SYS_HAS_EARLY_PRINTK |
387 | select SYS_SUPPORTS_32BIT_KERNEL | 387 | select SYS_SUPPORTS_32BIT_KERNEL |
388 | select SYS_SUPPORTS_64BIT_KERNEL | 388 | select SYS_SUPPORTS_64BIT_KERNEL |
389 | select SYS_SUPPORTS_BIG_ENDIAN | 389 | select SYS_SUPPORTS_BIG_ENDIAN |
390 | help | 390 | help |
391 | This are the SGI Indy, Challenge S and Indigo2, as well as certain | 391 | This are the SGI Indy, Challenge S and Indigo2, as well as certain |
392 | OEM variants like the Tandem CMN B006S. To compile a Linux kernel | 392 | OEM variants like the Tandem CMN B006S. To compile a Linux kernel |
393 | that runs on these, say Y here. | 393 | that runs on these, say Y here. |
394 | 394 | ||
395 | config SGI_IP27 | 395 | config SGI_IP27 |
396 | bool "SGI IP27 (Origin200/2000)" | 396 | bool "SGI IP27 (Origin200/2000)" |
397 | select ARC | 397 | select ARC |
398 | select ARC64 | 398 | select ARC64 |
399 | select BOOT_ELF64 | 399 | select BOOT_ELF64 |
400 | select DEFAULT_SGI_PARTITION | 400 | select DEFAULT_SGI_PARTITION |
401 | select DMA_IP27 | 401 | select DMA_IP27 |
402 | select SYS_HAS_EARLY_PRINTK | 402 | select SYS_HAS_EARLY_PRINTK |
403 | select HW_HAS_PCI | 403 | select HW_HAS_PCI |
404 | select NR_CPUS_DEFAULT_64 | 404 | select NR_CPUS_DEFAULT_64 |
405 | select SYS_HAS_CPU_R10000 | 405 | select SYS_HAS_CPU_R10000 |
406 | select SYS_SUPPORTS_64BIT_KERNEL | 406 | select SYS_SUPPORTS_64BIT_KERNEL |
407 | select SYS_SUPPORTS_BIG_ENDIAN | 407 | select SYS_SUPPORTS_BIG_ENDIAN |
408 | select SYS_SUPPORTS_KGDB | 408 | select SYS_SUPPORTS_KGDB |
409 | select SYS_SUPPORTS_NUMA | 409 | select SYS_SUPPORTS_NUMA |
410 | select SYS_SUPPORTS_SMP | 410 | select SYS_SUPPORTS_SMP |
411 | select GENERIC_HARDIRQS_NO__DO_IRQ | 411 | select GENERIC_HARDIRQS_NO__DO_IRQ |
412 | help | 412 | help |
413 | This are the SGI Origin 200, Origin 2000 and Onyx 2 Graphics | 413 | This are the SGI Origin 200, Origin 2000 and Onyx 2 Graphics |
414 | workstations. To compile a Linux kernel that runs on these, say Y | 414 | workstations. To compile a Linux kernel that runs on these, say Y |
415 | here. | 415 | here. |
416 | 416 | ||
417 | config SGI_IP28 | 417 | config SGI_IP28 |
418 | bool "SGI IP28 (Indigo2 R10k) (EXPERIMENTAL)" | 418 | bool "SGI IP28 (Indigo2 R10k) (EXPERIMENTAL)" |
419 | depends on EXPERIMENTAL | 419 | depends on EXPERIMENTAL |
420 | select ARC | 420 | select ARC |
421 | select ARC64 | 421 | select ARC64 |
422 | select BOOT_ELF64 | 422 | select BOOT_ELF64 |
423 | select CEVT_R4K | 423 | select CEVT_R4K |
424 | select CSRC_R4K | 424 | select CSRC_R4K |
425 | select DEFAULT_SGI_PARTITION | 425 | select DEFAULT_SGI_PARTITION |
426 | select DMA_NONCOHERENT | 426 | select DMA_NONCOHERENT |
427 | select GENERIC_ISA_DMA_SUPPORT_BROKEN | 427 | select GENERIC_ISA_DMA_SUPPORT_BROKEN |
428 | select IRQ_CPU | 428 | select IRQ_CPU |
429 | select HW_HAS_EISA | 429 | select HW_HAS_EISA |
430 | select I8253 | 430 | select I8253 |
431 | select I8259 | 431 | select I8259 |
432 | select SGI_HAS_DS1286 | 432 | select SGI_HAS_DS1286 |
433 | select SGI_HAS_I8042 | 433 | select SGI_HAS_I8042 |
434 | select SGI_HAS_INDYDOG | 434 | select SGI_HAS_INDYDOG |
435 | select SGI_HAS_SEEQ | 435 | select SGI_HAS_SEEQ |
436 | select SGI_HAS_WD93 | 436 | select SGI_HAS_WD93 |
437 | select SGI_HAS_ZILOG | 437 | select SGI_HAS_ZILOG |
438 | select SWAP_IO_SPACE | 438 | select SWAP_IO_SPACE |
439 | select SYS_HAS_CPU_R10000 | 439 | select SYS_HAS_CPU_R10000 |
440 | select SYS_HAS_EARLY_PRINTK | 440 | select SYS_HAS_EARLY_PRINTK |
441 | select SYS_SUPPORTS_64BIT_KERNEL | 441 | select SYS_SUPPORTS_64BIT_KERNEL |
442 | select SYS_SUPPORTS_BIG_ENDIAN | 442 | select SYS_SUPPORTS_BIG_ENDIAN |
443 | help | 443 | help |
444 | This is the SGI Indigo2 with R10000 processor. To compile a Linux | 444 | This is the SGI Indigo2 with R10000 processor. To compile a Linux |
445 | kernel that runs on these, say Y here. | 445 | kernel that runs on these, say Y here. |
446 | 446 | ||
447 | config SGI_IP32 | 447 | config SGI_IP32 |
448 | bool "SGI IP32 (O2)" | 448 | bool "SGI IP32 (O2)" |
449 | select ARC | 449 | select ARC |
450 | select ARC32 | 450 | select ARC32 |
451 | select BOOT_ELF32 | 451 | select BOOT_ELF32 |
452 | select CEVT_R4K | 452 | select CEVT_R4K |
453 | select CSRC_R4K | 453 | select CSRC_R4K |
454 | select DMA_NONCOHERENT | 454 | select DMA_NONCOHERENT |
455 | select HW_HAS_PCI | 455 | select HW_HAS_PCI |
456 | select IRQ_CPU | 456 | select IRQ_CPU |
457 | select R5000_CPU_SCACHE | 457 | select R5000_CPU_SCACHE |
458 | select RM7000_CPU_SCACHE | 458 | select RM7000_CPU_SCACHE |
459 | select SYS_HAS_CPU_R5000 | 459 | select SYS_HAS_CPU_R5000 |
460 | select SYS_HAS_CPU_R10000 if BROKEN | 460 | select SYS_HAS_CPU_R10000 if BROKEN |
461 | select SYS_HAS_CPU_RM7000 | 461 | select SYS_HAS_CPU_RM7000 |
462 | select SYS_HAS_CPU_NEVADA | 462 | select SYS_HAS_CPU_NEVADA |
463 | select SYS_SUPPORTS_64BIT_KERNEL | 463 | select SYS_SUPPORTS_64BIT_KERNEL |
464 | select SYS_SUPPORTS_BIG_ENDIAN | 464 | select SYS_SUPPORTS_BIG_ENDIAN |
465 | help | 465 | help |
466 | If you want this kernel to run on SGI O2 workstation, say Y here. | 466 | If you want this kernel to run on SGI O2 workstation, say Y here. |
467 | 467 | ||
468 | config SIBYTE_CRHINE | 468 | config SIBYTE_CRHINE |
469 | bool "Sibyte BCM91120C-CRhine" | 469 | bool "Sibyte BCM91120C-CRhine" |
470 | depends on EXPERIMENTAL | 470 | depends on EXPERIMENTAL |
471 | select BOOT_ELF32 | 471 | select BOOT_ELF32 |
472 | select DMA_COHERENT | 472 | select DMA_COHERENT |
473 | select SIBYTE_BCM1120 | 473 | select SIBYTE_BCM1120 |
474 | select SWAP_IO_SPACE | 474 | select SWAP_IO_SPACE |
475 | select SYS_HAS_CPU_SB1 | 475 | select SYS_HAS_CPU_SB1 |
476 | select SYS_SUPPORTS_BIG_ENDIAN | 476 | select SYS_SUPPORTS_BIG_ENDIAN |
477 | select SYS_SUPPORTS_LITTLE_ENDIAN | 477 | select SYS_SUPPORTS_LITTLE_ENDIAN |
478 | 478 | ||
479 | config SIBYTE_CARMEL | 479 | config SIBYTE_CARMEL |
480 | bool "Sibyte BCM91120x-Carmel" | 480 | bool "Sibyte BCM91120x-Carmel" |
481 | depends on EXPERIMENTAL | 481 | depends on EXPERIMENTAL |
482 | select BOOT_ELF32 | 482 | select BOOT_ELF32 |
483 | select DMA_COHERENT | 483 | select DMA_COHERENT |
484 | select SIBYTE_BCM1120 | 484 | select SIBYTE_BCM1120 |
485 | select SWAP_IO_SPACE | 485 | select SWAP_IO_SPACE |
486 | select SYS_HAS_CPU_SB1 | 486 | select SYS_HAS_CPU_SB1 |
487 | select SYS_SUPPORTS_BIG_ENDIAN | 487 | select SYS_SUPPORTS_BIG_ENDIAN |
488 | select SYS_SUPPORTS_LITTLE_ENDIAN | 488 | select SYS_SUPPORTS_LITTLE_ENDIAN |
489 | 489 | ||
490 | config SIBYTE_CRHONE | 490 | config SIBYTE_CRHONE |
491 | bool "Sibyte BCM91125C-CRhone" | 491 | bool "Sibyte BCM91125C-CRhone" |
492 | depends on EXPERIMENTAL | 492 | depends on EXPERIMENTAL |
493 | select BOOT_ELF32 | 493 | select BOOT_ELF32 |
494 | select DMA_COHERENT | 494 | select DMA_COHERENT |
495 | select SIBYTE_BCM1125 | 495 | select SIBYTE_BCM1125 |
496 | select SWAP_IO_SPACE | 496 | select SWAP_IO_SPACE |
497 | select SYS_HAS_CPU_SB1 | 497 | select SYS_HAS_CPU_SB1 |
498 | select SYS_SUPPORTS_BIG_ENDIAN | 498 | select SYS_SUPPORTS_BIG_ENDIAN |
499 | select SYS_SUPPORTS_HIGHMEM | 499 | select SYS_SUPPORTS_HIGHMEM |
500 | select SYS_SUPPORTS_LITTLE_ENDIAN | 500 | select SYS_SUPPORTS_LITTLE_ENDIAN |
501 | 501 | ||
502 | config SIBYTE_RHONE | 502 | config SIBYTE_RHONE |
503 | bool "Sibyte BCM91125E-Rhone" | 503 | bool "Sibyte BCM91125E-Rhone" |
504 | depends on EXPERIMENTAL | 504 | depends on EXPERIMENTAL |
505 | select BOOT_ELF32 | 505 | select BOOT_ELF32 |
506 | select DMA_COHERENT | 506 | select DMA_COHERENT |
507 | select SIBYTE_BCM1125H | 507 | select SIBYTE_BCM1125H |
508 | select SWAP_IO_SPACE | 508 | select SWAP_IO_SPACE |
509 | select SYS_HAS_CPU_SB1 | 509 | select SYS_HAS_CPU_SB1 |
510 | select SYS_SUPPORTS_BIG_ENDIAN | 510 | select SYS_SUPPORTS_BIG_ENDIAN |
511 | select SYS_SUPPORTS_LITTLE_ENDIAN | 511 | select SYS_SUPPORTS_LITTLE_ENDIAN |
512 | 512 | ||
513 | config SIBYTE_SWARM | 513 | config SIBYTE_SWARM |
514 | bool "Sibyte BCM91250A-SWARM" | 514 | bool "Sibyte BCM91250A-SWARM" |
515 | select BOOT_ELF32 | 515 | select BOOT_ELF32 |
516 | select DMA_COHERENT | 516 | select DMA_COHERENT |
517 | select NR_CPUS_DEFAULT_2 | 517 | select NR_CPUS_DEFAULT_2 |
518 | select SIBYTE_SB1250 | 518 | select SIBYTE_SB1250 |
519 | select SWAP_IO_SPACE | 519 | select SWAP_IO_SPACE |
520 | select SYS_HAS_CPU_SB1 | 520 | select SYS_HAS_CPU_SB1 |
521 | select SYS_SUPPORTS_BIG_ENDIAN | 521 | select SYS_SUPPORTS_BIG_ENDIAN |
522 | select SYS_SUPPORTS_HIGHMEM | 522 | select SYS_SUPPORTS_HIGHMEM |
523 | select SYS_SUPPORTS_KGDB | 523 | select SYS_SUPPORTS_KGDB |
524 | select SYS_SUPPORTS_LITTLE_ENDIAN | 524 | select SYS_SUPPORTS_LITTLE_ENDIAN |
525 | select ZONE_DMA32 if 64BIT | 525 | select ZONE_DMA32 if 64BIT |
526 | 526 | ||
527 | config SIBYTE_LITTLESUR | 527 | config SIBYTE_LITTLESUR |
528 | bool "Sibyte BCM91250C2-LittleSur" | 528 | bool "Sibyte BCM91250C2-LittleSur" |
529 | depends on EXPERIMENTAL | 529 | depends on EXPERIMENTAL |
530 | select BOOT_ELF32 | 530 | select BOOT_ELF32 |
531 | select DMA_COHERENT | 531 | select DMA_COHERENT |
532 | select NR_CPUS_DEFAULT_2 | 532 | select NR_CPUS_DEFAULT_2 |
533 | select SIBYTE_SB1250 | 533 | select SIBYTE_SB1250 |
534 | select SWAP_IO_SPACE | 534 | select SWAP_IO_SPACE |
535 | select SYS_HAS_CPU_SB1 | 535 | select SYS_HAS_CPU_SB1 |
536 | select SYS_SUPPORTS_BIG_ENDIAN | 536 | select SYS_SUPPORTS_BIG_ENDIAN |
537 | select SYS_SUPPORTS_HIGHMEM | 537 | select SYS_SUPPORTS_HIGHMEM |
538 | select SYS_SUPPORTS_LITTLE_ENDIAN | 538 | select SYS_SUPPORTS_LITTLE_ENDIAN |
539 | 539 | ||
540 | config SIBYTE_SENTOSA | 540 | config SIBYTE_SENTOSA |
541 | bool "Sibyte BCM91250E-Sentosa" | 541 | bool "Sibyte BCM91250E-Sentosa" |
542 | depends on EXPERIMENTAL | 542 | depends on EXPERIMENTAL |
543 | select BOOT_ELF32 | 543 | select BOOT_ELF32 |
544 | select DMA_COHERENT | 544 | select DMA_COHERENT |
545 | select NR_CPUS_DEFAULT_2 | 545 | select NR_CPUS_DEFAULT_2 |
546 | select SIBYTE_SB1250 | 546 | select SIBYTE_SB1250 |
547 | select SWAP_IO_SPACE | 547 | select SWAP_IO_SPACE |
548 | select SYS_HAS_CPU_SB1 | 548 | select SYS_HAS_CPU_SB1 |
549 | select SYS_SUPPORTS_BIG_ENDIAN | 549 | select SYS_SUPPORTS_BIG_ENDIAN |
550 | select SYS_SUPPORTS_LITTLE_ENDIAN | 550 | select SYS_SUPPORTS_LITTLE_ENDIAN |
551 | 551 | ||
552 | config SIBYTE_BIGSUR | 552 | config SIBYTE_BIGSUR |
553 | bool "Sibyte BCM91480B-BigSur" | 553 | bool "Sibyte BCM91480B-BigSur" |
554 | select BOOT_ELF32 | 554 | select BOOT_ELF32 |
555 | select DMA_COHERENT | 555 | select DMA_COHERENT |
556 | select NR_CPUS_DEFAULT_4 | 556 | select NR_CPUS_DEFAULT_4 |
557 | select SIBYTE_BCM1x80 | 557 | select SIBYTE_BCM1x80 |
558 | select SWAP_IO_SPACE | 558 | select SWAP_IO_SPACE |
559 | select SYS_HAS_CPU_SB1 | 559 | select SYS_HAS_CPU_SB1 |
560 | select SYS_SUPPORTS_BIG_ENDIAN | 560 | select SYS_SUPPORTS_BIG_ENDIAN |
561 | select SYS_SUPPORTS_HIGHMEM | 561 | select SYS_SUPPORTS_HIGHMEM |
562 | select SYS_SUPPORTS_LITTLE_ENDIAN | 562 | select SYS_SUPPORTS_LITTLE_ENDIAN |
563 | select ZONE_DMA32 if 64BIT | 563 | select ZONE_DMA32 if 64BIT |
564 | 564 | ||
565 | config SNI_RM | 565 | config SNI_RM |
566 | bool "SNI RM200/300/400" | 566 | bool "SNI RM200/300/400" |
567 | select ARC if CPU_LITTLE_ENDIAN | 567 | select ARC if CPU_LITTLE_ENDIAN |
568 | select ARC32 if CPU_LITTLE_ENDIAN | 568 | select ARC32 if CPU_LITTLE_ENDIAN |
569 | select SNIPROM if CPU_BIG_ENDIAN | 569 | select SNIPROM if CPU_BIG_ENDIAN |
570 | select ARCH_MAY_HAVE_PC_FDC | 570 | select ARCH_MAY_HAVE_PC_FDC |
571 | select BOOT_ELF32 | 571 | select BOOT_ELF32 |
572 | select CEVT_R4K | 572 | select CEVT_R4K |
573 | select CSRC_R4K | 573 | select CSRC_R4K |
574 | select DEFAULT_SGI_PARTITION if CPU_BIG_ENDIAN | 574 | select DEFAULT_SGI_PARTITION if CPU_BIG_ENDIAN |
575 | select DMA_NONCOHERENT | 575 | select DMA_NONCOHERENT |
576 | select GENERIC_ISA_DMA | 576 | select GENERIC_ISA_DMA |
577 | select HW_HAS_EISA | 577 | select HW_HAS_EISA |
578 | select HW_HAS_PCI | 578 | select HW_HAS_PCI |
579 | select IRQ_CPU | 579 | select IRQ_CPU |
580 | select I8253 | 580 | select I8253 |
581 | select I8259 | 581 | select I8259 |
582 | select ISA | 582 | select ISA |
583 | select SWAP_IO_SPACE if CPU_BIG_ENDIAN | 583 | select SWAP_IO_SPACE if CPU_BIG_ENDIAN |
584 | select SYS_HAS_CPU_R4X00 | 584 | select SYS_HAS_CPU_R4X00 |
585 | select SYS_HAS_CPU_R5000 | 585 | select SYS_HAS_CPU_R5000 |
586 | select SYS_HAS_CPU_R10000 | 586 | select SYS_HAS_CPU_R10000 |
587 | select R5000_CPU_SCACHE | 587 | select R5000_CPU_SCACHE |
588 | select SYS_HAS_EARLY_PRINTK | 588 | select SYS_HAS_EARLY_PRINTK |
589 | select SYS_SUPPORTS_32BIT_KERNEL | 589 | select SYS_SUPPORTS_32BIT_KERNEL |
590 | select SYS_SUPPORTS_64BIT_KERNEL if EXPERIMENTAL | 590 | select SYS_SUPPORTS_64BIT_KERNEL if EXPERIMENTAL |
591 | select SYS_SUPPORTS_BIG_ENDIAN | 591 | select SYS_SUPPORTS_BIG_ENDIAN |
592 | select SYS_SUPPORTS_HIGHMEM | 592 | select SYS_SUPPORTS_HIGHMEM |
593 | select SYS_SUPPORTS_LITTLE_ENDIAN | 593 | select SYS_SUPPORTS_LITTLE_ENDIAN |
594 | help | 594 | help |
595 | The SNI RM200/300/400 are MIPS-based machines manufactured by | 595 | The SNI RM200/300/400 are MIPS-based machines manufactured by |
596 | Siemens Nixdorf Informationssysteme (SNI), parent company of Pyramid | 596 | Siemens Nixdorf Informationssysteme (SNI), parent company of Pyramid |
597 | Technology and now in turn merged with Fujitsu. Say Y here to | 597 | Technology and now in turn merged with Fujitsu. Say Y here to |
598 | support this machine type. | 598 | support this machine type. |
599 | 599 | ||
600 | config TOSHIBA_JMR3927 | 600 | config TOSHIBA_JMR3927 |
601 | bool "Toshiba JMR-TX3927 board" | 601 | bool "Toshiba JMR-TX3927 board" |
602 | select CEVT_TXX9 | 602 | select CEVT_TXX9 |
603 | select DMA_NONCOHERENT | 603 | select DMA_NONCOHERENT |
604 | select HW_HAS_PCI | 604 | select HW_HAS_PCI |
605 | select MIPS_TX3927 | 605 | select MIPS_TX3927 |
606 | select IRQ_TXX9 | 606 | select IRQ_TXX9 |
607 | select SWAP_IO_SPACE | 607 | select SWAP_IO_SPACE |
608 | select SYS_HAS_CPU_TX39XX | 608 | select SYS_HAS_CPU_TX39XX |
609 | select SYS_SUPPORTS_32BIT_KERNEL | 609 | select SYS_SUPPORTS_32BIT_KERNEL |
610 | select SYS_SUPPORTS_LITTLE_ENDIAN | 610 | select SYS_SUPPORTS_LITTLE_ENDIAN |
611 | select SYS_SUPPORTS_BIG_ENDIAN | 611 | select SYS_SUPPORTS_BIG_ENDIAN |
612 | select GENERIC_HARDIRQS_NO__DO_IRQ | 612 | select GENERIC_HARDIRQS_NO__DO_IRQ |
613 | 613 | ||
614 | config TOSHIBA_RBTX4927 | 614 | config TOSHIBA_RBTX4927 |
615 | bool "Toshiba RBTX49[23]7 board" | 615 | bool "Toshiba RBTX49[23]7 board" |
616 | select CEVT_R4K | 616 | select CEVT_R4K |
617 | select CSRC_R4K | 617 | select CSRC_R4K |
618 | select CEVT_TXX9 | 618 | select CEVT_TXX9 |
619 | select DMA_NONCOHERENT | 619 | select DMA_NONCOHERENT |
620 | select HAS_TXX9_SERIAL | 620 | select HAS_TXX9_SERIAL |
621 | select HW_HAS_PCI | 621 | select HW_HAS_PCI |
622 | select IRQ_CPU | 622 | select IRQ_CPU |
623 | select IRQ_TXX9 | 623 | select IRQ_TXX9 |
624 | select I8259 if TOSHIBA_FPCIB0 | 624 | select I8259 if TOSHIBA_FPCIB0 |
625 | select SWAP_IO_SPACE | 625 | select SWAP_IO_SPACE |
626 | select SYS_HAS_CPU_TX49XX | 626 | select SYS_HAS_CPU_TX49XX |
627 | select SYS_SUPPORTS_32BIT_KERNEL | 627 | select SYS_SUPPORTS_32BIT_KERNEL |
628 | select SYS_SUPPORTS_64BIT_KERNEL | 628 | select SYS_SUPPORTS_64BIT_KERNEL |
629 | select SYS_SUPPORTS_LITTLE_ENDIAN | 629 | select SYS_SUPPORTS_LITTLE_ENDIAN |
630 | select SYS_SUPPORTS_BIG_ENDIAN | 630 | select SYS_SUPPORTS_BIG_ENDIAN |
631 | select SYS_SUPPORTS_KGDB | 631 | select SYS_SUPPORTS_KGDB |
632 | select GENERIC_HARDIRQS_NO__DO_IRQ | 632 | select GENERIC_HARDIRQS_NO__DO_IRQ |
633 | help | 633 | help |
634 | This Toshiba board is based on the TX4927 processor. Say Y here to | 634 | This Toshiba board is based on the TX4927 processor. Say Y here to |
635 | support this machine type | 635 | support this machine type |
636 | 636 | ||
637 | config TOSHIBA_RBTX4938 | 637 | config TOSHIBA_RBTX4938 |
638 | bool "Toshiba RBTX4938 board" | 638 | bool "Toshiba RBTX4938 board" |
639 | select CEVT_R4K | 639 | select CEVT_R4K |
640 | select CSRC_R4K | 640 | select CSRC_R4K |
641 | select CEVT_TXX9 | 641 | select CEVT_TXX9 |
642 | select DMA_NONCOHERENT | 642 | select DMA_NONCOHERENT |
643 | select HAS_TXX9_SERIAL | 643 | select HAS_TXX9_SERIAL |
644 | select HW_HAS_PCI | 644 | select HW_HAS_PCI |
645 | select IRQ_CPU | 645 | select IRQ_CPU |
646 | select IRQ_TXX9 | 646 | select IRQ_TXX9 |
647 | select SWAP_IO_SPACE | 647 | select SWAP_IO_SPACE |
648 | select SYS_HAS_CPU_TX49XX | 648 | select SYS_HAS_CPU_TX49XX |
649 | select SYS_SUPPORTS_32BIT_KERNEL | 649 | select SYS_SUPPORTS_32BIT_KERNEL |
650 | select SYS_SUPPORTS_LITTLE_ENDIAN | 650 | select SYS_SUPPORTS_LITTLE_ENDIAN |
651 | select SYS_SUPPORTS_BIG_ENDIAN | 651 | select SYS_SUPPORTS_BIG_ENDIAN |
652 | select SYS_SUPPORTS_KGDB | 652 | select SYS_SUPPORTS_KGDB |
653 | select GENERIC_HARDIRQS_NO__DO_IRQ | 653 | select GENERIC_HARDIRQS_NO__DO_IRQ |
654 | select GENERIC_GPIO | 654 | select GENERIC_GPIO |
655 | help | 655 | help |
656 | This Toshiba board is based on the TX4938 processor. Say Y here to | 656 | This Toshiba board is based on the TX4938 processor. Say Y here to |
657 | support this machine type | 657 | support this machine type |
658 | 658 | ||
659 | config WR_PPMC | 659 | config WR_PPMC |
660 | bool "Wind River PPMC board" | 660 | bool "Wind River PPMC board" |
661 | select CEVT_R4K | 661 | select CEVT_R4K |
662 | select CSRC_R4K | 662 | select CSRC_R4K |
663 | select IRQ_CPU | 663 | select IRQ_CPU |
664 | select BOOT_ELF32 | 664 | select BOOT_ELF32 |
665 | select DMA_NONCOHERENT | 665 | select DMA_NONCOHERENT |
666 | select HW_HAS_PCI | 666 | select HW_HAS_PCI |
667 | select PCI_GT64XXX_PCI0 | 667 | select PCI_GT64XXX_PCI0 |
668 | select SWAP_IO_SPACE | 668 | select SWAP_IO_SPACE |
669 | select SYS_HAS_CPU_MIPS32_R1 | 669 | select SYS_HAS_CPU_MIPS32_R1 |
670 | select SYS_HAS_CPU_MIPS32_R2 | 670 | select SYS_HAS_CPU_MIPS32_R2 |
671 | select SYS_HAS_CPU_MIPS64_R1 | 671 | select SYS_HAS_CPU_MIPS64_R1 |
672 | select SYS_HAS_CPU_NEVADA | 672 | select SYS_HAS_CPU_NEVADA |
673 | select SYS_HAS_CPU_RM7000 | 673 | select SYS_HAS_CPU_RM7000 |
674 | select SYS_SUPPORTS_32BIT_KERNEL | 674 | select SYS_SUPPORTS_32BIT_KERNEL |
675 | select SYS_SUPPORTS_64BIT_KERNEL | 675 | select SYS_SUPPORTS_64BIT_KERNEL |
676 | select SYS_SUPPORTS_BIG_ENDIAN | 676 | select SYS_SUPPORTS_BIG_ENDIAN |
677 | select SYS_SUPPORTS_LITTLE_ENDIAN | 677 | select SYS_SUPPORTS_LITTLE_ENDIAN |
678 | help | 678 | help |
679 | This enables support for the Wind River MIPS32 4KC PPMC evaluation | 679 | This enables support for the Wind River MIPS32 4KC PPMC evaluation |
680 | board, which is based on GT64120 bridge chip. | 680 | board, which is based on GT64120 bridge chip. |
681 | 681 | ||
682 | endchoice | 682 | endchoice |
683 | 683 | ||
684 | source "arch/mips/au1000/Kconfig" | 684 | source "arch/mips/au1000/Kconfig" |
685 | source "arch/mips/basler/excite/Kconfig" | 685 | source "arch/mips/basler/excite/Kconfig" |
686 | source "arch/mips/jazz/Kconfig" | 686 | source "arch/mips/jazz/Kconfig" |
687 | source "arch/mips/lasat/Kconfig" | 687 | source "arch/mips/lasat/Kconfig" |
688 | source "arch/mips/pmc-sierra/Kconfig" | 688 | source "arch/mips/pmc-sierra/Kconfig" |
689 | source "arch/mips/sgi-ip27/Kconfig" | 689 | source "arch/mips/sgi-ip27/Kconfig" |
690 | source "arch/mips/sibyte/Kconfig" | 690 | source "arch/mips/sibyte/Kconfig" |
691 | source "arch/mips/tx4927/Kconfig" | 691 | source "arch/mips/tx4927/Kconfig" |
692 | source "arch/mips/tx4938/Kconfig" | 692 | source "arch/mips/tx4938/Kconfig" |
693 | source "arch/mips/vr41xx/Kconfig" | 693 | source "arch/mips/vr41xx/Kconfig" |
694 | 694 | ||
695 | endmenu | 695 | endmenu |
696 | 696 | ||
697 | config GENERIC_LOCKBREAK | ||
698 | bool | ||
699 | default y | ||
700 | depends on SMP && PREEMPT | ||
701 | |||
697 | config RWSEM_GENERIC_SPINLOCK | 702 | config RWSEM_GENERIC_SPINLOCK |
698 | bool | 703 | bool |
699 | default y | 704 | default y |
700 | 705 | ||
701 | config RWSEM_XCHGADD_ALGORITHM | 706 | config RWSEM_XCHGADD_ALGORITHM |
702 | bool | 707 | bool |
703 | 708 | ||
704 | config ARCH_HAS_ILOG2_U32 | 709 | config ARCH_HAS_ILOG2_U32 |
705 | bool | 710 | bool |
706 | default n | 711 | default n |
707 | 712 | ||
708 | config ARCH_HAS_ILOG2_U64 | 713 | config ARCH_HAS_ILOG2_U64 |
709 | bool | 714 | bool |
710 | default n | 715 | default n |
711 | 716 | ||
712 | config ARCH_SUPPORTS_OPROFILE | 717 | config ARCH_SUPPORTS_OPROFILE |
713 | bool | 718 | bool |
714 | default y if !MIPS_MT_SMTC | 719 | default y if !MIPS_MT_SMTC |
715 | 720 | ||
716 | config GENERIC_FIND_NEXT_BIT | 721 | config GENERIC_FIND_NEXT_BIT |
717 | bool | 722 | bool |
718 | default y | 723 | default y |
719 | 724 | ||
720 | config GENERIC_HWEIGHT | 725 | config GENERIC_HWEIGHT |
721 | bool | 726 | bool |
722 | default y | 727 | default y |
723 | 728 | ||
724 | config GENERIC_CALIBRATE_DELAY | 729 | config GENERIC_CALIBRATE_DELAY |
725 | bool | 730 | bool |
726 | default y | 731 | default y |
727 | 732 | ||
728 | config GENERIC_CLOCKEVENTS | 733 | config GENERIC_CLOCKEVENTS |
729 | bool | 734 | bool |
730 | default y | 735 | default y |
731 | 736 | ||
732 | config GENERIC_TIME | 737 | config GENERIC_TIME |
733 | bool | 738 | bool |
734 | default y | 739 | default y |
735 | 740 | ||
736 | config GENERIC_CMOS_UPDATE | 741 | config GENERIC_CMOS_UPDATE |
737 | bool | 742 | bool |
738 | default y | 743 | default y |
739 | 744 | ||
740 | config SCHED_NO_NO_OMIT_FRAME_POINTER | 745 | config SCHED_NO_NO_OMIT_FRAME_POINTER |
741 | bool | 746 | bool |
742 | default y | 747 | default y |
743 | 748 | ||
744 | config GENERIC_HARDIRQS_NO__DO_IRQ | 749 | config GENERIC_HARDIRQS_NO__DO_IRQ |
745 | bool | 750 | bool |
746 | default n | 751 | default n |
747 | 752 | ||
748 | # | 753 | # |
749 | # Select some configuration options automatically based on user selections. | 754 | # Select some configuration options automatically based on user selections. |
750 | # | 755 | # |
751 | config ARC | 756 | config ARC |
752 | bool | 757 | bool |
753 | 758 | ||
754 | config ARCH_MAY_HAVE_PC_FDC | 759 | config ARCH_MAY_HAVE_PC_FDC |
755 | bool | 760 | bool |
756 | 761 | ||
757 | config BOOT_RAW | 762 | config BOOT_RAW |
758 | bool | 763 | bool |
759 | 764 | ||
760 | config CEVT_BCM1480 | 765 | config CEVT_BCM1480 |
761 | bool | 766 | bool |
762 | 767 | ||
763 | config CEVT_GT641XX | 768 | config CEVT_GT641XX |
764 | bool | 769 | bool |
765 | 770 | ||
766 | config CEVT_R4K | 771 | config CEVT_R4K |
767 | bool | 772 | bool |
768 | 773 | ||
769 | config CEVT_SB1250 | 774 | config CEVT_SB1250 |
770 | bool | 775 | bool |
771 | 776 | ||
772 | config CEVT_TXX9 | 777 | config CEVT_TXX9 |
773 | bool | 778 | bool |
774 | 779 | ||
775 | config CSRC_BCM1480 | 780 | config CSRC_BCM1480 |
776 | bool | 781 | bool |
777 | 782 | ||
778 | config CSRC_R4K | 783 | config CSRC_R4K |
779 | bool | 784 | bool |
780 | 785 | ||
781 | config CSRC_SB1250 | 786 | config CSRC_SB1250 |
782 | bool | 787 | bool |
783 | 788 | ||
784 | config CFE | 789 | config CFE |
785 | bool | 790 | bool |
786 | 791 | ||
787 | config DMA_COHERENT | 792 | config DMA_COHERENT |
788 | bool | 793 | bool |
789 | 794 | ||
790 | config DMA_IP27 | 795 | config DMA_IP27 |
791 | bool | 796 | bool |
792 | 797 | ||
793 | config DMA_NONCOHERENT | 798 | config DMA_NONCOHERENT |
794 | bool | 799 | bool |
795 | select DMA_NEED_PCI_MAP_STATE | 800 | select DMA_NEED_PCI_MAP_STATE |
796 | 801 | ||
797 | config DMA_NEED_PCI_MAP_STATE | 802 | config DMA_NEED_PCI_MAP_STATE |
798 | bool | 803 | bool |
799 | 804 | ||
800 | config EARLY_PRINTK | 805 | config EARLY_PRINTK |
801 | bool "Early printk" if EMBEDDED && DEBUG_KERNEL | 806 | bool "Early printk" if EMBEDDED && DEBUG_KERNEL |
802 | depends on SYS_HAS_EARLY_PRINTK | 807 | depends on SYS_HAS_EARLY_PRINTK |
803 | default y | 808 | default y |
804 | help | 809 | help |
805 | This option enables special console drivers which allow the kernel | 810 | This option enables special console drivers which allow the kernel |
806 | to print messages very early in the bootup process. | 811 | to print messages very early in the bootup process. |
807 | 812 | ||
808 | This is useful for kernel debugging when your machine crashes very | 813 | This is useful for kernel debugging when your machine crashes very |
809 | early before the console code is initialized. For normal operation, | 814 | early before the console code is initialized. For normal operation, |
810 | it is not recommended because it looks ugly on some machines and | 815 | it is not recommended because it looks ugly on some machines and |
811 | doesn't cooperate with an X server. You should normally say N here, | 816 | doesn't cooperate with an X server. You should normally say N here, |
812 | unless you want to debug such a crash. | 817 | unless you want to debug such a crash. |
813 | 818 | ||
814 | config SYS_HAS_EARLY_PRINTK | 819 | config SYS_HAS_EARLY_PRINTK |
815 | bool | 820 | bool |
816 | 821 | ||
817 | config HOTPLUG_CPU | 822 | config HOTPLUG_CPU |
818 | bool | 823 | bool |
819 | default n | 824 | default n |
820 | 825 | ||
821 | config I8259 | 826 | config I8259 |
822 | bool | 827 | bool |
823 | 828 | ||
824 | config MIPS_BONITO64 | 829 | config MIPS_BONITO64 |
825 | bool | 830 | bool |
826 | 831 | ||
827 | config MIPS_MSC | 832 | config MIPS_MSC |
828 | bool | 833 | bool |
829 | 834 | ||
830 | config MIPS_NILE4 | 835 | config MIPS_NILE4 |
831 | bool | 836 | bool |
832 | 837 | ||
833 | config MIPS_DISABLE_OBSOLETE_IDE | 838 | config MIPS_DISABLE_OBSOLETE_IDE |
834 | bool | 839 | bool |
835 | 840 | ||
836 | config NO_IOPORT | 841 | config NO_IOPORT |
837 | def_bool n | 842 | def_bool n |
838 | 843 | ||
839 | config GENERIC_ISA_DMA | 844 | config GENERIC_ISA_DMA |
840 | bool | 845 | bool |
841 | select ZONE_DMA if GENERIC_ISA_DMA_SUPPORT_BROKEN=n | 846 | select ZONE_DMA if GENERIC_ISA_DMA_SUPPORT_BROKEN=n |
842 | 847 | ||
843 | config GENERIC_ISA_DMA_SUPPORT_BROKEN | 848 | config GENERIC_ISA_DMA_SUPPORT_BROKEN |
844 | bool | 849 | bool |
845 | select GENERIC_ISA_DMA | 850 | select GENERIC_ISA_DMA |
846 | 851 | ||
847 | config GENERIC_GPIO | 852 | config GENERIC_GPIO |
848 | bool | 853 | bool |
849 | 854 | ||
850 | # | 855 | # |
851 | # Endianess selection. Sufficiently obscure so many users don't know what to | 856 | # Endianess selection. Sufficiently obscure so many users don't know what to |
852 | # answer,so we try hard to limit the available choices. Also the use of a | 857 | # answer,so we try hard to limit the available choices. Also the use of a |
853 | # choice statement should be more obvious to the user. | 858 | # choice statement should be more obvious to the user. |
854 | # | 859 | # |
855 | choice | 860 | choice |
856 | prompt "Endianess selection" | 861 | prompt "Endianess selection" |
857 | help | 862 | help |
858 | Some MIPS machines can be configured for either little or big endian | 863 | Some MIPS machines can be configured for either little or big endian |
859 | byte order. These modes require different kernels and a different | 864 | byte order. These modes require different kernels and a different |
860 | Linux distribution. In general there is one preferred byteorder for a | 865 | Linux distribution. In general there is one preferred byteorder for a |
861 | particular system but some systems are just as commonly used in the | 866 | particular system but some systems are just as commonly used in the |
862 | one or the other endianness. | 867 | one or the other endianness. |
863 | 868 | ||
864 | config CPU_BIG_ENDIAN | 869 | config CPU_BIG_ENDIAN |
865 | bool "Big endian" | 870 | bool "Big endian" |
866 | depends on SYS_SUPPORTS_BIG_ENDIAN | 871 | depends on SYS_SUPPORTS_BIG_ENDIAN |
867 | 872 | ||
868 | config CPU_LITTLE_ENDIAN | 873 | config CPU_LITTLE_ENDIAN |
869 | bool "Little endian" | 874 | bool "Little endian" |
870 | depends on SYS_SUPPORTS_LITTLE_ENDIAN | 875 | depends on SYS_SUPPORTS_LITTLE_ENDIAN |
871 | help | 876 | help |
872 | 877 | ||
873 | endchoice | 878 | endchoice |
874 | 879 | ||
875 | config SYS_SUPPORTS_APM_EMULATION | 880 | config SYS_SUPPORTS_APM_EMULATION |
876 | bool | 881 | bool |
877 | 882 | ||
878 | config SYS_SUPPORTS_BIG_ENDIAN | 883 | config SYS_SUPPORTS_BIG_ENDIAN |
879 | bool | 884 | bool |
880 | 885 | ||
881 | config SYS_SUPPORTS_LITTLE_ENDIAN | 886 | config SYS_SUPPORTS_LITTLE_ENDIAN |
882 | bool | 887 | bool |
883 | 888 | ||
884 | config IRQ_CPU | 889 | config IRQ_CPU |
885 | bool | 890 | bool |
886 | 891 | ||
887 | config IRQ_CPU_RM7K | 892 | config IRQ_CPU_RM7K |
888 | bool | 893 | bool |
889 | 894 | ||
890 | config IRQ_CPU_RM9K | 895 | config IRQ_CPU_RM9K |
891 | bool | 896 | bool |
892 | 897 | ||
893 | config IRQ_MSP_SLP | 898 | config IRQ_MSP_SLP |
894 | bool | 899 | bool |
895 | 900 | ||
896 | config IRQ_MSP_CIC | 901 | config IRQ_MSP_CIC |
897 | bool | 902 | bool |
898 | 903 | ||
899 | config IRQ_TXX9 | 904 | config IRQ_TXX9 |
900 | bool | 905 | bool |
901 | 906 | ||
902 | config IRQ_GT641XX | 907 | config IRQ_GT641XX |
903 | bool | 908 | bool |
904 | 909 | ||
905 | config MIPS_BOARDS_GEN | 910 | config MIPS_BOARDS_GEN |
906 | bool | 911 | bool |
907 | 912 | ||
908 | config PCI_GT64XXX_PCI0 | 913 | config PCI_GT64XXX_PCI0 |
909 | bool | 914 | bool |
910 | 915 | ||
911 | config NO_EXCEPT_FILL | 916 | config NO_EXCEPT_FILL |
912 | bool | 917 | bool |
913 | 918 | ||
914 | config MIPS_TX3927 | 919 | config MIPS_TX3927 |
915 | bool | 920 | bool |
916 | select HAS_TXX9_SERIAL | 921 | select HAS_TXX9_SERIAL |
917 | 922 | ||
918 | config MIPS_RM9122 | 923 | config MIPS_RM9122 |
919 | bool | 924 | bool |
920 | select SERIAL_RM9000 | 925 | select SERIAL_RM9000 |
921 | 926 | ||
922 | config PNX8550 | 927 | config PNX8550 |
923 | bool | 928 | bool |
924 | select SOC_PNX8550 | 929 | select SOC_PNX8550 |
925 | 930 | ||
926 | config SOC_PNX8550 | 931 | config SOC_PNX8550 |
927 | bool | 932 | bool |
928 | select DMA_NONCOHERENT | 933 | select DMA_NONCOHERENT |
929 | select HW_HAS_PCI | 934 | select HW_HAS_PCI |
930 | select SYS_HAS_CPU_MIPS32_R1 | 935 | select SYS_HAS_CPU_MIPS32_R1 |
931 | select SYS_HAS_EARLY_PRINTK | 936 | select SYS_HAS_EARLY_PRINTK |
932 | select SYS_SUPPORTS_32BIT_KERNEL | 937 | select SYS_SUPPORTS_32BIT_KERNEL |
933 | select GENERIC_HARDIRQS_NO__DO_IRQ | 938 | select GENERIC_HARDIRQS_NO__DO_IRQ |
934 | select SYS_SUPPORTS_KGDB | 939 | select SYS_SUPPORTS_KGDB |
935 | select GENERIC_GPIO | 940 | select GENERIC_GPIO |
936 | 941 | ||
937 | config SWAP_IO_SPACE | 942 | config SWAP_IO_SPACE |
938 | bool | 943 | bool |
939 | 944 | ||
940 | config EMMA2RH | 945 | config EMMA2RH |
941 | bool | 946 | bool |
942 | depends on MARKEINS | 947 | depends on MARKEINS |
943 | default y | 948 | default y |
944 | 949 | ||
945 | config SERIAL_RM9000 | 950 | config SERIAL_RM9000 |
946 | bool | 951 | bool |
947 | 952 | ||
948 | config SGI_HAS_DS1286 | 953 | config SGI_HAS_DS1286 |
949 | bool | 954 | bool |
950 | 955 | ||
951 | config SGI_HAS_INDYDOG | 956 | config SGI_HAS_INDYDOG |
952 | bool | 957 | bool |
953 | 958 | ||
954 | config SGI_HAS_SEEQ | 959 | config SGI_HAS_SEEQ |
955 | bool | 960 | bool |
956 | 961 | ||
957 | config SGI_HAS_WD93 | 962 | config SGI_HAS_WD93 |
958 | bool | 963 | bool |
959 | 964 | ||
960 | config SGI_HAS_ZILOG | 965 | config SGI_HAS_ZILOG |
961 | bool | 966 | bool |
962 | 967 | ||
963 | config SGI_HAS_I8042 | 968 | config SGI_HAS_I8042 |
964 | bool | 969 | bool |
965 | 970 | ||
966 | config DEFAULT_SGI_PARTITION | 971 | config DEFAULT_SGI_PARTITION |
967 | bool | 972 | bool |
968 | 973 | ||
969 | config ARC32 | 974 | config ARC32 |
970 | bool | 975 | bool |
971 | 976 | ||
972 | config SNIPROM | 977 | config SNIPROM |
973 | bool | 978 | bool |
974 | 979 | ||
975 | config BOOT_ELF32 | 980 | config BOOT_ELF32 |
976 | bool | 981 | bool |
977 | 982 | ||
978 | config MIPS_L1_CACHE_SHIFT | 983 | config MIPS_L1_CACHE_SHIFT |
979 | int | 984 | int |
980 | default "4" if MACH_DECSTATION | 985 | default "4" if MACH_DECSTATION |
981 | default "7" if SGI_IP27 || SGI_IP28 || SNI_RM | 986 | default "7" if SGI_IP27 || SGI_IP28 || SNI_RM |
982 | default "4" if PMC_MSP4200_EVAL | 987 | default "4" if PMC_MSP4200_EVAL |
983 | default "5" | 988 | default "5" |
984 | 989 | ||
985 | config HAVE_STD_PC_SERIAL_PORT | 990 | config HAVE_STD_PC_SERIAL_PORT |
986 | bool | 991 | bool |
987 | 992 | ||
988 | config ARC_CONSOLE | 993 | config ARC_CONSOLE |
989 | bool "ARC console support" | 994 | bool "ARC console support" |
990 | depends on SGI_IP22 || SGI_IP28 || (SNI_RM && CPU_LITTLE_ENDIAN) | 995 | depends on SGI_IP22 || SGI_IP28 || (SNI_RM && CPU_LITTLE_ENDIAN) |
991 | 996 | ||
992 | config ARC_MEMORY | 997 | config ARC_MEMORY |
993 | bool | 998 | bool |
994 | depends on MACH_JAZZ || SNI_RM || SGI_IP32 | 999 | depends on MACH_JAZZ || SNI_RM || SGI_IP32 |
995 | default y | 1000 | default y |
996 | 1001 | ||
997 | config ARC_PROMLIB | 1002 | config ARC_PROMLIB |
998 | bool | 1003 | bool |
999 | depends on MACH_JAZZ || SNI_RM || SGI_IP22 || SGI_IP28 || SGI_IP32 | 1004 | depends on MACH_JAZZ || SNI_RM || SGI_IP22 || SGI_IP28 || SGI_IP32 |
1000 | default y | 1005 | default y |
1001 | 1006 | ||
1002 | config ARC64 | 1007 | config ARC64 |
1003 | bool | 1008 | bool |
1004 | 1009 | ||
1005 | config BOOT_ELF64 | 1010 | config BOOT_ELF64 |
1006 | bool | 1011 | bool |
1007 | 1012 | ||
1008 | menu "CPU selection" | 1013 | menu "CPU selection" |
1009 | 1014 | ||
1010 | choice | 1015 | choice |
1011 | prompt "CPU type" | 1016 | prompt "CPU type" |
1012 | default CPU_R4X00 | 1017 | default CPU_R4X00 |
1013 | 1018 | ||
1014 | config CPU_LOONGSON2 | 1019 | config CPU_LOONGSON2 |
1015 | bool "Loongson 2" | 1020 | bool "Loongson 2" |
1016 | depends on SYS_HAS_CPU_LOONGSON2 | 1021 | depends on SYS_HAS_CPU_LOONGSON2 |
1017 | select CPU_SUPPORTS_32BIT_KERNEL | 1022 | select CPU_SUPPORTS_32BIT_KERNEL |
1018 | select CPU_SUPPORTS_64BIT_KERNEL | 1023 | select CPU_SUPPORTS_64BIT_KERNEL |
1019 | select CPU_SUPPORTS_HIGHMEM | 1024 | select CPU_SUPPORTS_HIGHMEM |
1020 | help | 1025 | help |
1021 | The Loongson 2E processor implements the MIPS III instruction set | 1026 | The Loongson 2E processor implements the MIPS III instruction set |
1022 | with many extensions. | 1027 | with many extensions. |
1023 | 1028 | ||
1024 | config CPU_MIPS32_R1 | 1029 | config CPU_MIPS32_R1 |
1025 | bool "MIPS32 Release 1" | 1030 | bool "MIPS32 Release 1" |
1026 | depends on SYS_HAS_CPU_MIPS32_R1 | 1031 | depends on SYS_HAS_CPU_MIPS32_R1 |
1027 | select CPU_HAS_LLSC | 1032 | select CPU_HAS_LLSC |
1028 | select CPU_HAS_PREFETCH | 1033 | select CPU_HAS_PREFETCH |
1029 | select CPU_SUPPORTS_32BIT_KERNEL | 1034 | select CPU_SUPPORTS_32BIT_KERNEL |
1030 | select CPU_SUPPORTS_HIGHMEM | 1035 | select CPU_SUPPORTS_HIGHMEM |
1031 | help | 1036 | help |
1032 | Choose this option to build a kernel for release 1 or later of the | 1037 | Choose this option to build a kernel for release 1 or later of the |
1033 | MIPS32 architecture. Most modern embedded systems with a 32-bit | 1038 | MIPS32 architecture. Most modern embedded systems with a 32-bit |
1034 | MIPS processor are based on a MIPS32 processor. If you know the | 1039 | MIPS processor are based on a MIPS32 processor. If you know the |
1035 | specific type of processor in your system, choose those that one | 1040 | specific type of processor in your system, choose those that one |
1036 | otherwise CPU_MIPS32_R1 is a safe bet for any MIPS32 system. | 1041 | otherwise CPU_MIPS32_R1 is a safe bet for any MIPS32 system. |
1037 | Release 2 of the MIPS32 architecture is available since several | 1042 | Release 2 of the MIPS32 architecture is available since several |
1038 | years so chances are you even have a MIPS32 Release 2 processor | 1043 | years so chances are you even have a MIPS32 Release 2 processor |
1039 | in which case you should choose CPU_MIPS32_R2 instead for better | 1044 | in which case you should choose CPU_MIPS32_R2 instead for better |
1040 | performance. | 1045 | performance. |
1041 | 1046 | ||
1042 | config CPU_MIPS32_R2 | 1047 | config CPU_MIPS32_R2 |
1043 | bool "MIPS32 Release 2" | 1048 | bool "MIPS32 Release 2" |
1044 | depends on SYS_HAS_CPU_MIPS32_R2 | 1049 | depends on SYS_HAS_CPU_MIPS32_R2 |
1045 | select CPU_HAS_LLSC | 1050 | select CPU_HAS_LLSC |
1046 | select CPU_HAS_PREFETCH | 1051 | select CPU_HAS_PREFETCH |
1047 | select CPU_SUPPORTS_32BIT_KERNEL | 1052 | select CPU_SUPPORTS_32BIT_KERNEL |
1048 | select CPU_SUPPORTS_HIGHMEM | 1053 | select CPU_SUPPORTS_HIGHMEM |
1049 | help | 1054 | help |
1050 | Choose this option to build a kernel for release 2 or later of the | 1055 | Choose this option to build a kernel for release 2 or later of the |
1051 | MIPS32 architecture. Most modern embedded systems with a 32-bit | 1056 | MIPS32 architecture. Most modern embedded systems with a 32-bit |
1052 | MIPS processor are based on a MIPS32 processor. If you know the | 1057 | MIPS processor are based on a MIPS32 processor. If you know the |
1053 | specific type of processor in your system, choose those that one | 1058 | specific type of processor in your system, choose those that one |
1054 | otherwise CPU_MIPS32_R1 is a safe bet for any MIPS32 system. | 1059 | otherwise CPU_MIPS32_R1 is a safe bet for any MIPS32 system. |
1055 | 1060 | ||
1056 | config CPU_MIPS64_R1 | 1061 | config CPU_MIPS64_R1 |
1057 | bool "MIPS64 Release 1" | 1062 | bool "MIPS64 Release 1" |
1058 | depends on SYS_HAS_CPU_MIPS64_R1 | 1063 | depends on SYS_HAS_CPU_MIPS64_R1 |
1059 | select CPU_HAS_LLSC | 1064 | select CPU_HAS_LLSC |
1060 | select CPU_HAS_PREFETCH | 1065 | select CPU_HAS_PREFETCH |
1061 | select CPU_SUPPORTS_32BIT_KERNEL | 1066 | select CPU_SUPPORTS_32BIT_KERNEL |
1062 | select CPU_SUPPORTS_64BIT_KERNEL | 1067 | select CPU_SUPPORTS_64BIT_KERNEL |
1063 | select CPU_SUPPORTS_HIGHMEM | 1068 | select CPU_SUPPORTS_HIGHMEM |
1064 | help | 1069 | help |
1065 | Choose this option to build a kernel for release 1 or later of the | 1070 | Choose this option to build a kernel for release 1 or later of the |
1066 | MIPS64 architecture. Many modern embedded systems with a 64-bit | 1071 | MIPS64 architecture. Many modern embedded systems with a 64-bit |
1067 | MIPS processor are based on a MIPS64 processor. If you know the | 1072 | MIPS processor are based on a MIPS64 processor. If you know the |
1068 | specific type of processor in your system, choose those that one | 1073 | specific type of processor in your system, choose those that one |
1069 | otherwise CPU_MIPS64_R1 is a safe bet for any MIPS64 system. | 1074 | otherwise CPU_MIPS64_R1 is a safe bet for any MIPS64 system. |
1070 | Release 2 of the MIPS64 architecture is available since several | 1075 | Release 2 of the MIPS64 architecture is available since several |
1071 | years so chances are you even have a MIPS64 Release 2 processor | 1076 | years so chances are you even have a MIPS64 Release 2 processor |
1072 | in which case you should choose CPU_MIPS64_R2 instead for better | 1077 | in which case you should choose CPU_MIPS64_R2 instead for better |
1073 | performance. | 1078 | performance. |
1074 | 1079 | ||
1075 | config CPU_MIPS64_R2 | 1080 | config CPU_MIPS64_R2 |
1076 | bool "MIPS64 Release 2" | 1081 | bool "MIPS64 Release 2" |
1077 | depends on SYS_HAS_CPU_MIPS64_R2 | 1082 | depends on SYS_HAS_CPU_MIPS64_R2 |
1078 | select CPU_HAS_LLSC | 1083 | select CPU_HAS_LLSC |
1079 | select CPU_HAS_PREFETCH | 1084 | select CPU_HAS_PREFETCH |
1080 | select CPU_SUPPORTS_32BIT_KERNEL | 1085 | select CPU_SUPPORTS_32BIT_KERNEL |
1081 | select CPU_SUPPORTS_64BIT_KERNEL | 1086 | select CPU_SUPPORTS_64BIT_KERNEL |
1082 | select CPU_SUPPORTS_HIGHMEM | 1087 | select CPU_SUPPORTS_HIGHMEM |
1083 | help | 1088 | help |
1084 | Choose this option to build a kernel for release 2 or later of the | 1089 | Choose this option to build a kernel for release 2 or later of the |
1085 | MIPS64 architecture. Many modern embedded systems with a 64-bit | 1090 | MIPS64 architecture. Many modern embedded systems with a 64-bit |
1086 | MIPS processor are based on a MIPS64 processor. If you know the | 1091 | MIPS processor are based on a MIPS64 processor. If you know the |
1087 | specific type of processor in your system, choose those that one | 1092 | specific type of processor in your system, choose those that one |
1088 | otherwise CPU_MIPS64_R1 is a safe bet for any MIPS64 system. | 1093 | otherwise CPU_MIPS64_R1 is a safe bet for any MIPS64 system. |
1089 | 1094 | ||
1090 | config CPU_R3000 | 1095 | config CPU_R3000 |
1091 | bool "R3000" | 1096 | bool "R3000" |
1092 | depends on SYS_HAS_CPU_R3000 | 1097 | depends on SYS_HAS_CPU_R3000 |
1093 | select CPU_HAS_WB | 1098 | select CPU_HAS_WB |
1094 | select CPU_SUPPORTS_32BIT_KERNEL | 1099 | select CPU_SUPPORTS_32BIT_KERNEL |
1095 | select CPU_SUPPORTS_HIGHMEM | 1100 | select CPU_SUPPORTS_HIGHMEM |
1096 | help | 1101 | help |
1097 | Please make sure to pick the right CPU type. Linux/MIPS is not | 1102 | Please make sure to pick the right CPU type. Linux/MIPS is not |
1098 | designed to be generic, i.e. Kernels compiled for R3000 CPUs will | 1103 | designed to be generic, i.e. Kernels compiled for R3000 CPUs will |
1099 | *not* work on R4000 machines and vice versa. However, since most | 1104 | *not* work on R4000 machines and vice versa. However, since most |
1100 | of the supported machines have an R4000 (or similar) CPU, R4x00 | 1105 | of the supported machines have an R4000 (or similar) CPU, R4x00 |
1101 | might be a safe bet. If the resulting kernel does not work, | 1106 | might be a safe bet. If the resulting kernel does not work, |
1102 | try to recompile with R3000. | 1107 | try to recompile with R3000. |
1103 | 1108 | ||
1104 | config CPU_TX39XX | 1109 | config CPU_TX39XX |
1105 | bool "R39XX" | 1110 | bool "R39XX" |
1106 | depends on SYS_HAS_CPU_TX39XX | 1111 | depends on SYS_HAS_CPU_TX39XX |
1107 | select CPU_SUPPORTS_32BIT_KERNEL | 1112 | select CPU_SUPPORTS_32BIT_KERNEL |
1108 | 1113 | ||
1109 | config CPU_VR41XX | 1114 | config CPU_VR41XX |
1110 | bool "R41xx" | 1115 | bool "R41xx" |
1111 | depends on SYS_HAS_CPU_VR41XX | 1116 | depends on SYS_HAS_CPU_VR41XX |
1112 | select CPU_SUPPORTS_32BIT_KERNEL | 1117 | select CPU_SUPPORTS_32BIT_KERNEL |
1113 | select CPU_SUPPORTS_64BIT_KERNEL | 1118 | select CPU_SUPPORTS_64BIT_KERNEL |
1114 | help | 1119 | help |
1115 | The options selects support for the NEC VR4100 series of processors. | 1120 | The options selects support for the NEC VR4100 series of processors. |
1116 | Only choose this option if you have one of these processors as a | 1121 | Only choose this option if you have one of these processors as a |
1117 | kernel built with this option will not run on any other type of | 1122 | kernel built with this option will not run on any other type of |
1118 | processor or vice versa. | 1123 | processor or vice versa. |
1119 | 1124 | ||
1120 | config CPU_R4300 | 1125 | config CPU_R4300 |
1121 | bool "R4300" | 1126 | bool "R4300" |
1122 | depends on SYS_HAS_CPU_R4300 | 1127 | depends on SYS_HAS_CPU_R4300 |
1123 | select CPU_HAS_LLSC | 1128 | select CPU_HAS_LLSC |
1124 | select CPU_SUPPORTS_32BIT_KERNEL | 1129 | select CPU_SUPPORTS_32BIT_KERNEL |
1125 | select CPU_SUPPORTS_64BIT_KERNEL | 1130 | select CPU_SUPPORTS_64BIT_KERNEL |
1126 | help | 1131 | help |
1127 | MIPS Technologies R4300-series processors. | 1132 | MIPS Technologies R4300-series processors. |
1128 | 1133 | ||
1129 | config CPU_R4X00 | 1134 | config CPU_R4X00 |
1130 | bool "R4x00" | 1135 | bool "R4x00" |
1131 | depends on SYS_HAS_CPU_R4X00 | 1136 | depends on SYS_HAS_CPU_R4X00 |
1132 | select CPU_HAS_LLSC | 1137 | select CPU_HAS_LLSC |
1133 | select CPU_SUPPORTS_32BIT_KERNEL | 1138 | select CPU_SUPPORTS_32BIT_KERNEL |
1134 | select CPU_SUPPORTS_64BIT_KERNEL | 1139 | select CPU_SUPPORTS_64BIT_KERNEL |
1135 | help | 1140 | help |
1136 | MIPS Technologies R4000-series processors other than 4300, including | 1141 | MIPS Technologies R4000-series processors other than 4300, including |
1137 | the R4000, R4400, R4600, and 4700. | 1142 | the R4000, R4400, R4600, and 4700. |
1138 | 1143 | ||
1139 | config CPU_TX49XX | 1144 | config CPU_TX49XX |
1140 | bool "R49XX" | 1145 | bool "R49XX" |
1141 | depends on SYS_HAS_CPU_TX49XX | 1146 | depends on SYS_HAS_CPU_TX49XX |
1142 | select CPU_HAS_LLSC | 1147 | select CPU_HAS_LLSC |
1143 | select CPU_HAS_PREFETCH | 1148 | select CPU_HAS_PREFETCH |
1144 | select CPU_SUPPORTS_32BIT_KERNEL | 1149 | select CPU_SUPPORTS_32BIT_KERNEL |
1145 | select CPU_SUPPORTS_64BIT_KERNEL | 1150 | select CPU_SUPPORTS_64BIT_KERNEL |
1146 | 1151 | ||
1147 | config CPU_R5000 | 1152 | config CPU_R5000 |
1148 | bool "R5000" | 1153 | bool "R5000" |
1149 | depends on SYS_HAS_CPU_R5000 | 1154 | depends on SYS_HAS_CPU_R5000 |
1150 | select CPU_HAS_LLSC | 1155 | select CPU_HAS_LLSC |
1151 | select CPU_SUPPORTS_32BIT_KERNEL | 1156 | select CPU_SUPPORTS_32BIT_KERNEL |
1152 | select CPU_SUPPORTS_64BIT_KERNEL | 1157 | select CPU_SUPPORTS_64BIT_KERNEL |
1153 | help | 1158 | help |
1154 | MIPS Technologies R5000-series processors other than the Nevada. | 1159 | MIPS Technologies R5000-series processors other than the Nevada. |
1155 | 1160 | ||
1156 | config CPU_R5432 | 1161 | config CPU_R5432 |
1157 | bool "R5432" | 1162 | bool "R5432" |
1158 | depends on SYS_HAS_CPU_R5432 | 1163 | depends on SYS_HAS_CPU_R5432 |
1159 | select CPU_HAS_LLSC | 1164 | select CPU_HAS_LLSC |
1160 | select CPU_SUPPORTS_32BIT_KERNEL | 1165 | select CPU_SUPPORTS_32BIT_KERNEL |
1161 | select CPU_SUPPORTS_64BIT_KERNEL | 1166 | select CPU_SUPPORTS_64BIT_KERNEL |
1162 | 1167 | ||
1163 | config CPU_R6000 | 1168 | config CPU_R6000 |
1164 | bool "R6000" | 1169 | bool "R6000" |
1165 | depends on EXPERIMENTAL | 1170 | depends on EXPERIMENTAL |
1166 | select CPU_HAS_LLSC | 1171 | select CPU_HAS_LLSC |
1167 | depends on SYS_HAS_CPU_R6000 | 1172 | depends on SYS_HAS_CPU_R6000 |
1168 | select CPU_SUPPORTS_32BIT_KERNEL | 1173 | select CPU_SUPPORTS_32BIT_KERNEL |
1169 | help | 1174 | help |
1170 | MIPS Technologies R6000 and R6000A series processors. Note these | 1175 | MIPS Technologies R6000 and R6000A series processors. Note these |
1171 | processors are extremely rare and the support for them is incomplete. | 1176 | processors are extremely rare and the support for them is incomplete. |
1172 | 1177 | ||
1173 | config CPU_NEVADA | 1178 | config CPU_NEVADA |
1174 | bool "RM52xx" | 1179 | bool "RM52xx" |
1175 | depends on SYS_HAS_CPU_NEVADA | 1180 | depends on SYS_HAS_CPU_NEVADA |
1176 | select CPU_HAS_LLSC | 1181 | select CPU_HAS_LLSC |
1177 | select CPU_SUPPORTS_32BIT_KERNEL | 1182 | select CPU_SUPPORTS_32BIT_KERNEL |
1178 | select CPU_SUPPORTS_64BIT_KERNEL | 1183 | select CPU_SUPPORTS_64BIT_KERNEL |
1179 | help | 1184 | help |
1180 | QED / PMC-Sierra RM52xx-series ("Nevada") processors. | 1185 | QED / PMC-Sierra RM52xx-series ("Nevada") processors. |
1181 | 1186 | ||
1182 | config CPU_R8000 | 1187 | config CPU_R8000 |
1183 | bool "R8000" | 1188 | bool "R8000" |
1184 | depends on EXPERIMENTAL | 1189 | depends on EXPERIMENTAL |
1185 | depends on SYS_HAS_CPU_R8000 | 1190 | depends on SYS_HAS_CPU_R8000 |
1186 | select CPU_HAS_LLSC | 1191 | select CPU_HAS_LLSC |
1187 | select CPU_HAS_PREFETCH | 1192 | select CPU_HAS_PREFETCH |
1188 | select CPU_SUPPORTS_64BIT_KERNEL | 1193 | select CPU_SUPPORTS_64BIT_KERNEL |
1189 | help | 1194 | help |
1190 | MIPS Technologies R8000 processors. Note these processors are | 1195 | MIPS Technologies R8000 processors. Note these processors are |
1191 | uncommon and the support for them is incomplete. | 1196 | uncommon and the support for them is incomplete. |
1192 | 1197 | ||
1193 | config CPU_R10000 | 1198 | config CPU_R10000 |
1194 | bool "R10000" | 1199 | bool "R10000" |
1195 | depends on SYS_HAS_CPU_R10000 | 1200 | depends on SYS_HAS_CPU_R10000 |
1196 | select CPU_HAS_LLSC | 1201 | select CPU_HAS_LLSC |
1197 | select CPU_HAS_PREFETCH | 1202 | select CPU_HAS_PREFETCH |
1198 | select CPU_SUPPORTS_32BIT_KERNEL | 1203 | select CPU_SUPPORTS_32BIT_KERNEL |
1199 | select CPU_SUPPORTS_64BIT_KERNEL | 1204 | select CPU_SUPPORTS_64BIT_KERNEL |
1200 | select CPU_SUPPORTS_HIGHMEM | 1205 | select CPU_SUPPORTS_HIGHMEM |
1201 | help | 1206 | help |
1202 | MIPS Technologies R10000-series processors. | 1207 | MIPS Technologies R10000-series processors. |
1203 | 1208 | ||
1204 | config CPU_RM7000 | 1209 | config CPU_RM7000 |
1205 | bool "RM7000" | 1210 | bool "RM7000" |
1206 | depends on SYS_HAS_CPU_RM7000 | 1211 | depends on SYS_HAS_CPU_RM7000 |
1207 | select CPU_HAS_LLSC | 1212 | select CPU_HAS_LLSC |
1208 | select CPU_HAS_PREFETCH | 1213 | select CPU_HAS_PREFETCH |
1209 | select CPU_SUPPORTS_32BIT_KERNEL | 1214 | select CPU_SUPPORTS_32BIT_KERNEL |
1210 | select CPU_SUPPORTS_64BIT_KERNEL | 1215 | select CPU_SUPPORTS_64BIT_KERNEL |
1211 | select CPU_SUPPORTS_HIGHMEM | 1216 | select CPU_SUPPORTS_HIGHMEM |
1212 | 1217 | ||
1213 | config CPU_RM9000 | 1218 | config CPU_RM9000 |
1214 | bool "RM9000" | 1219 | bool "RM9000" |
1215 | depends on SYS_HAS_CPU_RM9000 | 1220 | depends on SYS_HAS_CPU_RM9000 |
1216 | select CPU_HAS_LLSC | 1221 | select CPU_HAS_LLSC |
1217 | select CPU_HAS_PREFETCH | 1222 | select CPU_HAS_PREFETCH |
1218 | select CPU_SUPPORTS_32BIT_KERNEL | 1223 | select CPU_SUPPORTS_32BIT_KERNEL |
1219 | select CPU_SUPPORTS_64BIT_KERNEL | 1224 | select CPU_SUPPORTS_64BIT_KERNEL |
1220 | select CPU_SUPPORTS_HIGHMEM | 1225 | select CPU_SUPPORTS_HIGHMEM |
1221 | select WEAK_ORDERING | 1226 | select WEAK_ORDERING |
1222 | 1227 | ||
1223 | config CPU_SB1 | 1228 | config CPU_SB1 |
1224 | bool "SB1" | 1229 | bool "SB1" |
1225 | depends on SYS_HAS_CPU_SB1 | 1230 | depends on SYS_HAS_CPU_SB1 |
1226 | select CPU_HAS_LLSC | 1231 | select CPU_HAS_LLSC |
1227 | select CPU_SUPPORTS_32BIT_KERNEL | 1232 | select CPU_SUPPORTS_32BIT_KERNEL |
1228 | select CPU_SUPPORTS_64BIT_KERNEL | 1233 | select CPU_SUPPORTS_64BIT_KERNEL |
1229 | select CPU_SUPPORTS_HIGHMEM | 1234 | select CPU_SUPPORTS_HIGHMEM |
1230 | select WEAK_ORDERING | 1235 | select WEAK_ORDERING |
1231 | 1236 | ||
1232 | endchoice | 1237 | endchoice |
1233 | 1238 | ||
1234 | config SYS_HAS_CPU_LOONGSON2 | 1239 | config SYS_HAS_CPU_LOONGSON2 |
1235 | bool | 1240 | bool |
1236 | 1241 | ||
1237 | config SYS_HAS_CPU_MIPS32_R1 | 1242 | config SYS_HAS_CPU_MIPS32_R1 |
1238 | bool | 1243 | bool |
1239 | 1244 | ||
1240 | config SYS_HAS_CPU_MIPS32_R2 | 1245 | config SYS_HAS_CPU_MIPS32_R2 |
1241 | bool | 1246 | bool |
1242 | 1247 | ||
1243 | config SYS_HAS_CPU_MIPS64_R1 | 1248 | config SYS_HAS_CPU_MIPS64_R1 |
1244 | bool | 1249 | bool |
1245 | 1250 | ||
1246 | config SYS_HAS_CPU_MIPS64_R2 | 1251 | config SYS_HAS_CPU_MIPS64_R2 |
1247 | bool | 1252 | bool |
1248 | 1253 | ||
1249 | config SYS_HAS_CPU_R3000 | 1254 | config SYS_HAS_CPU_R3000 |
1250 | bool | 1255 | bool |
1251 | 1256 | ||
1252 | config SYS_HAS_CPU_TX39XX | 1257 | config SYS_HAS_CPU_TX39XX |
1253 | bool | 1258 | bool |
1254 | 1259 | ||
1255 | config SYS_HAS_CPU_VR41XX | 1260 | config SYS_HAS_CPU_VR41XX |
1256 | bool | 1261 | bool |
1257 | 1262 | ||
1258 | config SYS_HAS_CPU_R4300 | 1263 | config SYS_HAS_CPU_R4300 |
1259 | bool | 1264 | bool |
1260 | 1265 | ||
1261 | config SYS_HAS_CPU_R4X00 | 1266 | config SYS_HAS_CPU_R4X00 |
1262 | bool | 1267 | bool |
1263 | 1268 | ||
1264 | config SYS_HAS_CPU_TX49XX | 1269 | config SYS_HAS_CPU_TX49XX |
1265 | bool | 1270 | bool |
1266 | 1271 | ||
1267 | config SYS_HAS_CPU_R5000 | 1272 | config SYS_HAS_CPU_R5000 |
1268 | bool | 1273 | bool |
1269 | 1274 | ||
1270 | config SYS_HAS_CPU_R5432 | 1275 | config SYS_HAS_CPU_R5432 |
1271 | bool | 1276 | bool |
1272 | 1277 | ||
1273 | config SYS_HAS_CPU_R6000 | 1278 | config SYS_HAS_CPU_R6000 |
1274 | bool | 1279 | bool |
1275 | 1280 | ||
1276 | config SYS_HAS_CPU_NEVADA | 1281 | config SYS_HAS_CPU_NEVADA |
1277 | bool | 1282 | bool |
1278 | 1283 | ||
1279 | config SYS_HAS_CPU_R8000 | 1284 | config SYS_HAS_CPU_R8000 |
1280 | bool | 1285 | bool |
1281 | 1286 | ||
1282 | config SYS_HAS_CPU_R10000 | 1287 | config SYS_HAS_CPU_R10000 |
1283 | bool | 1288 | bool |
1284 | 1289 | ||
1285 | config SYS_HAS_CPU_RM7000 | 1290 | config SYS_HAS_CPU_RM7000 |
1286 | bool | 1291 | bool |
1287 | 1292 | ||
1288 | config SYS_HAS_CPU_RM9000 | 1293 | config SYS_HAS_CPU_RM9000 |
1289 | bool | 1294 | bool |
1290 | 1295 | ||
1291 | config SYS_HAS_CPU_SB1 | 1296 | config SYS_HAS_CPU_SB1 |
1292 | bool | 1297 | bool |
1293 | 1298 | ||
1294 | # | 1299 | # |
1295 | # CPU may reorder R->R, R->W, W->R, W->W | 1300 | # CPU may reorder R->R, R->W, W->R, W->W |
1296 | # Reordering beyond LL and SC is handled in WEAK_REORDERING_BEYOND_LLSC | 1301 | # Reordering beyond LL and SC is handled in WEAK_REORDERING_BEYOND_LLSC |
1297 | # | 1302 | # |
1298 | config WEAK_ORDERING | 1303 | config WEAK_ORDERING |
1299 | bool | 1304 | bool |
1300 | 1305 | ||
1301 | # | 1306 | # |
1302 | # CPU may reorder reads and writes beyond LL/SC | 1307 | # CPU may reorder reads and writes beyond LL/SC |
1303 | # CPU may reorder R->LL, R->LL, W->LL, W->LL, R->SC, R->SC, W->SC, W->SC | 1308 | # CPU may reorder R->LL, R->LL, W->LL, W->LL, R->SC, R->SC, W->SC, W->SC |
1304 | # | 1309 | # |
1305 | config WEAK_REORDERING_BEYOND_LLSC | 1310 | config WEAK_REORDERING_BEYOND_LLSC |
1306 | bool | 1311 | bool |
1307 | endmenu | 1312 | endmenu |
1308 | 1313 | ||
1309 | # | 1314 | # |
1310 | # These two indicate any level of the MIPS32 and MIPS64 architecture | 1315 | # These two indicate any level of the MIPS32 and MIPS64 architecture |
1311 | # | 1316 | # |
1312 | config CPU_MIPS32 | 1317 | config CPU_MIPS32 |
1313 | bool | 1318 | bool |
1314 | default y if CPU_MIPS32_R1 || CPU_MIPS32_R2 | 1319 | default y if CPU_MIPS32_R1 || CPU_MIPS32_R2 |
1315 | 1320 | ||
1316 | config CPU_MIPS64 | 1321 | config CPU_MIPS64 |
1317 | bool | 1322 | bool |
1318 | default y if CPU_MIPS64_R1 || CPU_MIPS64_R2 | 1323 | default y if CPU_MIPS64_R1 || CPU_MIPS64_R2 |
1319 | 1324 | ||
1320 | # | 1325 | # |
1321 | # These two indicate the revision of the architecture, either Release 1 or Release 2 | 1326 | # These two indicate the revision of the architecture, either Release 1 or Release 2 |
1322 | # | 1327 | # |
1323 | config CPU_MIPSR1 | 1328 | config CPU_MIPSR1 |
1324 | bool | 1329 | bool |
1325 | default y if CPU_MIPS32_R1 || CPU_MIPS64_R1 | 1330 | default y if CPU_MIPS32_R1 || CPU_MIPS64_R1 |
1326 | 1331 | ||
1327 | config CPU_MIPSR2 | 1332 | config CPU_MIPSR2 |
1328 | bool | 1333 | bool |
1329 | default y if CPU_MIPS32_R2 || CPU_MIPS64_R2 | 1334 | default y if CPU_MIPS32_R2 || CPU_MIPS64_R2 |
1330 | 1335 | ||
1331 | config SYS_SUPPORTS_32BIT_KERNEL | 1336 | config SYS_SUPPORTS_32BIT_KERNEL |
1332 | bool | 1337 | bool |
1333 | config SYS_SUPPORTS_64BIT_KERNEL | 1338 | config SYS_SUPPORTS_64BIT_KERNEL |
1334 | bool | 1339 | bool |
1335 | config CPU_SUPPORTS_32BIT_KERNEL | 1340 | config CPU_SUPPORTS_32BIT_KERNEL |
1336 | bool | 1341 | bool |
1337 | config CPU_SUPPORTS_64BIT_KERNEL | 1342 | config CPU_SUPPORTS_64BIT_KERNEL |
1338 | bool | 1343 | bool |
1339 | 1344 | ||
1340 | menu "Kernel type" | 1345 | menu "Kernel type" |
1341 | 1346 | ||
1342 | choice | 1347 | choice |
1343 | 1348 | ||
1344 | prompt "Kernel code model" | 1349 | prompt "Kernel code model" |
1345 | help | 1350 | help |
1346 | You should only select this option if you have a workload that | 1351 | You should only select this option if you have a workload that |
1347 | actually benefits from 64-bit processing or if your machine has | 1352 | actually benefits from 64-bit processing or if your machine has |
1348 | large memory. You will only be presented a single option in this | 1353 | large memory. You will only be presented a single option in this |
1349 | menu if your system does not support both 32-bit and 64-bit kernels. | 1354 | menu if your system does not support both 32-bit and 64-bit kernels. |
1350 | 1355 | ||
1351 | config 32BIT | 1356 | config 32BIT |
1352 | bool "32-bit kernel" | 1357 | bool "32-bit kernel" |
1353 | depends on CPU_SUPPORTS_32BIT_KERNEL && SYS_SUPPORTS_32BIT_KERNEL | 1358 | depends on CPU_SUPPORTS_32BIT_KERNEL && SYS_SUPPORTS_32BIT_KERNEL |
1354 | select TRAD_SIGNALS | 1359 | select TRAD_SIGNALS |
1355 | help | 1360 | help |
1356 | Select this option if you want to build a 32-bit kernel. | 1361 | Select this option if you want to build a 32-bit kernel. |
1357 | config 64BIT | 1362 | config 64BIT |
1358 | bool "64-bit kernel" | 1363 | bool "64-bit kernel" |
1359 | depends on CPU_SUPPORTS_64BIT_KERNEL && SYS_SUPPORTS_64BIT_KERNEL | 1364 | depends on CPU_SUPPORTS_64BIT_KERNEL && SYS_SUPPORTS_64BIT_KERNEL |
1360 | help | 1365 | help |
1361 | Select this option if you want to build a 64-bit kernel. | 1366 | Select this option if you want to build a 64-bit kernel. |
1362 | 1367 | ||
1363 | endchoice | 1368 | endchoice |
1364 | 1369 | ||
1365 | choice | 1370 | choice |
1366 | prompt "Kernel page size" | 1371 | prompt "Kernel page size" |
1367 | default PAGE_SIZE_4KB | 1372 | default PAGE_SIZE_4KB |
1368 | 1373 | ||
1369 | config PAGE_SIZE_4KB | 1374 | config PAGE_SIZE_4KB |
1370 | bool "4kB" | 1375 | bool "4kB" |
1371 | help | 1376 | help |
1372 | This option select the standard 4kB Linux page size. On some | 1377 | This option select the standard 4kB Linux page size. On some |
1373 | R3000-family processors this is the only available page size. Using | 1378 | R3000-family processors this is the only available page size. Using |
1374 | 4kB page size will minimize memory consumption and is therefore | 1379 | 4kB page size will minimize memory consumption and is therefore |
1375 | recommended for low memory systems. | 1380 | recommended for low memory systems. |
1376 | 1381 | ||
1377 | config PAGE_SIZE_8KB | 1382 | config PAGE_SIZE_8KB |
1378 | bool "8kB" | 1383 | bool "8kB" |
1379 | depends on EXPERIMENTAL && CPU_R8000 | 1384 | depends on EXPERIMENTAL && CPU_R8000 |
1380 | help | 1385 | help |
1381 | Using 8kB page size will result in higher performance kernel at | 1386 | Using 8kB page size will result in higher performance kernel at |
1382 | the price of higher memory consumption. This option is available | 1387 | the price of higher memory consumption. This option is available |
1383 | only on the R8000 processor. Not that at the time of this writing | 1388 | only on the R8000 processor. Not that at the time of this writing |
1384 | this option is still high experimental; there are also issues with | 1389 | this option is still high experimental; there are also issues with |
1385 | compatibility of user applications. | 1390 | compatibility of user applications. |
1386 | 1391 | ||
1387 | config PAGE_SIZE_16KB | 1392 | config PAGE_SIZE_16KB |
1388 | bool "16kB" | 1393 | bool "16kB" |
1389 | depends on !CPU_R3000 && !CPU_TX39XX | 1394 | depends on !CPU_R3000 && !CPU_TX39XX |
1390 | help | 1395 | help |
1391 | Using 16kB page size will result in higher performance kernel at | 1396 | Using 16kB page size will result in higher performance kernel at |
1392 | the price of higher memory consumption. This option is available on | 1397 | the price of higher memory consumption. This option is available on |
1393 | all non-R3000 family processors. Note that you will need a suitable | 1398 | all non-R3000 family processors. Note that you will need a suitable |
1394 | Linux distribution to support this. | 1399 | Linux distribution to support this. |
1395 | 1400 | ||
1396 | config PAGE_SIZE_64KB | 1401 | config PAGE_SIZE_64KB |
1397 | bool "64kB" | 1402 | bool "64kB" |
1398 | depends on EXPERIMENTAL && !CPU_R3000 && !CPU_TX39XX | 1403 | depends on EXPERIMENTAL && !CPU_R3000 && !CPU_TX39XX |
1399 | help | 1404 | help |
1400 | Using 64kB page size will result in higher performance kernel at | 1405 | Using 64kB page size will result in higher performance kernel at |
1401 | the price of higher memory consumption. This option is available on | 1406 | the price of higher memory consumption. This option is available on |
1402 | all non-R3000 family processor. Not that at the time of this | 1407 | all non-R3000 family processor. Not that at the time of this |
1403 | writing this option is still high experimental. | 1408 | writing this option is still high experimental. |
1404 | 1409 | ||
1405 | endchoice | 1410 | endchoice |
1406 | 1411 | ||
1407 | config BOARD_SCACHE | 1412 | config BOARD_SCACHE |
1408 | bool | 1413 | bool |
1409 | 1414 | ||
1410 | config IP22_CPU_SCACHE | 1415 | config IP22_CPU_SCACHE |
1411 | bool | 1416 | bool |
1412 | select BOARD_SCACHE | 1417 | select BOARD_SCACHE |
1413 | 1418 | ||
1414 | # | 1419 | # |
1415 | # Support for a MIPS32 / MIPS64 style S-caches | 1420 | # Support for a MIPS32 / MIPS64 style S-caches |
1416 | # | 1421 | # |
1417 | config MIPS_CPU_SCACHE | 1422 | config MIPS_CPU_SCACHE |
1418 | bool | 1423 | bool |
1419 | select BOARD_SCACHE | 1424 | select BOARD_SCACHE |
1420 | 1425 | ||
1421 | config R5000_CPU_SCACHE | 1426 | config R5000_CPU_SCACHE |
1422 | bool | 1427 | bool |
1423 | select BOARD_SCACHE | 1428 | select BOARD_SCACHE |
1424 | 1429 | ||
1425 | config RM7000_CPU_SCACHE | 1430 | config RM7000_CPU_SCACHE |
1426 | bool | 1431 | bool |
1427 | select BOARD_SCACHE | 1432 | select BOARD_SCACHE |
1428 | 1433 | ||
1429 | config SIBYTE_DMA_PAGEOPS | 1434 | config SIBYTE_DMA_PAGEOPS |
1430 | bool "Use DMA to clear/copy pages" | 1435 | bool "Use DMA to clear/copy pages" |
1431 | depends on CPU_SB1 | 1436 | depends on CPU_SB1 |
1432 | help | 1437 | help |
1433 | Instead of using the CPU to zero and copy pages, use a Data Mover | 1438 | Instead of using the CPU to zero and copy pages, use a Data Mover |
1434 | channel. These DMA channels are otherwise unused by the standard | 1439 | channel. These DMA channels are otherwise unused by the standard |
1435 | SiByte Linux port. Seems to give a small performance benefit. | 1440 | SiByte Linux port. Seems to give a small performance benefit. |
1436 | 1441 | ||
1437 | config CPU_HAS_PREFETCH | 1442 | config CPU_HAS_PREFETCH |
1438 | bool | 1443 | bool |
1439 | 1444 | ||
1440 | choice | 1445 | choice |
1441 | prompt "MIPS MT options" | 1446 | prompt "MIPS MT options" |
1442 | 1447 | ||
1443 | config MIPS_MT_DISABLED | 1448 | config MIPS_MT_DISABLED |
1444 | bool "Disable multithreading support." | 1449 | bool "Disable multithreading support." |
1445 | help | 1450 | help |
1446 | Use this option if your workload can't take advantage of | 1451 | Use this option if your workload can't take advantage of |
1447 | MIPS hardware multithreading support. On systems that don't have | 1452 | MIPS hardware multithreading support. On systems that don't have |
1448 | the option of an MT-enabled processor this option will be the only | 1453 | the option of an MT-enabled processor this option will be the only |
1449 | option in this menu. | 1454 | option in this menu. |
1450 | 1455 | ||
1451 | config MIPS_MT_SMP | 1456 | config MIPS_MT_SMP |
1452 | bool "Use 1 TC on each available VPE for SMP" | 1457 | bool "Use 1 TC on each available VPE for SMP" |
1453 | depends on SYS_SUPPORTS_MULTITHREADING | 1458 | depends on SYS_SUPPORTS_MULTITHREADING |
1454 | select CPU_MIPSR2_IRQ_VI | 1459 | select CPU_MIPSR2_IRQ_VI |
1455 | select CPU_MIPSR2_IRQ_EI | 1460 | select CPU_MIPSR2_IRQ_EI |
1456 | select MIPS_MT | 1461 | select MIPS_MT |
1457 | select NR_CPUS_DEFAULT_2 | 1462 | select NR_CPUS_DEFAULT_2 |
1458 | select SMP | 1463 | select SMP |
1459 | select SYS_SUPPORTS_SCHED_SMT if SMP | 1464 | select SYS_SUPPORTS_SCHED_SMT if SMP |
1460 | select SYS_SUPPORTS_SMP | 1465 | select SYS_SUPPORTS_SMP |
1461 | select SMP_UP | 1466 | select SMP_UP |
1462 | help | 1467 | help |
1463 | This is a kernel model which is also known a VSMP or lately | 1468 | This is a kernel model which is also known a VSMP or lately |
1464 | has been marketesed into SMVP. | 1469 | has been marketesed into SMVP. |
1465 | 1470 | ||
1466 | config MIPS_MT_SMTC | 1471 | config MIPS_MT_SMTC |
1467 | bool "SMTC: Use all TCs on all VPEs for SMP" | 1472 | bool "SMTC: Use all TCs on all VPEs for SMP" |
1468 | depends on CPU_MIPS32_R2 | 1473 | depends on CPU_MIPS32_R2 |
1469 | #depends on CPU_MIPS64_R2 # once there is hardware ... | 1474 | #depends on CPU_MIPS64_R2 # once there is hardware ... |
1470 | depends on SYS_SUPPORTS_MULTITHREADING | 1475 | depends on SYS_SUPPORTS_MULTITHREADING |
1471 | select GENERIC_CLOCKEVENTS_BROADCAST | 1476 | select GENERIC_CLOCKEVENTS_BROADCAST |
1472 | select CPU_MIPSR2_IRQ_VI | 1477 | select CPU_MIPSR2_IRQ_VI |
1473 | select CPU_MIPSR2_IRQ_EI | 1478 | select CPU_MIPSR2_IRQ_EI |
1474 | select MIPS_MT | 1479 | select MIPS_MT |
1475 | select NR_CPUS_DEFAULT_8 | 1480 | select NR_CPUS_DEFAULT_8 |
1476 | select SMP | 1481 | select SMP |
1477 | select SYS_SUPPORTS_SMP | 1482 | select SYS_SUPPORTS_SMP |
1478 | select SMP_UP | 1483 | select SMP_UP |
1479 | help | 1484 | help |
1480 | This is a kernel model which is known a SMTC or lately has been | 1485 | This is a kernel model which is known a SMTC or lately has been |
1481 | marketesed into SMVP. | 1486 | marketesed into SMVP. |
1482 | 1487 | ||
1483 | endchoice | 1488 | endchoice |
1484 | 1489 | ||
1485 | config MIPS_MT | 1490 | config MIPS_MT |
1486 | bool | 1491 | bool |
1487 | 1492 | ||
1488 | config SCHED_SMT | 1493 | config SCHED_SMT |
1489 | bool "SMT (multithreading) scheduler support" | 1494 | bool "SMT (multithreading) scheduler support" |
1490 | depends on SYS_SUPPORTS_SCHED_SMT | 1495 | depends on SYS_SUPPORTS_SCHED_SMT |
1491 | default n | 1496 | default n |
1492 | help | 1497 | help |
1493 | SMT scheduler support improves the CPU scheduler's decision making | 1498 | SMT scheduler support improves the CPU scheduler's decision making |
1494 | when dealing with MIPS MT enabled cores at a cost of slightly | 1499 | when dealing with MIPS MT enabled cores at a cost of slightly |
1495 | increased overhead in some places. If unsure say N here. | 1500 | increased overhead in some places. If unsure say N here. |
1496 | 1501 | ||
1497 | config SYS_SUPPORTS_SCHED_SMT | 1502 | config SYS_SUPPORTS_SCHED_SMT |
1498 | bool | 1503 | bool |
1499 | 1504 | ||
1500 | 1505 | ||
1501 | config SYS_SUPPORTS_MULTITHREADING | 1506 | config SYS_SUPPORTS_MULTITHREADING |
1502 | bool | 1507 | bool |
1503 | 1508 | ||
1504 | config MIPS_MT_FPAFF | 1509 | config MIPS_MT_FPAFF |
1505 | bool "Dynamic FPU affinity for FP-intensive threads" | 1510 | bool "Dynamic FPU affinity for FP-intensive threads" |
1506 | default y | 1511 | default y |
1507 | depends on MIPS_MT_SMP || MIPS_MT_SMTC | 1512 | depends on MIPS_MT_SMP || MIPS_MT_SMTC |
1508 | 1513 | ||
1509 | config MIPS_VPE_LOADER | 1514 | config MIPS_VPE_LOADER |
1510 | bool "VPE loader support." | 1515 | bool "VPE loader support." |
1511 | depends on SYS_SUPPORTS_MULTITHREADING | 1516 | depends on SYS_SUPPORTS_MULTITHREADING |
1512 | select CPU_MIPSR2_IRQ_VI | 1517 | select CPU_MIPSR2_IRQ_VI |
1513 | select CPU_MIPSR2_IRQ_EI | 1518 | select CPU_MIPSR2_IRQ_EI |
1514 | select MIPS_MT | 1519 | select MIPS_MT |
1515 | help | 1520 | help |
1516 | Includes a loader for loading an elf relocatable object | 1521 | Includes a loader for loading an elf relocatable object |
1517 | onto another VPE and running it. | 1522 | onto another VPE and running it. |
1518 | 1523 | ||
1519 | config MIPS_MT_SMTC_INSTANT_REPLAY | 1524 | config MIPS_MT_SMTC_INSTANT_REPLAY |
1520 | bool "Low-latency Dispatch of Deferred SMTC IPIs" | 1525 | bool "Low-latency Dispatch of Deferred SMTC IPIs" |
1521 | depends on MIPS_MT_SMTC && !PREEMPT | 1526 | depends on MIPS_MT_SMTC && !PREEMPT |
1522 | default y | 1527 | default y |
1523 | help | 1528 | help |
1524 | SMTC pseudo-interrupts between TCs are deferred and queued | 1529 | SMTC pseudo-interrupts between TCs are deferred and queued |
1525 | if the target TC is interrupt-inhibited (IXMT). In the first | 1530 | if the target TC is interrupt-inhibited (IXMT). In the first |
1526 | SMTC prototypes, these queued IPIs were serviced on return | 1531 | SMTC prototypes, these queued IPIs were serviced on return |
1527 | to user mode, or on entry into the kernel idle loop. The | 1532 | to user mode, or on entry into the kernel idle loop. The |
1528 | INSTANT_REPLAY option dispatches them as part of local_irq_restore() | 1533 | INSTANT_REPLAY option dispatches them as part of local_irq_restore() |
1529 | processing, which adds runtime overhead (hence the option to turn | 1534 | processing, which adds runtime overhead (hence the option to turn |
1530 | it off), but ensures that IPIs are handled promptly even under | 1535 | it off), but ensures that IPIs are handled promptly even under |
1531 | heavy I/O interrupt load. | 1536 | heavy I/O interrupt load. |
1532 | 1537 | ||
1533 | config MIPS_MT_SMTC_IM_BACKSTOP | 1538 | config MIPS_MT_SMTC_IM_BACKSTOP |
1534 | bool "Use per-TC register bits as backstop for inhibited IM bits" | 1539 | bool "Use per-TC register bits as backstop for inhibited IM bits" |
1535 | depends on MIPS_MT_SMTC | 1540 | depends on MIPS_MT_SMTC |
1536 | default y | 1541 | default y |
1537 | help | 1542 | help |
1538 | To support multiple TC microthreads acting as "CPUs" within | 1543 | To support multiple TC microthreads acting as "CPUs" within |
1539 | a VPE, VPE-wide interrupt mask bits must be specially manipulated | 1544 | a VPE, VPE-wide interrupt mask bits must be specially manipulated |
1540 | during interrupt handling. To support legacy drivers and interrupt | 1545 | during interrupt handling. To support legacy drivers and interrupt |
1541 | controller management code, SMTC has a "backstop" to track and | 1546 | controller management code, SMTC has a "backstop" to track and |
1542 | if necessary restore the interrupt mask. This has some performance | 1547 | if necessary restore the interrupt mask. This has some performance |
1543 | impact on interrupt service overhead. Disable it only if you know | 1548 | impact on interrupt service overhead. Disable it only if you know |
1544 | what you are doing. | 1549 | what you are doing. |
1545 | 1550 | ||
1546 | config MIPS_MT_SMTC_IRQAFF | 1551 | config MIPS_MT_SMTC_IRQAFF |
1547 | bool "Support IRQ affinity API" | 1552 | bool "Support IRQ affinity API" |
1548 | depends on MIPS_MT_SMTC | 1553 | depends on MIPS_MT_SMTC |
1549 | default n | 1554 | default n |
1550 | help | 1555 | help |
1551 | Enables SMP IRQ affinity API (/proc/irq/*/smp_affinity, etc.) | 1556 | Enables SMP IRQ affinity API (/proc/irq/*/smp_affinity, etc.) |
1552 | for SMTC Linux kernel. Requires platform support, of which | 1557 | for SMTC Linux kernel. Requires platform support, of which |
1553 | an example can be found in the MIPS kernel i8259 and Malta | 1558 | an example can be found in the MIPS kernel i8259 and Malta |
1554 | platform code. It is recommended that MIPS_MT_SMTC_INSTANT_REPLAY | 1559 | platform code. It is recommended that MIPS_MT_SMTC_INSTANT_REPLAY |
1555 | be enabled if MIPS_MT_SMTC_IRQAFF is used. Adds overhead to | 1560 | be enabled if MIPS_MT_SMTC_IRQAFF is used. Adds overhead to |
1556 | interrupt dispatch, and should be used only if you know what | 1561 | interrupt dispatch, and should be used only if you know what |
1557 | you are doing. | 1562 | you are doing. |
1558 | 1563 | ||
1559 | config MIPS_VPE_LOADER_TOM | 1564 | config MIPS_VPE_LOADER_TOM |
1560 | bool "Load VPE program into memory hidden from linux" | 1565 | bool "Load VPE program into memory hidden from linux" |
1561 | depends on MIPS_VPE_LOADER | 1566 | depends on MIPS_VPE_LOADER |
1562 | default y | 1567 | default y |
1563 | help | 1568 | help |
1564 | The loader can use memory that is present but has been hidden from | 1569 | The loader can use memory that is present but has been hidden from |
1565 | Linux using the kernel command line option "mem=xxMB". It's up to | 1570 | Linux using the kernel command line option "mem=xxMB". It's up to |
1566 | you to ensure the amount you put in the option and the space your | 1571 | you to ensure the amount you put in the option and the space your |
1567 | program requires is less or equal to the amount physically present. | 1572 | program requires is less or equal to the amount physically present. |
1568 | 1573 | ||
1569 | # this should possibly be in drivers/char, but it is rather cpu related. Hmmm | 1574 | # this should possibly be in drivers/char, but it is rather cpu related. Hmmm |
1570 | config MIPS_VPE_APSP_API | 1575 | config MIPS_VPE_APSP_API |
1571 | bool "Enable support for AP/SP API (RTLX)" | 1576 | bool "Enable support for AP/SP API (RTLX)" |
1572 | depends on MIPS_VPE_LOADER | 1577 | depends on MIPS_VPE_LOADER |
1573 | help | 1578 | help |
1574 | 1579 | ||
1575 | config MIPS_APSP_KSPD | 1580 | config MIPS_APSP_KSPD |
1576 | bool "Enable KSPD" | 1581 | bool "Enable KSPD" |
1577 | depends on MIPS_VPE_APSP_API | 1582 | depends on MIPS_VPE_APSP_API |
1578 | default y | 1583 | default y |
1579 | help | 1584 | help |
1580 | KSPD is a kernel daemon that accepts syscall requests from the SP | 1585 | KSPD is a kernel daemon that accepts syscall requests from the SP |
1581 | side, actions them and returns the results. It also handles the | 1586 | side, actions them and returns the results. It also handles the |
1582 | "exit" syscall notifying other kernel modules the SP program is | 1587 | "exit" syscall notifying other kernel modules the SP program is |
1583 | exiting. You probably want to say yes here. | 1588 | exiting. You probably want to say yes here. |
1584 | 1589 | ||
1585 | config SB1_PASS_1_WORKAROUNDS | 1590 | config SB1_PASS_1_WORKAROUNDS |
1586 | bool | 1591 | bool |
1587 | depends on CPU_SB1_PASS_1 | 1592 | depends on CPU_SB1_PASS_1 |
1588 | default y | 1593 | default y |
1589 | 1594 | ||
1590 | config SB1_PASS_2_WORKAROUNDS | 1595 | config SB1_PASS_2_WORKAROUNDS |
1591 | bool | 1596 | bool |
1592 | depends on CPU_SB1 && (CPU_SB1_PASS_2_2 || CPU_SB1_PASS_2) | 1597 | depends on CPU_SB1 && (CPU_SB1_PASS_2_2 || CPU_SB1_PASS_2) |
1593 | default y | 1598 | default y |
1594 | 1599 | ||
1595 | config SB1_PASS_2_1_WORKAROUNDS | 1600 | config SB1_PASS_2_1_WORKAROUNDS |
1596 | bool | 1601 | bool |
1597 | depends on CPU_SB1 && CPU_SB1_PASS_2 | 1602 | depends on CPU_SB1 && CPU_SB1_PASS_2 |
1598 | default y | 1603 | default y |
1599 | 1604 | ||
1600 | config 64BIT_PHYS_ADDR | 1605 | config 64BIT_PHYS_ADDR |
1601 | bool | 1606 | bool |
1602 | 1607 | ||
1603 | config CPU_HAS_LLSC | 1608 | config CPU_HAS_LLSC |
1604 | bool | 1609 | bool |
1605 | 1610 | ||
1606 | config CPU_HAS_SMARTMIPS | 1611 | config CPU_HAS_SMARTMIPS |
1607 | depends on SYS_SUPPORTS_SMARTMIPS | 1612 | depends on SYS_SUPPORTS_SMARTMIPS |
1608 | bool "Support for the SmartMIPS ASE" | 1613 | bool "Support for the SmartMIPS ASE" |
1609 | help | 1614 | help |
1610 | SmartMIPS is a extension of the MIPS32 architecture aimed at | 1615 | SmartMIPS is a extension of the MIPS32 architecture aimed at |
1611 | increased security at both hardware and software level for | 1616 | increased security at both hardware and software level for |
1612 | smartcards. Enabling this option will allow proper use of the | 1617 | smartcards. Enabling this option will allow proper use of the |
1613 | SmartMIPS instructions by Linux applications. However a kernel with | 1618 | SmartMIPS instructions by Linux applications. However a kernel with |
1614 | this option will not work on a MIPS core without SmartMIPS core. If | 1619 | this option will not work on a MIPS core without SmartMIPS core. If |
1615 | you don't know you probably don't have SmartMIPS and should say N | 1620 | you don't know you probably don't have SmartMIPS and should say N |
1616 | here. | 1621 | here. |
1617 | 1622 | ||
1618 | config CPU_HAS_WB | 1623 | config CPU_HAS_WB |
1619 | bool | 1624 | bool |
1620 | 1625 | ||
1621 | # | 1626 | # |
1622 | # Vectored interrupt mode is an R2 feature | 1627 | # Vectored interrupt mode is an R2 feature |
1623 | # | 1628 | # |
1624 | config CPU_MIPSR2_IRQ_VI | 1629 | config CPU_MIPSR2_IRQ_VI |
1625 | bool | 1630 | bool |
1626 | 1631 | ||
1627 | # | 1632 | # |
1628 | # Extended interrupt mode is an R2 feature | 1633 | # Extended interrupt mode is an R2 feature |
1629 | # | 1634 | # |
1630 | config CPU_MIPSR2_IRQ_EI | 1635 | config CPU_MIPSR2_IRQ_EI |
1631 | bool | 1636 | bool |
1632 | 1637 | ||
1633 | config CPU_HAS_SYNC | 1638 | config CPU_HAS_SYNC |
1634 | bool | 1639 | bool |
1635 | depends on !CPU_R3000 | 1640 | depends on !CPU_R3000 |
1636 | default y | 1641 | default y |
1637 | 1642 | ||
1638 | config GENERIC_CLOCKEVENTS_BROADCAST | 1643 | config GENERIC_CLOCKEVENTS_BROADCAST |
1639 | bool | 1644 | bool |
1640 | 1645 | ||
1641 | # | 1646 | # |
1642 | # CPU non-features | 1647 | # CPU non-features |
1643 | # | 1648 | # |
1644 | config CPU_DADDI_WORKAROUNDS | 1649 | config CPU_DADDI_WORKAROUNDS |
1645 | bool | 1650 | bool |
1646 | 1651 | ||
1647 | config CPU_R4000_WORKAROUNDS | 1652 | config CPU_R4000_WORKAROUNDS |
1648 | bool | 1653 | bool |
1649 | select CPU_R4400_WORKAROUNDS | 1654 | select CPU_R4400_WORKAROUNDS |
1650 | 1655 | ||
1651 | config CPU_R4400_WORKAROUNDS | 1656 | config CPU_R4400_WORKAROUNDS |
1652 | bool | 1657 | bool |
1653 | 1658 | ||
1654 | # | 1659 | # |
1655 | # Use the generic interrupt handling code in kernel/irq/: | 1660 | # Use the generic interrupt handling code in kernel/irq/: |
1656 | # | 1661 | # |
1657 | config GENERIC_HARDIRQS | 1662 | config GENERIC_HARDIRQS |
1658 | bool | 1663 | bool |
1659 | default y | 1664 | default y |
1660 | 1665 | ||
1661 | config GENERIC_IRQ_PROBE | 1666 | config GENERIC_IRQ_PROBE |
1662 | bool | 1667 | bool |
1663 | default y | 1668 | default y |
1664 | 1669 | ||
1665 | config IRQ_PER_CPU | 1670 | config IRQ_PER_CPU |
1666 | bool | 1671 | bool |
1667 | 1672 | ||
1668 | # | 1673 | # |
1669 | # - Highmem only makes sense for the 32-bit kernel. | 1674 | # - Highmem only makes sense for the 32-bit kernel. |
1670 | # - The current highmem code will only work properly on physically indexed | 1675 | # - The current highmem code will only work properly on physically indexed |
1671 | # caches such as R3000, SB1, R7000 or those that look like they're virtually | 1676 | # caches such as R3000, SB1, R7000 or those that look like they're virtually |
1672 | # indexed such as R4000/R4400 SC and MC versions or R10000. So for the | 1677 | # indexed such as R4000/R4400 SC and MC versions or R10000. So for the |
1673 | # moment we protect the user and offer the highmem option only on machines | 1678 | # moment we protect the user and offer the highmem option only on machines |
1674 | # where it's known to be safe. This will not offer highmem on a few systems | 1679 | # where it's known to be safe. This will not offer highmem on a few systems |
1675 | # such as MIPS32 and MIPS64 CPUs which may have virtual and physically | 1680 | # such as MIPS32 and MIPS64 CPUs which may have virtual and physically |
1676 | # indexed CPUs but we're playing safe. | 1681 | # indexed CPUs but we're playing safe. |
1677 | # - We use SYS_SUPPORTS_HIGHMEM to offer highmem only for systems where we | 1682 | # - We use SYS_SUPPORTS_HIGHMEM to offer highmem only for systems where we |
1678 | # know they might have memory configurations that could make use of highmem | 1683 | # know they might have memory configurations that could make use of highmem |
1679 | # support. | 1684 | # support. |
1680 | # | 1685 | # |
1681 | config HIGHMEM | 1686 | config HIGHMEM |
1682 | bool "High Memory Support" | 1687 | bool "High Memory Support" |
1683 | depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM | 1688 | depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM |
1684 | 1689 | ||
1685 | config CPU_SUPPORTS_HIGHMEM | 1690 | config CPU_SUPPORTS_HIGHMEM |
1686 | bool | 1691 | bool |
1687 | 1692 | ||
1688 | config SYS_SUPPORTS_HIGHMEM | 1693 | config SYS_SUPPORTS_HIGHMEM |
1689 | bool | 1694 | bool |
1690 | 1695 | ||
1691 | config SYS_SUPPORTS_SMARTMIPS | 1696 | config SYS_SUPPORTS_SMARTMIPS |
1692 | bool | 1697 | bool |
1693 | 1698 | ||
1694 | config ARCH_FLATMEM_ENABLE | 1699 | config ARCH_FLATMEM_ENABLE |
1695 | def_bool y | 1700 | def_bool y |
1696 | depends on !NUMA | 1701 | depends on !NUMA |
1697 | 1702 | ||
1698 | config ARCH_DISCONTIGMEM_ENABLE | 1703 | config ARCH_DISCONTIGMEM_ENABLE |
1699 | bool | 1704 | bool |
1700 | default y if SGI_IP27 | 1705 | default y if SGI_IP27 |
1701 | help | 1706 | help |
1702 | Say Y to support efficient handling of discontiguous physical memory, | 1707 | Say Y to support efficient handling of discontiguous physical memory, |
1703 | for architectures which are either NUMA (Non-Uniform Memory Access) | 1708 | for architectures which are either NUMA (Non-Uniform Memory Access) |
1704 | or have huge holes in the physical address space for other reasons. | 1709 | or have huge holes in the physical address space for other reasons. |
1705 | See <file:Documentation/vm/numa> for more. | 1710 | See <file:Documentation/vm/numa> for more. |
1706 | 1711 | ||
1707 | config ARCH_POPULATES_NODE_MAP | 1712 | config ARCH_POPULATES_NODE_MAP |
1708 | def_bool y | 1713 | def_bool y |
1709 | 1714 | ||
1710 | config ARCH_SPARSEMEM_ENABLE | 1715 | config ARCH_SPARSEMEM_ENABLE |
1711 | bool | 1716 | bool |
1712 | select SPARSEMEM_STATIC | 1717 | select SPARSEMEM_STATIC |
1713 | 1718 | ||
1714 | config NUMA | 1719 | config NUMA |
1715 | bool "NUMA Support" | 1720 | bool "NUMA Support" |
1716 | depends on SYS_SUPPORTS_NUMA | 1721 | depends on SYS_SUPPORTS_NUMA |
1717 | help | 1722 | help |
1718 | Say Y to compile the kernel to support NUMA (Non-Uniform Memory | 1723 | Say Y to compile the kernel to support NUMA (Non-Uniform Memory |
1719 | Access). This option improves performance on systems with more | 1724 | Access). This option improves performance on systems with more |
1720 | than two nodes; on two node systems it is generally better to | 1725 | than two nodes; on two node systems it is generally better to |
1721 | leave it disabled; on single node systems disable this option | 1726 | leave it disabled; on single node systems disable this option |
1722 | disabled. | 1727 | disabled. |
1723 | 1728 | ||
1724 | config SYS_SUPPORTS_NUMA | 1729 | config SYS_SUPPORTS_NUMA |
1725 | bool | 1730 | bool |
1726 | 1731 | ||
1727 | config NODES_SHIFT | 1732 | config NODES_SHIFT |
1728 | int | 1733 | int |
1729 | default "6" | 1734 | default "6" |
1730 | depends on NEED_MULTIPLE_NODES | 1735 | depends on NEED_MULTIPLE_NODES |
1731 | 1736 | ||
1732 | source "mm/Kconfig" | 1737 | source "mm/Kconfig" |
1733 | 1738 | ||
1734 | config SMP | 1739 | config SMP |
1735 | bool "Multi-Processing support" | 1740 | bool "Multi-Processing support" |
1736 | depends on SYS_SUPPORTS_SMP | 1741 | depends on SYS_SUPPORTS_SMP |
1737 | select IRQ_PER_CPU | 1742 | select IRQ_PER_CPU |
1738 | help | 1743 | help |
1739 | This enables support for systems with more than one CPU. If you have | 1744 | This enables support for systems with more than one CPU. If you have |
1740 | a system with only one CPU, like most personal computers, say N. If | 1745 | a system with only one CPU, like most personal computers, say N. If |
1741 | you have a system with more than one CPU, say Y. | 1746 | you have a system with more than one CPU, say Y. |
1742 | 1747 | ||
1743 | If you say N here, the kernel will run on single and multiprocessor | 1748 | If you say N here, the kernel will run on single and multiprocessor |
1744 | machines, but will use only one CPU of a multiprocessor machine. If | 1749 | machines, but will use only one CPU of a multiprocessor machine. If |
1745 | you say Y here, the kernel will run on many, but not all, | 1750 | you say Y here, the kernel will run on many, but not all, |
1746 | singleprocessor machines. On a singleprocessor machine, the kernel | 1751 | singleprocessor machines. On a singleprocessor machine, the kernel |
1747 | will run faster if you say N here. | 1752 | will run faster if you say N here. |
1748 | 1753 | ||
1749 | People using multiprocessor machines who say Y here should also say | 1754 | People using multiprocessor machines who say Y here should also say |
1750 | Y to "Enhanced Real Time Clock Support", below. | 1755 | Y to "Enhanced Real Time Clock Support", below. |
1751 | 1756 | ||
1752 | See also the <file:Documentation/smp.txt> and the SMP-HOWTO | 1757 | See also the <file:Documentation/smp.txt> and the SMP-HOWTO |
1753 | available at <http://www.tldp.org/docs.html#howto>. | 1758 | available at <http://www.tldp.org/docs.html#howto>. |
1754 | 1759 | ||
1755 | If you don't know what to do here, say N. | 1760 | If you don't know what to do here, say N. |
1756 | 1761 | ||
1757 | config SMP_UP | 1762 | config SMP_UP |
1758 | bool | 1763 | bool |
1759 | 1764 | ||
1760 | config SYS_SUPPORTS_SMP | 1765 | config SYS_SUPPORTS_SMP |
1761 | bool | 1766 | bool |
1762 | 1767 | ||
1763 | config NR_CPUS_DEFAULT_1 | 1768 | config NR_CPUS_DEFAULT_1 |
1764 | bool | 1769 | bool |
1765 | 1770 | ||
1766 | config NR_CPUS_DEFAULT_2 | 1771 | config NR_CPUS_DEFAULT_2 |
1767 | bool | 1772 | bool |
1768 | 1773 | ||
1769 | config NR_CPUS_DEFAULT_4 | 1774 | config NR_CPUS_DEFAULT_4 |
1770 | bool | 1775 | bool |
1771 | 1776 | ||
1772 | config NR_CPUS_DEFAULT_8 | 1777 | config NR_CPUS_DEFAULT_8 |
1773 | bool | 1778 | bool |
1774 | 1779 | ||
1775 | config NR_CPUS_DEFAULT_16 | 1780 | config NR_CPUS_DEFAULT_16 |
1776 | bool | 1781 | bool |
1777 | 1782 | ||
1778 | config NR_CPUS_DEFAULT_32 | 1783 | config NR_CPUS_DEFAULT_32 |
1779 | bool | 1784 | bool |
1780 | 1785 | ||
1781 | config NR_CPUS_DEFAULT_64 | 1786 | config NR_CPUS_DEFAULT_64 |
1782 | bool | 1787 | bool |
1783 | 1788 | ||
1784 | config NR_CPUS | 1789 | config NR_CPUS |
1785 | int "Maximum number of CPUs (2-64)" | 1790 | int "Maximum number of CPUs (2-64)" |
1786 | range 1 64 if NR_CPUS_DEFAULT_1 | 1791 | range 1 64 if NR_CPUS_DEFAULT_1 |
1787 | depends on SMP | 1792 | depends on SMP |
1788 | default "1" if NR_CPUS_DEFAULT_1 | 1793 | default "1" if NR_CPUS_DEFAULT_1 |
1789 | default "2" if NR_CPUS_DEFAULT_2 | 1794 | default "2" if NR_CPUS_DEFAULT_2 |
1790 | default "4" if NR_CPUS_DEFAULT_4 | 1795 | default "4" if NR_CPUS_DEFAULT_4 |
1791 | default "8" if NR_CPUS_DEFAULT_8 | 1796 | default "8" if NR_CPUS_DEFAULT_8 |
1792 | default "16" if NR_CPUS_DEFAULT_16 | 1797 | default "16" if NR_CPUS_DEFAULT_16 |
1793 | default "32" if NR_CPUS_DEFAULT_32 | 1798 | default "32" if NR_CPUS_DEFAULT_32 |
1794 | default "64" if NR_CPUS_DEFAULT_64 | 1799 | default "64" if NR_CPUS_DEFAULT_64 |
1795 | help | 1800 | help |
1796 | This allows you to specify the maximum number of CPUs which this | 1801 | This allows you to specify the maximum number of CPUs which this |
1797 | kernel will support. The maximum supported value is 32 for 32-bit | 1802 | kernel will support. The maximum supported value is 32 for 32-bit |
1798 | kernel and 64 for 64-bit kernels; the minimum value which makes | 1803 | kernel and 64 for 64-bit kernels; the minimum value which makes |
1799 | sense is 1 for Qemu (useful only for kernel debugging purposes) | 1804 | sense is 1 for Qemu (useful only for kernel debugging purposes) |
1800 | and 2 for all others. | 1805 | and 2 for all others. |
1801 | 1806 | ||
1802 | This is purely to save memory - each supported CPU adds | 1807 | This is purely to save memory - each supported CPU adds |
1803 | approximately eight kilobytes to the kernel image. For best | 1808 | approximately eight kilobytes to the kernel image. For best |
1804 | performance should round up your number of processors to the next | 1809 | performance should round up your number of processors to the next |
1805 | power of two. | 1810 | power of two. |
1806 | 1811 | ||
1807 | source "kernel/time/Kconfig" | 1812 | source "kernel/time/Kconfig" |
1808 | 1813 | ||
1809 | # | 1814 | # |
1810 | # Timer Interrupt Frequency Configuration | 1815 | # Timer Interrupt Frequency Configuration |
1811 | # | 1816 | # |
1812 | 1817 | ||
1813 | choice | 1818 | choice |
1814 | prompt "Timer frequency" | 1819 | prompt "Timer frequency" |
1815 | default HZ_250 | 1820 | default HZ_250 |
1816 | help | 1821 | help |
1817 | Allows the configuration of the timer frequency. | 1822 | Allows the configuration of the timer frequency. |
1818 | 1823 | ||
1819 | config HZ_48 | 1824 | config HZ_48 |
1820 | bool "48 HZ" if SYS_SUPPORTS_48HZ | 1825 | bool "48 HZ" if SYS_SUPPORTS_48HZ |
1821 | 1826 | ||
1822 | config HZ_100 | 1827 | config HZ_100 |
1823 | bool "100 HZ" if SYS_SUPPORTS_100HZ || SYS_SUPPORTS_ARBIT_HZ | 1828 | bool "100 HZ" if SYS_SUPPORTS_100HZ || SYS_SUPPORTS_ARBIT_HZ |
1824 | 1829 | ||
1825 | config HZ_128 | 1830 | config HZ_128 |
1826 | bool "128 HZ" if SYS_SUPPORTS_128HZ || SYS_SUPPORTS_ARBIT_HZ | 1831 | bool "128 HZ" if SYS_SUPPORTS_128HZ || SYS_SUPPORTS_ARBIT_HZ |
1827 | 1832 | ||
1828 | config HZ_250 | 1833 | config HZ_250 |
1829 | bool "250 HZ" if SYS_SUPPORTS_250HZ || SYS_SUPPORTS_ARBIT_HZ | 1834 | bool "250 HZ" if SYS_SUPPORTS_250HZ || SYS_SUPPORTS_ARBIT_HZ |
1830 | 1835 | ||
1831 | config HZ_256 | 1836 | config HZ_256 |
1832 | bool "256 HZ" if SYS_SUPPORTS_256HZ || SYS_SUPPORTS_ARBIT_HZ | 1837 | bool "256 HZ" if SYS_SUPPORTS_256HZ || SYS_SUPPORTS_ARBIT_HZ |
1833 | 1838 | ||
1834 | config HZ_1000 | 1839 | config HZ_1000 |
1835 | bool "1000 HZ" if SYS_SUPPORTS_1000HZ || SYS_SUPPORTS_ARBIT_HZ | 1840 | bool "1000 HZ" if SYS_SUPPORTS_1000HZ || SYS_SUPPORTS_ARBIT_HZ |
1836 | 1841 | ||
1837 | config HZ_1024 | 1842 | config HZ_1024 |
1838 | bool "1024 HZ" if SYS_SUPPORTS_1024HZ || SYS_SUPPORTS_ARBIT_HZ | 1843 | bool "1024 HZ" if SYS_SUPPORTS_1024HZ || SYS_SUPPORTS_ARBIT_HZ |
1839 | 1844 | ||
1840 | endchoice | 1845 | endchoice |
1841 | 1846 | ||
1842 | config SYS_SUPPORTS_48HZ | 1847 | config SYS_SUPPORTS_48HZ |
1843 | bool | 1848 | bool |
1844 | 1849 | ||
1845 | config SYS_SUPPORTS_100HZ | 1850 | config SYS_SUPPORTS_100HZ |
1846 | bool | 1851 | bool |
1847 | 1852 | ||
1848 | config SYS_SUPPORTS_128HZ | 1853 | config SYS_SUPPORTS_128HZ |
1849 | bool | 1854 | bool |
1850 | 1855 | ||
1851 | config SYS_SUPPORTS_250HZ | 1856 | config SYS_SUPPORTS_250HZ |
1852 | bool | 1857 | bool |
1853 | 1858 | ||
1854 | config SYS_SUPPORTS_256HZ | 1859 | config SYS_SUPPORTS_256HZ |
1855 | bool | 1860 | bool |
1856 | 1861 | ||
1857 | config SYS_SUPPORTS_1000HZ | 1862 | config SYS_SUPPORTS_1000HZ |
1858 | bool | 1863 | bool |
1859 | 1864 | ||
1860 | config SYS_SUPPORTS_1024HZ | 1865 | config SYS_SUPPORTS_1024HZ |
1861 | bool | 1866 | bool |
1862 | 1867 | ||
1863 | config SYS_SUPPORTS_ARBIT_HZ | 1868 | config SYS_SUPPORTS_ARBIT_HZ |
1864 | bool | 1869 | bool |
1865 | default y if !SYS_SUPPORTS_48HZ && !SYS_SUPPORTS_100HZ && \ | 1870 | default y if !SYS_SUPPORTS_48HZ && !SYS_SUPPORTS_100HZ && \ |
1866 | !SYS_SUPPORTS_128HZ && !SYS_SUPPORTS_250HZ && \ | 1871 | !SYS_SUPPORTS_128HZ && !SYS_SUPPORTS_250HZ && \ |
1867 | !SYS_SUPPORTS_256HZ && !SYS_SUPPORTS_1000HZ && \ | 1872 | !SYS_SUPPORTS_256HZ && !SYS_SUPPORTS_1000HZ && \ |
1868 | !SYS_SUPPORTS_1024HZ | 1873 | !SYS_SUPPORTS_1024HZ |
1869 | 1874 | ||
1870 | config HZ | 1875 | config HZ |
1871 | int | 1876 | int |
1872 | default 48 if HZ_48 | 1877 | default 48 if HZ_48 |
1873 | default 100 if HZ_100 | 1878 | default 100 if HZ_100 |
1874 | default 128 if HZ_128 | 1879 | default 128 if HZ_128 |
1875 | default 250 if HZ_250 | 1880 | default 250 if HZ_250 |
1876 | default 256 if HZ_256 | 1881 | default 256 if HZ_256 |
1877 | default 1000 if HZ_1000 | 1882 | default 1000 if HZ_1000 |
1878 | default 1024 if HZ_1024 | 1883 | default 1024 if HZ_1024 |
1879 | 1884 | ||
1880 | source "kernel/Kconfig.preempt" | 1885 | source "kernel/Kconfig.preempt" |
1881 | 1886 | ||
1882 | config MIPS_INSANE_LARGE | 1887 | config MIPS_INSANE_LARGE |
1883 | bool "Support for large 64-bit configurations" | 1888 | bool "Support for large 64-bit configurations" |
1884 | depends on CPU_R10000 && 64BIT | 1889 | depends on CPU_R10000 && 64BIT |
1885 | help | 1890 | help |
1886 | MIPS R10000 does support a 44 bit / 16TB address space as opposed to | 1891 | MIPS R10000 does support a 44 bit / 16TB address space as opposed to |
1887 | previous 64-bit processors which only supported 40 bit / 1TB. If you | 1892 | previous 64-bit processors which only supported 40 bit / 1TB. If you |
1888 | need processes of more than 1TB virtual address space, say Y here. | 1893 | need processes of more than 1TB virtual address space, say Y here. |
1889 | This will result in additional memory usage, so it is not | 1894 | This will result in additional memory usage, so it is not |
1890 | recommended for normal users. | 1895 | recommended for normal users. |
1891 | 1896 | ||
1892 | config KEXEC | 1897 | config KEXEC |
1893 | bool "Kexec system call (EXPERIMENTAL)" | 1898 | bool "Kexec system call (EXPERIMENTAL)" |
1894 | depends on EXPERIMENTAL | 1899 | depends on EXPERIMENTAL |
1895 | help | 1900 | help |
1896 | kexec is a system call that implements the ability to shutdown your | 1901 | kexec is a system call that implements the ability to shutdown your |
1897 | current kernel, and to start another kernel. It is like a reboot | 1902 | current kernel, and to start another kernel. It is like a reboot |
1898 | but it is independent of the system firmware. And like a reboot | 1903 | but it is independent of the system firmware. And like a reboot |
1899 | you can start any kernel with it, not just Linux. | 1904 | you can start any kernel with it, not just Linux. |
1900 | 1905 | ||
1901 | The name comes from the similarity to the exec system call. | 1906 | The name comes from the similarity to the exec system call. |
1902 | 1907 | ||
1903 | It is an ongoing process to be certain the hardware in a machine | 1908 | It is an ongoing process to be certain the hardware in a machine |
1904 | is properly shutdown, so do not be surprised if this code does not | 1909 | is properly shutdown, so do not be surprised if this code does not |
1905 | initially work for you. It may help to enable device hotplugging | 1910 | initially work for you. It may help to enable device hotplugging |
1906 | support. As of this writing the exact hardware interface is | 1911 | support. As of this writing the exact hardware interface is |
1907 | strongly in flux, so no good recommendation can be made. | 1912 | strongly in flux, so no good recommendation can be made. |
1908 | 1913 | ||
1909 | config SECCOMP | 1914 | config SECCOMP |
1910 | bool "Enable seccomp to safely compute untrusted bytecode" | 1915 | bool "Enable seccomp to safely compute untrusted bytecode" |
1911 | depends on PROC_FS | 1916 | depends on PROC_FS |
1912 | default y | 1917 | default y |
1913 | help | 1918 | help |
1914 | This kernel feature is useful for number crunching applications | 1919 | This kernel feature is useful for number crunching applications |
1915 | that may need to compute untrusted bytecode during their | 1920 | that may need to compute untrusted bytecode during their |
1916 | execution. By using pipes or other transports made available to | 1921 | execution. By using pipes or other transports made available to |
1917 | the process as file descriptors supporting the read/write | 1922 | the process as file descriptors supporting the read/write |
1918 | syscalls, it's possible to isolate those applications in | 1923 | syscalls, it's possible to isolate those applications in |
1919 | their own address space using seccomp. Once seccomp is | 1924 | their own address space using seccomp. Once seccomp is |
1920 | enabled via /proc/<pid>/seccomp, it cannot be disabled | 1925 | enabled via /proc/<pid>/seccomp, it cannot be disabled |
1921 | and the task is only allowed to execute a few safe syscalls | 1926 | and the task is only allowed to execute a few safe syscalls |
1922 | defined by each seccomp mode. | 1927 | defined by each seccomp mode. |
1923 | 1928 | ||
1924 | If unsure, say Y. Only embedded should say N here. | 1929 | If unsure, say Y. Only embedded should say N here. |
1925 | 1930 | ||
1926 | endmenu | 1931 | endmenu |
1927 | 1932 | ||
1928 | config RWSEM_GENERIC_SPINLOCK | 1933 | config RWSEM_GENERIC_SPINLOCK |
1929 | bool | 1934 | bool |
1930 | default y | 1935 | default y |
1931 | 1936 | ||
1932 | config LOCKDEP_SUPPORT | 1937 | config LOCKDEP_SUPPORT |
1933 | bool | 1938 | bool |
1934 | default y | 1939 | default y |
1935 | 1940 | ||
1936 | config STACKTRACE_SUPPORT | 1941 | config STACKTRACE_SUPPORT |
1937 | bool | 1942 | bool |
1938 | default y | 1943 | default y |
1939 | 1944 | ||
1940 | source "init/Kconfig" | 1945 | source "init/Kconfig" |
1941 | 1946 | ||
1942 | menu "Bus options (PCI, PCMCIA, EISA, ISA, TC)" | 1947 | menu "Bus options (PCI, PCMCIA, EISA, ISA, TC)" |
1943 | 1948 | ||
1944 | config HW_HAS_EISA | 1949 | config HW_HAS_EISA |
1945 | bool | 1950 | bool |
1946 | config HW_HAS_PCI | 1951 | config HW_HAS_PCI |
1947 | bool | 1952 | bool |
1948 | 1953 | ||
1949 | config PCI | 1954 | config PCI |
1950 | bool "Support for PCI controller" | 1955 | bool "Support for PCI controller" |
1951 | depends on HW_HAS_PCI | 1956 | depends on HW_HAS_PCI |
1952 | select PCI_DOMAINS | 1957 | select PCI_DOMAINS |
1953 | help | 1958 | help |
1954 | Find out whether you have a PCI motherboard. PCI is the name of a | 1959 | Find out whether you have a PCI motherboard. PCI is the name of a |
1955 | bus system, i.e. the way the CPU talks to the other stuff inside | 1960 | bus system, i.e. the way the CPU talks to the other stuff inside |
1956 | your box. Other bus systems are ISA, EISA, or VESA. If you have PCI, | 1961 | your box. Other bus systems are ISA, EISA, or VESA. If you have PCI, |
1957 | say Y, otherwise N. | 1962 | say Y, otherwise N. |
1958 | 1963 | ||
1959 | The PCI-HOWTO, available from | 1964 | The PCI-HOWTO, available from |
1960 | <http://www.tldp.org/docs.html#howto>, contains valuable | 1965 | <http://www.tldp.org/docs.html#howto>, contains valuable |
1961 | information about which PCI hardware does work under Linux and which | 1966 | information about which PCI hardware does work under Linux and which |
1962 | doesn't. | 1967 | doesn't. |
1963 | 1968 | ||
1964 | config PCI_DOMAINS | 1969 | config PCI_DOMAINS |
1965 | bool | 1970 | bool |
1966 | 1971 | ||
1967 | source "drivers/pci/Kconfig" | 1972 | source "drivers/pci/Kconfig" |
1968 | 1973 | ||
1969 | # | 1974 | # |
1970 | # ISA support is now enabled via select. Too many systems still have the one | 1975 | # ISA support is now enabled via select. Too many systems still have the one |
1971 | # or other ISA chip on the board that users don't know about so don't expect | 1976 | # or other ISA chip on the board that users don't know about so don't expect |
1972 | # users to choose the right thing ... | 1977 | # users to choose the right thing ... |
1973 | # | 1978 | # |
1974 | config ISA | 1979 | config ISA |
1975 | bool | 1980 | bool |
1976 | 1981 | ||
1977 | config EISA | 1982 | config EISA |
1978 | bool "EISA support" | 1983 | bool "EISA support" |
1979 | depends on HW_HAS_EISA | 1984 | depends on HW_HAS_EISA |
1980 | select ISA | 1985 | select ISA |
1981 | select GENERIC_ISA_DMA | 1986 | select GENERIC_ISA_DMA |
1982 | ---help--- | 1987 | ---help--- |
1983 | The Extended Industry Standard Architecture (EISA) bus was | 1988 | The Extended Industry Standard Architecture (EISA) bus was |
1984 | developed as an open alternative to the IBM MicroChannel bus. | 1989 | developed as an open alternative to the IBM MicroChannel bus. |
1985 | 1990 | ||
1986 | The EISA bus provided some of the features of the IBM MicroChannel | 1991 | The EISA bus provided some of the features of the IBM MicroChannel |
1987 | bus while maintaining backward compatibility with cards made for | 1992 | bus while maintaining backward compatibility with cards made for |
1988 | the older ISA bus. The EISA bus saw limited use between 1988 and | 1993 | the older ISA bus. The EISA bus saw limited use between 1988 and |
1989 | 1995 when it was made obsolete by the PCI bus. | 1994 | 1995 when it was made obsolete by the PCI bus. |
1990 | 1995 | ||
1991 | Say Y here if you are building a kernel for an EISA-based machine. | 1996 | Say Y here if you are building a kernel for an EISA-based machine. |
1992 | 1997 | ||
1993 | Otherwise, say N. | 1998 | Otherwise, say N. |
1994 | 1999 | ||
1995 | source "drivers/eisa/Kconfig" | 2000 | source "drivers/eisa/Kconfig" |
1996 | 2001 | ||
1997 | config TC | 2002 | config TC |
1998 | bool "TURBOchannel support" | 2003 | bool "TURBOchannel support" |
1999 | depends on MACH_DECSTATION | 2004 | depends on MACH_DECSTATION |
2000 | help | 2005 | help |
2001 | TurboChannel is a DEC (now Compaq (now HP)) bus for Alpha and MIPS | 2006 | TurboChannel is a DEC (now Compaq (now HP)) bus for Alpha and MIPS |
2002 | processors. Documentation on writing device drivers for TurboChannel | 2007 | processors. Documentation on writing device drivers for TurboChannel |
2003 | is available at: | 2008 | is available at: |
2004 | <http://www.cs.arizona.edu/computer.help/policy/DIGITAL_unix/AA-PS3HD-TET1_html/TITLE.html>. | 2009 | <http://www.cs.arizona.edu/computer.help/policy/DIGITAL_unix/AA-PS3HD-TET1_html/TITLE.html>. |
2005 | 2010 | ||
2006 | #config ACCESSBUS | 2011 | #config ACCESSBUS |
2007 | # bool "Access.Bus support" | 2012 | # bool "Access.Bus support" |
2008 | # depends on TC | 2013 | # depends on TC |
2009 | 2014 | ||
2010 | config MMU | 2015 | config MMU |
2011 | bool | 2016 | bool |
2012 | default y | 2017 | default y |
2013 | 2018 | ||
2014 | config I8253 | 2019 | config I8253 |
2015 | bool | 2020 | bool |
2016 | 2021 | ||
2017 | config ZONE_DMA32 | 2022 | config ZONE_DMA32 |
2018 | bool | 2023 | bool |
2019 | 2024 | ||
2020 | source "drivers/pcmcia/Kconfig" | 2025 | source "drivers/pcmcia/Kconfig" |
2021 | 2026 | ||
2022 | source "drivers/pci/hotplug/Kconfig" | 2027 | source "drivers/pci/hotplug/Kconfig" |
2023 | 2028 | ||
2024 | endmenu | 2029 | endmenu |
2025 | 2030 | ||
2026 | menu "Executable file formats" | 2031 | menu "Executable file formats" |
2027 | 2032 | ||
2028 | source "fs/Kconfig.binfmt" | 2033 | source "fs/Kconfig.binfmt" |
2029 | 2034 | ||
2030 | config TRAD_SIGNALS | 2035 | config TRAD_SIGNALS |
2031 | bool | 2036 | bool |
2032 | 2037 | ||
2033 | config BINFMT_IRIX | 2038 | config BINFMT_IRIX |
2034 | bool "Include IRIX binary compatibility" | 2039 | bool "Include IRIX binary compatibility" |
2035 | depends on CPU_BIG_ENDIAN && 32BIT && BROKEN | 2040 | depends on CPU_BIG_ENDIAN && 32BIT && BROKEN |
2036 | 2041 | ||
2037 | config MIPS32_COMPAT | 2042 | config MIPS32_COMPAT |
2038 | bool "Kernel support for Linux/MIPS 32-bit binary compatibility" | 2043 | bool "Kernel support for Linux/MIPS 32-bit binary compatibility" |
2039 | depends on 64BIT | 2044 | depends on 64BIT |
2040 | help | 2045 | help |
2041 | Select this option if you want Linux/MIPS 32-bit binary | 2046 | Select this option if you want Linux/MIPS 32-bit binary |
2042 | compatibility. Since all software available for Linux/MIPS is | 2047 | compatibility. Since all software available for Linux/MIPS is |
2043 | currently 32-bit you should say Y here. | 2048 | currently 32-bit you should say Y here. |
2044 | 2049 | ||
2045 | config COMPAT | 2050 | config COMPAT |
2046 | bool | 2051 | bool |
2047 | depends on MIPS32_COMPAT | 2052 | depends on MIPS32_COMPAT |
2048 | default y | 2053 | default y |
2049 | 2054 | ||
2050 | config SYSVIPC_COMPAT | 2055 | config SYSVIPC_COMPAT |
2051 | bool | 2056 | bool |
2052 | depends on COMPAT && SYSVIPC | 2057 | depends on COMPAT && SYSVIPC |
2053 | default y | 2058 | default y |
2054 | 2059 | ||
2055 | config MIPS32_O32 | 2060 | config MIPS32_O32 |
2056 | bool "Kernel support for o32 binaries" | 2061 | bool "Kernel support for o32 binaries" |
2057 | depends on MIPS32_COMPAT | 2062 | depends on MIPS32_COMPAT |
2058 | help | 2063 | help |
2059 | Select this option if you want to run o32 binaries. These are pure | 2064 | Select this option if you want to run o32 binaries. These are pure |
2060 | 32-bit binaries as used by the 32-bit Linux/MIPS port. Most of | 2065 | 32-bit binaries as used by the 32-bit Linux/MIPS port. Most of |
2061 | existing binaries are in this format. | 2066 | existing binaries are in this format. |
2062 | 2067 | ||
2063 | If unsure, say Y. | 2068 | If unsure, say Y. |
2064 | 2069 | ||
2065 | config MIPS32_N32 | 2070 | config MIPS32_N32 |
2066 | bool "Kernel support for n32 binaries" | 2071 | bool "Kernel support for n32 binaries" |
2067 | depends on MIPS32_COMPAT | 2072 | depends on MIPS32_COMPAT |
2068 | help | 2073 | help |
2069 | Select this option if you want to run n32 binaries. These are | 2074 | Select this option if you want to run n32 binaries. These are |
2070 | 64-bit binaries using 32-bit quantities for addressing and certain | 2075 | 64-bit binaries using 32-bit quantities for addressing and certain |
2071 | data that would normally be 64-bit. They are used in special | 2076 | data that would normally be 64-bit. They are used in special |
2072 | cases. | 2077 | cases. |
2073 | 2078 | ||
2074 | If unsure, say N. | 2079 | If unsure, say N. |
2075 | 2080 | ||
2076 | config BINFMT_ELF32 | 2081 | config BINFMT_ELF32 |
2077 | bool | 2082 | bool |
2078 | default y if MIPS32_O32 || MIPS32_N32 | 2083 | default y if MIPS32_O32 || MIPS32_N32 |
2079 | 2084 | ||
2080 | endmenu | 2085 | endmenu |
2081 | 2086 | ||
2082 | menu "Power management options" | 2087 | menu "Power management options" |
2083 | 2088 | ||
2084 | source "kernel/power/Kconfig" | 2089 | source "kernel/power/Kconfig" |
2085 | 2090 | ||
2086 | endmenu | 2091 | endmenu |
2087 | 2092 | ||
2088 | source "net/Kconfig" | 2093 | source "net/Kconfig" |
2089 | 2094 | ||
2090 | source "drivers/Kconfig" | 2095 | source "drivers/Kconfig" |
2091 | 2096 | ||
2092 | source "fs/Kconfig" | 2097 | source "fs/Kconfig" |
2093 | 2098 | ||
2094 | source "kernel/Kconfig.instrumentation" | 2099 | source "kernel/Kconfig.instrumentation" |
2095 | 2100 | ||
2096 | source "arch/mips/Kconfig.debug" | 2101 | source "arch/mips/Kconfig.debug" |
2097 | 2102 | ||
2098 | source "security/Kconfig" | 2103 | source "security/Kconfig" |
2099 | 2104 | ||
2100 | source "crypto/Kconfig" | 2105 | source "crypto/Kconfig" |
2101 | 2106 | ||
2102 | source "lib/Kconfig" | 2107 | source "lib/Kconfig" |
2103 | 2108 |
arch/parisc/Kconfig
1 | # | 1 | # |
2 | # For a description of the syntax of this configuration file, | 2 | # For a description of the syntax of this configuration file, |
3 | # see Documentation/kbuild/kconfig-language.txt. | 3 | # see Documentation/kbuild/kconfig-language.txt. |
4 | # | 4 | # |
5 | 5 | ||
6 | mainmenu "Linux/PA-RISC Kernel Configuration" | 6 | mainmenu "Linux/PA-RISC Kernel Configuration" |
7 | 7 | ||
8 | config PARISC | 8 | config PARISC |
9 | def_bool y | 9 | def_bool y |
10 | help | 10 | help |
11 | The PA-RISC microprocessor is designed by Hewlett-Packard and used | 11 | The PA-RISC microprocessor is designed by Hewlett-Packard and used |
12 | in many of their workstations & servers (HP9000 700 and 800 series, | 12 | in many of their workstations & servers (HP9000 700 and 800 series, |
13 | and later HP3000 series). The PA-RISC Linux project home page is | 13 | and later HP3000 series). The PA-RISC Linux project home page is |
14 | at <http://www.parisc-linux.org/>. | 14 | at <http://www.parisc-linux.org/>. |
15 | 15 | ||
16 | config MMU | 16 | config MMU |
17 | def_bool y | 17 | def_bool y |
18 | 18 | ||
19 | config STACK_GROWSUP | 19 | config STACK_GROWSUP |
20 | def_bool y | 20 | def_bool y |
21 | 21 | ||
22 | config GENERIC_LOCKBREAK | ||
23 | bool | ||
24 | default y | ||
25 | depends on SMP && PREEMPT | ||
26 | |||
22 | config RWSEM_GENERIC_SPINLOCK | 27 | config RWSEM_GENERIC_SPINLOCK |
23 | def_bool y | 28 | def_bool y |
24 | 29 | ||
25 | config RWSEM_XCHGADD_ALGORITHM | 30 | config RWSEM_XCHGADD_ALGORITHM |
26 | bool | 31 | bool |
27 | 32 | ||
28 | config ARCH_HAS_ILOG2_U32 | 33 | config ARCH_HAS_ILOG2_U32 |
29 | bool | 34 | bool |
30 | default n | 35 | default n |
31 | 36 | ||
32 | config ARCH_HAS_ILOG2_U64 | 37 | config ARCH_HAS_ILOG2_U64 |
33 | bool | 38 | bool |
34 | default n | 39 | default n |
35 | 40 | ||
36 | config GENERIC_FIND_NEXT_BIT | 41 | config GENERIC_FIND_NEXT_BIT |
37 | bool | 42 | bool |
38 | default y | 43 | default y |
39 | 44 | ||
40 | config GENERIC_BUG | 45 | config GENERIC_BUG |
41 | bool | 46 | bool |
42 | default y | 47 | default y |
43 | depends on BUG | 48 | depends on BUG |
44 | 49 | ||
45 | config GENERIC_HWEIGHT | 50 | config GENERIC_HWEIGHT |
46 | bool | 51 | bool |
47 | default y | 52 | default y |
48 | 53 | ||
49 | config GENERIC_CALIBRATE_DELAY | 54 | config GENERIC_CALIBRATE_DELAY |
50 | bool | 55 | bool |
51 | default y | 56 | default y |
52 | 57 | ||
53 | config GENERIC_TIME | 58 | config GENERIC_TIME |
54 | bool | 59 | bool |
55 | default y | 60 | default y |
56 | 61 | ||
57 | config TIME_LOW_RES | 62 | config TIME_LOW_RES |
58 | bool | 63 | bool |
59 | depends on SMP | 64 | depends on SMP |
60 | default y | 65 | default y |
61 | 66 | ||
62 | config GENERIC_HARDIRQS | 67 | config GENERIC_HARDIRQS |
63 | def_bool y | 68 | def_bool y |
64 | 69 | ||
65 | config GENERIC_IRQ_PROBE | 70 | config GENERIC_IRQ_PROBE |
66 | def_bool y | 71 | def_bool y |
67 | 72 | ||
68 | config IRQ_PER_CPU | 73 | config IRQ_PER_CPU |
69 | bool | 74 | bool |
70 | default y | 75 | default y |
71 | 76 | ||
72 | # unless you want to implement ACPI on PA-RISC ... ;-) | 77 | # unless you want to implement ACPI on PA-RISC ... ;-) |
73 | config PM | 78 | config PM |
74 | bool | 79 | bool |
75 | 80 | ||
76 | config ISA_DMA_API | 81 | config ISA_DMA_API |
77 | bool | 82 | bool |
78 | 83 | ||
79 | config ARCH_MAY_HAVE_PC_FDC | 84 | config ARCH_MAY_HAVE_PC_FDC |
80 | bool | 85 | bool |
81 | depends on BROKEN | 86 | depends on BROKEN |
82 | default y | 87 | default y |
83 | 88 | ||
84 | source "init/Kconfig" | 89 | source "init/Kconfig" |
85 | 90 | ||
86 | 91 | ||
87 | menu "Processor type and features" | 92 | menu "Processor type and features" |
88 | 93 | ||
89 | choice | 94 | choice |
90 | prompt "Processor type" | 95 | prompt "Processor type" |
91 | default PA7000 | 96 | default PA7000 |
92 | 97 | ||
93 | config PA7000 | 98 | config PA7000 |
94 | bool "PA7000/PA7100" | 99 | bool "PA7000/PA7100" |
95 | ---help--- | 100 | ---help--- |
96 | This is the processor type of your CPU. This information is | 101 | This is the processor type of your CPU. This information is |
97 | used for optimizing purposes. In order to compile a kernel | 102 | used for optimizing purposes. In order to compile a kernel |
98 | that can run on all 32-bit PA CPUs (albeit not optimally fast), | 103 | that can run on all 32-bit PA CPUs (albeit not optimally fast), |
99 | you can specify "PA7000" here. | 104 | you can specify "PA7000" here. |
100 | 105 | ||
101 | Specifying "PA8000" here will allow you to select a 64-bit kernel | 106 | Specifying "PA8000" here will allow you to select a 64-bit kernel |
102 | which is required on some machines. | 107 | which is required on some machines. |
103 | 108 | ||
104 | config PA7100LC | 109 | config PA7100LC |
105 | bool "PA7100LC" | 110 | bool "PA7100LC" |
106 | help | 111 | help |
107 | Select this option for the PCX-L processor, as used in the | 112 | Select this option for the PCX-L processor, as used in the |
108 | 712, 715/64, 715/80, 715/100, 715/100XC, 725/100, 743, 748, | 113 | 712, 715/64, 715/80, 715/100, 715/100XC, 725/100, 743, 748, |
109 | D200, D210, D300, D310 and E-class | 114 | D200, D210, D300, D310 and E-class |
110 | 115 | ||
111 | config PA7200 | 116 | config PA7200 |
112 | bool "PA7200" | 117 | bool "PA7200" |
113 | help | 118 | help |
114 | Select this option for the PCX-T' processor, as used in the | 119 | Select this option for the PCX-T' processor, as used in the |
115 | C100, C110, J100, J110, J210XC, D250, D260, D350, D360, | 120 | C100, C110, J100, J110, J210XC, D250, D260, D350, D360, |
116 | K100, K200, K210, K220, K400, K410 and K420 | 121 | K100, K200, K210, K220, K400, K410 and K420 |
117 | 122 | ||
118 | config PA7300LC | 123 | config PA7300LC |
119 | bool "PA7300LC" | 124 | bool "PA7300LC" |
120 | help | 125 | help |
121 | Select this option for the PCX-L2 processor, as used in the | 126 | Select this option for the PCX-L2 processor, as used in the |
122 | 744, A180, B132L, B160L, B180L, C132L, C160L, C180L, | 127 | 744, A180, B132L, B160L, B180L, C132L, C160L, C180L, |
123 | D220, D230, D320 and D330. | 128 | D220, D230, D320 and D330. |
124 | 129 | ||
125 | config PA8X00 | 130 | config PA8X00 |
126 | bool "PA8000 and up" | 131 | bool "PA8000 and up" |
127 | help | 132 | help |
128 | Select this option for PCX-U to PCX-W2 processors. | 133 | Select this option for PCX-U to PCX-W2 processors. |
129 | 134 | ||
130 | endchoice | 135 | endchoice |
131 | 136 | ||
132 | # Define implied options from the CPU selection here | 137 | # Define implied options from the CPU selection here |
133 | 138 | ||
134 | config PA20 | 139 | config PA20 |
135 | def_bool y | 140 | def_bool y |
136 | depends on PA8X00 | 141 | depends on PA8X00 |
137 | 142 | ||
138 | config PA11 | 143 | config PA11 |
139 | def_bool y | 144 | def_bool y |
140 | depends on PA7000 || PA7100LC || PA7200 || PA7300LC | 145 | depends on PA7000 || PA7100LC || PA7200 || PA7300LC |
141 | 146 | ||
142 | config PREFETCH | 147 | config PREFETCH |
143 | def_bool y | 148 | def_bool y |
144 | depends on PA8X00 || PA7200 | 149 | depends on PA8X00 || PA7200 |
145 | 150 | ||
146 | config 64BIT | 151 | config 64BIT |
147 | bool "64-bit kernel" | 152 | bool "64-bit kernel" |
148 | depends on PA8X00 | 153 | depends on PA8X00 |
149 | help | 154 | help |
150 | Enable this if you want to support 64bit kernel on PA-RISC platform. | 155 | Enable this if you want to support 64bit kernel on PA-RISC platform. |
151 | 156 | ||
152 | At the moment, only people willing to use more than 2GB of RAM, | 157 | At the moment, only people willing to use more than 2GB of RAM, |
153 | or having a 64bit-only capable PA-RISC machine should say Y here. | 158 | or having a 64bit-only capable PA-RISC machine should say Y here. |
154 | 159 | ||
155 | Since there is no 64bit userland on PA-RISC, there is no point to | 160 | Since there is no 64bit userland on PA-RISC, there is no point to |
156 | enable this option otherwise. The 64bit kernel is significantly bigger | 161 | enable this option otherwise. The 64bit kernel is significantly bigger |
157 | and slower than the 32bit one. | 162 | and slower than the 32bit one. |
158 | 163 | ||
159 | choice | 164 | choice |
160 | prompt "Kernel page size" | 165 | prompt "Kernel page size" |
161 | default PARISC_PAGE_SIZE_4KB if !64BIT | 166 | default PARISC_PAGE_SIZE_4KB if !64BIT |
162 | default PARISC_PAGE_SIZE_4KB if 64BIT | 167 | default PARISC_PAGE_SIZE_4KB if 64BIT |
163 | # default PARISC_PAGE_SIZE_16KB if 64BIT | 168 | # default PARISC_PAGE_SIZE_16KB if 64BIT |
164 | 169 | ||
165 | config PARISC_PAGE_SIZE_4KB | 170 | config PARISC_PAGE_SIZE_4KB |
166 | bool "4KB" | 171 | bool "4KB" |
167 | help | 172 | help |
168 | This lets you select the page size of the kernel. For best | 173 | This lets you select the page size of the kernel. For best |
169 | performance, a page size of 16KB is recommended. For best | 174 | performance, a page size of 16KB is recommended. For best |
170 | compatibility with 32bit applications, a page size of 4KB should be | 175 | compatibility with 32bit applications, a page size of 4KB should be |
171 | selected (the vast majority of 32bit binaries work perfectly fine | 176 | selected (the vast majority of 32bit binaries work perfectly fine |
172 | with a larger page size). | 177 | with a larger page size). |
173 | 178 | ||
174 | 4KB For best 32bit compatibility | 179 | 4KB For best 32bit compatibility |
175 | 16KB For best performance | 180 | 16KB For best performance |
176 | 64KB For best performance, might give more overhead. | 181 | 64KB For best performance, might give more overhead. |
177 | 182 | ||
178 | If you don't know what to do, choose 4KB. | 183 | If you don't know what to do, choose 4KB. |
179 | 184 | ||
180 | config PARISC_PAGE_SIZE_16KB | 185 | config PARISC_PAGE_SIZE_16KB |
181 | bool "16KB (EXPERIMENTAL)" | 186 | bool "16KB (EXPERIMENTAL)" |
182 | depends on PA8X00 && EXPERIMENTAL | 187 | depends on PA8X00 && EXPERIMENTAL |
183 | 188 | ||
184 | config PARISC_PAGE_SIZE_64KB | 189 | config PARISC_PAGE_SIZE_64KB |
185 | bool "64KB (EXPERIMENTAL)" | 190 | bool "64KB (EXPERIMENTAL)" |
186 | depends on PA8X00 && EXPERIMENTAL | 191 | depends on PA8X00 && EXPERIMENTAL |
187 | 192 | ||
188 | endchoice | 193 | endchoice |
189 | 194 | ||
190 | config SMP | 195 | config SMP |
191 | bool "Symmetric multi-processing support" | 196 | bool "Symmetric multi-processing support" |
192 | ---help--- | 197 | ---help--- |
193 | This enables support for systems with more than one CPU. If you have | 198 | This enables support for systems with more than one CPU. If you have |
194 | a system with only one CPU, like most personal computers, say N. If | 199 | a system with only one CPU, like most personal computers, say N. If |
195 | you have a system with more than one CPU, say Y. | 200 | you have a system with more than one CPU, say Y. |
196 | 201 | ||
197 | If you say N here, the kernel will run on single and multiprocessor | 202 | If you say N here, the kernel will run on single and multiprocessor |
198 | machines, but will use only one CPU of a multiprocessor machine. If | 203 | machines, but will use only one CPU of a multiprocessor machine. If |
199 | you say Y here, the kernel will run on many, but not all, | 204 | you say Y here, the kernel will run on many, but not all, |
200 | singleprocessor machines. On a singleprocessor machine, the kernel | 205 | singleprocessor machines. On a singleprocessor machine, the kernel |
201 | will run faster if you say N here. | 206 | will run faster if you say N here. |
202 | 207 | ||
203 | See also the <file:Documentation/smp.txt>, | 208 | See also the <file:Documentation/smp.txt>, |
204 | <file:Documentation/nmi_watchdog.txt> and the SMP-HOWTO available | 209 | <file:Documentation/nmi_watchdog.txt> and the SMP-HOWTO available |
205 | at <http://www.tldp.org/docs.html#howto>. | 210 | at <http://www.tldp.org/docs.html#howto>. |
206 | 211 | ||
207 | If you don't know what to do here, say N. | 212 | If you don't know what to do here, say N. |
208 | 213 | ||
209 | config HOTPLUG_CPU | 214 | config HOTPLUG_CPU |
210 | bool | 215 | bool |
211 | default y if SMP | 216 | default y if SMP |
212 | select HOTPLUG | 217 | select HOTPLUG |
213 | 218 | ||
214 | config ARCH_SELECT_MEMORY_MODEL | 219 | config ARCH_SELECT_MEMORY_MODEL |
215 | def_bool y | 220 | def_bool y |
216 | depends on 64BIT | 221 | depends on 64BIT |
217 | 222 | ||
218 | config ARCH_DISCONTIGMEM_ENABLE | 223 | config ARCH_DISCONTIGMEM_ENABLE |
219 | def_bool y | 224 | def_bool y |
220 | depends on 64BIT | 225 | depends on 64BIT |
221 | 226 | ||
222 | config ARCH_FLATMEM_ENABLE | 227 | config ARCH_FLATMEM_ENABLE |
223 | def_bool y | 228 | def_bool y |
224 | 229 | ||
225 | config ARCH_DISCONTIGMEM_DEFAULT | 230 | config ARCH_DISCONTIGMEM_DEFAULT |
226 | def_bool y | 231 | def_bool y |
227 | depends on ARCH_DISCONTIGMEM_ENABLE | 232 | depends on ARCH_DISCONTIGMEM_ENABLE |
228 | 233 | ||
229 | config NODES_SHIFT | 234 | config NODES_SHIFT |
230 | int | 235 | int |
231 | default "3" | 236 | default "3" |
232 | depends on NEED_MULTIPLE_NODES | 237 | depends on NEED_MULTIPLE_NODES |
233 | 238 | ||
234 | source "kernel/Kconfig.preempt" | 239 | source "kernel/Kconfig.preempt" |
235 | source "kernel/Kconfig.hz" | 240 | source "kernel/Kconfig.hz" |
236 | source "mm/Kconfig" | 241 | source "mm/Kconfig" |
237 | 242 | ||
238 | config COMPAT | 243 | config COMPAT |
239 | def_bool y | 244 | def_bool y |
240 | depends on 64BIT | 245 | depends on 64BIT |
241 | 246 | ||
242 | config HPUX | 247 | config HPUX |
243 | bool "Support for HP-UX binaries" | 248 | bool "Support for HP-UX binaries" |
244 | depends on !64BIT | 249 | depends on !64BIT |
245 | 250 | ||
246 | config NR_CPUS | 251 | config NR_CPUS |
247 | int "Maximum number of CPUs (2-32)" | 252 | int "Maximum number of CPUs (2-32)" |
248 | range 2 32 | 253 | range 2 32 |
249 | depends on SMP | 254 | depends on SMP |
250 | default "32" | 255 | default "32" |
251 | 256 | ||
252 | endmenu | 257 | endmenu |
253 | 258 | ||
254 | 259 | ||
255 | source "drivers/parisc/Kconfig" | 260 | source "drivers/parisc/Kconfig" |
256 | 261 | ||
257 | 262 | ||
258 | menu "Executable file formats" | 263 | menu "Executable file formats" |
259 | 264 | ||
260 | source "fs/Kconfig.binfmt" | 265 | source "fs/Kconfig.binfmt" |
261 | 266 | ||
262 | endmenu | 267 | endmenu |
263 | 268 | ||
264 | source "net/Kconfig" | 269 | source "net/Kconfig" |
265 | 270 | ||
266 | source "drivers/Kconfig" | 271 | source "drivers/Kconfig" |
267 | 272 | ||
268 | source "fs/Kconfig" | 273 | source "fs/Kconfig" |
269 | 274 | ||
270 | source "kernel/Kconfig.instrumentation" | 275 | source "kernel/Kconfig.instrumentation" |
271 | 276 | ||
272 | source "arch/parisc/Kconfig.debug" | 277 | source "arch/parisc/Kconfig.debug" |
273 | 278 | ||
274 | source "security/Kconfig" | 279 | source "security/Kconfig" |
275 | 280 | ||
276 | source "crypto/Kconfig" | 281 | source "crypto/Kconfig" |
277 | 282 | ||
278 | source "lib/Kconfig" | 283 | source "lib/Kconfig" |
279 | 284 |
arch/powerpc/Kconfig
1 | # For a description of the syntax of this configuration file, | 1 | # For a description of the syntax of this configuration file, |
2 | # see Documentation/kbuild/kconfig-language.txt. | 2 | # see Documentation/kbuild/kconfig-language.txt. |
3 | # | 3 | # |
4 | 4 | ||
5 | mainmenu "Linux/PowerPC Kernel Configuration" | 5 | mainmenu "Linux/PowerPC Kernel Configuration" |
6 | 6 | ||
7 | source "arch/powerpc/platforms/Kconfig.cputype" | 7 | source "arch/powerpc/platforms/Kconfig.cputype" |
8 | 8 | ||
9 | config PPC32 | 9 | config PPC32 |
10 | bool | 10 | bool |
11 | default y if !PPC64 | 11 | default y if !PPC64 |
12 | 12 | ||
13 | config 64BIT | 13 | config 64BIT |
14 | bool | 14 | bool |
15 | default y if PPC64 | 15 | default y if PPC64 |
16 | 16 | ||
17 | config WORD_SIZE | 17 | config WORD_SIZE |
18 | int | 18 | int |
19 | default 64 if PPC64 | 19 | default 64 if PPC64 |
20 | default 32 if !PPC64 | 20 | default 32 if !PPC64 |
21 | 21 | ||
22 | config PPC_MERGE | 22 | config PPC_MERGE |
23 | def_bool y | 23 | def_bool y |
24 | 24 | ||
25 | config MMU | 25 | config MMU |
26 | bool | 26 | bool |
27 | default y | 27 | default y |
28 | 28 | ||
29 | config GENERIC_CMOS_UPDATE | 29 | config GENERIC_CMOS_UPDATE |
30 | def_bool y | 30 | def_bool y |
31 | 31 | ||
32 | config GENERIC_TIME | 32 | config GENERIC_TIME |
33 | def_bool y | 33 | def_bool y |
34 | 34 | ||
35 | config GENERIC_TIME_VSYSCALL | 35 | config GENERIC_TIME_VSYSCALL |
36 | def_bool y | 36 | def_bool y |
37 | 37 | ||
38 | config GENERIC_CLOCKEVENTS | 38 | config GENERIC_CLOCKEVENTS |
39 | def_bool y | 39 | def_bool y |
40 | 40 | ||
41 | config GENERIC_HARDIRQS | 41 | config GENERIC_HARDIRQS |
42 | bool | 42 | bool |
43 | default y | 43 | default y |
44 | 44 | ||
45 | config IRQ_PER_CPU | 45 | config IRQ_PER_CPU |
46 | bool | 46 | bool |
47 | default y | 47 | default y |
48 | 48 | ||
49 | config RWSEM_GENERIC_SPINLOCK | 49 | config RWSEM_GENERIC_SPINLOCK |
50 | bool | 50 | bool |
51 | 51 | ||
52 | config RWSEM_XCHGADD_ALGORITHM | 52 | config RWSEM_XCHGADD_ALGORITHM |
53 | bool | 53 | bool |
54 | default y | 54 | default y |
55 | 55 | ||
56 | config GENERIC_LOCKBREAK | ||
57 | bool | ||
58 | default y | ||
59 | depends on SMP && PREEMPT | ||
60 | |||
56 | config ARCH_HAS_ILOG2_U32 | 61 | config ARCH_HAS_ILOG2_U32 |
57 | bool | 62 | bool |
58 | default y | 63 | default y |
59 | 64 | ||
60 | config ARCH_HAS_ILOG2_U64 | 65 | config ARCH_HAS_ILOG2_U64 |
61 | bool | 66 | bool |
62 | default y if 64BIT | 67 | default y if 64BIT |
63 | 68 | ||
64 | config GENERIC_HWEIGHT | 69 | config GENERIC_HWEIGHT |
65 | bool | 70 | bool |
66 | default y | 71 | default y |
67 | 72 | ||
68 | config GENERIC_CALIBRATE_DELAY | 73 | config GENERIC_CALIBRATE_DELAY |
69 | bool | 74 | bool |
70 | default y | 75 | default y |
71 | 76 | ||
72 | config GENERIC_FIND_NEXT_BIT | 77 | config GENERIC_FIND_NEXT_BIT |
73 | bool | 78 | bool |
74 | default y | 79 | default y |
75 | 80 | ||
76 | config ARCH_NO_VIRT_TO_BUS | 81 | config ARCH_NO_VIRT_TO_BUS |
77 | def_bool PPC64 | 82 | def_bool PPC64 |
78 | 83 | ||
79 | config PPC | 84 | config PPC |
80 | bool | 85 | bool |
81 | default y | 86 | default y |
82 | 87 | ||
83 | config EARLY_PRINTK | 88 | config EARLY_PRINTK |
84 | bool | 89 | bool |
85 | default y | 90 | default y |
86 | 91 | ||
87 | config COMPAT | 92 | config COMPAT |
88 | bool | 93 | bool |
89 | default y if PPC64 | 94 | default y if PPC64 |
90 | 95 | ||
91 | config SYSVIPC_COMPAT | 96 | config SYSVIPC_COMPAT |
92 | bool | 97 | bool |
93 | depends on COMPAT && SYSVIPC | 98 | depends on COMPAT && SYSVIPC |
94 | default y | 99 | default y |
95 | 100 | ||
96 | # All PPC32s use generic nvram driver through ppc_md | 101 | # All PPC32s use generic nvram driver through ppc_md |
97 | config GENERIC_NVRAM | 102 | config GENERIC_NVRAM |
98 | bool | 103 | bool |
99 | default y if PPC32 | 104 | default y if PPC32 |
100 | 105 | ||
101 | config SCHED_NO_NO_OMIT_FRAME_POINTER | 106 | config SCHED_NO_NO_OMIT_FRAME_POINTER |
102 | bool | 107 | bool |
103 | default y | 108 | default y |
104 | 109 | ||
105 | config ARCH_MAY_HAVE_PC_FDC | 110 | config ARCH_MAY_HAVE_PC_FDC |
106 | bool | 111 | bool |
107 | default !PPC_PSERIES || PCI | 112 | default !PPC_PSERIES || PCI |
108 | 113 | ||
109 | config PPC_OF | 114 | config PPC_OF |
110 | def_bool y | 115 | def_bool y |
111 | 116 | ||
112 | config OF | 117 | config OF |
113 | def_bool y | 118 | def_bool y |
114 | 119 | ||
115 | config PPC_UDBG_16550 | 120 | config PPC_UDBG_16550 |
116 | bool | 121 | bool |
117 | default n | 122 | default n |
118 | 123 | ||
119 | config GENERIC_TBSYNC | 124 | config GENERIC_TBSYNC |
120 | bool | 125 | bool |
121 | default y if PPC32 && SMP | 126 | default y if PPC32 && SMP |
122 | default n | 127 | default n |
123 | 128 | ||
124 | config AUDIT_ARCH | 129 | config AUDIT_ARCH |
125 | bool | 130 | bool |
126 | default y | 131 | default y |
127 | 132 | ||
128 | config GENERIC_BUG | 133 | config GENERIC_BUG |
129 | bool | 134 | bool |
130 | default y | 135 | default y |
131 | depends on BUG | 136 | depends on BUG |
132 | 137 | ||
133 | config SYS_SUPPORTS_APM_EMULATION | 138 | config SYS_SUPPORTS_APM_EMULATION |
134 | default y if PMAC_APM_EMU | 139 | default y if PMAC_APM_EMU |
135 | bool | 140 | bool |
136 | 141 | ||
137 | config DEFAULT_UIMAGE | 142 | config DEFAULT_UIMAGE |
138 | bool | 143 | bool |
139 | help | 144 | help |
140 | Used to allow a board to specify it wants a uImage built by default | 145 | Used to allow a board to specify it wants a uImage built by default |
141 | default n | 146 | default n |
142 | 147 | ||
143 | config PPC64_SWSUSP | 148 | config PPC64_SWSUSP |
144 | bool | 149 | bool |
145 | depends on PPC64 && (BROKEN || (PPC_PMAC64 && EXPERIMENTAL)) | 150 | depends on PPC64 && (BROKEN || (PPC_PMAC64 && EXPERIMENTAL)) |
146 | default y | 151 | default y |
147 | 152 | ||
148 | config PPC_DCR_NATIVE | 153 | config PPC_DCR_NATIVE |
149 | bool | 154 | bool |
150 | default n | 155 | default n |
151 | 156 | ||
152 | config PPC_DCR_MMIO | 157 | config PPC_DCR_MMIO |
153 | bool | 158 | bool |
154 | default n | 159 | default n |
155 | 160 | ||
156 | config PPC_DCR | 161 | config PPC_DCR |
157 | bool | 162 | bool |
158 | depends on PPC_DCR_NATIVE || PPC_DCR_MMIO | 163 | depends on PPC_DCR_NATIVE || PPC_DCR_MMIO |
159 | default y | 164 | default y |
160 | 165 | ||
161 | config PPC_OF_PLATFORM_PCI | 166 | config PPC_OF_PLATFORM_PCI |
162 | bool | 167 | bool |
163 | depends on PPC64 # not supported on 32 bits yet | 168 | depends on PPC64 # not supported on 32 bits yet |
164 | default n | 169 | default n |
165 | 170 | ||
166 | source "init/Kconfig" | 171 | source "init/Kconfig" |
167 | 172 | ||
168 | source "arch/powerpc/platforms/Kconfig" | 173 | source "arch/powerpc/platforms/Kconfig" |
169 | 174 | ||
170 | menu "Kernel options" | 175 | menu "Kernel options" |
171 | 176 | ||
172 | config HIGHMEM | 177 | config HIGHMEM |
173 | bool "High memory support" | 178 | bool "High memory support" |
174 | depends on PPC32 | 179 | depends on PPC32 |
175 | 180 | ||
176 | source kernel/time/Kconfig | 181 | source kernel/time/Kconfig |
177 | source kernel/Kconfig.hz | 182 | source kernel/Kconfig.hz |
178 | source kernel/Kconfig.preempt | 183 | source kernel/Kconfig.preempt |
179 | source "fs/Kconfig.binfmt" | 184 | source "fs/Kconfig.binfmt" |
180 | 185 | ||
181 | # We optimistically allocate largepages from the VM, so make the limit | 186 | # We optimistically allocate largepages from the VM, so make the limit |
182 | # large enough (16MB). This badly named config option is actually | 187 | # large enough (16MB). This badly named config option is actually |
183 | # max order + 1 | 188 | # max order + 1 |
184 | config FORCE_MAX_ZONEORDER | 189 | config FORCE_MAX_ZONEORDER |
185 | int | 190 | int |
186 | depends on PPC64 | 191 | depends on PPC64 |
187 | default "9" if PPC_64K_PAGES | 192 | default "9" if PPC_64K_PAGES |
188 | default "13" | 193 | default "13" |
189 | 194 | ||
190 | config HUGETLB_PAGE_SIZE_VARIABLE | 195 | config HUGETLB_PAGE_SIZE_VARIABLE |
191 | bool | 196 | bool |
192 | depends on HUGETLB_PAGE | 197 | depends on HUGETLB_PAGE |
193 | default y | 198 | default y |
194 | 199 | ||
195 | config MATH_EMULATION | 200 | config MATH_EMULATION |
196 | bool "Math emulation" | 201 | bool "Math emulation" |
197 | depends on 4xx || 8xx || E200 || PPC_MPC832x || E500 | 202 | depends on 4xx || 8xx || E200 || PPC_MPC832x || E500 |
198 | ---help--- | 203 | ---help--- |
199 | Some PowerPC chips designed for embedded applications do not have | 204 | Some PowerPC chips designed for embedded applications do not have |
200 | a floating-point unit and therefore do not implement the | 205 | a floating-point unit and therefore do not implement the |
201 | floating-point instructions in the PowerPC instruction set. If you | 206 | floating-point instructions in the PowerPC instruction set. If you |
202 | say Y here, the kernel will include code to emulate a floating-point | 207 | say Y here, the kernel will include code to emulate a floating-point |
203 | unit, which will allow programs that use floating-point | 208 | unit, which will allow programs that use floating-point |
204 | instructions to run. | 209 | instructions to run. |
205 | 210 | ||
206 | config 8XX_MINIMAL_FPEMU | 211 | config 8XX_MINIMAL_FPEMU |
207 | bool "Minimal math emulation for 8xx" | 212 | bool "Minimal math emulation for 8xx" |
208 | depends on 8xx && !MATH_EMULATION | 213 | depends on 8xx && !MATH_EMULATION |
209 | help | 214 | help |
210 | Older arch/ppc kernels still emulated a few floating point | 215 | Older arch/ppc kernels still emulated a few floating point |
211 | instructions such as load and store, even when full math | 216 | instructions such as load and store, even when full math |
212 | emulation is disabled. Say "Y" here if you want to preserve | 217 | emulation is disabled. Say "Y" here if you want to preserve |
213 | this behavior. | 218 | this behavior. |
214 | 219 | ||
215 | It is recommended that you build a soft-float userspace instead. | 220 | It is recommended that you build a soft-float userspace instead. |
216 | 221 | ||
217 | config IOMMU_VMERGE | 222 | config IOMMU_VMERGE |
218 | bool "Enable IOMMU virtual merging" | 223 | bool "Enable IOMMU virtual merging" |
219 | depends on PPC64 | 224 | depends on PPC64 |
220 | default y | 225 | default y |
221 | help | 226 | help |
222 | Cause IO segments sent to a device for DMA to be merged virtually | 227 | Cause IO segments sent to a device for DMA to be merged virtually |
223 | by the IOMMU when they happen to have been allocated contiguously. | 228 | by the IOMMU when they happen to have been allocated contiguously. |
224 | This doesn't add pressure to the IOMMU allocator. However, some | 229 | This doesn't add pressure to the IOMMU allocator. However, some |
225 | drivers don't support getting large merged segments coming back | 230 | drivers don't support getting large merged segments coming back |
226 | from *_map_sg(). | 231 | from *_map_sg(). |
227 | 232 | ||
228 | Most drivers don't have this problem; it is safe to say Y here. | 233 | Most drivers don't have this problem; it is safe to say Y here. |
229 | 234 | ||
230 | config HOTPLUG_CPU | 235 | config HOTPLUG_CPU |
231 | bool "Support for enabling/disabling CPUs" | 236 | bool "Support for enabling/disabling CPUs" |
232 | depends on SMP && HOTPLUG && EXPERIMENTAL && (PPC_PSERIES || PPC_PMAC) | 237 | depends on SMP && HOTPLUG && EXPERIMENTAL && (PPC_PSERIES || PPC_PMAC) |
233 | ---help--- | 238 | ---help--- |
234 | Say Y here to be able to disable and re-enable individual | 239 | Say Y here to be able to disable and re-enable individual |
235 | CPUs at runtime on SMP machines. | 240 | CPUs at runtime on SMP machines. |
236 | 241 | ||
237 | Say N if you are unsure. | 242 | Say N if you are unsure. |
238 | 243 | ||
239 | config ARCH_ENABLE_MEMORY_HOTPLUG | 244 | config ARCH_ENABLE_MEMORY_HOTPLUG |
240 | def_bool y | 245 | def_bool y |
241 | 246 | ||
242 | config KEXEC | 247 | config KEXEC |
243 | bool "kexec system call (EXPERIMENTAL)" | 248 | bool "kexec system call (EXPERIMENTAL)" |
244 | depends on (PPC_PRPMC2800 || PPC_MULTIPLATFORM) && EXPERIMENTAL | 249 | depends on (PPC_PRPMC2800 || PPC_MULTIPLATFORM) && EXPERIMENTAL |
245 | help | 250 | help |
246 | kexec is a system call that implements the ability to shutdown your | 251 | kexec is a system call that implements the ability to shutdown your |
247 | current kernel, and to start another kernel. It is like a reboot | 252 | current kernel, and to start another kernel. It is like a reboot |
248 | but it is independent of the system firmware. And like a reboot | 253 | but it is independent of the system firmware. And like a reboot |
249 | you can start any kernel with it, not just Linux. | 254 | you can start any kernel with it, not just Linux. |
250 | 255 | ||
251 | The name comes from the similarity to the exec system call. | 256 | The name comes from the similarity to the exec system call. |
252 | 257 | ||
253 | It is an ongoing process to be certain the hardware in a machine | 258 | It is an ongoing process to be certain the hardware in a machine |
254 | is properly shutdown, so do not be surprised if this code does not | 259 | is properly shutdown, so do not be surprised if this code does not |
255 | initially work for you. It may help to enable device hotplugging | 260 | initially work for you. It may help to enable device hotplugging |
256 | support. As of this writing the exact hardware interface is | 261 | support. As of this writing the exact hardware interface is |
257 | strongly in flux, so no good recommendation can be made. | 262 | strongly in flux, so no good recommendation can be made. |
258 | 263 | ||
259 | config CRASH_DUMP | 264 | config CRASH_DUMP |
260 | bool "Build a kdump crash kernel (EXPERIMENTAL)" | 265 | bool "Build a kdump crash kernel (EXPERIMENTAL)" |
261 | depends on PPC_MULTIPLATFORM && PPC64 && EXPERIMENTAL | 266 | depends on PPC_MULTIPLATFORM && PPC64 && EXPERIMENTAL |
262 | help | 267 | help |
263 | Build a kernel suitable for use as a kdump capture kernel. | 268 | Build a kernel suitable for use as a kdump capture kernel. |
264 | The kernel will be linked at a different address than normal, and | 269 | The kernel will be linked at a different address than normal, and |
265 | so can only be used for Kdump. | 270 | so can only be used for Kdump. |
266 | 271 | ||
267 | Don't change this unless you know what you are doing. | 272 | Don't change this unless you know what you are doing. |
268 | 273 | ||
269 | config PPCBUG_NVRAM | 274 | config PPCBUG_NVRAM |
270 | bool "Enable reading PPCBUG NVRAM during boot" if PPLUS || LOPEC | 275 | bool "Enable reading PPCBUG NVRAM during boot" if PPLUS || LOPEC |
271 | default y if PPC_PREP | 276 | default y if PPC_PREP |
272 | 277 | ||
273 | config IRQ_ALL_CPUS | 278 | config IRQ_ALL_CPUS |
274 | bool "Distribute interrupts on all CPUs by default" | 279 | bool "Distribute interrupts on all CPUs by default" |
275 | depends on SMP && !MV64360 | 280 | depends on SMP && !MV64360 |
276 | help | 281 | help |
277 | This option gives the kernel permission to distribute IRQs across | 282 | This option gives the kernel permission to distribute IRQs across |
278 | multiple CPUs. Saying N here will route all IRQs to the first | 283 | multiple CPUs. Saying N here will route all IRQs to the first |
279 | CPU. Generally saying Y is safe, although some problems have been | 284 | CPU. Generally saying Y is safe, although some problems have been |
280 | reported with SMP Power Macintoshes with this option enabled. | 285 | reported with SMP Power Macintoshes with this option enabled. |
281 | 286 | ||
282 | config NUMA | 287 | config NUMA |
283 | bool "NUMA support" | 288 | bool "NUMA support" |
284 | depends on PPC64 | 289 | depends on PPC64 |
285 | default y if SMP && PPC_PSERIES | 290 | default y if SMP && PPC_PSERIES |
286 | 291 | ||
287 | config NODES_SHIFT | 292 | config NODES_SHIFT |
288 | int | 293 | int |
289 | default "4" | 294 | default "4" |
290 | depends on NEED_MULTIPLE_NODES | 295 | depends on NEED_MULTIPLE_NODES |
291 | 296 | ||
292 | config ARCH_SELECT_MEMORY_MODEL | 297 | config ARCH_SELECT_MEMORY_MODEL |
293 | def_bool y | 298 | def_bool y |
294 | depends on PPC64 | 299 | depends on PPC64 |
295 | 300 | ||
296 | config ARCH_FLATMEM_ENABLE | 301 | config ARCH_FLATMEM_ENABLE |
297 | def_bool y | 302 | def_bool y |
298 | depends on (PPC64 && !NUMA) || PPC32 | 303 | depends on (PPC64 && !NUMA) || PPC32 |
299 | 304 | ||
300 | config ARCH_SPARSEMEM_ENABLE | 305 | config ARCH_SPARSEMEM_ENABLE |
301 | def_bool y | 306 | def_bool y |
302 | depends on PPC64 | 307 | depends on PPC64 |
303 | select SPARSEMEM_VMEMMAP_ENABLE | 308 | select SPARSEMEM_VMEMMAP_ENABLE |
304 | 309 | ||
305 | config ARCH_SPARSEMEM_DEFAULT | 310 | config ARCH_SPARSEMEM_DEFAULT |
306 | def_bool y | 311 | def_bool y |
307 | depends on (SMP && PPC_PSERIES) || PPC_PS3 | 312 | depends on (SMP && PPC_PSERIES) || PPC_PS3 |
308 | 313 | ||
309 | config ARCH_POPULATES_NODE_MAP | 314 | config ARCH_POPULATES_NODE_MAP |
310 | def_bool y | 315 | def_bool y |
311 | 316 | ||
312 | source "mm/Kconfig" | 317 | source "mm/Kconfig" |
313 | 318 | ||
314 | config ARCH_MEMORY_PROBE | 319 | config ARCH_MEMORY_PROBE |
315 | def_bool y | 320 | def_bool y |
316 | depends on MEMORY_HOTPLUG | 321 | depends on MEMORY_HOTPLUG |
317 | 322 | ||
318 | # Some NUMA nodes have memory ranges that span | 323 | # Some NUMA nodes have memory ranges that span |
319 | # other nodes. Even though a pfn is valid and | 324 | # other nodes. Even though a pfn is valid and |
320 | # between a node's start and end pfns, it may not | 325 | # between a node's start and end pfns, it may not |
321 | # reside on that node. See memmap_init_zone() | 326 | # reside on that node. See memmap_init_zone() |
322 | # for details. | 327 | # for details. |
323 | config NODES_SPAN_OTHER_NODES | 328 | config NODES_SPAN_OTHER_NODES |
324 | def_bool y | 329 | def_bool y |
325 | depends on NEED_MULTIPLE_NODES | 330 | depends on NEED_MULTIPLE_NODES |
326 | 331 | ||
327 | config PPC_HAS_HASH_64K | 332 | config PPC_HAS_HASH_64K |
328 | bool | 333 | bool |
329 | depends on PPC64 | 334 | depends on PPC64 |
330 | default n | 335 | default n |
331 | 336 | ||
332 | config PPC_64K_PAGES | 337 | config PPC_64K_PAGES |
333 | bool "64k page size" | 338 | bool "64k page size" |
334 | depends on PPC64 | 339 | depends on PPC64 |
335 | select PPC_HAS_HASH_64K | 340 | select PPC_HAS_HASH_64K |
336 | help | 341 | help |
337 | This option changes the kernel logical page size to 64k. On machines | 342 | This option changes the kernel logical page size to 64k. On machines |
338 | without processor support for 64k pages, the kernel will simulate | 343 | without processor support for 64k pages, the kernel will simulate |
339 | them by loading each individual 4k page on demand transparently, | 344 | them by loading each individual 4k page on demand transparently, |
340 | while on hardware with such support, it will be used to map | 345 | while on hardware with such support, it will be used to map |
341 | normal application pages. | 346 | normal application pages. |
342 | 347 | ||
343 | config SCHED_SMT | 348 | config SCHED_SMT |
344 | bool "SMT (Hyperthreading) scheduler support" | 349 | bool "SMT (Hyperthreading) scheduler support" |
345 | depends on PPC64 && SMP | 350 | depends on PPC64 && SMP |
346 | help | 351 | help |
347 | SMT scheduler support improves the CPU scheduler's decision making | 352 | SMT scheduler support improves the CPU scheduler's decision making |
348 | when dealing with POWER5 cpus at a cost of slightly increased | 353 | when dealing with POWER5 cpus at a cost of slightly increased |
349 | overhead in some places. If unsure say N here. | 354 | overhead in some places. If unsure say N here. |
350 | 355 | ||
351 | config PROC_DEVICETREE | 356 | config PROC_DEVICETREE |
352 | bool "Support for device tree in /proc" | 357 | bool "Support for device tree in /proc" |
353 | depends on PROC_FS | 358 | depends on PROC_FS |
354 | help | 359 | help |
355 | This option adds a device-tree directory under /proc which contains | 360 | This option adds a device-tree directory under /proc which contains |
356 | an image of the device tree that the kernel copies from Open | 361 | an image of the device tree that the kernel copies from Open |
357 | Firmware or other boot firmware. If unsure, say Y here. | 362 | Firmware or other boot firmware. If unsure, say Y here. |
358 | 363 | ||
359 | config CMDLINE_BOOL | 364 | config CMDLINE_BOOL |
360 | bool "Default bootloader kernel arguments" | 365 | bool "Default bootloader kernel arguments" |
361 | 366 | ||
362 | config CMDLINE | 367 | config CMDLINE |
363 | string "Initial kernel command string" | 368 | string "Initial kernel command string" |
364 | depends on CMDLINE_BOOL | 369 | depends on CMDLINE_BOOL |
365 | default "console=ttyS0,9600 console=tty0 root=/dev/sda2" | 370 | default "console=ttyS0,9600 console=tty0 root=/dev/sda2" |
366 | help | 371 | help |
367 | On some platforms, there is currently no way for the boot loader to | 372 | On some platforms, there is currently no way for the boot loader to |
368 | pass arguments to the kernel. For these platforms, you can supply | 373 | pass arguments to the kernel. For these platforms, you can supply |
369 | some command-line options at build time by entering them here. In | 374 | some command-line options at build time by entering them here. In |
370 | most cases you will need to specify the root device here. | 375 | most cases you will need to specify the root device here. |
371 | 376 | ||
372 | if !44x || BROKEN | 377 | if !44x || BROKEN |
373 | source kernel/power/Kconfig | 378 | source kernel/power/Kconfig |
374 | endif | 379 | endif |
375 | 380 | ||
376 | config SECCOMP | 381 | config SECCOMP |
377 | bool "Enable seccomp to safely compute untrusted bytecode" | 382 | bool "Enable seccomp to safely compute untrusted bytecode" |
378 | depends on PROC_FS | 383 | depends on PROC_FS |
379 | default y | 384 | default y |
380 | help | 385 | help |
381 | This kernel feature is useful for number crunching applications | 386 | This kernel feature is useful for number crunching applications |
382 | that may need to compute untrusted bytecode during their | 387 | that may need to compute untrusted bytecode during their |
383 | execution. By using pipes or other transports made available to | 388 | execution. By using pipes or other transports made available to |
384 | the process as file descriptors supporting the read/write | 389 | the process as file descriptors supporting the read/write |
385 | syscalls, it's possible to isolate those applications in | 390 | syscalls, it's possible to isolate those applications in |
386 | their own address space using seccomp. Once seccomp is | 391 | their own address space using seccomp. Once seccomp is |
387 | enabled via /proc/<pid>/seccomp, it cannot be disabled | 392 | enabled via /proc/<pid>/seccomp, it cannot be disabled |
388 | and the task is only allowed to execute a few safe syscalls | 393 | and the task is only allowed to execute a few safe syscalls |
389 | defined by each seccomp mode. | 394 | defined by each seccomp mode. |
390 | 395 | ||
391 | If unsure, say Y. Only embedded should say N here. | 396 | If unsure, say Y. Only embedded should say N here. |
392 | 397 | ||
393 | config WANT_DEVICE_TREE | 398 | config WANT_DEVICE_TREE |
394 | bool | 399 | bool |
395 | default n | 400 | default n |
396 | 401 | ||
397 | config DEVICE_TREE | 402 | config DEVICE_TREE |
398 | string "Static device tree source file" | 403 | string "Static device tree source file" |
399 | depends on WANT_DEVICE_TREE | 404 | depends on WANT_DEVICE_TREE |
400 | help | 405 | help |
401 | This specifies the device tree source (.dts) file to be | 406 | This specifies the device tree source (.dts) file to be |
402 | compiled and included when building the bootwrapper. If a | 407 | compiled and included when building the bootwrapper. If a |
403 | relative filename is given, then it will be relative to | 408 | relative filename is given, then it will be relative to |
404 | arch/powerpc/boot/dts. If you are not using the bootwrapper, | 409 | arch/powerpc/boot/dts. If you are not using the bootwrapper, |
405 | or do not need to build a dts into the bootwrapper, this | 410 | or do not need to build a dts into the bootwrapper, this |
406 | field is ignored. | 411 | field is ignored. |
407 | 412 | ||
408 | For example, this is required when building a cuImage target | 413 | For example, this is required when building a cuImage target |
409 | for an older U-Boot, which cannot pass a device tree itself. | 414 | for an older U-Boot, which cannot pass a device tree itself. |
410 | Such a kernel will not work with a newer U-Boot that tries to | 415 | Such a kernel will not work with a newer U-Boot that tries to |
411 | pass a device tree (unless you tell it not to). If your U-Boot | 416 | pass a device tree (unless you tell it not to). If your U-Boot |
412 | does not mention a device tree in "help bootm", then use the | 417 | does not mention a device tree in "help bootm", then use the |
413 | cuImage target and specify a device tree here. Otherwise, use | 418 | cuImage target and specify a device tree here. Otherwise, use |
414 | the uImage target and leave this field blank. | 419 | the uImage target and leave this field blank. |
415 | 420 | ||
416 | endmenu | 421 | endmenu |
417 | 422 | ||
418 | config ISA_DMA_API | 423 | config ISA_DMA_API |
419 | bool | 424 | bool |
420 | default y | 425 | default y |
421 | 426 | ||
422 | menu "Bus options" | 427 | menu "Bus options" |
423 | 428 | ||
424 | config ISA | 429 | config ISA |
425 | bool "Support for ISA-bus hardware" | 430 | bool "Support for ISA-bus hardware" |
426 | depends on PPC_PREP || PPC_CHRP | 431 | depends on PPC_PREP || PPC_CHRP |
427 | select PPC_I8259 | 432 | select PPC_I8259 |
428 | help | 433 | help |
429 | Find out whether you have ISA slots on your motherboard. ISA is the | 434 | Find out whether you have ISA slots on your motherboard. ISA is the |
430 | name of a bus system, i.e. the way the CPU talks to the other stuff | 435 | name of a bus system, i.e. the way the CPU talks to the other stuff |
431 | inside your box. If you have an Apple machine, say N here; if you | 436 | inside your box. If you have an Apple machine, say N here; if you |
432 | have an IBM RS/6000 or pSeries machine or a PReP machine, say Y. If | 437 | have an IBM RS/6000 or pSeries machine or a PReP machine, say Y. If |
433 | you have an embedded board, consult your board documentation. | 438 | you have an embedded board, consult your board documentation. |
434 | 439 | ||
435 | config ZONE_DMA | 440 | config ZONE_DMA |
436 | bool | 441 | bool |
437 | default y | 442 | default y |
438 | 443 | ||
439 | config GENERIC_ISA_DMA | 444 | config GENERIC_ISA_DMA |
440 | bool | 445 | bool |
441 | depends on PPC64 || POWER4 || 6xx && !CPM2 | 446 | depends on PPC64 || POWER4 || 6xx && !CPM2 |
442 | default y | 447 | default y |
443 | 448 | ||
444 | config PPC_INDIRECT_PCI | 449 | config PPC_INDIRECT_PCI |
445 | bool | 450 | bool |
446 | depends on PCI | 451 | depends on PCI |
447 | default y if 40x || 44x | 452 | default y if 40x || 44x |
448 | default n | 453 | default n |
449 | 454 | ||
450 | config EISA | 455 | config EISA |
451 | bool | 456 | bool |
452 | 457 | ||
453 | config SBUS | 458 | config SBUS |
454 | bool | 459 | bool |
455 | 460 | ||
456 | config FSL_SOC | 461 | config FSL_SOC |
457 | bool | 462 | bool |
458 | 463 | ||
459 | config FSL_PCI | 464 | config FSL_PCI |
460 | bool | 465 | bool |
461 | select PPC_INDIRECT_PCI | 466 | select PPC_INDIRECT_PCI |
462 | 467 | ||
463 | # Yes MCA RS/6000s exist but Linux-PPC does not currently support any | 468 | # Yes MCA RS/6000s exist but Linux-PPC does not currently support any |
464 | config MCA | 469 | config MCA |
465 | bool | 470 | bool |
466 | 471 | ||
467 | config PCI | 472 | config PCI |
468 | bool "PCI support" if 40x || CPM2 || PPC_83xx || PPC_85xx || PPC_86xx \ | 473 | bool "PCI support" if 40x || CPM2 || PPC_83xx || PPC_85xx || PPC_86xx \ |
469 | || PPC_MPC52xx || (EMBEDDED && (PPC_PSERIES || PPC_ISERIES)) \ | 474 | || PPC_MPC52xx || (EMBEDDED && (PPC_PSERIES || PPC_ISERIES)) \ |
470 | || PPC_PS3 | 475 | || PPC_PS3 |
471 | default y if !40x && !CPM2 && !8xx && !PPC_83xx \ | 476 | default y if !40x && !CPM2 && !8xx && !PPC_83xx \ |
472 | && !PPC_85xx && !PPC_86xx | 477 | && !PPC_85xx && !PPC_86xx |
473 | default PCI_PERMEDIA if !4xx && !CPM2 && !8xx | 478 | default PCI_PERMEDIA if !4xx && !CPM2 && !8xx |
474 | default PCI_QSPAN if !4xx && !CPM2 && 8xx | 479 | default PCI_QSPAN if !4xx && !CPM2 && 8xx |
475 | select ARCH_SUPPORTS_MSI | 480 | select ARCH_SUPPORTS_MSI |
476 | help | 481 | help |
477 | Find out whether your system includes a PCI bus. PCI is the name of | 482 | Find out whether your system includes a PCI bus. PCI is the name of |
478 | a bus system, i.e. the way the CPU talks to the other stuff inside | 483 | a bus system, i.e. the way the CPU talks to the other stuff inside |
479 | your box. If you say Y here, the kernel will include drivers and | 484 | your box. If you say Y here, the kernel will include drivers and |
480 | infrastructure code to support PCI bus devices. | 485 | infrastructure code to support PCI bus devices. |
481 | 486 | ||
482 | config PCI_DOMAINS | 487 | config PCI_DOMAINS |
483 | def_bool PCI | 488 | def_bool PCI |
484 | 489 | ||
485 | config PCI_SYSCALL | 490 | config PCI_SYSCALL |
486 | def_bool PCI | 491 | def_bool PCI |
487 | 492 | ||
488 | config PCI_QSPAN | 493 | config PCI_QSPAN |
489 | bool "QSpan PCI" | 494 | bool "QSpan PCI" |
490 | depends on !4xx && !CPM2 && 8xx | 495 | depends on !4xx && !CPM2 && 8xx |
491 | select PPC_I8259 | 496 | select PPC_I8259 |
492 | help | 497 | help |
493 | Say Y here if you have a system based on a Motorola 8xx-series | 498 | Say Y here if you have a system based on a Motorola 8xx-series |
494 | embedded processor with a QSPAN PCI interface, otherwise say N. | 499 | embedded processor with a QSPAN PCI interface, otherwise say N. |
495 | 500 | ||
496 | config PCI_8260 | 501 | config PCI_8260 |
497 | bool | 502 | bool |
498 | depends on PCI && 8260 | 503 | depends on PCI && 8260 |
499 | select PPC_INDIRECT_PCI | 504 | select PPC_INDIRECT_PCI |
500 | default y | 505 | default y |
501 | 506 | ||
502 | config 8260_PCI9 | 507 | config 8260_PCI9 |
503 | bool "Enable workaround for MPC826x erratum PCI 9" | 508 | bool "Enable workaround for MPC826x erratum PCI 9" |
504 | depends on PCI_8260 && !8272 | 509 | depends on PCI_8260 && !8272 |
505 | default y | 510 | default y |
506 | 511 | ||
507 | choice | 512 | choice |
508 | prompt "IDMA channel for PCI 9 workaround" | 513 | prompt "IDMA channel for PCI 9 workaround" |
509 | depends on 8260_PCI9 | 514 | depends on 8260_PCI9 |
510 | 515 | ||
511 | config 8260_PCI9_IDMA1 | 516 | config 8260_PCI9_IDMA1 |
512 | bool "IDMA1" | 517 | bool "IDMA1" |
513 | 518 | ||
514 | config 8260_PCI9_IDMA2 | 519 | config 8260_PCI9_IDMA2 |
515 | bool "IDMA2" | 520 | bool "IDMA2" |
516 | 521 | ||
517 | config 8260_PCI9_IDMA3 | 522 | config 8260_PCI9_IDMA3 |
518 | bool "IDMA3" | 523 | bool "IDMA3" |
519 | 524 | ||
520 | config 8260_PCI9_IDMA4 | 525 | config 8260_PCI9_IDMA4 |
521 | bool "IDMA4" | 526 | bool "IDMA4" |
522 | 527 | ||
523 | endchoice | 528 | endchoice |
524 | 529 | ||
525 | source "drivers/pci/pcie/Kconfig" | 530 | source "drivers/pci/pcie/Kconfig" |
526 | 531 | ||
527 | source "drivers/pci/Kconfig" | 532 | source "drivers/pci/Kconfig" |
528 | 533 | ||
529 | source "drivers/pcmcia/Kconfig" | 534 | source "drivers/pcmcia/Kconfig" |
530 | 535 | ||
531 | source "drivers/pci/hotplug/Kconfig" | 536 | source "drivers/pci/hotplug/Kconfig" |
532 | 537 | ||
533 | endmenu | 538 | endmenu |
534 | 539 | ||
535 | menu "Advanced setup" | 540 | menu "Advanced setup" |
536 | depends on PPC32 | 541 | depends on PPC32 |
537 | 542 | ||
538 | config ADVANCED_OPTIONS | 543 | config ADVANCED_OPTIONS |
539 | bool "Prompt for advanced kernel configuration options" | 544 | bool "Prompt for advanced kernel configuration options" |
540 | help | 545 | help |
541 | This option will enable prompting for a variety of advanced kernel | 546 | This option will enable prompting for a variety of advanced kernel |
542 | configuration options. These options can cause the kernel to not | 547 | configuration options. These options can cause the kernel to not |
543 | work if they are set incorrectly, but can be used to optimize certain | 548 | work if they are set incorrectly, but can be used to optimize certain |
544 | aspects of kernel memory management. | 549 | aspects of kernel memory management. |
545 | 550 | ||
546 | Unless you know what you are doing, say N here. | 551 | Unless you know what you are doing, say N here. |
547 | 552 | ||
548 | comment "Default settings for advanced configuration options are used" | 553 | comment "Default settings for advanced configuration options are used" |
549 | depends on !ADVANCED_OPTIONS | 554 | depends on !ADVANCED_OPTIONS |
550 | 555 | ||
551 | config HIGHMEM_START_BOOL | 556 | config HIGHMEM_START_BOOL |
552 | bool "Set high memory pool address" | 557 | bool "Set high memory pool address" |
553 | depends on ADVANCED_OPTIONS && HIGHMEM | 558 | depends on ADVANCED_OPTIONS && HIGHMEM |
554 | help | 559 | help |
555 | This option allows you to set the base address of the kernel virtual | 560 | This option allows you to set the base address of the kernel virtual |
556 | area used to map high memory pages. This can be useful in | 561 | area used to map high memory pages. This can be useful in |
557 | optimizing the layout of kernel virtual memory. | 562 | optimizing the layout of kernel virtual memory. |
558 | 563 | ||
559 | Say N here unless you know what you are doing. | 564 | Say N here unless you know what you are doing. |
560 | 565 | ||
561 | config HIGHMEM_START | 566 | config HIGHMEM_START |
562 | hex "Virtual start address of high memory pool" if HIGHMEM_START_BOOL | 567 | hex "Virtual start address of high memory pool" if HIGHMEM_START_BOOL |
563 | default "0xfe000000" | 568 | default "0xfe000000" |
564 | 569 | ||
565 | config LOWMEM_SIZE_BOOL | 570 | config LOWMEM_SIZE_BOOL |
566 | bool "Set maximum low memory" | 571 | bool "Set maximum low memory" |
567 | depends on ADVANCED_OPTIONS | 572 | depends on ADVANCED_OPTIONS |
568 | help | 573 | help |
569 | This option allows you to set the maximum amount of memory which | 574 | This option allows you to set the maximum amount of memory which |
570 | will be used as "low memory", that is, memory which the kernel can | 575 | will be used as "low memory", that is, memory which the kernel can |
571 | access directly, without having to set up a kernel virtual mapping. | 576 | access directly, without having to set up a kernel virtual mapping. |
572 | This can be useful in optimizing the layout of kernel virtual | 577 | This can be useful in optimizing the layout of kernel virtual |
573 | memory. | 578 | memory. |
574 | 579 | ||
575 | Say N here unless you know what you are doing. | 580 | Say N here unless you know what you are doing. |
576 | 581 | ||
577 | config LOWMEM_SIZE | 582 | config LOWMEM_SIZE |
578 | hex "Maximum low memory size (in bytes)" if LOWMEM_SIZE_BOOL | 583 | hex "Maximum low memory size (in bytes)" if LOWMEM_SIZE_BOOL |
579 | default "0x30000000" | 584 | default "0x30000000" |
580 | 585 | ||
581 | config KERNEL_START_BOOL | 586 | config KERNEL_START_BOOL |
582 | bool "Set custom kernel base address" | 587 | bool "Set custom kernel base address" |
583 | depends on ADVANCED_OPTIONS | 588 | depends on ADVANCED_OPTIONS |
584 | help | 589 | help |
585 | This option allows you to set the kernel virtual address at which | 590 | This option allows you to set the kernel virtual address at which |
586 | the kernel will map low memory (the kernel image will be linked at | 591 | the kernel will map low memory (the kernel image will be linked at |
587 | this address). This can be useful in optimizing the virtual memory | 592 | this address). This can be useful in optimizing the virtual memory |
588 | layout of the system. | 593 | layout of the system. |
589 | 594 | ||
590 | Say N here unless you know what you are doing. | 595 | Say N here unless you know what you are doing. |
591 | 596 | ||
592 | config KERNEL_START | 597 | config KERNEL_START |
593 | hex "Virtual address of kernel base" if KERNEL_START_BOOL | 598 | hex "Virtual address of kernel base" if KERNEL_START_BOOL |
594 | default "0xc0000000" | 599 | default "0xc0000000" |
595 | 600 | ||
596 | config TASK_SIZE_BOOL | 601 | config TASK_SIZE_BOOL |
597 | bool "Set custom user task size" | 602 | bool "Set custom user task size" |
598 | depends on ADVANCED_OPTIONS | 603 | depends on ADVANCED_OPTIONS |
599 | help | 604 | help |
600 | This option allows you to set the amount of virtual address space | 605 | This option allows you to set the amount of virtual address space |
601 | allocated to user tasks. This can be useful in optimizing the | 606 | allocated to user tasks. This can be useful in optimizing the |
602 | virtual memory layout of the system. | 607 | virtual memory layout of the system. |
603 | 608 | ||
604 | Say N here unless you know what you are doing. | 609 | Say N here unless you know what you are doing. |
605 | 610 | ||
606 | config TASK_SIZE | 611 | config TASK_SIZE |
607 | hex "Size of user task space" if TASK_SIZE_BOOL | 612 | hex "Size of user task space" if TASK_SIZE_BOOL |
608 | default "0x80000000" if PPC_PREP || PPC_8xx | 613 | default "0x80000000" if PPC_PREP || PPC_8xx |
609 | default "0xc0000000" | 614 | default "0xc0000000" |
610 | 615 | ||
611 | config CONSISTENT_START_BOOL | 616 | config CONSISTENT_START_BOOL |
612 | bool "Set custom consistent memory pool address" | 617 | bool "Set custom consistent memory pool address" |
613 | depends on ADVANCED_OPTIONS && NOT_COHERENT_CACHE | 618 | depends on ADVANCED_OPTIONS && NOT_COHERENT_CACHE |
614 | help | 619 | help |
615 | This option allows you to set the base virtual address | 620 | This option allows you to set the base virtual address |
616 | of the consistent memory pool. This pool of virtual | 621 | of the consistent memory pool. This pool of virtual |
617 | memory is used to make consistent memory allocations. | 622 | memory is used to make consistent memory allocations. |
618 | 623 | ||
619 | config CONSISTENT_START | 624 | config CONSISTENT_START |
620 | hex "Base virtual address of consistent memory pool" if CONSISTENT_START_BOOL | 625 | hex "Base virtual address of consistent memory pool" if CONSISTENT_START_BOOL |
621 | default "0xfd000000" if (NOT_COHERENT_CACHE && 8xx) | 626 | default "0xfd000000" if (NOT_COHERENT_CACHE && 8xx) |
622 | default "0xff100000" if NOT_COHERENT_CACHE | 627 | default "0xff100000" if NOT_COHERENT_CACHE |
623 | 628 | ||
624 | config CONSISTENT_SIZE_BOOL | 629 | config CONSISTENT_SIZE_BOOL |
625 | bool "Set custom consistent memory pool size" | 630 | bool "Set custom consistent memory pool size" |
626 | depends on ADVANCED_OPTIONS && NOT_COHERENT_CACHE | 631 | depends on ADVANCED_OPTIONS && NOT_COHERENT_CACHE |
627 | help | 632 | help |
628 | This option allows you to set the size of the | 633 | This option allows you to set the size of the |
629 | consistent memory pool. This pool of virtual memory | 634 | consistent memory pool. This pool of virtual memory |
630 | is used to make consistent memory allocations. | 635 | is used to make consistent memory allocations. |
631 | 636 | ||
632 | config CONSISTENT_SIZE | 637 | config CONSISTENT_SIZE |
633 | hex "Size of consistent memory pool" if CONSISTENT_SIZE_BOOL | 638 | hex "Size of consistent memory pool" if CONSISTENT_SIZE_BOOL |
634 | default "0x00200000" if NOT_COHERENT_CACHE | 639 | default "0x00200000" if NOT_COHERENT_CACHE |
635 | 640 | ||
636 | config BOOT_LOAD_BOOL | 641 | config BOOT_LOAD_BOOL |
637 | bool "Set the boot link/load address" | 642 | bool "Set the boot link/load address" |
638 | depends on ADVANCED_OPTIONS && !PPC_MULTIPLATFORM | 643 | depends on ADVANCED_OPTIONS && !PPC_MULTIPLATFORM |
639 | help | 644 | help |
640 | This option allows you to set the initial load address of the zImage | 645 | This option allows you to set the initial load address of the zImage |
641 | or zImage.initrd file. This can be useful if you are on a board | 646 | or zImage.initrd file. This can be useful if you are on a board |
642 | which has a small amount of memory. | 647 | which has a small amount of memory. |
643 | 648 | ||
644 | Say N here unless you know what you are doing. | 649 | Say N here unless you know what you are doing. |
645 | 650 | ||
646 | config BOOT_LOAD | 651 | config BOOT_LOAD |
647 | hex "Link/load address for booting" if BOOT_LOAD_BOOL | 652 | hex "Link/load address for booting" if BOOT_LOAD_BOOL |
648 | default "0x00400000" if 40x || 8xx || 8260 | 653 | default "0x00400000" if 40x || 8xx || 8260 |
649 | default "0x01000000" if 44x | 654 | default "0x01000000" if 44x |
650 | default "0x00800000" | 655 | default "0x00800000" |
651 | 656 | ||
652 | config PIN_TLB | 657 | config PIN_TLB |
653 | bool "Pinned Kernel TLBs (860 ONLY)" | 658 | bool "Pinned Kernel TLBs (860 ONLY)" |
654 | depends on ADVANCED_OPTIONS && 8xx | 659 | depends on ADVANCED_OPTIONS && 8xx |
655 | endmenu | 660 | endmenu |
656 | 661 | ||
657 | if PPC64 | 662 | if PPC64 |
658 | config KERNEL_START | 663 | config KERNEL_START |
659 | hex | 664 | hex |
660 | default "0xc000000000000000" | 665 | default "0xc000000000000000" |
661 | endif | 666 | endif |
662 | 667 | ||
663 | source "net/Kconfig" | 668 | source "net/Kconfig" |
664 | 669 | ||
665 | source "drivers/Kconfig" | 670 | source "drivers/Kconfig" |
666 | 671 | ||
667 | source "fs/Kconfig" | 672 | source "fs/Kconfig" |
668 | 673 | ||
669 | # XXX source "arch/ppc/8xx_io/Kconfig" | 674 | # XXX source "arch/ppc/8xx_io/Kconfig" |
670 | 675 | ||
671 | # XXX source "arch/ppc/8260_io/Kconfig" | 676 | # XXX source "arch/ppc/8260_io/Kconfig" |
672 | 677 | ||
673 | source "arch/powerpc/sysdev/qe_lib/Kconfig" | 678 | source "arch/powerpc/sysdev/qe_lib/Kconfig" |
674 | 679 | ||
675 | source "lib/Kconfig" | 680 | source "lib/Kconfig" |
676 | 681 | ||
677 | source "kernel/Kconfig.instrumentation" | 682 | source "kernel/Kconfig.instrumentation" |
678 | 683 | ||
679 | source "arch/powerpc/Kconfig.debug" | 684 | source "arch/powerpc/Kconfig.debug" |
680 | 685 | ||
681 | source "security/Kconfig" | 686 | source "security/Kconfig" |
682 | 687 | ||
683 | config KEYS_COMPAT | 688 | config KEYS_COMPAT |
684 | bool | 689 | bool |
685 | depends on COMPAT && KEYS | 690 | depends on COMPAT && KEYS |
686 | default y | 691 | default y |
687 | 692 | ||
688 | source "crypto/Kconfig" | 693 | source "crypto/Kconfig" |
689 | 694 | ||
690 | config PPC_CLOCK | 695 | config PPC_CLOCK |
691 | bool | 696 | bool |
692 | default n | 697 | default n |
693 | 698 | ||
694 | config PPC_LIB_RHEAP | 699 | config PPC_LIB_RHEAP |
695 | bool | 700 | bool |
696 | 701 | ||
697 | 702 |
arch/sparc64/Kconfig
1 | # $Id: config.in,v 1.158 2002/01/24 22:14:44 davem Exp $ | 1 | # $Id: config.in,v 1.158 2002/01/24 22:14:44 davem Exp $ |
2 | # For a description of the syntax of this configuration file, | 2 | # For a description of the syntax of this configuration file, |
3 | # see the Configure script. | 3 | # see the Configure script. |
4 | # | 4 | # |
5 | 5 | ||
6 | mainmenu "Linux/UltraSPARC Kernel Configuration" | 6 | mainmenu "Linux/UltraSPARC Kernel Configuration" |
7 | 7 | ||
8 | config SPARC | 8 | config SPARC |
9 | bool | 9 | bool |
10 | default y | 10 | default y |
11 | 11 | ||
12 | config SPARC64 | 12 | config SPARC64 |
13 | bool | 13 | bool |
14 | default y | 14 | default y |
15 | help | 15 | help |
16 | SPARC is a family of RISC microprocessors designed and marketed by | 16 | SPARC is a family of RISC microprocessors designed and marketed by |
17 | Sun Microsystems, incorporated. This port covers the newer 64-bit | 17 | Sun Microsystems, incorporated. This port covers the newer 64-bit |
18 | UltraSPARC. The UltraLinux project maintains both the SPARC32 and | 18 | UltraSPARC. The UltraLinux project maintains both the SPARC32 and |
19 | SPARC64 ports; its web page is available at | 19 | SPARC64 ports; its web page is available at |
20 | <http://www.ultralinux.org/>. | 20 | <http://www.ultralinux.org/>. |
21 | 21 | ||
22 | config GENERIC_TIME | 22 | config GENERIC_TIME |
23 | bool | 23 | bool |
24 | default y | 24 | default y |
25 | 25 | ||
26 | config GENERIC_CMOS_UPDATE | 26 | config GENERIC_CMOS_UPDATE |
27 | bool | 27 | bool |
28 | default y | 28 | default y |
29 | 29 | ||
30 | config GENERIC_CLOCKEVENTS | 30 | config GENERIC_CLOCKEVENTS |
31 | bool | 31 | bool |
32 | default y | 32 | default y |
33 | 33 | ||
34 | config 64BIT | 34 | config 64BIT |
35 | def_bool y | 35 | def_bool y |
36 | 36 | ||
37 | config MMU | 37 | config MMU |
38 | bool | 38 | bool |
39 | default y | 39 | default y |
40 | 40 | ||
41 | config QUICKLIST | 41 | config QUICKLIST |
42 | bool | 42 | bool |
43 | default y | 43 | default y |
44 | 44 | ||
45 | config STACKTRACE_SUPPORT | 45 | config STACKTRACE_SUPPORT |
46 | bool | 46 | bool |
47 | default y | 47 | default y |
48 | 48 | ||
49 | config LOCKDEP_SUPPORT | 49 | config LOCKDEP_SUPPORT |
50 | bool | 50 | bool |
51 | default y | 51 | default y |
52 | 52 | ||
53 | config ARCH_MAY_HAVE_PC_FDC | 53 | config ARCH_MAY_HAVE_PC_FDC |
54 | bool | 54 | bool |
55 | default y | 55 | default y |
56 | 56 | ||
57 | config ARCH_HAS_ILOG2_U32 | 57 | config ARCH_HAS_ILOG2_U32 |
58 | bool | 58 | bool |
59 | default n | 59 | default n |
60 | 60 | ||
61 | config ARCH_HAS_ILOG2_U64 | 61 | config ARCH_HAS_ILOG2_U64 |
62 | bool | 62 | bool |
63 | default n | 63 | default n |
64 | 64 | ||
65 | config AUDIT_ARCH | 65 | config AUDIT_ARCH |
66 | bool | 66 | bool |
67 | default y | 67 | default y |
68 | 68 | ||
69 | config ARCH_NO_VIRT_TO_BUS | 69 | config ARCH_NO_VIRT_TO_BUS |
70 | def_bool y | 70 | def_bool y |
71 | 71 | ||
72 | config OF | 72 | config OF |
73 | def_bool y | 73 | def_bool y |
74 | 74 | ||
75 | config GENERIC_HARDIRQS_NO__DO_IRQ | 75 | config GENERIC_HARDIRQS_NO__DO_IRQ |
76 | bool | 76 | bool |
77 | def_bool y | 77 | def_bool y |
78 | 78 | ||
79 | choice | 79 | choice |
80 | prompt "Kernel page size" | 80 | prompt "Kernel page size" |
81 | default SPARC64_PAGE_SIZE_8KB | 81 | default SPARC64_PAGE_SIZE_8KB |
82 | 82 | ||
83 | config SPARC64_PAGE_SIZE_8KB | 83 | config SPARC64_PAGE_SIZE_8KB |
84 | bool "8KB" | 84 | bool "8KB" |
85 | help | 85 | help |
86 | This lets you select the page size of the kernel. | 86 | This lets you select the page size of the kernel. |
87 | 87 | ||
88 | 8KB and 64KB work quite well, since Sparc ELF sections | 88 | 8KB and 64KB work quite well, since Sparc ELF sections |
89 | provide for up to 64KB alignment. | 89 | provide for up to 64KB alignment. |
90 | 90 | ||
91 | Therefore, 512KB and 4MB are for expert hackers only. | 91 | Therefore, 512KB and 4MB are for expert hackers only. |
92 | 92 | ||
93 | If you don't know what to do, choose 8KB. | 93 | If you don't know what to do, choose 8KB. |
94 | 94 | ||
95 | config SPARC64_PAGE_SIZE_64KB | 95 | config SPARC64_PAGE_SIZE_64KB |
96 | bool "64KB" | 96 | bool "64KB" |
97 | 97 | ||
98 | config SPARC64_PAGE_SIZE_512KB | 98 | config SPARC64_PAGE_SIZE_512KB |
99 | bool "512KB" | 99 | bool "512KB" |
100 | 100 | ||
101 | config SPARC64_PAGE_SIZE_4MB | 101 | config SPARC64_PAGE_SIZE_4MB |
102 | bool "4MB" | 102 | bool "4MB" |
103 | 103 | ||
104 | endchoice | 104 | endchoice |
105 | 105 | ||
106 | config SECCOMP | 106 | config SECCOMP |
107 | bool "Enable seccomp to safely compute untrusted bytecode" | 107 | bool "Enable seccomp to safely compute untrusted bytecode" |
108 | depends on PROC_FS | 108 | depends on PROC_FS |
109 | default y | 109 | default y |
110 | help | 110 | help |
111 | This kernel feature is useful for number crunching applications | 111 | This kernel feature is useful for number crunching applications |
112 | that may need to compute untrusted bytecode during their | 112 | that may need to compute untrusted bytecode during their |
113 | execution. By using pipes or other transports made available to | 113 | execution. By using pipes or other transports made available to |
114 | the process as file descriptors supporting the read/write | 114 | the process as file descriptors supporting the read/write |
115 | syscalls, it's possible to isolate those applications in | 115 | syscalls, it's possible to isolate those applications in |
116 | their own address space using seccomp. Once seccomp is | 116 | their own address space using seccomp. Once seccomp is |
117 | enabled via /proc/<pid>/seccomp, it cannot be disabled | 117 | enabled via /proc/<pid>/seccomp, it cannot be disabled |
118 | and the task is only allowed to execute a few safe syscalls | 118 | and the task is only allowed to execute a few safe syscalls |
119 | defined by each seccomp mode. | 119 | defined by each seccomp mode. |
120 | 120 | ||
121 | If unsure, say Y. Only embedded should say N here. | 121 | If unsure, say Y. Only embedded should say N here. |
122 | 122 | ||
123 | source kernel/Kconfig.hz | 123 | source kernel/Kconfig.hz |
124 | 124 | ||
125 | config HOTPLUG_CPU | 125 | config HOTPLUG_CPU |
126 | bool "Support for hot-pluggable CPUs" | 126 | bool "Support for hot-pluggable CPUs" |
127 | depends on SMP | 127 | depends on SMP |
128 | select HOTPLUG | 128 | select HOTPLUG |
129 | ---help--- | 129 | ---help--- |
130 | Say Y here to experiment with turning CPUs off and on. CPUs | 130 | Say Y here to experiment with turning CPUs off and on. CPUs |
131 | can be controlled through /sys/devices/system/cpu/cpu#. | 131 | can be controlled through /sys/devices/system/cpu/cpu#. |
132 | Say N if you want to disable CPU hotplug. | 132 | Say N if you want to disable CPU hotplug. |
133 | 133 | ||
134 | source "init/Kconfig" | 134 | source "init/Kconfig" |
135 | 135 | ||
136 | config SYSVIPC_COMPAT | 136 | config SYSVIPC_COMPAT |
137 | bool | 137 | bool |
138 | depends on COMPAT && SYSVIPC | 138 | depends on COMPAT && SYSVIPC |
139 | default y | 139 | default y |
140 | 140 | ||
141 | config GENERIC_HARDIRQS | 141 | config GENERIC_HARDIRQS |
142 | bool | 142 | bool |
143 | default y | 143 | default y |
144 | 144 | ||
145 | menu "General machine setup" | 145 | menu "General machine setup" |
146 | 146 | ||
147 | source "kernel/time/Kconfig" | 147 | source "kernel/time/Kconfig" |
148 | 148 | ||
149 | config SMP | 149 | config SMP |
150 | bool "Symmetric multi-processing support" | 150 | bool "Symmetric multi-processing support" |
151 | ---help--- | 151 | ---help--- |
152 | This enables support for systems with more than one CPU. If you have | 152 | This enables support for systems with more than one CPU. If you have |
153 | a system with only one CPU, say N. If you have a system with more than | 153 | a system with only one CPU, say N. If you have a system with more than |
154 | one CPU, say Y. | 154 | one CPU, say Y. |
155 | 155 | ||
156 | If you say N here, the kernel will run on single and multiprocessor | 156 | If you say N here, the kernel will run on single and multiprocessor |
157 | machines, but will use only one CPU of a multiprocessor machine. If | 157 | machines, but will use only one CPU of a multiprocessor machine. If |
158 | you say Y here, the kernel will run on many, but not all, | 158 | you say Y here, the kernel will run on many, but not all, |
159 | singleprocessor machines. On a singleprocessor machine, the kernel | 159 | singleprocessor machines. On a singleprocessor machine, the kernel |
160 | will run faster if you say N here. | 160 | will run faster if you say N here. |
161 | 161 | ||
162 | People using multiprocessor machines who say Y here should also say | 162 | People using multiprocessor machines who say Y here should also say |
163 | Y to "Enhanced Real Time Clock Support", below. The "Advanced Power | 163 | Y to "Enhanced Real Time Clock Support", below. The "Advanced Power |
164 | Management" code will be disabled if you say Y here. | 164 | Management" code will be disabled if you say Y here. |
165 | 165 | ||
166 | See also the <file:Documentation/smp.txt>, | 166 | See also the <file:Documentation/smp.txt>, |
167 | <file:Documentation/nmi_watchdog.txt> and the SMP-HOWTO available at | 167 | <file:Documentation/nmi_watchdog.txt> and the SMP-HOWTO available at |
168 | <http://www.tldp.org/docs.html#howto>. | 168 | <http://www.tldp.org/docs.html#howto>. |
169 | 169 | ||
170 | If you don't know what to do here, say N. | 170 | If you don't know what to do here, say N. |
171 | 171 | ||
172 | config NR_CPUS | 172 | config NR_CPUS |
173 | int "Maximum number of CPUs (2-1024)" | 173 | int "Maximum number of CPUs (2-1024)" |
174 | range 2 1024 | 174 | range 2 1024 |
175 | depends on SMP | 175 | depends on SMP |
176 | default "64" | 176 | default "64" |
177 | 177 | ||
178 | source "drivers/cpufreq/Kconfig" | 178 | source "drivers/cpufreq/Kconfig" |
179 | 179 | ||
180 | config US3_FREQ | 180 | config US3_FREQ |
181 | tristate "UltraSPARC-III CPU Frequency driver" | 181 | tristate "UltraSPARC-III CPU Frequency driver" |
182 | depends on CPU_FREQ | 182 | depends on CPU_FREQ |
183 | select CPU_FREQ_TABLE | 183 | select CPU_FREQ_TABLE |
184 | help | 184 | help |
185 | This adds the CPUFreq driver for UltraSPARC-III processors. | 185 | This adds the CPUFreq driver for UltraSPARC-III processors. |
186 | 186 | ||
187 | For details, take a look at <file:Documentation/cpu-freq>. | 187 | For details, take a look at <file:Documentation/cpu-freq>. |
188 | 188 | ||
189 | If in doubt, say N. | 189 | If in doubt, say N. |
190 | 190 | ||
191 | config US2E_FREQ | 191 | config US2E_FREQ |
192 | tristate "UltraSPARC-IIe CPU Frequency driver" | 192 | tristate "UltraSPARC-IIe CPU Frequency driver" |
193 | depends on CPU_FREQ | 193 | depends on CPU_FREQ |
194 | select CPU_FREQ_TABLE | 194 | select CPU_FREQ_TABLE |
195 | help | 195 | help |
196 | This adds the CPUFreq driver for UltraSPARC-IIe processors. | 196 | This adds the CPUFreq driver for UltraSPARC-IIe processors. |
197 | 197 | ||
198 | For details, take a look at <file:Documentation/cpu-freq>. | 198 | For details, take a look at <file:Documentation/cpu-freq>. |
199 | 199 | ||
200 | If in doubt, say N. | 200 | If in doubt, say N. |
201 | 201 | ||
202 | # Global things across all Sun machines. | 202 | # Global things across all Sun machines. |
203 | config GENERIC_LOCKBREAK | ||
204 | bool | ||
205 | default y | ||
206 | depends on SMP && PREEMPT | ||
207 | |||
203 | config RWSEM_GENERIC_SPINLOCK | 208 | config RWSEM_GENERIC_SPINLOCK |
204 | bool | 209 | bool |
205 | 210 | ||
206 | config RWSEM_XCHGADD_ALGORITHM | 211 | config RWSEM_XCHGADD_ALGORITHM |
207 | bool | 212 | bool |
208 | default y | 213 | default y |
209 | 214 | ||
210 | config GENERIC_FIND_NEXT_BIT | 215 | config GENERIC_FIND_NEXT_BIT |
211 | bool | 216 | bool |
212 | default y | 217 | default y |
213 | 218 | ||
214 | config GENERIC_HWEIGHT | 219 | config GENERIC_HWEIGHT |
215 | bool | 220 | bool |
216 | default y if !ULTRA_HAS_POPULATION_COUNT | 221 | default y if !ULTRA_HAS_POPULATION_COUNT |
217 | 222 | ||
218 | config GENERIC_CALIBRATE_DELAY | 223 | config GENERIC_CALIBRATE_DELAY |
219 | bool | 224 | bool |
220 | default y | 225 | default y |
221 | 226 | ||
222 | choice | 227 | choice |
223 | prompt "SPARC64 Huge TLB Page Size" | 228 | prompt "SPARC64 Huge TLB Page Size" |
224 | depends on HUGETLB_PAGE | 229 | depends on HUGETLB_PAGE |
225 | default HUGETLB_PAGE_SIZE_4MB | 230 | default HUGETLB_PAGE_SIZE_4MB |
226 | 231 | ||
227 | config HUGETLB_PAGE_SIZE_4MB | 232 | config HUGETLB_PAGE_SIZE_4MB |
228 | bool "4MB" | 233 | bool "4MB" |
229 | 234 | ||
230 | config HUGETLB_PAGE_SIZE_512K | 235 | config HUGETLB_PAGE_SIZE_512K |
231 | depends on !SPARC64_PAGE_SIZE_4MB && !SPARC64_PAGE_SIZE_512KB | 236 | depends on !SPARC64_PAGE_SIZE_4MB && !SPARC64_PAGE_SIZE_512KB |
232 | bool "512K" | 237 | bool "512K" |
233 | 238 | ||
234 | config HUGETLB_PAGE_SIZE_64K | 239 | config HUGETLB_PAGE_SIZE_64K |
235 | depends on !SPARC64_PAGE_SIZE_4MB && !SPARC64_PAGE_SIZE_512KB && !SPARC64_PAGE_SIZE_64KB | 240 | depends on !SPARC64_PAGE_SIZE_4MB && !SPARC64_PAGE_SIZE_512KB && !SPARC64_PAGE_SIZE_64KB |
236 | bool "64K" | 241 | bool "64K" |
237 | 242 | ||
238 | endchoice | 243 | endchoice |
239 | 244 | ||
240 | endmenu | 245 | endmenu |
241 | 246 | ||
242 | config ARCH_SELECT_MEMORY_MODEL | 247 | config ARCH_SELECT_MEMORY_MODEL |
243 | def_bool y | 248 | def_bool y |
244 | 249 | ||
245 | config ARCH_SPARSEMEM_ENABLE | 250 | config ARCH_SPARSEMEM_ENABLE |
246 | def_bool y | 251 | def_bool y |
247 | select SPARSEMEM_VMEMMAP_ENABLE | 252 | select SPARSEMEM_VMEMMAP_ENABLE |
248 | 253 | ||
249 | config ARCH_SPARSEMEM_DEFAULT | 254 | config ARCH_SPARSEMEM_DEFAULT |
250 | def_bool y | 255 | def_bool y |
251 | 256 | ||
252 | source "mm/Kconfig" | 257 | source "mm/Kconfig" |
253 | 258 | ||
254 | config ISA | 259 | config ISA |
255 | bool | 260 | bool |
256 | help | 261 | help |
257 | Find out whether you have ISA slots on your motherboard. ISA is the | 262 | Find out whether you have ISA slots on your motherboard. ISA is the |
258 | name of a bus system, i.e. the way the CPU talks to the other stuff | 263 | name of a bus system, i.e. the way the CPU talks to the other stuff |
259 | inside your box. Other bus systems are PCI, EISA, MicroChannel | 264 | inside your box. Other bus systems are PCI, EISA, MicroChannel |
260 | (MCA) or VESA. ISA is an older system, now being displaced by PCI; | 265 | (MCA) or VESA. ISA is an older system, now being displaced by PCI; |
261 | newer boards don't support it. If you have ISA, say Y, otherwise N. | 266 | newer boards don't support it. If you have ISA, say Y, otherwise N. |
262 | 267 | ||
263 | config ISAPNP | 268 | config ISAPNP |
264 | bool | 269 | bool |
265 | help | 270 | help |
266 | Say Y here if you would like support for ISA Plug and Play devices. | 271 | Say Y here if you would like support for ISA Plug and Play devices. |
267 | Some information is in <file:Documentation/isapnp.txt>. | 272 | Some information is in <file:Documentation/isapnp.txt>. |
268 | 273 | ||
269 | To compile this driver as a module, choose M here: the | 274 | To compile this driver as a module, choose M here: the |
270 | module will be called isapnp. | 275 | module will be called isapnp. |
271 | 276 | ||
272 | If unsure, say Y. | 277 | If unsure, say Y. |
273 | 278 | ||
274 | config EISA | 279 | config EISA |
275 | bool | 280 | bool |
276 | ---help--- | 281 | ---help--- |
277 | The Extended Industry Standard Architecture (EISA) bus was | 282 | The Extended Industry Standard Architecture (EISA) bus was |
278 | developed as an open alternative to the IBM MicroChannel bus. | 283 | developed as an open alternative to the IBM MicroChannel bus. |
279 | 284 | ||
280 | The EISA bus provided some of the features of the IBM MicroChannel | 285 | The EISA bus provided some of the features of the IBM MicroChannel |
281 | bus while maintaining backward compatibility with cards made for | 286 | bus while maintaining backward compatibility with cards made for |
282 | the older ISA bus. The EISA bus saw limited use between 1988 and | 287 | the older ISA bus. The EISA bus saw limited use between 1988 and |
283 | 1995 when it was made obsolete by the PCI bus. | 288 | 1995 when it was made obsolete by the PCI bus. |
284 | 289 | ||
285 | Say Y here if you are building a kernel for an EISA-based machine. | 290 | Say Y here if you are building a kernel for an EISA-based machine. |
286 | 291 | ||
287 | Otherwise, say N. | 292 | Otherwise, say N. |
288 | 293 | ||
289 | config MCA | 294 | config MCA |
290 | bool | 295 | bool |
291 | help | 296 | help |
292 | MicroChannel Architecture is found in some IBM PS/2 machines and | 297 | MicroChannel Architecture is found in some IBM PS/2 machines and |
293 | laptops. It is a bus system similar to PCI or ISA. See | 298 | laptops. It is a bus system similar to PCI or ISA. See |
294 | <file:Documentation/mca.txt> (and especially the web page given | 299 | <file:Documentation/mca.txt> (and especially the web page given |
295 | there) before attempting to build an MCA bus kernel. | 300 | there) before attempting to build an MCA bus kernel. |
296 | 301 | ||
297 | config PCMCIA | 302 | config PCMCIA |
298 | tristate | 303 | tristate |
299 | ---help--- | 304 | ---help--- |
300 | Say Y here if you want to attach PCMCIA- or PC-cards to your Linux | 305 | Say Y here if you want to attach PCMCIA- or PC-cards to your Linux |
301 | computer. These are credit-card size devices such as network cards, | 306 | computer. These are credit-card size devices such as network cards, |
302 | modems or hard drives often used with laptops computers. There are | 307 | modems or hard drives often used with laptops computers. There are |
303 | actually two varieties of these cards: the older 16 bit PCMCIA cards | 308 | actually two varieties of these cards: the older 16 bit PCMCIA cards |
304 | and the newer 32 bit CardBus cards. If you want to use CardBus | 309 | and the newer 32 bit CardBus cards. If you want to use CardBus |
305 | cards, you need to say Y here and also to "CardBus support" below. | 310 | cards, you need to say Y here and also to "CardBus support" below. |
306 | 311 | ||
307 | To use your PC-cards, you will need supporting software from David | 312 | To use your PC-cards, you will need supporting software from David |
308 | Hinds' pcmcia-cs package (see the file <file:Documentation/Changes> | 313 | Hinds' pcmcia-cs package (see the file <file:Documentation/Changes> |
309 | for location). Please also read the PCMCIA-HOWTO, available from | 314 | for location). Please also read the PCMCIA-HOWTO, available from |
310 | <http://www.tldp.org/docs.html#howto>. | 315 | <http://www.tldp.org/docs.html#howto>. |
311 | 316 | ||
312 | To compile this driver as modules, choose M here: the | 317 | To compile this driver as modules, choose M here: the |
313 | modules will be called pcmcia_core and ds. | 318 | modules will be called pcmcia_core and ds. |
314 | 319 | ||
315 | config SBUS | 320 | config SBUS |
316 | bool | 321 | bool |
317 | default y | 322 | default y |
318 | 323 | ||
319 | config SBUSCHAR | 324 | config SBUSCHAR |
320 | bool | 325 | bool |
321 | default y | 326 | default y |
322 | 327 | ||
323 | config SUN_AUXIO | 328 | config SUN_AUXIO |
324 | bool | 329 | bool |
325 | default y | 330 | default y |
326 | 331 | ||
327 | config SUN_IO | 332 | config SUN_IO |
328 | bool | 333 | bool |
329 | default y | 334 | default y |
330 | 335 | ||
331 | config SUN_LDOMS | 336 | config SUN_LDOMS |
332 | bool "Sun Logical Domains support" | 337 | bool "Sun Logical Domains support" |
333 | help | 338 | help |
334 | Say Y here is you want to support virtual devices via | 339 | Say Y here is you want to support virtual devices via |
335 | Logical Domains. | 340 | Logical Domains. |
336 | 341 | ||
337 | config PCI | 342 | config PCI |
338 | bool "PCI support" | 343 | bool "PCI support" |
339 | select ARCH_SUPPORTS_MSI | 344 | select ARCH_SUPPORTS_MSI |
340 | help | 345 | help |
341 | Find out whether you have a PCI motherboard. PCI is the name of a | 346 | Find out whether you have a PCI motherboard. PCI is the name of a |
342 | bus system, i.e. the way the CPU talks to the other stuff inside | 347 | bus system, i.e. the way the CPU talks to the other stuff inside |
343 | your box. Other bus systems are ISA, EISA, MicroChannel (MCA) or | 348 | your box. Other bus systems are ISA, EISA, MicroChannel (MCA) or |
344 | VESA. If you have PCI, say Y, otherwise N. | 349 | VESA. If you have PCI, say Y, otherwise N. |
345 | 350 | ||
346 | The PCI-HOWTO, available from | 351 | The PCI-HOWTO, available from |
347 | <http://www.tldp.org/docs.html#howto>, contains valuable | 352 | <http://www.tldp.org/docs.html#howto>, contains valuable |
348 | information about which PCI hardware does work under Linux and which | 353 | information about which PCI hardware does work under Linux and which |
349 | doesn't. | 354 | doesn't. |
350 | 355 | ||
351 | config PCI_DOMAINS | 356 | config PCI_DOMAINS |
352 | def_bool PCI | 357 | def_bool PCI |
353 | 358 | ||
354 | config PCI_SYSCALL | 359 | config PCI_SYSCALL |
355 | def_bool PCI | 360 | def_bool PCI |
356 | 361 | ||
357 | source "drivers/pci/Kconfig" | 362 | source "drivers/pci/Kconfig" |
358 | 363 | ||
359 | config SUN_OPENPROMFS | 364 | config SUN_OPENPROMFS |
360 | tristate "Openprom tree appears in /proc/openprom" | 365 | tristate "Openprom tree appears in /proc/openprom" |
361 | help | 366 | help |
362 | If you say Y, the OpenPROM device tree will be available as a | 367 | If you say Y, the OpenPROM device tree will be available as a |
363 | virtual file system, which you can mount to /proc/openprom by "mount | 368 | virtual file system, which you can mount to /proc/openprom by "mount |
364 | -t openpromfs none /proc/openprom". | 369 | -t openpromfs none /proc/openprom". |
365 | 370 | ||
366 | To compile the /proc/openprom support as a module, choose M here: the | 371 | To compile the /proc/openprom support as a module, choose M here: the |
367 | module will be called openpromfs. If unsure, choose M. | 372 | module will be called openpromfs. If unsure, choose M. |
368 | 373 | ||
369 | config SPARC32_COMPAT | 374 | config SPARC32_COMPAT |
370 | bool "Kernel support for Linux/Sparc 32bit binary compatibility" | 375 | bool "Kernel support for Linux/Sparc 32bit binary compatibility" |
371 | help | 376 | help |
372 | This allows you to run 32-bit binaries on your Ultra. | 377 | This allows you to run 32-bit binaries on your Ultra. |
373 | Everybody wants this; say Y. | 378 | Everybody wants this; say Y. |
374 | 379 | ||
375 | config COMPAT | 380 | config COMPAT |
376 | bool | 381 | bool |
377 | depends on SPARC32_COMPAT | 382 | depends on SPARC32_COMPAT |
378 | default y | 383 | default y |
379 | 384 | ||
380 | config BINFMT_ELF32 | 385 | config BINFMT_ELF32 |
381 | bool "Kernel support for 32-bit ELF binaries" | 386 | bool "Kernel support for 32-bit ELF binaries" |
382 | depends on SPARC32_COMPAT | 387 | depends on SPARC32_COMPAT |
383 | help | 388 | help |
384 | This allows you to run 32-bit Linux/ELF binaries on your Ultra. | 389 | This allows you to run 32-bit Linux/ELF binaries on your Ultra. |
385 | Everybody wants this; say Y. | 390 | Everybody wants this; say Y. |
386 | 391 | ||
387 | config BINFMT_AOUT32 | 392 | config BINFMT_AOUT32 |
388 | bool "Kernel support for 32-bit (ie. SunOS) a.out binaries" | 393 | bool "Kernel support for 32-bit (ie. SunOS) a.out binaries" |
389 | depends on SPARC32_COMPAT | 394 | depends on SPARC32_COMPAT |
390 | help | 395 | help |
391 | This allows you to run 32-bit a.out format binaries on your Ultra. | 396 | This allows you to run 32-bit a.out format binaries on your Ultra. |
392 | If you want to run SunOS binaries (see SunOS binary emulation below) | 397 | If you want to run SunOS binaries (see SunOS binary emulation below) |
393 | or other a.out binaries, say Y. If unsure, say N. | 398 | or other a.out binaries, say Y. If unsure, say N. |
394 | 399 | ||
395 | menu "Executable file formats" | 400 | menu "Executable file formats" |
396 | 401 | ||
397 | source "fs/Kconfig.binfmt" | 402 | source "fs/Kconfig.binfmt" |
398 | 403 | ||
399 | config SUNOS_EMUL | 404 | config SUNOS_EMUL |
400 | bool "SunOS binary emulation" | 405 | bool "SunOS binary emulation" |
401 | depends on BINFMT_AOUT32 | 406 | depends on BINFMT_AOUT32 |
402 | help | 407 | help |
403 | This allows you to run most SunOS binaries. If you want to do this, | 408 | This allows you to run most SunOS binaries. If you want to do this, |
404 | say Y here and place appropriate files in /usr/gnemul/sunos. See | 409 | say Y here and place appropriate files in /usr/gnemul/sunos. See |
405 | <http://www.ultralinux.org/faq.html> for more information. If you | 410 | <http://www.ultralinux.org/faq.html> for more information. If you |
406 | want to run SunOS binaries on an Ultra you must also say Y to | 411 | want to run SunOS binaries on an Ultra you must also say Y to |
407 | "Kernel support for 32-bit a.out binaries" above. | 412 | "Kernel support for 32-bit a.out binaries" above. |
408 | 413 | ||
409 | config SOLARIS_EMUL | 414 | config SOLARIS_EMUL |
410 | tristate "Solaris binary emulation (EXPERIMENTAL)" | 415 | tristate "Solaris binary emulation (EXPERIMENTAL)" |
411 | depends on SPARC32_COMPAT && NET && EXPERIMENTAL | 416 | depends on SPARC32_COMPAT && NET && EXPERIMENTAL |
412 | help | 417 | help |
413 | This is experimental code which will enable you to run (many) | 418 | This is experimental code which will enable you to run (many) |
414 | Solaris binaries on your SPARC Linux machine. | 419 | Solaris binaries on your SPARC Linux machine. |
415 | 420 | ||
416 | To compile this code as a module, choose M here: the | 421 | To compile this code as a module, choose M here: the |
417 | module will be called solaris. | 422 | module will be called solaris. |
418 | 423 | ||
419 | endmenu | 424 | endmenu |
420 | 425 | ||
421 | config SCHED_SMT | 426 | config SCHED_SMT |
422 | bool "SMT (Hyperthreading) scheduler support" | 427 | bool "SMT (Hyperthreading) scheduler support" |
423 | depends on SMP | 428 | depends on SMP |
424 | default y | 429 | default y |
425 | help | 430 | help |
426 | SMT scheduler support improves the CPU scheduler's decision making | 431 | SMT scheduler support improves the CPU scheduler's decision making |
427 | when dealing with UltraSPARC cpus at a cost of slightly increased | 432 | when dealing with UltraSPARC cpus at a cost of slightly increased |
428 | overhead in some places. If unsure say N here. | 433 | overhead in some places. If unsure say N here. |
429 | 434 | ||
430 | config SCHED_MC | 435 | config SCHED_MC |
431 | bool "Multi-core scheduler support" | 436 | bool "Multi-core scheduler support" |
432 | depends on SMP | 437 | depends on SMP |
433 | default y | 438 | default y |
434 | help | 439 | help |
435 | Multi-core scheduler support improves the CPU scheduler's decision | 440 | Multi-core scheduler support improves the CPU scheduler's decision |
436 | making when dealing with multi-core CPU chips at a cost of slightly | 441 | making when dealing with multi-core CPU chips at a cost of slightly |
437 | increased overhead in some places. If unsure say N here. | 442 | increased overhead in some places. If unsure say N here. |
438 | 443 | ||
439 | source "kernel/Kconfig.preempt" | 444 | source "kernel/Kconfig.preempt" |
440 | 445 | ||
441 | config CMDLINE_BOOL | 446 | config CMDLINE_BOOL |
442 | bool "Default bootloader kernel arguments" | 447 | bool "Default bootloader kernel arguments" |
443 | 448 | ||
444 | config CMDLINE | 449 | config CMDLINE |
445 | string "Initial kernel command string" | 450 | string "Initial kernel command string" |
446 | depends on CMDLINE_BOOL | 451 | depends on CMDLINE_BOOL |
447 | default "console=ttyS0,9600 root=/dev/sda1" | 452 | default "console=ttyS0,9600 root=/dev/sda1" |
448 | help | 453 | help |
449 | Say Y here if you want to be able to pass default arguments to | 454 | Say Y here if you want to be able to pass default arguments to |
450 | the kernel. This will be overridden by the bootloader, if you | 455 | the kernel. This will be overridden by the bootloader, if you |
451 | use one (such as SILO). This is most useful if you want to boot | 456 | use one (such as SILO). This is most useful if you want to boot |
452 | a kernel from TFTP, and want default options to be available | 457 | a kernel from TFTP, and want default options to be available |
453 | with having them passed on the command line. | 458 | with having them passed on the command line. |
454 | 459 | ||
455 | NOTE: This option WILL override the PROM bootargs setting! | 460 | NOTE: This option WILL override the PROM bootargs setting! |
456 | 461 | ||
457 | source "net/Kconfig" | 462 | source "net/Kconfig" |
458 | 463 | ||
459 | source "drivers/Kconfig" | 464 | source "drivers/Kconfig" |
460 | 465 | ||
461 | source "drivers/sbus/char/Kconfig" | 466 | source "drivers/sbus/char/Kconfig" |
462 | 467 | ||
463 | source "fs/Kconfig" | 468 | source "fs/Kconfig" |
464 | 469 | ||
465 | source "kernel/Kconfig.instrumentation" | 470 | source "kernel/Kconfig.instrumentation" |
466 | 471 | ||
467 | source "arch/sparc64/Kconfig.debug" | 472 | source "arch/sparc64/Kconfig.debug" |
468 | 473 | ||
469 | source "security/Kconfig" | 474 | source "security/Kconfig" |
470 | 475 | ||
471 | source "crypto/Kconfig" | 476 | source "crypto/Kconfig" |
472 | 477 | ||
473 | source "lib/Kconfig" | 478 | source "lib/Kconfig" |
474 | 479 |
arch/x86/Kconfig
1 | # x86 configuration | 1 | # x86 configuration |
2 | mainmenu "Linux Kernel Configuration for x86" | 2 | mainmenu "Linux Kernel Configuration for x86" |
3 | 3 | ||
4 | # Select 32 or 64 bit | 4 | # Select 32 or 64 bit |
5 | config 64BIT | 5 | config 64BIT |
6 | bool "64-bit kernel" if ARCH = "x86" | 6 | bool "64-bit kernel" if ARCH = "x86" |
7 | default ARCH = "x86_64" | 7 | default ARCH = "x86_64" |
8 | help | 8 | help |
9 | Say yes to build a 64-bit kernel - formerly known as x86_64 | 9 | Say yes to build a 64-bit kernel - formerly known as x86_64 |
10 | Say no to build a 32-bit kernel - formerly known as i386 | 10 | Say no to build a 32-bit kernel - formerly known as i386 |
11 | 11 | ||
12 | config X86_32 | 12 | config X86_32 |
13 | def_bool !64BIT | 13 | def_bool !64BIT |
14 | 14 | ||
15 | config X86_64 | 15 | config X86_64 |
16 | def_bool 64BIT | 16 | def_bool 64BIT |
17 | 17 | ||
18 | ### Arch settings | 18 | ### Arch settings |
19 | config X86 | 19 | config X86 |
20 | def_bool y | 20 | def_bool y |
21 | 21 | ||
22 | config GENERIC_LOCKBREAK | ||
23 | def_bool y | ||
24 | depends on SMP && PREEMPT | ||
25 | |||
22 | config GENERIC_TIME | 26 | config GENERIC_TIME |
23 | def_bool y | 27 | def_bool y |
24 | 28 | ||
25 | config GENERIC_CMOS_UPDATE | 29 | config GENERIC_CMOS_UPDATE |
26 | def_bool y | 30 | def_bool y |
27 | 31 | ||
28 | config CLOCKSOURCE_WATCHDOG | 32 | config CLOCKSOURCE_WATCHDOG |
29 | def_bool y | 33 | def_bool y |
30 | 34 | ||
31 | config GENERIC_CLOCKEVENTS | 35 | config GENERIC_CLOCKEVENTS |
32 | def_bool y | 36 | def_bool y |
33 | 37 | ||
34 | config GENERIC_CLOCKEVENTS_BROADCAST | 38 | config GENERIC_CLOCKEVENTS_BROADCAST |
35 | def_bool y | 39 | def_bool y |
36 | depends on X86_64 || (X86_32 && X86_LOCAL_APIC) | 40 | depends on X86_64 || (X86_32 && X86_LOCAL_APIC) |
37 | 41 | ||
38 | config LOCKDEP_SUPPORT | 42 | config LOCKDEP_SUPPORT |
39 | def_bool y | 43 | def_bool y |
40 | 44 | ||
41 | config STACKTRACE_SUPPORT | 45 | config STACKTRACE_SUPPORT |
42 | def_bool y | 46 | def_bool y |
43 | 47 | ||
44 | config SEMAPHORE_SLEEPERS | 48 | config SEMAPHORE_SLEEPERS |
45 | def_bool y | 49 | def_bool y |
46 | 50 | ||
47 | config MMU | 51 | config MMU |
48 | def_bool y | 52 | def_bool y |
49 | 53 | ||
50 | config ZONE_DMA | 54 | config ZONE_DMA |
51 | def_bool y | 55 | def_bool y |
52 | 56 | ||
53 | config QUICKLIST | 57 | config QUICKLIST |
54 | def_bool X86_32 | 58 | def_bool X86_32 |
55 | 59 | ||
56 | config SBUS | 60 | config SBUS |
57 | bool | 61 | bool |
58 | 62 | ||
59 | config GENERIC_ISA_DMA | 63 | config GENERIC_ISA_DMA |
60 | def_bool y | 64 | def_bool y |
61 | 65 | ||
62 | config GENERIC_IOMAP | 66 | config GENERIC_IOMAP |
63 | def_bool y | 67 | def_bool y |
64 | 68 | ||
65 | config GENERIC_BUG | 69 | config GENERIC_BUG |
66 | def_bool y | 70 | def_bool y |
67 | depends on BUG | 71 | depends on BUG |
68 | 72 | ||
69 | config GENERIC_HWEIGHT | 73 | config GENERIC_HWEIGHT |
70 | def_bool y | 74 | def_bool y |
71 | 75 | ||
72 | config ARCH_MAY_HAVE_PC_FDC | 76 | config ARCH_MAY_HAVE_PC_FDC |
73 | def_bool y | 77 | def_bool y |
74 | 78 | ||
75 | config DMI | 79 | config DMI |
76 | def_bool y | 80 | def_bool y |
77 | 81 | ||
78 | config RWSEM_GENERIC_SPINLOCK | 82 | config RWSEM_GENERIC_SPINLOCK |
79 | def_bool !X86_XADD | 83 | def_bool !X86_XADD |
80 | 84 | ||
81 | config RWSEM_XCHGADD_ALGORITHM | 85 | config RWSEM_XCHGADD_ALGORITHM |
82 | def_bool X86_XADD | 86 | def_bool X86_XADD |
83 | 87 | ||
84 | config ARCH_HAS_ILOG2_U32 | 88 | config ARCH_HAS_ILOG2_U32 |
85 | def_bool n | 89 | def_bool n |
86 | 90 | ||
87 | config ARCH_HAS_ILOG2_U64 | 91 | config ARCH_HAS_ILOG2_U64 |
88 | def_bool n | 92 | def_bool n |
89 | 93 | ||
90 | config GENERIC_CALIBRATE_DELAY | 94 | config GENERIC_CALIBRATE_DELAY |
91 | def_bool y | 95 | def_bool y |
92 | 96 | ||
93 | config GENERIC_TIME_VSYSCALL | 97 | config GENERIC_TIME_VSYSCALL |
94 | bool | 98 | bool |
95 | default X86_64 | 99 | default X86_64 |
96 | 100 | ||
97 | config ARCH_SUPPORTS_OPROFILE | 101 | config ARCH_SUPPORTS_OPROFILE |
98 | bool | 102 | bool |
99 | default y | 103 | default y |
100 | 104 | ||
101 | 105 | ||
102 | config ZONE_DMA32 | 106 | config ZONE_DMA32 |
103 | bool | 107 | bool |
104 | default X86_64 | 108 | default X86_64 |
105 | 109 | ||
106 | config ARCH_POPULATES_NODE_MAP | 110 | config ARCH_POPULATES_NODE_MAP |
107 | def_bool y | 111 | def_bool y |
108 | 112 | ||
109 | config AUDIT_ARCH | 113 | config AUDIT_ARCH |
110 | bool | 114 | bool |
111 | default X86_64 | 115 | default X86_64 |
112 | 116 | ||
113 | # Use the generic interrupt handling code in kernel/irq/: | 117 | # Use the generic interrupt handling code in kernel/irq/: |
114 | config GENERIC_HARDIRQS | 118 | config GENERIC_HARDIRQS |
115 | bool | 119 | bool |
116 | default y | 120 | default y |
117 | 121 | ||
118 | config GENERIC_IRQ_PROBE | 122 | config GENERIC_IRQ_PROBE |
119 | bool | 123 | bool |
120 | default y | 124 | default y |
121 | 125 | ||
122 | config GENERIC_PENDING_IRQ | 126 | config GENERIC_PENDING_IRQ |
123 | bool | 127 | bool |
124 | depends on GENERIC_HARDIRQS && SMP | 128 | depends on GENERIC_HARDIRQS && SMP |
125 | default y | 129 | default y |
126 | 130 | ||
127 | config X86_SMP | 131 | config X86_SMP |
128 | bool | 132 | bool |
129 | depends on X86_32 && SMP && !X86_VOYAGER | 133 | depends on X86_32 && SMP && !X86_VOYAGER |
130 | default y | 134 | default y |
131 | 135 | ||
132 | config X86_HT | 136 | config X86_HT |
133 | bool | 137 | bool |
134 | depends on SMP | 138 | depends on SMP |
135 | depends on (X86_32 && !(X86_VISWS || X86_VOYAGER)) || (X86_64 && !MK8) | 139 | depends on (X86_32 && !(X86_VISWS || X86_VOYAGER)) || (X86_64 && !MK8) |
136 | default y | 140 | default y |
137 | 141 | ||
138 | config X86_BIOS_REBOOT | 142 | config X86_BIOS_REBOOT |
139 | bool | 143 | bool |
140 | depends on X86_32 && !(X86_VISWS || X86_VOYAGER) | 144 | depends on X86_32 && !(X86_VISWS || X86_VOYAGER) |
141 | default y | 145 | default y |
142 | 146 | ||
143 | config X86_TRAMPOLINE | 147 | config X86_TRAMPOLINE |
144 | bool | 148 | bool |
145 | depends on X86_SMP || (X86_VOYAGER && SMP) | 149 | depends on X86_SMP || (X86_VOYAGER && SMP) |
146 | default y | 150 | default y |
147 | 151 | ||
148 | config KTIME_SCALAR | 152 | config KTIME_SCALAR |
149 | def_bool X86_32 | 153 | def_bool X86_32 |
150 | source "init/Kconfig" | 154 | source "init/Kconfig" |
151 | 155 | ||
152 | menu "Processor type and features" | 156 | menu "Processor type and features" |
153 | 157 | ||
154 | source "kernel/time/Kconfig" | 158 | source "kernel/time/Kconfig" |
155 | 159 | ||
156 | config SMP | 160 | config SMP |
157 | bool "Symmetric multi-processing support" | 161 | bool "Symmetric multi-processing support" |
158 | ---help--- | 162 | ---help--- |
159 | This enables support for systems with more than one CPU. If you have | 163 | This enables support for systems with more than one CPU. If you have |
160 | a system with only one CPU, like most personal computers, say N. If | 164 | a system with only one CPU, like most personal computers, say N. If |
161 | you have a system with more than one CPU, say Y. | 165 | you have a system with more than one CPU, say Y. |
162 | 166 | ||
163 | If you say N here, the kernel will run on single and multiprocessor | 167 | If you say N here, the kernel will run on single and multiprocessor |
164 | machines, but will use only one CPU of a multiprocessor machine. If | 168 | machines, but will use only one CPU of a multiprocessor machine. If |
165 | you say Y here, the kernel will run on many, but not all, | 169 | you say Y here, the kernel will run on many, but not all, |
166 | singleprocessor machines. On a singleprocessor machine, the kernel | 170 | singleprocessor machines. On a singleprocessor machine, the kernel |
167 | will run faster if you say N here. | 171 | will run faster if you say N here. |
168 | 172 | ||
169 | Note that if you say Y here and choose architecture "586" or | 173 | Note that if you say Y here and choose architecture "586" or |
170 | "Pentium" under "Processor family", the kernel will not work on 486 | 174 | "Pentium" under "Processor family", the kernel will not work on 486 |
171 | architectures. Similarly, multiprocessor kernels for the "PPro" | 175 | architectures. Similarly, multiprocessor kernels for the "PPro" |
172 | architecture may not work on all Pentium based boards. | 176 | architecture may not work on all Pentium based boards. |
173 | 177 | ||
174 | People using multiprocessor machines who say Y here should also say | 178 | People using multiprocessor machines who say Y here should also say |
175 | Y to "Enhanced Real Time Clock Support", below. The "Advanced Power | 179 | Y to "Enhanced Real Time Clock Support", below. The "Advanced Power |
176 | Management" code will be disabled if you say Y here. | 180 | Management" code will be disabled if you say Y here. |
177 | 181 | ||
178 | See also the <file:Documentation/smp.txt>, | 182 | See also the <file:Documentation/smp.txt>, |
179 | <file:Documentation/i386/IO-APIC.txt>, | 183 | <file:Documentation/i386/IO-APIC.txt>, |
180 | <file:Documentation/nmi_watchdog.txt> and the SMP-HOWTO available at | 184 | <file:Documentation/nmi_watchdog.txt> and the SMP-HOWTO available at |
181 | <http://www.tldp.org/docs.html#howto>. | 185 | <http://www.tldp.org/docs.html#howto>. |
182 | 186 | ||
183 | If you don't know what to do here, say N. | 187 | If you don't know what to do here, say N. |
184 | 188 | ||
185 | choice | 189 | choice |
186 | prompt "Subarchitecture Type" | 190 | prompt "Subarchitecture Type" |
187 | default X86_PC | 191 | default X86_PC |
188 | 192 | ||
189 | config X86_PC | 193 | config X86_PC |
190 | bool "PC-compatible" | 194 | bool "PC-compatible" |
191 | help | 195 | help |
192 | Choose this option if your computer is a standard PC or compatible. | 196 | Choose this option if your computer is a standard PC or compatible. |
193 | 197 | ||
194 | config X86_ELAN | 198 | config X86_ELAN |
195 | bool "AMD Elan" | 199 | bool "AMD Elan" |
196 | depends on X86_32 | 200 | depends on X86_32 |
197 | help | 201 | help |
198 | Select this for an AMD Elan processor. | 202 | Select this for an AMD Elan processor. |
199 | 203 | ||
200 | Do not use this option for K6/Athlon/Opteron processors! | 204 | Do not use this option for K6/Athlon/Opteron processors! |
201 | 205 | ||
202 | If unsure, choose "PC-compatible" instead. | 206 | If unsure, choose "PC-compatible" instead. |
203 | 207 | ||
204 | config X86_VOYAGER | 208 | config X86_VOYAGER |
205 | bool "Voyager (NCR)" | 209 | bool "Voyager (NCR)" |
206 | depends on X86_32 | 210 | depends on X86_32 |
207 | select SMP if !BROKEN | 211 | select SMP if !BROKEN |
208 | help | 212 | help |
209 | Voyager is an MCA-based 32-way capable SMP architecture proprietary | 213 | Voyager is an MCA-based 32-way capable SMP architecture proprietary |
210 | to NCR Corp. Machine classes 345x/35xx/4100/51xx are Voyager-based. | 214 | to NCR Corp. Machine classes 345x/35xx/4100/51xx are Voyager-based. |
211 | 215 | ||
212 | *** WARNING *** | 216 | *** WARNING *** |
213 | 217 | ||
214 | If you do not specifically know you have a Voyager based machine, | 218 | If you do not specifically know you have a Voyager based machine, |
215 | say N here, otherwise the kernel you build will not be bootable. | 219 | say N here, otherwise the kernel you build will not be bootable. |
216 | 220 | ||
217 | config X86_NUMAQ | 221 | config X86_NUMAQ |
218 | bool "NUMAQ (IBM/Sequent)" | 222 | bool "NUMAQ (IBM/Sequent)" |
219 | select SMP | 223 | select SMP |
220 | select NUMA | 224 | select NUMA |
221 | depends on X86_32 | 225 | depends on X86_32 |
222 | help | 226 | help |
223 | This option is used for getting Linux to run on a (IBM/Sequent) NUMA | 227 | This option is used for getting Linux to run on a (IBM/Sequent) NUMA |
224 | multiquad box. This changes the way that processors are bootstrapped, | 228 | multiquad box. This changes the way that processors are bootstrapped, |
225 | and uses Clustered Logical APIC addressing mode instead of Flat Logical. | 229 | and uses Clustered Logical APIC addressing mode instead of Flat Logical. |
226 | You will need a new lynxer.elf file to flash your firmware with - send | 230 | You will need a new lynxer.elf file to flash your firmware with - send |
227 | email to <Martin.Bligh@us.ibm.com>. | 231 | email to <Martin.Bligh@us.ibm.com>. |
228 | 232 | ||
229 | config X86_SUMMIT | 233 | config X86_SUMMIT |
230 | bool "Summit/EXA (IBM x440)" | 234 | bool "Summit/EXA (IBM x440)" |
231 | depends on X86_32 && SMP | 235 | depends on X86_32 && SMP |
232 | help | 236 | help |
233 | This option is needed for IBM systems that use the Summit/EXA chipset. | 237 | This option is needed for IBM systems that use the Summit/EXA chipset. |
234 | In particular, it is needed for the x440. | 238 | In particular, it is needed for the x440. |
235 | 239 | ||
236 | If you don't have one of these computers, you should say N here. | 240 | If you don't have one of these computers, you should say N here. |
237 | If you want to build a NUMA kernel, you must select ACPI. | 241 | If you want to build a NUMA kernel, you must select ACPI. |
238 | 242 | ||
239 | config X86_BIGSMP | 243 | config X86_BIGSMP |
240 | bool "Support for other sub-arch SMP systems with more than 8 CPUs" | 244 | bool "Support for other sub-arch SMP systems with more than 8 CPUs" |
241 | depends on X86_32 && SMP | 245 | depends on X86_32 && SMP |
242 | help | 246 | help |
243 | This option is needed for the systems that have more than 8 CPUs | 247 | This option is needed for the systems that have more than 8 CPUs |
244 | and if the system is not of any sub-arch type above. | 248 | and if the system is not of any sub-arch type above. |
245 | 249 | ||
246 | If you don't have such a system, you should say N here. | 250 | If you don't have such a system, you should say N here. |
247 | 251 | ||
248 | config X86_VISWS | 252 | config X86_VISWS |
249 | bool "SGI 320/540 (Visual Workstation)" | 253 | bool "SGI 320/540 (Visual Workstation)" |
250 | depends on X86_32 | 254 | depends on X86_32 |
251 | help | 255 | help |
252 | The SGI Visual Workstation series is an IA32-based workstation | 256 | The SGI Visual Workstation series is an IA32-based workstation |
253 | based on SGI systems chips with some legacy PC hardware attached. | 257 | based on SGI systems chips with some legacy PC hardware attached. |
254 | 258 | ||
255 | Say Y here to create a kernel to run on the SGI 320 or 540. | 259 | Say Y here to create a kernel to run on the SGI 320 or 540. |
256 | 260 | ||
257 | A kernel compiled for the Visual Workstation will not run on PCs | 261 | A kernel compiled for the Visual Workstation will not run on PCs |
258 | and vice versa. See <file:Documentation/sgi-visws.txt> for details. | 262 | and vice versa. See <file:Documentation/sgi-visws.txt> for details. |
259 | 263 | ||
260 | config X86_GENERICARCH | 264 | config X86_GENERICARCH |
261 | bool "Generic architecture (Summit, bigsmp, ES7000, default)" | 265 | bool "Generic architecture (Summit, bigsmp, ES7000, default)" |
262 | depends on X86_32 | 266 | depends on X86_32 |
263 | help | 267 | help |
264 | This option compiles in the Summit, bigsmp, ES7000, default subarchitectures. | 268 | This option compiles in the Summit, bigsmp, ES7000, default subarchitectures. |
265 | It is intended for a generic binary kernel. | 269 | It is intended for a generic binary kernel. |
266 | If you want a NUMA kernel, select ACPI. We need SRAT for NUMA. | 270 | If you want a NUMA kernel, select ACPI. We need SRAT for NUMA. |
267 | 271 | ||
268 | config X86_ES7000 | 272 | config X86_ES7000 |
269 | bool "Support for Unisys ES7000 IA32 series" | 273 | bool "Support for Unisys ES7000 IA32 series" |
270 | depends on X86_32 && SMP | 274 | depends on X86_32 && SMP |
271 | help | 275 | help |
272 | Support for Unisys ES7000 systems. Say 'Y' here if this kernel is | 276 | Support for Unisys ES7000 systems. Say 'Y' here if this kernel is |
273 | supposed to run on an IA32-based Unisys ES7000 system. | 277 | supposed to run on an IA32-based Unisys ES7000 system. |
274 | Only choose this option if you have such a system, otherwise you | 278 | Only choose this option if you have such a system, otherwise you |
275 | should say N here. | 279 | should say N here. |
276 | 280 | ||
277 | config X86_VSMP | 281 | config X86_VSMP |
278 | bool "Support for ScaleMP vSMP" | 282 | bool "Support for ScaleMP vSMP" |
279 | depends on X86_64 && PCI | 283 | depends on X86_64 && PCI |
280 | help | 284 | help |
281 | Support for ScaleMP vSMP systems. Say 'Y' here if this kernel is | 285 | Support for ScaleMP vSMP systems. Say 'Y' here if this kernel is |
282 | supposed to run on these EM64T-based machines. Only choose this option | 286 | supposed to run on these EM64T-based machines. Only choose this option |
283 | if you have one of these machines. | 287 | if you have one of these machines. |
284 | 288 | ||
285 | endchoice | 289 | endchoice |
286 | 290 | ||
287 | config SCHED_NO_NO_OMIT_FRAME_POINTER | 291 | config SCHED_NO_NO_OMIT_FRAME_POINTER |
288 | def_bool y | 292 | def_bool y |
289 | prompt "Single-depth WCHAN output" | 293 | prompt "Single-depth WCHAN output" |
290 | depends on X86_32 | 294 | depends on X86_32 |
291 | help | 295 | help |
292 | Calculate simpler /proc/<PID>/wchan values. If this option | 296 | Calculate simpler /proc/<PID>/wchan values. If this option |
293 | is disabled then wchan values will recurse back to the | 297 | is disabled then wchan values will recurse back to the |
294 | caller function. This provides more accurate wchan values, | 298 | caller function. This provides more accurate wchan values, |
295 | at the expense of slightly more scheduling overhead. | 299 | at the expense of slightly more scheduling overhead. |
296 | 300 | ||
297 | If in doubt, say "Y". | 301 | If in doubt, say "Y". |
298 | 302 | ||
299 | config PARAVIRT | 303 | config PARAVIRT |
300 | bool | 304 | bool |
301 | depends on X86_32 && !(X86_VISWS || X86_VOYAGER) | 305 | depends on X86_32 && !(X86_VISWS || X86_VOYAGER) |
302 | help | 306 | help |
303 | This changes the kernel so it can modify itself when it is run | 307 | This changes the kernel so it can modify itself when it is run |
304 | under a hypervisor, potentially improving performance significantly | 308 | under a hypervisor, potentially improving performance significantly |
305 | over full virtualization. However, when run without a hypervisor | 309 | over full virtualization. However, when run without a hypervisor |
306 | the kernel is theoretically slower and slightly larger. | 310 | the kernel is theoretically slower and slightly larger. |
307 | 311 | ||
308 | menuconfig PARAVIRT_GUEST | 312 | menuconfig PARAVIRT_GUEST |
309 | bool "Paravirtualized guest support" | 313 | bool "Paravirtualized guest support" |
310 | depends on X86_32 | 314 | depends on X86_32 |
311 | help | 315 | help |
312 | Say Y here to get to see options related to running Linux under | 316 | Say Y here to get to see options related to running Linux under |
313 | various hypervisors. This option alone does not add any kernel code. | 317 | various hypervisors. This option alone does not add any kernel code. |
314 | 318 | ||
315 | If you say N, all options in this submenu will be skipped and disabled. | 319 | If you say N, all options in this submenu will be skipped and disabled. |
316 | 320 | ||
317 | if PARAVIRT_GUEST | 321 | if PARAVIRT_GUEST |
318 | 322 | ||
319 | source "arch/x86/xen/Kconfig" | 323 | source "arch/x86/xen/Kconfig" |
320 | 324 | ||
321 | config VMI | 325 | config VMI |
322 | bool "VMI Guest support" | 326 | bool "VMI Guest support" |
323 | select PARAVIRT | 327 | select PARAVIRT |
324 | depends on !(X86_VISWS || X86_VOYAGER) | 328 | depends on !(X86_VISWS || X86_VOYAGER) |
325 | help | 329 | help |
326 | VMI provides a paravirtualized interface to the VMware ESX server | 330 | VMI provides a paravirtualized interface to the VMware ESX server |
327 | (it could be used by other hypervisors in theory too, but is not | 331 | (it could be used by other hypervisors in theory too, but is not |
328 | at the moment), by linking the kernel to a GPL-ed ROM module | 332 | at the moment), by linking the kernel to a GPL-ed ROM module |
329 | provided by the hypervisor. | 333 | provided by the hypervisor. |
330 | 334 | ||
331 | source "arch/x86/lguest/Kconfig" | 335 | source "arch/x86/lguest/Kconfig" |
332 | 336 | ||
333 | endif | 337 | endif |
334 | 338 | ||
335 | config ACPI_SRAT | 339 | config ACPI_SRAT |
336 | def_bool y | 340 | def_bool y |
337 | depends on X86_32 && ACPI && NUMA && (X86_SUMMIT || X86_GENERICARCH) | 341 | depends on X86_32 && ACPI && NUMA && (X86_SUMMIT || X86_GENERICARCH) |
338 | select ACPI_NUMA | 342 | select ACPI_NUMA |
339 | 343 | ||
340 | config HAVE_ARCH_PARSE_SRAT | 344 | config HAVE_ARCH_PARSE_SRAT |
341 | def_bool y | 345 | def_bool y |
342 | depends on ACPI_SRAT | 346 | depends on ACPI_SRAT |
343 | 347 | ||
344 | config X86_SUMMIT_NUMA | 348 | config X86_SUMMIT_NUMA |
345 | def_bool y | 349 | def_bool y |
346 | depends on X86_32 && NUMA && (X86_SUMMIT || X86_GENERICARCH) | 350 | depends on X86_32 && NUMA && (X86_SUMMIT || X86_GENERICARCH) |
347 | 351 | ||
348 | config X86_CYCLONE_TIMER | 352 | config X86_CYCLONE_TIMER |
349 | def_bool y | 353 | def_bool y |
350 | depends on X86_32 && X86_SUMMIT || X86_GENERICARCH | 354 | depends on X86_32 && X86_SUMMIT || X86_GENERICARCH |
351 | 355 | ||
352 | config ES7000_CLUSTERED_APIC | 356 | config ES7000_CLUSTERED_APIC |
353 | def_bool y | 357 | def_bool y |
354 | depends on SMP && X86_ES7000 && MPENTIUMIII | 358 | depends on SMP && X86_ES7000 && MPENTIUMIII |
355 | 359 | ||
356 | source "arch/x86/Kconfig.cpu" | 360 | source "arch/x86/Kconfig.cpu" |
357 | 361 | ||
358 | config HPET_TIMER | 362 | config HPET_TIMER |
359 | def_bool X86_64 | 363 | def_bool X86_64 |
360 | prompt "HPET Timer Support" if X86_32 | 364 | prompt "HPET Timer Support" if X86_32 |
361 | help | 365 | help |
362 | Use the IA-PC HPET (High Precision Event Timer) to manage | 366 | Use the IA-PC HPET (High Precision Event Timer) to manage |
363 | time in preference to the PIT and RTC, if a HPET is | 367 | time in preference to the PIT and RTC, if a HPET is |
364 | present. | 368 | present. |
365 | HPET is the next generation timer replacing legacy 8254s. | 369 | HPET is the next generation timer replacing legacy 8254s. |
366 | The HPET provides a stable time base on SMP | 370 | The HPET provides a stable time base on SMP |
367 | systems, unlike the TSC, but it is more expensive to access, | 371 | systems, unlike the TSC, but it is more expensive to access, |
368 | as it is off-chip. You can find the HPET spec at | 372 | as it is off-chip. You can find the HPET spec at |
369 | <http://www.intel.com/hardwaredesign/hpetspec.htm>. | 373 | <http://www.intel.com/hardwaredesign/hpetspec.htm>. |
370 | 374 | ||
371 | You can safely choose Y here. However, HPET will only be | 375 | You can safely choose Y here. However, HPET will only be |
372 | activated if the platform and the BIOS support this feature. | 376 | activated if the platform and the BIOS support this feature. |
373 | Otherwise the 8254 will be used for timing services. | 377 | Otherwise the 8254 will be used for timing services. |
374 | 378 | ||
375 | Choose N to continue using the legacy 8254 timer. | 379 | Choose N to continue using the legacy 8254 timer. |
376 | 380 | ||
377 | config HPET_EMULATE_RTC | 381 | config HPET_EMULATE_RTC |
378 | def_bool y | 382 | def_bool y |
379 | depends on HPET_TIMER && RTC=y | 383 | depends on HPET_TIMER && RTC=y |
380 | 384 | ||
381 | # Mark as embedded because too many people got it wrong. | 385 | # Mark as embedded because too many people got it wrong. |
382 | # The code disables itself when not needed. | 386 | # The code disables itself when not needed. |
383 | config GART_IOMMU | 387 | config GART_IOMMU |
384 | bool "GART IOMMU support" if EMBEDDED | 388 | bool "GART IOMMU support" if EMBEDDED |
385 | default y | 389 | default y |
386 | select SWIOTLB | 390 | select SWIOTLB |
387 | select AGP | 391 | select AGP |
388 | depends on X86_64 && PCI | 392 | depends on X86_64 && PCI |
389 | help | 393 | help |
390 | Support for full DMA access of devices with 32bit memory access only | 394 | Support for full DMA access of devices with 32bit memory access only |
391 | on systems with more than 3GB. This is usually needed for USB, | 395 | on systems with more than 3GB. This is usually needed for USB, |
392 | sound, many IDE/SATA chipsets and some other devices. | 396 | sound, many IDE/SATA chipsets and some other devices. |
393 | Provides a driver for the AMD Athlon64/Opteron/Turion/Sempron GART | 397 | Provides a driver for the AMD Athlon64/Opteron/Turion/Sempron GART |
394 | based hardware IOMMU and a software bounce buffer based IOMMU used | 398 | based hardware IOMMU and a software bounce buffer based IOMMU used |
395 | on Intel systems and as fallback. | 399 | on Intel systems and as fallback. |
396 | The code is only active when needed (enough memory and limited | 400 | The code is only active when needed (enough memory and limited |
397 | device) unless CONFIG_IOMMU_DEBUG or iommu=force is specified | 401 | device) unless CONFIG_IOMMU_DEBUG or iommu=force is specified |
398 | too. | 402 | too. |
399 | 403 | ||
400 | config CALGARY_IOMMU | 404 | config CALGARY_IOMMU |
401 | bool "IBM Calgary IOMMU support" | 405 | bool "IBM Calgary IOMMU support" |
402 | select SWIOTLB | 406 | select SWIOTLB |
403 | depends on X86_64 && PCI && EXPERIMENTAL | 407 | depends on X86_64 && PCI && EXPERIMENTAL |
404 | help | 408 | help |
405 | Support for hardware IOMMUs in IBM's xSeries x366 and x460 | 409 | Support for hardware IOMMUs in IBM's xSeries x366 and x460 |
406 | systems. Needed to run systems with more than 3GB of memory | 410 | systems. Needed to run systems with more than 3GB of memory |
407 | properly with 32-bit PCI devices that do not support DAC | 411 | properly with 32-bit PCI devices that do not support DAC |
408 | (Double Address Cycle). Calgary also supports bus level | 412 | (Double Address Cycle). Calgary also supports bus level |
409 | isolation, where all DMAs pass through the IOMMU. This | 413 | isolation, where all DMAs pass through the IOMMU. This |
410 | prevents them from going anywhere except their intended | 414 | prevents them from going anywhere except their intended |
411 | destination. This catches hard-to-find kernel bugs and | 415 | destination. This catches hard-to-find kernel bugs and |
412 | mis-behaving drivers and devices that do not use the DMA-API | 416 | mis-behaving drivers and devices that do not use the DMA-API |
413 | properly to set up their DMA buffers. The IOMMU can be | 417 | properly to set up their DMA buffers. The IOMMU can be |
414 | turned off at boot time with the iommu=off parameter. | 418 | turned off at boot time with the iommu=off parameter. |
415 | Normally the kernel will make the right choice by itself. | 419 | Normally the kernel will make the right choice by itself. |
416 | If unsure, say Y. | 420 | If unsure, say Y. |
417 | 421 | ||
418 | config CALGARY_IOMMU_ENABLED_BY_DEFAULT | 422 | config CALGARY_IOMMU_ENABLED_BY_DEFAULT |
419 | def_bool y | 423 | def_bool y |
420 | prompt "Should Calgary be enabled by default?" | 424 | prompt "Should Calgary be enabled by default?" |
421 | depends on CALGARY_IOMMU | 425 | depends on CALGARY_IOMMU |
422 | help | 426 | help |
423 | Should Calgary be enabled by default? if you choose 'y', Calgary | 427 | Should Calgary be enabled by default? if you choose 'y', Calgary |
424 | will be used (if it exists). If you choose 'n', Calgary will not be | 428 | will be used (if it exists). If you choose 'n', Calgary will not be |
425 | used even if it exists. If you choose 'n' and would like to use | 429 | used even if it exists. If you choose 'n' and would like to use |
426 | Calgary anyway, pass 'iommu=calgary' on the kernel command line. | 430 | Calgary anyway, pass 'iommu=calgary' on the kernel command line. |
427 | If unsure, say Y. | 431 | If unsure, say Y. |
428 | 432 | ||
429 | # need this always selected by IOMMU for the VIA workaround | 433 | # need this always selected by IOMMU for the VIA workaround |
430 | config SWIOTLB | 434 | config SWIOTLB |
431 | bool | 435 | bool |
432 | help | 436 | help |
433 | Support for software bounce buffers used on x86-64 systems | 437 | Support for software bounce buffers used on x86-64 systems |
434 | which don't have a hardware IOMMU (e.g. the current generation | 438 | which don't have a hardware IOMMU (e.g. the current generation |
435 | of Intel's x86-64 CPUs). Using this PCI devices which can only | 439 | of Intel's x86-64 CPUs). Using this PCI devices which can only |
436 | access 32-bits of memory can be used on systems with more than | 440 | access 32-bits of memory can be used on systems with more than |
437 | 3 GB of memory. If unsure, say Y. | 441 | 3 GB of memory. If unsure, say Y. |
438 | 442 | ||
439 | 443 | ||
440 | config NR_CPUS | 444 | config NR_CPUS |
441 | int "Maximum number of CPUs (2-255)" | 445 | int "Maximum number of CPUs (2-255)" |
442 | range 2 255 | 446 | range 2 255 |
443 | depends on SMP | 447 | depends on SMP |
444 | default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000 | 448 | default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000 |
445 | default "8" | 449 | default "8" |
446 | help | 450 | help |
447 | This allows you to specify the maximum number of CPUs which this | 451 | This allows you to specify the maximum number of CPUs which this |
448 | kernel will support. The maximum supported value is 255 and the | 452 | kernel will support. The maximum supported value is 255 and the |
449 | minimum value which makes sense is 2. | 453 | minimum value which makes sense is 2. |
450 | 454 | ||
451 | This is purely to save memory - each supported CPU adds | 455 | This is purely to save memory - each supported CPU adds |
452 | approximately eight kilobytes to the kernel image. | 456 | approximately eight kilobytes to the kernel image. |
453 | 457 | ||
454 | config SCHED_SMT | 458 | config SCHED_SMT |
455 | bool "SMT (Hyperthreading) scheduler support" | 459 | bool "SMT (Hyperthreading) scheduler support" |
456 | depends on (X86_64 && SMP) || (X86_32 && X86_HT) | 460 | depends on (X86_64 && SMP) || (X86_32 && X86_HT) |
457 | help | 461 | help |
458 | SMT scheduler support improves the CPU scheduler's decision making | 462 | SMT scheduler support improves the CPU scheduler's decision making |
459 | when dealing with Intel Pentium 4 chips with HyperThreading at a | 463 | when dealing with Intel Pentium 4 chips with HyperThreading at a |
460 | cost of slightly increased overhead in some places. If unsure say | 464 | cost of slightly increased overhead in some places. If unsure say |
461 | N here. | 465 | N here. |
462 | 466 | ||
463 | config SCHED_MC | 467 | config SCHED_MC |
464 | def_bool y | 468 | def_bool y |
465 | prompt "Multi-core scheduler support" | 469 | prompt "Multi-core scheduler support" |
466 | depends on (X86_64 && SMP) || (X86_32 && X86_HT) | 470 | depends on (X86_64 && SMP) || (X86_32 && X86_HT) |
467 | help | 471 | help |
468 | Multi-core scheduler support improves the CPU scheduler's decision | 472 | Multi-core scheduler support improves the CPU scheduler's decision |
469 | making when dealing with multi-core CPU chips at a cost of slightly | 473 | making when dealing with multi-core CPU chips at a cost of slightly |
470 | increased overhead in some places. If unsure say N here. | 474 | increased overhead in some places. If unsure say N here. |
471 | 475 | ||
472 | source "kernel/Kconfig.preempt" | 476 | source "kernel/Kconfig.preempt" |
473 | 477 | ||
474 | config X86_UP_APIC | 478 | config X86_UP_APIC |
475 | bool "Local APIC support on uniprocessors" | 479 | bool "Local APIC support on uniprocessors" |
476 | depends on X86_32 && !SMP && !(X86_VISWS || X86_VOYAGER || X86_GENERICARCH) | 480 | depends on X86_32 && !SMP && !(X86_VISWS || X86_VOYAGER || X86_GENERICARCH) |
477 | help | 481 | help |
478 | A local APIC (Advanced Programmable Interrupt Controller) is an | 482 | A local APIC (Advanced Programmable Interrupt Controller) is an |
479 | integrated interrupt controller in the CPU. If you have a single-CPU | 483 | integrated interrupt controller in the CPU. If you have a single-CPU |
480 | system which has a processor with a local APIC, you can say Y here to | 484 | system which has a processor with a local APIC, you can say Y here to |
481 | enable and use it. If you say Y here even though your machine doesn't | 485 | enable and use it. If you say Y here even though your machine doesn't |
482 | have a local APIC, then the kernel will still run with no slowdown at | 486 | have a local APIC, then the kernel will still run with no slowdown at |
483 | all. The local APIC supports CPU-generated self-interrupts (timer, | 487 | all. The local APIC supports CPU-generated self-interrupts (timer, |
484 | performance counters), and the NMI watchdog which detects hard | 488 | performance counters), and the NMI watchdog which detects hard |
485 | lockups. | 489 | lockups. |
486 | 490 | ||
487 | config X86_UP_IOAPIC | 491 | config X86_UP_IOAPIC |
488 | bool "IO-APIC support on uniprocessors" | 492 | bool "IO-APIC support on uniprocessors" |
489 | depends on X86_UP_APIC | 493 | depends on X86_UP_APIC |
490 | help | 494 | help |
491 | An IO-APIC (I/O Advanced Programmable Interrupt Controller) is an | 495 | An IO-APIC (I/O Advanced Programmable Interrupt Controller) is an |
492 | SMP-capable replacement for PC-style interrupt controllers. Most | 496 | SMP-capable replacement for PC-style interrupt controllers. Most |
493 | SMP systems and many recent uniprocessor systems have one. | 497 | SMP systems and many recent uniprocessor systems have one. |
494 | 498 | ||
495 | If you have a single-CPU system with an IO-APIC, you can say Y here | 499 | If you have a single-CPU system with an IO-APIC, you can say Y here |
496 | to use it. If you say Y here even though your machine doesn't have | 500 | to use it. If you say Y here even though your machine doesn't have |
497 | an IO-APIC, then the kernel will still run with no slowdown at all. | 501 | an IO-APIC, then the kernel will still run with no slowdown at all. |
498 | 502 | ||
499 | config X86_LOCAL_APIC | 503 | config X86_LOCAL_APIC |
500 | def_bool y | 504 | def_bool y |
501 | depends on X86_64 || (X86_32 && (X86_UP_APIC || ((X86_VISWS || SMP) && !X86_VOYAGER) || X86_GENERICARCH)) | 505 | depends on X86_64 || (X86_32 && (X86_UP_APIC || ((X86_VISWS || SMP) && !X86_VOYAGER) || X86_GENERICARCH)) |
502 | 506 | ||
503 | config X86_IO_APIC | 507 | config X86_IO_APIC |
504 | def_bool y | 508 | def_bool y |
505 | depends on X86_64 || (X86_32 && (X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER)) || X86_GENERICARCH)) | 509 | depends on X86_64 || (X86_32 && (X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER)) || X86_GENERICARCH)) |
506 | 510 | ||
507 | config X86_VISWS_APIC | 511 | config X86_VISWS_APIC |
508 | def_bool y | 512 | def_bool y |
509 | depends on X86_32 && X86_VISWS | 513 | depends on X86_32 && X86_VISWS |
510 | 514 | ||
511 | config X86_MCE | 515 | config X86_MCE |
512 | bool "Machine Check Exception" | 516 | bool "Machine Check Exception" |
513 | depends on !X86_VOYAGER | 517 | depends on !X86_VOYAGER |
514 | ---help--- | 518 | ---help--- |
515 | Machine Check Exception support allows the processor to notify the | 519 | Machine Check Exception support allows the processor to notify the |
516 | kernel if it detects a problem (e.g. overheating, component failure). | 520 | kernel if it detects a problem (e.g. overheating, component failure). |
517 | The action the kernel takes depends on the severity of the problem, | 521 | The action the kernel takes depends on the severity of the problem, |
518 | ranging from a warning message on the console, to halting the machine. | 522 | ranging from a warning message on the console, to halting the machine. |
519 | Your processor must be a Pentium or newer to support this - check the | 523 | Your processor must be a Pentium or newer to support this - check the |
520 | flags in /proc/cpuinfo for mce. Note that some older Pentium systems | 524 | flags in /proc/cpuinfo for mce. Note that some older Pentium systems |
521 | have a design flaw which leads to false MCE events - hence MCE is | 525 | have a design flaw which leads to false MCE events - hence MCE is |
522 | disabled on all P5 processors, unless explicitly enabled with "mce" | 526 | disabled on all P5 processors, unless explicitly enabled with "mce" |
523 | as a boot argument. Similarly, if MCE is built in and creates a | 527 | as a boot argument. Similarly, if MCE is built in and creates a |
524 | problem on some new non-standard machine, you can boot with "nomce" | 528 | problem on some new non-standard machine, you can boot with "nomce" |
525 | to disable it. MCE support simply ignores non-MCE processors like | 529 | to disable it. MCE support simply ignores non-MCE processors like |
526 | the 386 and 486, so nearly everyone can say Y here. | 530 | the 386 and 486, so nearly everyone can say Y here. |
527 | 531 | ||
528 | config X86_MCE_INTEL | 532 | config X86_MCE_INTEL |
529 | def_bool y | 533 | def_bool y |
530 | prompt "Intel MCE features" | 534 | prompt "Intel MCE features" |
531 | depends on X86_64 && X86_MCE && X86_LOCAL_APIC | 535 | depends on X86_64 && X86_MCE && X86_LOCAL_APIC |
532 | help | 536 | help |
533 | Additional support for intel specific MCE features such as | 537 | Additional support for intel specific MCE features such as |
534 | the thermal monitor. | 538 | the thermal monitor. |
535 | 539 | ||
536 | config X86_MCE_AMD | 540 | config X86_MCE_AMD |
537 | def_bool y | 541 | def_bool y |
538 | prompt "AMD MCE features" | 542 | prompt "AMD MCE features" |
539 | depends on X86_64 && X86_MCE && X86_LOCAL_APIC | 543 | depends on X86_64 && X86_MCE && X86_LOCAL_APIC |
540 | help | 544 | help |
541 | Additional support for AMD specific MCE features such as | 545 | Additional support for AMD specific MCE features such as |
542 | the DRAM Error Threshold. | 546 | the DRAM Error Threshold. |
543 | 547 | ||
544 | config X86_MCE_NONFATAL | 548 | config X86_MCE_NONFATAL |
545 | tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4" | 549 | tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4" |
546 | depends on X86_32 && X86_MCE | 550 | depends on X86_32 && X86_MCE |
547 | help | 551 | help |
548 | Enabling this feature starts a timer that triggers every 5 seconds which | 552 | Enabling this feature starts a timer that triggers every 5 seconds which |
549 | will look at the machine check registers to see if anything happened. | 553 | will look at the machine check registers to see if anything happened. |
550 | Non-fatal problems automatically get corrected (but still logged). | 554 | Non-fatal problems automatically get corrected (but still logged). |
551 | Disable this if you don't want to see these messages. | 555 | Disable this if you don't want to see these messages. |
552 | Seeing the messages this option prints out may be indicative of dying | 556 | Seeing the messages this option prints out may be indicative of dying |
553 | or out-of-spec (ie, overclocked) hardware. | 557 | or out-of-spec (ie, overclocked) hardware. |
554 | This option only does something on certain CPUs. | 558 | This option only does something on certain CPUs. |
555 | (AMD Athlon/Duron and Intel Pentium 4) | 559 | (AMD Athlon/Duron and Intel Pentium 4) |
556 | 560 | ||
557 | config X86_MCE_P4THERMAL | 561 | config X86_MCE_P4THERMAL |
558 | bool "check for P4 thermal throttling interrupt." | 562 | bool "check for P4 thermal throttling interrupt." |
559 | depends on X86_32 && X86_MCE && (X86_UP_APIC || SMP) && !X86_VISWS | 563 | depends on X86_32 && X86_MCE && (X86_UP_APIC || SMP) && !X86_VISWS |
560 | help | 564 | help |
561 | Enabling this feature will cause a message to be printed when the P4 | 565 | Enabling this feature will cause a message to be printed when the P4 |
562 | enters thermal throttling. | 566 | enters thermal throttling. |
563 | 567 | ||
564 | config VM86 | 568 | config VM86 |
565 | bool "Enable VM86 support" if EMBEDDED | 569 | bool "Enable VM86 support" if EMBEDDED |
566 | default y | 570 | default y |
567 | depends on X86_32 | 571 | depends on X86_32 |
568 | help | 572 | help |
569 | This option is required by programs like DOSEMU to run 16-bit legacy | 573 | This option is required by programs like DOSEMU to run 16-bit legacy |
570 | code on X86 processors. It also may be needed by software like | 574 | code on X86 processors. It also may be needed by software like |
571 | XFree86 to initialize some video cards via BIOS. Disabling this | 575 | XFree86 to initialize some video cards via BIOS. Disabling this |
572 | option saves about 6k. | 576 | option saves about 6k. |
573 | 577 | ||
574 | config TOSHIBA | 578 | config TOSHIBA |
575 | tristate "Toshiba Laptop support" | 579 | tristate "Toshiba Laptop support" |
576 | depends on X86_32 | 580 | depends on X86_32 |
577 | ---help--- | 581 | ---help--- |
578 | This adds a driver to safely access the System Management Mode of | 582 | This adds a driver to safely access the System Management Mode of |
579 | the CPU on Toshiba portables with a genuine Toshiba BIOS. It does | 583 | the CPU on Toshiba portables with a genuine Toshiba BIOS. It does |
580 | not work on models with a Phoenix BIOS. The System Management Mode | 584 | not work on models with a Phoenix BIOS. The System Management Mode |
581 | is used to set the BIOS and power saving options on Toshiba portables. | 585 | is used to set the BIOS and power saving options on Toshiba portables. |
582 | 586 | ||
583 | For information on utilities to make use of this driver see the | 587 | For information on utilities to make use of this driver see the |
584 | Toshiba Linux utilities web site at: | 588 | Toshiba Linux utilities web site at: |
585 | <http://www.buzzard.org.uk/toshiba/>. | 589 | <http://www.buzzard.org.uk/toshiba/>. |
586 | 590 | ||
587 | Say Y if you intend to run this kernel on a Toshiba portable. | 591 | Say Y if you intend to run this kernel on a Toshiba portable. |
588 | Say N otherwise. | 592 | Say N otherwise. |
589 | 593 | ||
590 | config I8K | 594 | config I8K |
591 | tristate "Dell laptop support" | 595 | tristate "Dell laptop support" |
592 | depends on X86_32 | 596 | depends on X86_32 |
593 | ---help--- | 597 | ---help--- |
594 | This adds a driver to safely access the System Management Mode | 598 | This adds a driver to safely access the System Management Mode |
595 | of the CPU on the Dell Inspiron 8000. The System Management Mode | 599 | of the CPU on the Dell Inspiron 8000. The System Management Mode |
596 | is used to read cpu temperature and cooling fan status and to | 600 | is used to read cpu temperature and cooling fan status and to |
597 | control the fans on the I8K portables. | 601 | control the fans on the I8K portables. |
598 | 602 | ||
599 | This driver has been tested only on the Inspiron 8000 but it may | 603 | This driver has been tested only on the Inspiron 8000 but it may |
600 | also work with other Dell laptops. You can force loading on other | 604 | also work with other Dell laptops. You can force loading on other |
601 | models by passing the parameter `force=1' to the module. Use at | 605 | models by passing the parameter `force=1' to the module. Use at |
602 | your own risk. | 606 | your own risk. |
603 | 607 | ||
604 | For information on utilities to make use of this driver see the | 608 | For information on utilities to make use of this driver see the |
605 | I8K Linux utilities web site at: | 609 | I8K Linux utilities web site at: |
606 | <http://people.debian.org/~dz/i8k/> | 610 | <http://people.debian.org/~dz/i8k/> |
607 | 611 | ||
608 | Say Y if you intend to run this kernel on a Dell Inspiron 8000. | 612 | Say Y if you intend to run this kernel on a Dell Inspiron 8000. |
609 | Say N otherwise. | 613 | Say N otherwise. |
610 | 614 | ||
611 | config X86_REBOOTFIXUPS | 615 | config X86_REBOOTFIXUPS |
612 | def_bool n | 616 | def_bool n |
613 | prompt "Enable X86 board specific fixups for reboot" | 617 | prompt "Enable X86 board specific fixups for reboot" |
614 | depends on X86_32 && X86 | 618 | depends on X86_32 && X86 |
615 | ---help--- | 619 | ---help--- |
616 | This enables chipset and/or board specific fixups to be done | 620 | This enables chipset and/or board specific fixups to be done |
617 | in order to get reboot to work correctly. This is only needed on | 621 | in order to get reboot to work correctly. This is only needed on |
618 | some combinations of hardware and BIOS. The symptom, for which | 622 | some combinations of hardware and BIOS. The symptom, for which |
619 | this config is intended, is when reboot ends with a stalled/hung | 623 | this config is intended, is when reboot ends with a stalled/hung |
620 | system. | 624 | system. |
621 | 625 | ||
622 | Currently, the only fixup is for the Geode machines using | 626 | Currently, the only fixup is for the Geode machines using |
623 | CS5530A and CS5536 chipsets. | 627 | CS5530A and CS5536 chipsets. |
624 | 628 | ||
625 | Say Y if you want to enable the fixup. Currently, it's safe to | 629 | Say Y if you want to enable the fixup. Currently, it's safe to |
626 | enable this option even if you don't need it. | 630 | enable this option even if you don't need it. |
627 | Say N otherwise. | 631 | Say N otherwise. |
628 | 632 | ||
629 | config MICROCODE | 633 | config MICROCODE |
630 | tristate "/dev/cpu/microcode - Intel IA32 CPU microcode support" | 634 | tristate "/dev/cpu/microcode - Intel IA32 CPU microcode support" |
631 | select FW_LOADER | 635 | select FW_LOADER |
632 | ---help--- | 636 | ---help--- |
633 | If you say Y here, you will be able to update the microcode on | 637 | If you say Y here, you will be able to update the microcode on |
634 | Intel processors in the IA32 family, e.g. Pentium Pro, Pentium II, | 638 | Intel processors in the IA32 family, e.g. Pentium Pro, Pentium II, |
635 | Pentium III, Pentium 4, Xeon etc. You will obviously need the | 639 | Pentium III, Pentium 4, Xeon etc. You will obviously need the |
636 | actual microcode binary data itself which is not shipped with the | 640 | actual microcode binary data itself which is not shipped with the |
637 | Linux kernel. | 641 | Linux kernel. |
638 | 642 | ||
639 | For latest news and information on obtaining all the required | 643 | For latest news and information on obtaining all the required |
640 | ingredients for this driver, check: | 644 | ingredients for this driver, check: |
641 | <http://www.urbanmyth.org/microcode/>. | 645 | <http://www.urbanmyth.org/microcode/>. |
642 | 646 | ||
643 | To compile this driver as a module, choose M here: the | 647 | To compile this driver as a module, choose M here: the |
644 | module will be called microcode. | 648 | module will be called microcode. |
645 | 649 | ||
646 | config MICROCODE_OLD_INTERFACE | 650 | config MICROCODE_OLD_INTERFACE |
647 | def_bool y | 651 | def_bool y |
648 | depends on MICROCODE | 652 | depends on MICROCODE |
649 | 653 | ||
650 | config X86_MSR | 654 | config X86_MSR |
651 | tristate "/dev/cpu/*/msr - Model-specific register support" | 655 | tristate "/dev/cpu/*/msr - Model-specific register support" |
652 | help | 656 | help |
653 | This device gives privileged processes access to the x86 | 657 | This device gives privileged processes access to the x86 |
654 | Model-Specific Registers (MSRs). It is a character device with | 658 | Model-Specific Registers (MSRs). It is a character device with |
655 | major 202 and minors 0 to 31 for /dev/cpu/0/msr to /dev/cpu/31/msr. | 659 | major 202 and minors 0 to 31 for /dev/cpu/0/msr to /dev/cpu/31/msr. |
656 | MSR accesses are directed to a specific CPU on multi-processor | 660 | MSR accesses are directed to a specific CPU on multi-processor |
657 | systems. | 661 | systems. |
658 | 662 | ||
659 | config X86_CPUID | 663 | config X86_CPUID |
660 | tristate "/dev/cpu/*/cpuid - CPU information support" | 664 | tristate "/dev/cpu/*/cpuid - CPU information support" |
661 | help | 665 | help |
662 | This device gives processes access to the x86 CPUID instruction to | 666 | This device gives processes access to the x86 CPUID instruction to |
663 | be executed on a specific processor. It is a character device | 667 | be executed on a specific processor. It is a character device |
664 | with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to | 668 | with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to |
665 | /dev/cpu/31/cpuid. | 669 | /dev/cpu/31/cpuid. |
666 | 670 | ||
667 | choice | 671 | choice |
668 | prompt "High Memory Support" | 672 | prompt "High Memory Support" |
669 | default HIGHMEM4G if !X86_NUMAQ | 673 | default HIGHMEM4G if !X86_NUMAQ |
670 | default HIGHMEM64G if X86_NUMAQ | 674 | default HIGHMEM64G if X86_NUMAQ |
671 | depends on X86_32 | 675 | depends on X86_32 |
672 | 676 | ||
673 | config NOHIGHMEM | 677 | config NOHIGHMEM |
674 | bool "off" | 678 | bool "off" |
675 | depends on !X86_NUMAQ | 679 | depends on !X86_NUMAQ |
676 | ---help--- | 680 | ---help--- |
677 | Linux can use up to 64 Gigabytes of physical memory on x86 systems. | 681 | Linux can use up to 64 Gigabytes of physical memory on x86 systems. |
678 | However, the address space of 32-bit x86 processors is only 4 | 682 | However, the address space of 32-bit x86 processors is only 4 |
679 | Gigabytes large. That means that, if you have a large amount of | 683 | Gigabytes large. That means that, if you have a large amount of |
680 | physical memory, not all of it can be "permanently mapped" by the | 684 | physical memory, not all of it can be "permanently mapped" by the |
681 | kernel. The physical memory that's not permanently mapped is called | 685 | kernel. The physical memory that's not permanently mapped is called |
682 | "high memory". | 686 | "high memory". |
683 | 687 | ||
684 | If you are compiling a kernel which will never run on a machine with | 688 | If you are compiling a kernel which will never run on a machine with |
685 | more than 1 Gigabyte total physical RAM, answer "off" here (default | 689 | more than 1 Gigabyte total physical RAM, answer "off" here (default |
686 | choice and suitable for most users). This will result in a "3GB/1GB" | 690 | choice and suitable for most users). This will result in a "3GB/1GB" |
687 | split: 3GB are mapped so that each process sees a 3GB virtual memory | 691 | split: 3GB are mapped so that each process sees a 3GB virtual memory |
688 | space and the remaining part of the 4GB virtual memory space is used | 692 | space and the remaining part of the 4GB virtual memory space is used |
689 | by the kernel to permanently map as much physical memory as | 693 | by the kernel to permanently map as much physical memory as |
690 | possible. | 694 | possible. |
691 | 695 | ||
692 | If the machine has between 1 and 4 Gigabytes physical RAM, then | 696 | If the machine has between 1 and 4 Gigabytes physical RAM, then |
693 | answer "4GB" here. | 697 | answer "4GB" here. |
694 | 698 | ||
695 | If more than 4 Gigabytes is used then answer "64GB" here. This | 699 | If more than 4 Gigabytes is used then answer "64GB" here. This |
696 | selection turns Intel PAE (Physical Address Extension) mode on. | 700 | selection turns Intel PAE (Physical Address Extension) mode on. |
697 | PAE implements 3-level paging on IA32 processors. PAE is fully | 701 | PAE implements 3-level paging on IA32 processors. PAE is fully |
698 | supported by Linux, PAE mode is implemented on all recent Intel | 702 | supported by Linux, PAE mode is implemented on all recent Intel |
699 | processors (Pentium Pro and better). NOTE: If you say "64GB" here, | 703 | processors (Pentium Pro and better). NOTE: If you say "64GB" here, |
700 | then the kernel will not boot on CPUs that don't support PAE! | 704 | then the kernel will not boot on CPUs that don't support PAE! |
701 | 705 | ||
702 | The actual amount of total physical memory will either be | 706 | The actual amount of total physical memory will either be |
703 | auto detected or can be forced by using a kernel command line option | 707 | auto detected or can be forced by using a kernel command line option |
704 | such as "mem=256M". (Try "man bootparam" or see the documentation of | 708 | such as "mem=256M". (Try "man bootparam" or see the documentation of |
705 | your boot loader (lilo or loadlin) about how to pass options to the | 709 | your boot loader (lilo or loadlin) about how to pass options to the |
706 | kernel at boot time.) | 710 | kernel at boot time.) |
707 | 711 | ||
708 | If unsure, say "off". | 712 | If unsure, say "off". |
709 | 713 | ||
710 | config HIGHMEM4G | 714 | config HIGHMEM4G |
711 | bool "4GB" | 715 | bool "4GB" |
712 | depends on !X86_NUMAQ | 716 | depends on !X86_NUMAQ |
713 | help | 717 | help |
714 | Select this if you have a 32-bit processor and between 1 and 4 | 718 | Select this if you have a 32-bit processor and between 1 and 4 |
715 | gigabytes of physical RAM. | 719 | gigabytes of physical RAM. |
716 | 720 | ||
717 | config HIGHMEM64G | 721 | config HIGHMEM64G |
718 | bool "64GB" | 722 | bool "64GB" |
719 | depends on !M386 && !M486 | 723 | depends on !M386 && !M486 |
720 | select X86_PAE | 724 | select X86_PAE |
721 | help | 725 | help |
722 | Select this if you have a 32-bit processor and more than 4 | 726 | Select this if you have a 32-bit processor and more than 4 |
723 | gigabytes of physical RAM. | 727 | gigabytes of physical RAM. |
724 | 728 | ||
725 | endchoice | 729 | endchoice |
726 | 730 | ||
727 | choice | 731 | choice |
728 | depends on EXPERIMENTAL | 732 | depends on EXPERIMENTAL |
729 | prompt "Memory split" if EMBEDDED | 733 | prompt "Memory split" if EMBEDDED |
730 | default VMSPLIT_3G | 734 | default VMSPLIT_3G |
731 | depends on X86_32 | 735 | depends on X86_32 |
732 | help | 736 | help |
733 | Select the desired split between kernel and user memory. | 737 | Select the desired split between kernel and user memory. |
734 | 738 | ||
735 | If the address range available to the kernel is less than the | 739 | If the address range available to the kernel is less than the |
736 | physical memory installed, the remaining memory will be available | 740 | physical memory installed, the remaining memory will be available |
737 | as "high memory". Accessing high memory is a little more costly | 741 | as "high memory". Accessing high memory is a little more costly |
738 | than low memory, as it needs to be mapped into the kernel first. | 742 | than low memory, as it needs to be mapped into the kernel first. |
739 | Note that increasing the kernel address space limits the range | 743 | Note that increasing the kernel address space limits the range |
740 | available to user programs, making the address space there | 744 | available to user programs, making the address space there |
741 | tighter. Selecting anything other than the default 3G/1G split | 745 | tighter. Selecting anything other than the default 3G/1G split |
742 | will also likely make your kernel incompatible with binary-only | 746 | will also likely make your kernel incompatible with binary-only |
743 | kernel modules. | 747 | kernel modules. |
744 | 748 | ||
745 | If you are not absolutely sure what you are doing, leave this | 749 | If you are not absolutely sure what you are doing, leave this |
746 | option alone! | 750 | option alone! |
747 | 751 | ||
748 | config VMSPLIT_3G | 752 | config VMSPLIT_3G |
749 | bool "3G/1G user/kernel split" | 753 | bool "3G/1G user/kernel split" |
750 | config VMSPLIT_3G_OPT | 754 | config VMSPLIT_3G_OPT |
751 | depends on !X86_PAE | 755 | depends on !X86_PAE |
752 | bool "3G/1G user/kernel split (for full 1G low memory)" | 756 | bool "3G/1G user/kernel split (for full 1G low memory)" |
753 | config VMSPLIT_2G | 757 | config VMSPLIT_2G |
754 | bool "2G/2G user/kernel split" | 758 | bool "2G/2G user/kernel split" |
755 | config VMSPLIT_2G_OPT | 759 | config VMSPLIT_2G_OPT |
756 | depends on !X86_PAE | 760 | depends on !X86_PAE |
757 | bool "2G/2G user/kernel split (for full 2G low memory)" | 761 | bool "2G/2G user/kernel split (for full 2G low memory)" |
758 | config VMSPLIT_1G | 762 | config VMSPLIT_1G |
759 | bool "1G/3G user/kernel split" | 763 | bool "1G/3G user/kernel split" |
760 | endchoice | 764 | endchoice |
761 | 765 | ||
762 | config PAGE_OFFSET | 766 | config PAGE_OFFSET |
763 | hex | 767 | hex |
764 | default 0xB0000000 if VMSPLIT_3G_OPT | 768 | default 0xB0000000 if VMSPLIT_3G_OPT |
765 | default 0x80000000 if VMSPLIT_2G | 769 | default 0x80000000 if VMSPLIT_2G |
766 | default 0x78000000 if VMSPLIT_2G_OPT | 770 | default 0x78000000 if VMSPLIT_2G_OPT |
767 | default 0x40000000 if VMSPLIT_1G | 771 | default 0x40000000 if VMSPLIT_1G |
768 | default 0xC0000000 | 772 | default 0xC0000000 |
769 | depends on X86_32 | 773 | depends on X86_32 |
770 | 774 | ||
771 | config HIGHMEM | 775 | config HIGHMEM |
772 | def_bool y | 776 | def_bool y |
773 | depends on X86_32 && (HIGHMEM64G || HIGHMEM4G) | 777 | depends on X86_32 && (HIGHMEM64G || HIGHMEM4G) |
774 | 778 | ||
775 | config X86_PAE | 779 | config X86_PAE |
776 | def_bool n | 780 | def_bool n |
777 | prompt "PAE (Physical Address Extension) Support" | 781 | prompt "PAE (Physical Address Extension) Support" |
778 | depends on X86_32 && !HIGHMEM4G | 782 | depends on X86_32 && !HIGHMEM4G |
779 | select RESOURCES_64BIT | 783 | select RESOURCES_64BIT |
780 | help | 784 | help |
781 | PAE is required for NX support, and furthermore enables | 785 | PAE is required for NX support, and furthermore enables |
782 | larger swapspace support for non-overcommit purposes. It | 786 | larger swapspace support for non-overcommit purposes. It |
783 | has the cost of more pagetable lookup overhead, and also | 787 | has the cost of more pagetable lookup overhead, and also |
784 | consumes more pagetable space per process. | 788 | consumes more pagetable space per process. |
785 | 789 | ||
786 | # Common NUMA Features | 790 | # Common NUMA Features |
787 | config NUMA | 791 | config NUMA |
788 | bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)" | 792 | bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)" |
789 | depends on SMP | 793 | depends on SMP |
790 | depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || (X86_SUMMIT || X86_GENERICARCH) && ACPI) && EXPERIMENTAL) | 794 | depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || (X86_SUMMIT || X86_GENERICARCH) && ACPI) && EXPERIMENTAL) |
791 | default n if X86_PC | 795 | default n if X86_PC |
792 | default y if (X86_NUMAQ || X86_SUMMIT) | 796 | default y if (X86_NUMAQ || X86_SUMMIT) |
793 | help | 797 | help |
794 | Enable NUMA (Non Uniform Memory Access) support. | 798 | Enable NUMA (Non Uniform Memory Access) support. |
795 | The kernel will try to allocate memory used by a CPU on the | 799 | The kernel will try to allocate memory used by a CPU on the |
796 | local memory controller of the CPU and add some more | 800 | local memory controller of the CPU and add some more |
797 | NUMA awareness to the kernel. | 801 | NUMA awareness to the kernel. |
798 | 802 | ||
799 | For i386 this is currently highly experimental and should be only | 803 | For i386 this is currently highly experimental and should be only |
800 | used for kernel development. It might also cause boot failures. | 804 | used for kernel development. It might also cause boot failures. |
801 | For x86_64 this is recommended on all multiprocessor Opteron systems. | 805 | For x86_64 this is recommended on all multiprocessor Opteron systems. |
802 | If the system is EM64T, you should say N unless your system is | 806 | If the system is EM64T, you should say N unless your system is |
803 | EM64T NUMA. | 807 | EM64T NUMA. |
804 | 808 | ||
805 | comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI" | 809 | comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI" |
806 | depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI) | 810 | depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI) |
807 | 811 | ||
808 | config K8_NUMA | 812 | config K8_NUMA |
809 | def_bool y | 813 | def_bool y |
810 | prompt "Old style AMD Opteron NUMA detection" | 814 | prompt "Old style AMD Opteron NUMA detection" |
811 | depends on X86_64 && NUMA && PCI | 815 | depends on X86_64 && NUMA && PCI |
812 | help | 816 | help |
813 | Enable K8 NUMA node topology detection. You should say Y here if | 817 | Enable K8 NUMA node topology detection. You should say Y here if |
814 | you have a multi processor AMD K8 system. This uses an old | 818 | you have a multi processor AMD K8 system. This uses an old |
815 | method to read the NUMA configuration directly from the builtin | 819 | method to read the NUMA configuration directly from the builtin |
816 | Northbridge of Opteron. It is recommended to use X86_64_ACPI_NUMA | 820 | Northbridge of Opteron. It is recommended to use X86_64_ACPI_NUMA |
817 | instead, which also takes priority if both are compiled in. | 821 | instead, which also takes priority if both are compiled in. |
818 | 822 | ||
819 | config X86_64_ACPI_NUMA | 823 | config X86_64_ACPI_NUMA |
820 | def_bool y | 824 | def_bool y |
821 | prompt "ACPI NUMA detection" | 825 | prompt "ACPI NUMA detection" |
822 | depends on X86_64 && NUMA && ACPI && PCI | 826 | depends on X86_64 && NUMA && ACPI && PCI |
823 | select ACPI_NUMA | 827 | select ACPI_NUMA |
824 | help | 828 | help |
825 | Enable ACPI SRAT based node topology detection. | 829 | Enable ACPI SRAT based node topology detection. |
826 | 830 | ||
827 | config NUMA_EMU | 831 | config NUMA_EMU |
828 | bool "NUMA emulation" | 832 | bool "NUMA emulation" |
829 | depends on X86_64 && NUMA | 833 | depends on X86_64 && NUMA |
830 | help | 834 | help |
831 | Enable NUMA emulation. A flat machine will be split | 835 | Enable NUMA emulation. A flat machine will be split |
832 | into virtual nodes when booted with "numa=fake=N", where N is the | 836 | into virtual nodes when booted with "numa=fake=N", where N is the |
833 | number of nodes. This is only useful for debugging. | 837 | number of nodes. This is only useful for debugging. |
834 | 838 | ||
835 | config NODES_SHIFT | 839 | config NODES_SHIFT |
836 | int | 840 | int |
837 | default "6" if X86_64 | 841 | default "6" if X86_64 |
838 | default "4" if X86_NUMAQ | 842 | default "4" if X86_NUMAQ |
839 | default "3" | 843 | default "3" |
840 | depends on NEED_MULTIPLE_NODES | 844 | depends on NEED_MULTIPLE_NODES |
841 | 845 | ||
842 | config HAVE_ARCH_BOOTMEM_NODE | 846 | config HAVE_ARCH_BOOTMEM_NODE |
843 | def_bool y | 847 | def_bool y |
844 | depends on X86_32 && NUMA | 848 | depends on X86_32 && NUMA |
845 | 849 | ||
846 | config ARCH_HAVE_MEMORY_PRESENT | 850 | config ARCH_HAVE_MEMORY_PRESENT |
847 | def_bool y | 851 | def_bool y |
848 | depends on X86_32 && DISCONTIGMEM | 852 | depends on X86_32 && DISCONTIGMEM |
849 | 853 | ||
850 | config NEED_NODE_MEMMAP_SIZE | 854 | config NEED_NODE_MEMMAP_SIZE |
851 | def_bool y | 855 | def_bool y |
852 | depends on X86_32 && (DISCONTIGMEM || SPARSEMEM) | 856 | depends on X86_32 && (DISCONTIGMEM || SPARSEMEM) |
853 | 857 | ||
854 | config HAVE_ARCH_ALLOC_REMAP | 858 | config HAVE_ARCH_ALLOC_REMAP |
855 | def_bool y | 859 | def_bool y |
856 | depends on X86_32 && NUMA | 860 | depends on X86_32 && NUMA |
857 | 861 | ||
858 | config ARCH_FLATMEM_ENABLE | 862 | config ARCH_FLATMEM_ENABLE |
859 | def_bool y | 863 | def_bool y |
860 | depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && X86_PC | 864 | depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && X86_PC |
861 | 865 | ||
862 | config ARCH_DISCONTIGMEM_ENABLE | 866 | config ARCH_DISCONTIGMEM_ENABLE |
863 | def_bool y | 867 | def_bool y |
864 | depends on NUMA && X86_32 | 868 | depends on NUMA && X86_32 |
865 | 869 | ||
866 | config ARCH_DISCONTIGMEM_DEFAULT | 870 | config ARCH_DISCONTIGMEM_DEFAULT |
867 | def_bool y | 871 | def_bool y |
868 | depends on NUMA && X86_32 | 872 | depends on NUMA && X86_32 |
869 | 873 | ||
870 | config ARCH_SPARSEMEM_DEFAULT | 874 | config ARCH_SPARSEMEM_DEFAULT |
871 | def_bool y | 875 | def_bool y |
872 | depends on X86_64 | 876 | depends on X86_64 |
873 | 877 | ||
874 | config ARCH_SPARSEMEM_ENABLE | 878 | config ARCH_SPARSEMEM_ENABLE |
875 | def_bool y | 879 | def_bool y |
876 | depends on X86_64 || NUMA || (EXPERIMENTAL && X86_PC) | 880 | depends on X86_64 || NUMA || (EXPERIMENTAL && X86_PC) |
877 | select SPARSEMEM_STATIC if X86_32 | 881 | select SPARSEMEM_STATIC if X86_32 |
878 | select SPARSEMEM_VMEMMAP_ENABLE if X86_64 | 882 | select SPARSEMEM_VMEMMAP_ENABLE if X86_64 |
879 | 883 | ||
880 | config ARCH_SELECT_MEMORY_MODEL | 884 | config ARCH_SELECT_MEMORY_MODEL |
881 | def_bool y | 885 | def_bool y |
882 | depends on ARCH_SPARSEMEM_ENABLE | 886 | depends on ARCH_SPARSEMEM_ENABLE |
883 | 887 | ||
884 | config ARCH_MEMORY_PROBE | 888 | config ARCH_MEMORY_PROBE |
885 | def_bool X86_64 | 889 | def_bool X86_64 |
886 | depends on MEMORY_HOTPLUG | 890 | depends on MEMORY_HOTPLUG |
887 | 891 | ||
888 | source "mm/Kconfig" | 892 | source "mm/Kconfig" |
889 | 893 | ||
890 | config HIGHPTE | 894 | config HIGHPTE |
891 | bool "Allocate 3rd-level pagetables from highmem" | 895 | bool "Allocate 3rd-level pagetables from highmem" |
892 | depends on X86_32 && (HIGHMEM4G || HIGHMEM64G) | 896 | depends on X86_32 && (HIGHMEM4G || HIGHMEM64G) |
893 | help | 897 | help |
894 | The VM uses one page table entry for each page of physical memory. | 898 | The VM uses one page table entry for each page of physical memory. |
895 | For systems with a lot of RAM, this can be wasteful of precious | 899 | For systems with a lot of RAM, this can be wasteful of precious |
896 | low memory. Setting this option will put user-space page table | 900 | low memory. Setting this option will put user-space page table |
897 | entries in high memory. | 901 | entries in high memory. |
898 | 902 | ||
899 | config MATH_EMULATION | 903 | config MATH_EMULATION |
900 | bool | 904 | bool |
901 | prompt "Math emulation" if X86_32 | 905 | prompt "Math emulation" if X86_32 |
902 | ---help--- | 906 | ---help--- |
903 | Linux can emulate a math coprocessor (used for floating point | 907 | Linux can emulate a math coprocessor (used for floating point |
904 | operations) if you don't have one. 486DX and Pentium processors have | 908 | operations) if you don't have one. 486DX and Pentium processors have |
905 | a math coprocessor built in, 486SX and 386 do not, unless you added | 909 | a math coprocessor built in, 486SX and 386 do not, unless you added |
906 | a 487DX or 387, respectively. (The messages during boot time can | 910 | a 487DX or 387, respectively. (The messages during boot time can |
907 | give you some hints here ["man dmesg"].) Everyone needs either a | 911 | give you some hints here ["man dmesg"].) Everyone needs either a |
908 | coprocessor or this emulation. | 912 | coprocessor or this emulation. |
909 | 913 | ||
910 | If you don't have a math coprocessor, you need to say Y here; if you | 914 | If you don't have a math coprocessor, you need to say Y here; if you |
911 | say Y here even though you have a coprocessor, the coprocessor will | 915 | say Y here even though you have a coprocessor, the coprocessor will |
912 | be used nevertheless. (This behavior can be changed with the kernel | 916 | be used nevertheless. (This behavior can be changed with the kernel |
913 | command line option "no387", which comes handy if your coprocessor | 917 | command line option "no387", which comes handy if your coprocessor |
914 | is broken. Try "man bootparam" or see the documentation of your boot | 918 | is broken. Try "man bootparam" or see the documentation of your boot |
915 | loader (lilo or loadlin) about how to pass options to the kernel at | 919 | loader (lilo or loadlin) about how to pass options to the kernel at |
916 | boot time.) This means that it is a good idea to say Y here if you | 920 | boot time.) This means that it is a good idea to say Y here if you |
917 | intend to use this kernel on different machines. | 921 | intend to use this kernel on different machines. |
918 | 922 | ||
919 | More information about the internals of the Linux math coprocessor | 923 | More information about the internals of the Linux math coprocessor |
920 | emulation can be found in <file:arch/x86/math-emu/README>. | 924 | emulation can be found in <file:arch/x86/math-emu/README>. |
921 | 925 | ||
922 | If you are not sure, say Y; apart from resulting in a 66 KB bigger | 926 | If you are not sure, say Y; apart from resulting in a 66 KB bigger |
923 | kernel, it won't hurt. | 927 | kernel, it won't hurt. |
924 | 928 | ||
925 | config MTRR | 929 | config MTRR |
926 | bool "MTRR (Memory Type Range Register) support" | 930 | bool "MTRR (Memory Type Range Register) support" |
927 | ---help--- | 931 | ---help--- |
928 | On Intel P6 family processors (Pentium Pro, Pentium II and later) | 932 | On Intel P6 family processors (Pentium Pro, Pentium II and later) |
929 | the Memory Type Range Registers (MTRRs) may be used to control | 933 | the Memory Type Range Registers (MTRRs) may be used to control |
930 | processor access to memory ranges. This is most useful if you have | 934 | processor access to memory ranges. This is most useful if you have |
931 | a video (VGA) card on a PCI or AGP bus. Enabling write-combining | 935 | a video (VGA) card on a PCI or AGP bus. Enabling write-combining |
932 | allows bus write transfers to be combined into a larger transfer | 936 | allows bus write transfers to be combined into a larger transfer |
933 | before bursting over the PCI/AGP bus. This can increase performance | 937 | before bursting over the PCI/AGP bus. This can increase performance |
934 | of image write operations 2.5 times or more. Saying Y here creates a | 938 | of image write operations 2.5 times or more. Saying Y here creates a |
935 | /proc/mtrr file which may be used to manipulate your processor's | 939 | /proc/mtrr file which may be used to manipulate your processor's |
936 | MTRRs. Typically the X server should use this. | 940 | MTRRs. Typically the X server should use this. |
937 | 941 | ||
938 | This code has a reasonably generic interface so that similar | 942 | This code has a reasonably generic interface so that similar |
939 | control registers on other processors can be easily supported | 943 | control registers on other processors can be easily supported |
940 | as well: | 944 | as well: |
941 | 945 | ||
942 | The Cyrix 6x86, 6x86MX and M II processors have Address Range | 946 | The Cyrix 6x86, 6x86MX and M II processors have Address Range |
943 | Registers (ARRs) which provide a similar functionality to MTRRs. For | 947 | Registers (ARRs) which provide a similar functionality to MTRRs. For |
944 | these, the ARRs are used to emulate the MTRRs. | 948 | these, the ARRs are used to emulate the MTRRs. |
945 | The AMD K6-2 (stepping 8 and above) and K6-3 processors have two | 949 | The AMD K6-2 (stepping 8 and above) and K6-3 processors have two |
946 | MTRRs. The Centaur C6 (WinChip) has 8 MCRs, allowing | 950 | MTRRs. The Centaur C6 (WinChip) has 8 MCRs, allowing |
947 | write-combining. All of these processors are supported by this code | 951 | write-combining. All of these processors are supported by this code |
948 | and it makes sense to say Y here if you have one of them. | 952 | and it makes sense to say Y here if you have one of them. |
949 | 953 | ||
950 | Saying Y here also fixes a problem with buggy SMP BIOSes which only | 954 | Saying Y here also fixes a problem with buggy SMP BIOSes which only |
951 | set the MTRRs for the boot CPU and not for the secondary CPUs. This | 955 | set the MTRRs for the boot CPU and not for the secondary CPUs. This |
952 | can lead to all sorts of problems, so it's good to say Y here. | 956 | can lead to all sorts of problems, so it's good to say Y here. |
953 | 957 | ||
954 | You can safely say Y even if your machine doesn't have MTRRs, you'll | 958 | You can safely say Y even if your machine doesn't have MTRRs, you'll |
955 | just add about 9 KB to your kernel. | 959 | just add about 9 KB to your kernel. |
956 | 960 | ||
957 | See <file:Documentation/mtrr.txt> for more information. | 961 | See <file:Documentation/mtrr.txt> for more information. |
958 | 962 | ||
959 | config EFI | 963 | config EFI |
960 | def_bool n | 964 | def_bool n |
961 | prompt "Boot from EFI support" | 965 | prompt "Boot from EFI support" |
962 | depends on ACPI | 966 | depends on ACPI |
963 | ---help--- | 967 | ---help--- |
964 | This enables the kernel to boot on EFI platforms using | 968 | This enables the kernel to boot on EFI platforms using |
965 | system configuration information passed to it from the firmware. | 969 | system configuration information passed to it from the firmware. |
966 | This also enables the kernel to use any EFI runtime services that are | 970 | This also enables the kernel to use any EFI runtime services that are |
967 | available (such as the EFI variable services). | 971 | available (such as the EFI variable services). |
968 | 972 | ||
969 | This option is only useful on systems that have EFI firmware | 973 | This option is only useful on systems that have EFI firmware |
970 | and will result in a kernel image that is ~8k larger. In addition, | 974 | and will result in a kernel image that is ~8k larger. In addition, |
971 | you must use the latest ELILO loader available at | 975 | you must use the latest ELILO loader available at |
972 | <http://elilo.sourceforge.net> in order to take advantage of | 976 | <http://elilo.sourceforge.net> in order to take advantage of |
973 | kernel initialization using EFI information (neither GRUB nor LILO know | 977 | kernel initialization using EFI information (neither GRUB nor LILO know |
974 | anything about EFI). However, even with this option, the resultant | 978 | anything about EFI). However, even with this option, the resultant |
975 | kernel should continue to boot on existing non-EFI platforms. | 979 | kernel should continue to boot on existing non-EFI platforms. |
976 | 980 | ||
977 | config IRQBALANCE | 981 | config IRQBALANCE |
978 | def_bool y | 982 | def_bool y |
979 | prompt "Enable kernel irq balancing" | 983 | prompt "Enable kernel irq balancing" |
980 | depends on X86_32 && SMP && X86_IO_APIC | 984 | depends on X86_32 && SMP && X86_IO_APIC |
981 | help | 985 | help |
982 | The default yes will allow the kernel to do irq load balancing. | 986 | The default yes will allow the kernel to do irq load balancing. |
983 | Saying no will keep the kernel from doing irq load balancing. | 987 | Saying no will keep the kernel from doing irq load balancing. |
984 | 988 | ||
985 | # turning this on wastes a bunch of space. | 989 | # turning this on wastes a bunch of space. |
986 | # Summit needs it only when NUMA is on | 990 | # Summit needs it only when NUMA is on |
987 | config BOOT_IOREMAP | 991 | config BOOT_IOREMAP |
988 | def_bool y | 992 | def_bool y |
989 | depends on X86_32 && (((X86_SUMMIT || X86_GENERICARCH) && NUMA) || (X86 && EFI)) | 993 | depends on X86_32 && (((X86_SUMMIT || X86_GENERICARCH) && NUMA) || (X86 && EFI)) |
990 | 994 | ||
991 | config SECCOMP | 995 | config SECCOMP |
992 | def_bool y | 996 | def_bool y |
993 | prompt "Enable seccomp to safely compute untrusted bytecode" | 997 | prompt "Enable seccomp to safely compute untrusted bytecode" |
994 | depends on PROC_FS | 998 | depends on PROC_FS |
995 | help | 999 | help |
996 | This kernel feature is useful for number crunching applications | 1000 | This kernel feature is useful for number crunching applications |
997 | that may need to compute untrusted bytecode during their | 1001 | that may need to compute untrusted bytecode during their |
998 | execution. By using pipes or other transports made available to | 1002 | execution. By using pipes or other transports made available to |
999 | the process as file descriptors supporting the read/write | 1003 | the process as file descriptors supporting the read/write |
1000 | syscalls, it's possible to isolate those applications in | 1004 | syscalls, it's possible to isolate those applications in |
1001 | their own address space using seccomp. Once seccomp is | 1005 | their own address space using seccomp. Once seccomp is |
1002 | enabled via /proc/<pid>/seccomp, it cannot be disabled | 1006 | enabled via /proc/<pid>/seccomp, it cannot be disabled |
1003 | and the task is only allowed to execute a few safe syscalls | 1007 | and the task is only allowed to execute a few safe syscalls |
1004 | defined by each seccomp mode. | 1008 | defined by each seccomp mode. |
1005 | 1009 | ||
1006 | If unsure, say Y. Only embedded should say N here. | 1010 | If unsure, say Y. Only embedded should say N here. |
1007 | 1011 | ||
1008 | config CC_STACKPROTECTOR | 1012 | config CC_STACKPROTECTOR |
1009 | bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)" | 1013 | bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)" |
1010 | depends on X86_64 && EXPERIMENTAL | 1014 | depends on X86_64 && EXPERIMENTAL |
1011 | help | 1015 | help |
1012 | This option turns on the -fstack-protector GCC feature. This | 1016 | This option turns on the -fstack-protector GCC feature. This |
1013 | feature puts, at the beginning of critical functions, a canary | 1017 | feature puts, at the beginning of critical functions, a canary |
1014 | value on the stack just before the return address, and validates | 1018 | value on the stack just before the return address, and validates |
1015 | the value just before actually returning. Stack based buffer | 1019 | the value just before actually returning. Stack based buffer |
1016 | overflows (that need to overwrite this return address) now also | 1020 | overflows (that need to overwrite this return address) now also |
1017 | overwrite the canary, which gets detected and the attack is then | 1021 | overwrite the canary, which gets detected and the attack is then |
1018 | neutralized via a kernel panic. | 1022 | neutralized via a kernel panic. |
1019 | 1023 | ||
1020 | This feature requires gcc version 4.2 or above, or a distribution | 1024 | This feature requires gcc version 4.2 or above, or a distribution |
1021 | gcc with the feature backported. Older versions are automatically | 1025 | gcc with the feature backported. Older versions are automatically |
1022 | detected and for those versions, this configuration option is ignored. | 1026 | detected and for those versions, this configuration option is ignored. |
1023 | 1027 | ||
1024 | config CC_STACKPROTECTOR_ALL | 1028 | config CC_STACKPROTECTOR_ALL |
1025 | bool "Use stack-protector for all functions" | 1029 | bool "Use stack-protector for all functions" |
1026 | depends on CC_STACKPROTECTOR | 1030 | depends on CC_STACKPROTECTOR |
1027 | help | 1031 | help |
1028 | Normally, GCC only inserts the canary value protection for | 1032 | Normally, GCC only inserts the canary value protection for |
1029 | functions that use large-ish on-stack buffers. By enabling | 1033 | functions that use large-ish on-stack buffers. By enabling |
1030 | this option, GCC will be asked to do this for ALL functions. | 1034 | this option, GCC will be asked to do this for ALL functions. |
1031 | 1035 | ||
1032 | source kernel/Kconfig.hz | 1036 | source kernel/Kconfig.hz |
1033 | 1037 | ||
1034 | config KEXEC | 1038 | config KEXEC |
1035 | bool "kexec system call" | 1039 | bool "kexec system call" |
1036 | help | 1040 | help |
1037 | kexec is a system call that implements the ability to shutdown your | 1041 | kexec is a system call that implements the ability to shutdown your |
1038 | current kernel, and to start another kernel. It is like a reboot | 1042 | current kernel, and to start another kernel. It is like a reboot |
1039 | but it is independent of the system firmware. And like a reboot | 1043 | but it is independent of the system firmware. And like a reboot |
1040 | you can start any kernel with it, not just Linux. | 1044 | you can start any kernel with it, not just Linux. |
1041 | 1045 | ||
1042 | The name comes from the similarity to the exec system call. | 1046 | The name comes from the similarity to the exec system call. |
1043 | 1047 | ||
1044 | It is an ongoing process to be certain the hardware in a machine | 1048 | It is an ongoing process to be certain the hardware in a machine |
1045 | is properly shutdown, so do not be surprised if this code does not | 1049 | is properly shutdown, so do not be surprised if this code does not |
1046 | initially work for you. It may help to enable device hotplugging | 1050 | initially work for you. It may help to enable device hotplugging |
1047 | support. As of this writing the exact hardware interface is | 1051 | support. As of this writing the exact hardware interface is |
1048 | strongly in flux, so no good recommendation can be made. | 1052 | strongly in flux, so no good recommendation can be made. |
1049 | 1053 | ||
1050 | config CRASH_DUMP | 1054 | config CRASH_DUMP |
1051 | bool "kernel crash dumps (EXPERIMENTAL)" | 1055 | bool "kernel crash dumps (EXPERIMENTAL)" |
1052 | depends on EXPERIMENTAL | 1056 | depends on EXPERIMENTAL |
1053 | depends on X86_64 || (X86_32 && HIGHMEM) | 1057 | depends on X86_64 || (X86_32 && HIGHMEM) |
1054 | help | 1058 | help |
1055 | Generate crash dump after being started by kexec. | 1059 | Generate crash dump after being started by kexec. |
1056 | This should be normally only set in special crash dump kernels | 1060 | This should be normally only set in special crash dump kernels |
1057 | which are loaded in the main kernel with kexec-tools into | 1061 | which are loaded in the main kernel with kexec-tools into |
1058 | a specially reserved region and then later executed after | 1062 | a specially reserved region and then later executed after |
1059 | a crash by kdump/kexec. The crash dump kernel must be compiled | 1063 | a crash by kdump/kexec. The crash dump kernel must be compiled |
1060 | to a memory address not used by the main kernel or BIOS using | 1064 | to a memory address not used by the main kernel or BIOS using |
1061 | PHYSICAL_START, or it must be built as a relocatable image | 1065 | PHYSICAL_START, or it must be built as a relocatable image |
1062 | (CONFIG_RELOCATABLE=y). | 1066 | (CONFIG_RELOCATABLE=y). |
1063 | For more details see Documentation/kdump/kdump.txt | 1067 | For more details see Documentation/kdump/kdump.txt |
1064 | 1068 | ||
1065 | config PHYSICAL_START | 1069 | config PHYSICAL_START |
1066 | hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) | 1070 | hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) |
1067 | default "0x1000000" if X86_NUMAQ | 1071 | default "0x1000000" if X86_NUMAQ |
1068 | default "0x200000" if X86_64 | 1072 | default "0x200000" if X86_64 |
1069 | default "0x100000" | 1073 | default "0x100000" |
1070 | help | 1074 | help |
1071 | This gives the physical address where the kernel is loaded. | 1075 | This gives the physical address where the kernel is loaded. |
1072 | 1076 | ||
1073 | If kernel is a not relocatable (CONFIG_RELOCATABLE=n) then | 1077 | If kernel is a not relocatable (CONFIG_RELOCATABLE=n) then |
1074 | bzImage will decompress itself to above physical address and | 1078 | bzImage will decompress itself to above physical address and |
1075 | run from there. Otherwise, bzImage will run from the address where | 1079 | run from there. Otherwise, bzImage will run from the address where |
1076 | it has been loaded by the boot loader and will ignore above physical | 1080 | it has been loaded by the boot loader and will ignore above physical |
1077 | address. | 1081 | address. |
1078 | 1082 | ||
1079 | In normal kdump cases one does not have to set/change this option | 1083 | In normal kdump cases one does not have to set/change this option |
1080 | as now bzImage can be compiled as a completely relocatable image | 1084 | as now bzImage can be compiled as a completely relocatable image |
1081 | (CONFIG_RELOCATABLE=y) and be used to load and run from a different | 1085 | (CONFIG_RELOCATABLE=y) and be used to load and run from a different |
1082 | address. This option is mainly useful for the folks who don't want | 1086 | address. This option is mainly useful for the folks who don't want |
1083 | to use a bzImage for capturing the crash dump and want to use a | 1087 | to use a bzImage for capturing the crash dump and want to use a |
1084 | vmlinux instead. vmlinux is not relocatable hence a kernel needs | 1088 | vmlinux instead. vmlinux is not relocatable hence a kernel needs |
1085 | to be specifically compiled to run from a specific memory area | 1089 | to be specifically compiled to run from a specific memory area |
1086 | (normally a reserved region) and this option comes handy. | 1090 | (normally a reserved region) and this option comes handy. |
1087 | 1091 | ||
1088 | So if you are using bzImage for capturing the crash dump, leave | 1092 | So if you are using bzImage for capturing the crash dump, leave |
1089 | the value here unchanged to 0x100000 and set CONFIG_RELOCATABLE=y. | 1093 | the value here unchanged to 0x100000 and set CONFIG_RELOCATABLE=y. |
1090 | Otherwise if you plan to use vmlinux for capturing the crash dump | 1094 | Otherwise if you plan to use vmlinux for capturing the crash dump |
1091 | change this value to start of the reserved region (Typically 16MB | 1095 | change this value to start of the reserved region (Typically 16MB |
1092 | 0x1000000). In other words, it can be set based on the "X" value as | 1096 | 0x1000000). In other words, it can be set based on the "X" value as |
1093 | specified in the "crashkernel=YM@XM" command line boot parameter | 1097 | specified in the "crashkernel=YM@XM" command line boot parameter |
1094 | passed to the panic-ed kernel. Typically this parameter is set as | 1098 | passed to the panic-ed kernel. Typically this parameter is set as |
1095 | crashkernel=64M@16M. Please take a look at | 1099 | crashkernel=64M@16M. Please take a look at |
1096 | Documentation/kdump/kdump.txt for more details about crash dumps. | 1100 | Documentation/kdump/kdump.txt for more details about crash dumps. |
1097 | 1101 | ||
1098 | Usage of bzImage for capturing the crash dump is recommended as | 1102 | Usage of bzImage for capturing the crash dump is recommended as |
1099 | one does not have to build two kernels. Same kernel can be used | 1103 | one does not have to build two kernels. Same kernel can be used |
1100 | as production kernel and capture kernel. Above option should have | 1104 | as production kernel and capture kernel. Above option should have |
1101 | gone away after relocatable bzImage support is introduced. But it | 1105 | gone away after relocatable bzImage support is introduced. But it |
1102 | is present because there are users out there who continue to use | 1106 | is present because there are users out there who continue to use |
1103 | vmlinux for dump capture. This option should go away down the | 1107 | vmlinux for dump capture. This option should go away down the |
1104 | line. | 1108 | line. |
1105 | 1109 | ||
1106 | Don't change this unless you know what you are doing. | 1110 | Don't change this unless you know what you are doing. |
1107 | 1111 | ||
1108 | config RELOCATABLE | 1112 | config RELOCATABLE |
1109 | bool "Build a relocatable kernel (EXPERIMENTAL)" | 1113 | bool "Build a relocatable kernel (EXPERIMENTAL)" |
1110 | depends on EXPERIMENTAL | 1114 | depends on EXPERIMENTAL |
1111 | help | 1115 | help |
1112 | This builds a kernel image that retains relocation information | 1116 | This builds a kernel image that retains relocation information |
1113 | so it can be loaded someplace besides the default 1MB. | 1117 | so it can be loaded someplace besides the default 1MB. |
1114 | The relocations tend to make the kernel binary about 10% larger, | 1118 | The relocations tend to make the kernel binary about 10% larger, |
1115 | but are discarded at runtime. | 1119 | but are discarded at runtime. |
1116 | 1120 | ||
1117 | One use is for the kexec on panic case where the recovery kernel | 1121 | One use is for the kexec on panic case where the recovery kernel |
1118 | must live at a different physical address than the primary | 1122 | must live at a different physical address than the primary |
1119 | kernel. | 1123 | kernel. |
1120 | 1124 | ||
1121 | Note: If CONFIG_RELOCATABLE=y, then the kernel runs from the address | 1125 | Note: If CONFIG_RELOCATABLE=y, then the kernel runs from the address |
1122 | it has been loaded at and the compile time physical address | 1126 | it has been loaded at and the compile time physical address |
1123 | (CONFIG_PHYSICAL_START) is ignored. | 1127 | (CONFIG_PHYSICAL_START) is ignored. |
1124 | 1128 | ||
1125 | config PHYSICAL_ALIGN | 1129 | config PHYSICAL_ALIGN |
1126 | hex | 1130 | hex |
1127 | prompt "Alignment value to which kernel should be aligned" if X86_32 | 1131 | prompt "Alignment value to which kernel should be aligned" if X86_32 |
1128 | default "0x100000" if X86_32 | 1132 | default "0x100000" if X86_32 |
1129 | default "0x200000" if X86_64 | 1133 | default "0x200000" if X86_64 |
1130 | range 0x2000 0x400000 | 1134 | range 0x2000 0x400000 |
1131 | help | 1135 | help |
1132 | This value puts the alignment restrictions on physical address | 1136 | This value puts the alignment restrictions on physical address |
1133 | where kernel is loaded and run from. Kernel is compiled for an | 1137 | where kernel is loaded and run from. Kernel is compiled for an |
1134 | address which meets above alignment restriction. | 1138 | address which meets above alignment restriction. |
1135 | 1139 | ||
1136 | If bootloader loads the kernel at a non-aligned address and | 1140 | If bootloader loads the kernel at a non-aligned address and |
1137 | CONFIG_RELOCATABLE is set, kernel will move itself to nearest | 1141 | CONFIG_RELOCATABLE is set, kernel will move itself to nearest |
1138 | address aligned to above value and run from there. | 1142 | address aligned to above value and run from there. |
1139 | 1143 | ||
1140 | If bootloader loads the kernel at a non-aligned address and | 1144 | If bootloader loads the kernel at a non-aligned address and |
1141 | CONFIG_RELOCATABLE is not set, kernel will ignore the run time | 1145 | CONFIG_RELOCATABLE is not set, kernel will ignore the run time |
1142 | load address and decompress itself to the address it has been | 1146 | load address and decompress itself to the address it has been |
1143 | compiled for and run from there. The address for which kernel is | 1147 | compiled for and run from there. The address for which kernel is |
1144 | compiled already meets above alignment restrictions. Hence the | 1148 | compiled already meets above alignment restrictions. Hence the |
1145 | end result is that kernel runs from a physical address meeting | 1149 | end result is that kernel runs from a physical address meeting |
1146 | above alignment restrictions. | 1150 | above alignment restrictions. |
1147 | 1151 | ||
1148 | Don't change this unless you know what you are doing. | 1152 | Don't change this unless you know what you are doing. |
1149 | 1153 | ||
1150 | config HOTPLUG_CPU | 1154 | config HOTPLUG_CPU |
1151 | bool "Support for suspend on SMP and hot-pluggable CPUs (EXPERIMENTAL)" | 1155 | bool "Support for suspend on SMP and hot-pluggable CPUs (EXPERIMENTAL)" |
1152 | depends on SMP && HOTPLUG && EXPERIMENTAL && !X86_VOYAGER | 1156 | depends on SMP && HOTPLUG && EXPERIMENTAL && !X86_VOYAGER |
1153 | ---help--- | 1157 | ---help--- |
1154 | Say Y here to experiment with turning CPUs off and on, and to | 1158 | Say Y here to experiment with turning CPUs off and on, and to |
1155 | enable suspend on SMP systems. CPUs can be controlled through | 1159 | enable suspend on SMP systems. CPUs can be controlled through |
1156 | /sys/devices/system/cpu. | 1160 | /sys/devices/system/cpu. |
1157 | Say N if you want to disable CPU hotplug and don't need to | 1161 | Say N if you want to disable CPU hotplug and don't need to |
1158 | suspend. | 1162 | suspend. |
1159 | 1163 | ||
1160 | config COMPAT_VDSO | 1164 | config COMPAT_VDSO |
1161 | def_bool y | 1165 | def_bool y |
1162 | prompt "Compat VDSO support" | 1166 | prompt "Compat VDSO support" |
1163 | depends on X86_32 || IA32_EMULATION | 1167 | depends on X86_32 || IA32_EMULATION |
1164 | help | 1168 | help |
1165 | Map the 32-bit VDSO to the predictable old-style address too. | 1169 | Map the 32-bit VDSO to the predictable old-style address too. |
1166 | ---help--- | 1170 | ---help--- |
1167 | Say N here if you are running a sufficiently recent glibc | 1171 | Say N here if you are running a sufficiently recent glibc |
1168 | version (2.3.3 or later), to remove the high-mapped | 1172 | version (2.3.3 or later), to remove the high-mapped |
1169 | VDSO mapping and to exclusively use the randomized VDSO. | 1173 | VDSO mapping and to exclusively use the randomized VDSO. |
1170 | 1174 | ||
1171 | If unsure, say Y. | 1175 | If unsure, say Y. |
1172 | 1176 | ||
1173 | endmenu | 1177 | endmenu |
1174 | 1178 | ||
1175 | config ARCH_ENABLE_MEMORY_HOTPLUG | 1179 | config ARCH_ENABLE_MEMORY_HOTPLUG |
1176 | def_bool y | 1180 | def_bool y |
1177 | depends on X86_64 || (X86_32 && HIGHMEM) | 1181 | depends on X86_64 || (X86_32 && HIGHMEM) |
1178 | 1182 | ||
1179 | config HAVE_ARCH_EARLY_PFN_TO_NID | 1183 | config HAVE_ARCH_EARLY_PFN_TO_NID |
1180 | def_bool X86_64 | 1184 | def_bool X86_64 |
1181 | depends on NUMA | 1185 | depends on NUMA |
1182 | 1186 | ||
1183 | menu "Power management options" | 1187 | menu "Power management options" |
1184 | depends on !X86_VOYAGER | 1188 | depends on !X86_VOYAGER |
1185 | 1189 | ||
1186 | config ARCH_HIBERNATION_HEADER | 1190 | config ARCH_HIBERNATION_HEADER |
1187 | def_bool y | 1191 | def_bool y |
1188 | depends on X86_64 && HIBERNATION | 1192 | depends on X86_64 && HIBERNATION |
1189 | 1193 | ||
1190 | source "kernel/power/Kconfig" | 1194 | source "kernel/power/Kconfig" |
1191 | 1195 | ||
1192 | source "drivers/acpi/Kconfig" | 1196 | source "drivers/acpi/Kconfig" |
1193 | 1197 | ||
1194 | menuconfig APM | 1198 | menuconfig APM |
1195 | tristate "APM (Advanced Power Management) BIOS support" | 1199 | tristate "APM (Advanced Power Management) BIOS support" |
1196 | depends on X86_32 && PM_SLEEP && !X86_VISWS | 1200 | depends on X86_32 && PM_SLEEP && !X86_VISWS |
1197 | ---help--- | 1201 | ---help--- |
1198 | APM is a BIOS specification for saving power using several different | 1202 | APM is a BIOS specification for saving power using several different |
1199 | techniques. This is mostly useful for battery powered laptops with | 1203 | techniques. This is mostly useful for battery powered laptops with |
1200 | APM compliant BIOSes. If you say Y here, the system time will be | 1204 | APM compliant BIOSes. If you say Y here, the system time will be |
1201 | reset after a RESUME operation, the /proc/apm device will provide | 1205 | reset after a RESUME operation, the /proc/apm device will provide |
1202 | battery status information, and user-space programs will receive | 1206 | battery status information, and user-space programs will receive |
1203 | notification of APM "events" (e.g. battery status change). | 1207 | notification of APM "events" (e.g. battery status change). |
1204 | 1208 | ||
1205 | If you select "Y" here, you can disable actual use of the APM | 1209 | If you select "Y" here, you can disable actual use of the APM |
1206 | BIOS by passing the "apm=off" option to the kernel at boot time. | 1210 | BIOS by passing the "apm=off" option to the kernel at boot time. |
1207 | 1211 | ||
1208 | Note that the APM support is almost completely disabled for | 1212 | Note that the APM support is almost completely disabled for |
1209 | machines with more than one CPU. | 1213 | machines with more than one CPU. |
1210 | 1214 | ||
1211 | In order to use APM, you will need supporting software. For location | 1215 | In order to use APM, you will need supporting software. For location |
1212 | and more information, read <file:Documentation/pm.txt> and the | 1216 | and more information, read <file:Documentation/pm.txt> and the |
1213 | Battery Powered Linux mini-HOWTO, available from | 1217 | Battery Powered Linux mini-HOWTO, available from |
1214 | <http://www.tldp.org/docs.html#howto>. | 1218 | <http://www.tldp.org/docs.html#howto>. |
1215 | 1219 | ||
1216 | This driver does not spin down disk drives (see the hdparm(8) | 1220 | This driver does not spin down disk drives (see the hdparm(8) |
1217 | manpage ("man 8 hdparm") for that), and it doesn't turn off | 1221 | manpage ("man 8 hdparm") for that), and it doesn't turn off |
1218 | VESA-compliant "green" monitors. | 1222 | VESA-compliant "green" monitors. |
1219 | 1223 | ||
1220 | This driver does not support the TI 4000M TravelMate and the ACER | 1224 | This driver does not support the TI 4000M TravelMate and the ACER |
1221 | 486/DX4/75 because they don't have compliant BIOSes. Many "green" | 1225 | 486/DX4/75 because they don't have compliant BIOSes. Many "green" |
1222 | desktop machines also don't have compliant BIOSes, and this driver | 1226 | desktop machines also don't have compliant BIOSes, and this driver |
1223 | may cause those machines to panic during the boot phase. | 1227 | may cause those machines to panic during the boot phase. |
1224 | 1228 | ||
1225 | Generally, if you don't have a battery in your machine, there isn't | 1229 | Generally, if you don't have a battery in your machine, there isn't |
1226 | much point in using this driver and you should say N. If you get | 1230 | much point in using this driver and you should say N. If you get |
1227 | random kernel OOPSes or reboots that don't seem to be related to | 1231 | random kernel OOPSes or reboots that don't seem to be related to |
1228 | anything, try disabling/enabling this option (or disabling/enabling | 1232 | anything, try disabling/enabling this option (or disabling/enabling |
1229 | APM in your BIOS). | 1233 | APM in your BIOS). |
1230 | 1234 | ||
1231 | Some other things you should try when experiencing seemingly random, | 1235 | Some other things you should try when experiencing seemingly random, |
1232 | "weird" problems: | 1236 | "weird" problems: |
1233 | 1237 | ||
1234 | 1) make sure that you have enough swap space and that it is | 1238 | 1) make sure that you have enough swap space and that it is |
1235 | enabled. | 1239 | enabled. |
1236 | 2) pass the "no-hlt" option to the kernel | 1240 | 2) pass the "no-hlt" option to the kernel |
1237 | 3) switch on floating point emulation in the kernel and pass | 1241 | 3) switch on floating point emulation in the kernel and pass |
1238 | the "no387" option to the kernel | 1242 | the "no387" option to the kernel |
1239 | 4) pass the "floppy=nodma" option to the kernel | 1243 | 4) pass the "floppy=nodma" option to the kernel |
1240 | 5) pass the "mem=4M" option to the kernel (thereby disabling | 1244 | 5) pass the "mem=4M" option to the kernel (thereby disabling |
1241 | all but the first 4 MB of RAM) | 1245 | all but the first 4 MB of RAM) |
1242 | 6) make sure that the CPU is not over clocked. | 1246 | 6) make sure that the CPU is not over clocked. |
1243 | 7) read the sig11 FAQ at <http://www.bitwizard.nl/sig11/> | 1247 | 7) read the sig11 FAQ at <http://www.bitwizard.nl/sig11/> |
1244 | 8) disable the cache from your BIOS settings | 1248 | 8) disable the cache from your BIOS settings |
1245 | 9) install a fan for the video card or exchange video RAM | 1249 | 9) install a fan for the video card or exchange video RAM |
1246 | 10) install a better fan for the CPU | 1250 | 10) install a better fan for the CPU |
1247 | 11) exchange RAM chips | 1251 | 11) exchange RAM chips |
1248 | 12) exchange the motherboard. | 1252 | 12) exchange the motherboard. |
1249 | 1253 | ||
1250 | To compile this driver as a module, choose M here: the | 1254 | To compile this driver as a module, choose M here: the |
1251 | module will be called apm. | 1255 | module will be called apm. |
1252 | 1256 | ||
1253 | if APM | 1257 | if APM |
1254 | 1258 | ||
1255 | config APM_IGNORE_USER_SUSPEND | 1259 | config APM_IGNORE_USER_SUSPEND |
1256 | bool "Ignore USER SUSPEND" | 1260 | bool "Ignore USER SUSPEND" |
1257 | help | 1261 | help |
1258 | This option will ignore USER SUSPEND requests. On machines with a | 1262 | This option will ignore USER SUSPEND requests. On machines with a |
1259 | compliant APM BIOS, you want to say N. However, on the NEC Versa M | 1263 | compliant APM BIOS, you want to say N. However, on the NEC Versa M |
1260 | series notebooks, it is necessary to say Y because of a BIOS bug. | 1264 | series notebooks, it is necessary to say Y because of a BIOS bug. |
1261 | 1265 | ||
1262 | config APM_DO_ENABLE | 1266 | config APM_DO_ENABLE |
1263 | bool "Enable PM at boot time" | 1267 | bool "Enable PM at boot time" |
1264 | ---help--- | 1268 | ---help--- |
1265 | Enable APM features at boot time. From page 36 of the APM BIOS | 1269 | Enable APM features at boot time. From page 36 of the APM BIOS |
1266 | specification: "When disabled, the APM BIOS does not automatically | 1270 | specification: "When disabled, the APM BIOS does not automatically |
1267 | power manage devices, enter the Standby State, enter the Suspend | 1271 | power manage devices, enter the Standby State, enter the Suspend |
1268 | State, or take power saving steps in response to CPU Idle calls." | 1272 | State, or take power saving steps in response to CPU Idle calls." |
1269 | This driver will make CPU Idle calls when Linux is idle (unless this | 1273 | This driver will make CPU Idle calls when Linux is idle (unless this |
1270 | feature is turned off -- see "Do CPU IDLE calls", below). This | 1274 | feature is turned off -- see "Do CPU IDLE calls", below). This |
1271 | should always save battery power, but more complicated APM features | 1275 | should always save battery power, but more complicated APM features |
1272 | will be dependent on your BIOS implementation. You may need to turn | 1276 | will be dependent on your BIOS implementation. You may need to turn |
1273 | this option off if your computer hangs at boot time when using APM | 1277 | this option off if your computer hangs at boot time when using APM |
1274 | support, or if it beeps continuously instead of suspending. Turn | 1278 | support, or if it beeps continuously instead of suspending. Turn |
1275 | this off if you have a NEC UltraLite Versa 33/C or a Toshiba | 1279 | this off if you have a NEC UltraLite Versa 33/C or a Toshiba |
1276 | T400CDT. This is off by default since most machines do fine without | 1280 | T400CDT. This is off by default since most machines do fine without |
1277 | this feature. | 1281 | this feature. |
1278 | 1282 | ||
1279 | config APM_CPU_IDLE | 1283 | config APM_CPU_IDLE |
1280 | bool "Make CPU Idle calls when idle" | 1284 | bool "Make CPU Idle calls when idle" |
1281 | help | 1285 | help |
1282 | Enable calls to APM CPU Idle/CPU Busy inside the kernel's idle loop. | 1286 | Enable calls to APM CPU Idle/CPU Busy inside the kernel's idle loop. |
1283 | On some machines, this can activate improved power savings, such as | 1287 | On some machines, this can activate improved power savings, such as |
1284 | a slowed CPU clock rate, when the machine is idle. These idle calls | 1288 | a slowed CPU clock rate, when the machine is idle. These idle calls |
1285 | are made after the idle loop has run for some length of time (e.g., | 1289 | are made after the idle loop has run for some length of time (e.g., |
1286 | 333 mS). On some machines, this will cause a hang at boot time or | 1290 | 333 mS). On some machines, this will cause a hang at boot time or |
1287 | whenever the CPU becomes idle. (On machines with more than one CPU, | 1291 | whenever the CPU becomes idle. (On machines with more than one CPU, |
1288 | this option does nothing.) | 1292 | this option does nothing.) |
1289 | 1293 | ||
1290 | config APM_DISPLAY_BLANK | 1294 | config APM_DISPLAY_BLANK |
1291 | bool "Enable console blanking using APM" | 1295 | bool "Enable console blanking using APM" |
1292 | help | 1296 | help |
1293 | Enable console blanking using the APM. Some laptops can use this to | 1297 | Enable console blanking using the APM. Some laptops can use this to |
1294 | turn off the LCD backlight when the screen blanker of the Linux | 1298 | turn off the LCD backlight when the screen blanker of the Linux |
1295 | virtual console blanks the screen. Note that this is only used by | 1299 | virtual console blanks the screen. Note that this is only used by |
1296 | the virtual console screen blanker, and won't turn off the backlight | 1300 | the virtual console screen blanker, and won't turn off the backlight |
1297 | when using the X Window system. This also doesn't have anything to | 1301 | when using the X Window system. This also doesn't have anything to |
1298 | do with your VESA-compliant power-saving monitor. Further, this | 1302 | do with your VESA-compliant power-saving monitor. Further, this |
1299 | option doesn't work for all laptops -- it might not turn off your | 1303 | option doesn't work for all laptops -- it might not turn off your |
1300 | backlight at all, or it might print a lot of errors to the console, | 1304 | backlight at all, or it might print a lot of errors to the console, |
1301 | especially if you are using gpm. | 1305 | especially if you are using gpm. |
1302 | 1306 | ||
1303 | config APM_ALLOW_INTS | 1307 | config APM_ALLOW_INTS |
1304 | bool "Allow interrupts during APM BIOS calls" | 1308 | bool "Allow interrupts during APM BIOS calls" |
1305 | help | 1309 | help |
1306 | Normally we disable external interrupts while we are making calls to | 1310 | Normally we disable external interrupts while we are making calls to |
1307 | the APM BIOS as a measure to lessen the effects of a badly behaving | 1311 | the APM BIOS as a measure to lessen the effects of a badly behaving |
1308 | BIOS implementation. The BIOS should reenable interrupts if it | 1312 | BIOS implementation. The BIOS should reenable interrupts if it |
1309 | needs to. Unfortunately, some BIOSes do not -- especially those in | 1313 | needs to. Unfortunately, some BIOSes do not -- especially those in |
1310 | many of the newer IBM Thinkpads. If you experience hangs when you | 1314 | many of the newer IBM Thinkpads. If you experience hangs when you |
1311 | suspend, try setting this to Y. Otherwise, say N. | 1315 | suspend, try setting this to Y. Otherwise, say N. |
1312 | 1316 | ||
1313 | config APM_REAL_MODE_POWER_OFF | 1317 | config APM_REAL_MODE_POWER_OFF |
1314 | bool "Use real mode APM BIOS call to power off" | 1318 | bool "Use real mode APM BIOS call to power off" |
1315 | help | 1319 | help |
1316 | Use real mode APM BIOS calls to switch off the computer. This is | 1320 | Use real mode APM BIOS calls to switch off the computer. This is |
1317 | a work-around for a number of buggy BIOSes. Switch this option on if | 1321 | a work-around for a number of buggy BIOSes. Switch this option on if |
1318 | your computer crashes instead of powering off properly. | 1322 | your computer crashes instead of powering off properly. |
1319 | 1323 | ||
1320 | endif # APM | 1324 | endif # APM |
1321 | 1325 | ||
1322 | source "arch/x86/kernel/cpu/cpufreq/Kconfig" | 1326 | source "arch/x86/kernel/cpu/cpufreq/Kconfig" |
1323 | 1327 | ||
1324 | source "drivers/cpuidle/Kconfig" | 1328 | source "drivers/cpuidle/Kconfig" |
1325 | 1329 | ||
1326 | endmenu | 1330 | endmenu |
1327 | 1331 | ||
1328 | 1332 | ||
1329 | menu "Bus options (PCI etc.)" | 1333 | menu "Bus options (PCI etc.)" |
1330 | 1334 | ||
1331 | config PCI | 1335 | config PCI |
1332 | bool "PCI support" if !X86_VISWS | 1336 | bool "PCI support" if !X86_VISWS |
1333 | depends on !X86_VOYAGER | 1337 | depends on !X86_VOYAGER |
1334 | default y if X86_VISWS | 1338 | default y if X86_VISWS |
1335 | select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC) | 1339 | select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC) |
1336 | help | 1340 | help |
1337 | Find out whether you have a PCI motherboard. PCI is the name of a | 1341 | Find out whether you have a PCI motherboard. PCI is the name of a |
1338 | bus system, i.e. the way the CPU talks to the other stuff inside | 1342 | bus system, i.e. the way the CPU talks to the other stuff inside |
1339 | your box. Other bus systems are ISA, EISA, MicroChannel (MCA) or | 1343 | your box. Other bus systems are ISA, EISA, MicroChannel (MCA) or |
1340 | VESA. If you have PCI, say Y, otherwise N. | 1344 | VESA. If you have PCI, say Y, otherwise N. |
1341 | 1345 | ||
1342 | The PCI-HOWTO, available from | 1346 | The PCI-HOWTO, available from |
1343 | <http://www.tldp.org/docs.html#howto>, contains valuable | 1347 | <http://www.tldp.org/docs.html#howto>, contains valuable |
1344 | information about which PCI hardware does work under Linux and which | 1348 | information about which PCI hardware does work under Linux and which |
1345 | doesn't. | 1349 | doesn't. |
1346 | 1350 | ||
1347 | choice | 1351 | choice |
1348 | prompt "PCI access mode" | 1352 | prompt "PCI access mode" |
1349 | depends on X86_32 && PCI && !X86_VISWS | 1353 | depends on X86_32 && PCI && !X86_VISWS |
1350 | default PCI_GOANY | 1354 | default PCI_GOANY |
1351 | ---help--- | 1355 | ---help--- |
1352 | On PCI systems, the BIOS can be used to detect the PCI devices and | 1356 | On PCI systems, the BIOS can be used to detect the PCI devices and |
1353 | determine their configuration. However, some old PCI motherboards | 1357 | determine their configuration. However, some old PCI motherboards |
1354 | have BIOS bugs and may crash if this is done. Also, some embedded | 1358 | have BIOS bugs and may crash if this is done. Also, some embedded |
1355 | PCI-based systems don't have any BIOS at all. Linux can also try to | 1359 | PCI-based systems don't have any BIOS at all. Linux can also try to |
1356 | detect the PCI hardware directly without using the BIOS. | 1360 | detect the PCI hardware directly without using the BIOS. |
1357 | 1361 | ||
1358 | With this option, you can specify how Linux should detect the | 1362 | With this option, you can specify how Linux should detect the |
1359 | PCI devices. If you choose "BIOS", the BIOS will be used, | 1363 | PCI devices. If you choose "BIOS", the BIOS will be used, |
1360 | if you choose "Direct", the BIOS won't be used, and if you | 1364 | if you choose "Direct", the BIOS won't be used, and if you |
1361 | choose "MMConfig", then PCI Express MMCONFIG will be used. | 1365 | choose "MMConfig", then PCI Express MMCONFIG will be used. |
1362 | If you choose "Any", the kernel will try MMCONFIG, then the | 1366 | If you choose "Any", the kernel will try MMCONFIG, then the |
1363 | direct access method and falls back to the BIOS if that doesn't | 1367 | direct access method and falls back to the BIOS if that doesn't |
1364 | work. If unsure, go with the default, which is "Any". | 1368 | work. If unsure, go with the default, which is "Any". |
1365 | 1369 | ||
1366 | config PCI_GOBIOS | 1370 | config PCI_GOBIOS |
1367 | bool "BIOS" | 1371 | bool "BIOS" |
1368 | 1372 | ||
1369 | config PCI_GOMMCONFIG | 1373 | config PCI_GOMMCONFIG |
1370 | bool "MMConfig" | 1374 | bool "MMConfig" |
1371 | 1375 | ||
1372 | config PCI_GODIRECT | 1376 | config PCI_GODIRECT |
1373 | bool "Direct" | 1377 | bool "Direct" |
1374 | 1378 | ||
1375 | config PCI_GOANY | 1379 | config PCI_GOANY |
1376 | bool "Any" | 1380 | bool "Any" |
1377 | 1381 | ||
1378 | endchoice | 1382 | endchoice |
1379 | 1383 | ||
1380 | config PCI_BIOS | 1384 | config PCI_BIOS |
1381 | def_bool y | 1385 | def_bool y |
1382 | depends on X86_32 && !X86_VISWS && PCI && (PCI_GOBIOS || PCI_GOANY) | 1386 | depends on X86_32 && !X86_VISWS && PCI && (PCI_GOBIOS || PCI_GOANY) |
1383 | 1387 | ||
1384 | # x86-64 doesn't support PCI BIOS access from long mode so always go direct. | 1388 | # x86-64 doesn't support PCI BIOS access from long mode so always go direct. |
1385 | config PCI_DIRECT | 1389 | config PCI_DIRECT |
1386 | def_bool y | 1390 | def_bool y |
1387 | depends on PCI && (X86_64 || (PCI_GODIRECT || PCI_GOANY) || X86_VISWS) | 1391 | depends on PCI && (X86_64 || (PCI_GODIRECT || PCI_GOANY) || X86_VISWS) |
1388 | 1392 | ||
1389 | config PCI_MMCONFIG | 1393 | config PCI_MMCONFIG |
1390 | def_bool y | 1394 | def_bool y |
1391 | depends on X86_32 && PCI && ACPI && (PCI_GOMMCONFIG || PCI_GOANY) | 1395 | depends on X86_32 && PCI && ACPI && (PCI_GOMMCONFIG || PCI_GOANY) |
1392 | 1396 | ||
1393 | config PCI_DOMAINS | 1397 | config PCI_DOMAINS |
1394 | def_bool y | 1398 | def_bool y |
1395 | depends on PCI | 1399 | depends on PCI |
1396 | 1400 | ||
1397 | config PCI_MMCONFIG | 1401 | config PCI_MMCONFIG |
1398 | bool "Support mmconfig PCI config space access" | 1402 | bool "Support mmconfig PCI config space access" |
1399 | depends on X86_64 && PCI && ACPI | 1403 | depends on X86_64 && PCI && ACPI |
1400 | 1404 | ||
1401 | config DMAR | 1405 | config DMAR |
1402 | bool "Support for DMA Remapping Devices (EXPERIMENTAL)" | 1406 | bool "Support for DMA Remapping Devices (EXPERIMENTAL)" |
1403 | depends on X86_64 && PCI_MSI && ACPI && EXPERIMENTAL | 1407 | depends on X86_64 && PCI_MSI && ACPI && EXPERIMENTAL |
1404 | help | 1408 | help |
1405 | DMA remapping (DMAR) devices support enables independent address | 1409 | DMA remapping (DMAR) devices support enables independent address |
1406 | translations for Direct Memory Access (DMA) from devices. | 1410 | translations for Direct Memory Access (DMA) from devices. |
1407 | These DMA remapping devices are reported via ACPI tables | 1411 | These DMA remapping devices are reported via ACPI tables |
1408 | and include PCI device scope covered by these DMA | 1412 | and include PCI device scope covered by these DMA |
1409 | remapping devices. | 1413 | remapping devices. |
1410 | 1414 | ||
1411 | config DMAR_GFX_WA | 1415 | config DMAR_GFX_WA |
1412 | def_bool y | 1416 | def_bool y |
1413 | prompt "Support for Graphics workaround" | 1417 | prompt "Support for Graphics workaround" |
1414 | depends on DMAR | 1418 | depends on DMAR |
1415 | help | 1419 | help |
1416 | Current Graphics drivers tend to use physical address | 1420 | Current Graphics drivers tend to use physical address |
1417 | for DMA and avoid using DMA APIs. Setting this config | 1421 | for DMA and avoid using DMA APIs. Setting this config |
1418 | option permits the IOMMU driver to set a unity map for | 1422 | option permits the IOMMU driver to set a unity map for |
1419 | all the OS-visible memory. Hence the driver can continue | 1423 | all the OS-visible memory. Hence the driver can continue |
1420 | to use physical addresses for DMA. | 1424 | to use physical addresses for DMA. |
1421 | 1425 | ||
1422 | config DMAR_FLOPPY_WA | 1426 | config DMAR_FLOPPY_WA |
1423 | def_bool y | 1427 | def_bool y |
1424 | depends on DMAR | 1428 | depends on DMAR |
1425 | help | 1429 | help |
1426 | Floppy disk drivers are know to bypass DMA API calls | 1430 | Floppy disk drivers are know to bypass DMA API calls |
1427 | thereby failing to work when IOMMU is enabled. This | 1431 | thereby failing to work when IOMMU is enabled. This |
1428 | workaround will setup a 1:1 mapping for the first | 1432 | workaround will setup a 1:1 mapping for the first |
1429 | 16M to make floppy (an ISA device) work. | 1433 | 16M to make floppy (an ISA device) work. |
1430 | 1434 | ||
1431 | source "drivers/pci/pcie/Kconfig" | 1435 | source "drivers/pci/pcie/Kconfig" |
1432 | 1436 | ||
1433 | source "drivers/pci/Kconfig" | 1437 | source "drivers/pci/Kconfig" |
1434 | 1438 | ||
1435 | # x86_64 have no ISA slots, but do have ISA-style DMA. | 1439 | # x86_64 have no ISA slots, but do have ISA-style DMA. |
1436 | config ISA_DMA_API | 1440 | config ISA_DMA_API |
1437 | def_bool y | 1441 | def_bool y |
1438 | 1442 | ||
1439 | if X86_32 | 1443 | if X86_32 |
1440 | 1444 | ||
1441 | config ISA | 1445 | config ISA |
1442 | bool "ISA support" | 1446 | bool "ISA support" |
1443 | depends on !(X86_VOYAGER || X86_VISWS) | 1447 | depends on !(X86_VOYAGER || X86_VISWS) |
1444 | help | 1448 | help |
1445 | Find out whether you have ISA slots on your motherboard. ISA is the | 1449 | Find out whether you have ISA slots on your motherboard. ISA is the |
1446 | name of a bus system, i.e. the way the CPU talks to the other stuff | 1450 | name of a bus system, i.e. the way the CPU talks to the other stuff |
1447 | inside your box. Other bus systems are PCI, EISA, MicroChannel | 1451 | inside your box. Other bus systems are PCI, EISA, MicroChannel |
1448 | (MCA) or VESA. ISA is an older system, now being displaced by PCI; | 1452 | (MCA) or VESA. ISA is an older system, now being displaced by PCI; |
1449 | newer boards don't support it. If you have ISA, say Y, otherwise N. | 1453 | newer boards don't support it. If you have ISA, say Y, otherwise N. |
1450 | 1454 | ||
1451 | config EISA | 1455 | config EISA |
1452 | bool "EISA support" | 1456 | bool "EISA support" |
1453 | depends on ISA | 1457 | depends on ISA |
1454 | ---help--- | 1458 | ---help--- |
1455 | The Extended Industry Standard Architecture (EISA) bus was | 1459 | The Extended Industry Standard Architecture (EISA) bus was |
1456 | developed as an open alternative to the IBM MicroChannel bus. | 1460 | developed as an open alternative to the IBM MicroChannel bus. |
1457 | 1461 | ||
1458 | The EISA bus provided some of the features of the IBM MicroChannel | 1462 | The EISA bus provided some of the features of the IBM MicroChannel |
1459 | bus while maintaining backward compatibility with cards made for | 1463 | bus while maintaining backward compatibility with cards made for |
1460 | the older ISA bus. The EISA bus saw limited use between 1988 and | 1464 | the older ISA bus. The EISA bus saw limited use between 1988 and |
1461 | 1995 when it was made obsolete by the PCI bus. | 1465 | 1995 when it was made obsolete by the PCI bus. |
1462 | 1466 | ||
1463 | Say Y here if you are building a kernel for an EISA-based machine. | 1467 | Say Y here if you are building a kernel for an EISA-based machine. |
1464 | 1468 | ||
1465 | Otherwise, say N. | 1469 | Otherwise, say N. |
1466 | 1470 | ||
1467 | source "drivers/eisa/Kconfig" | 1471 | source "drivers/eisa/Kconfig" |
1468 | 1472 | ||
1469 | config MCA | 1473 | config MCA |
1470 | bool "MCA support" if !(X86_VISWS || X86_VOYAGER) | 1474 | bool "MCA support" if !(X86_VISWS || X86_VOYAGER) |
1471 | default y if X86_VOYAGER | 1475 | default y if X86_VOYAGER |
1472 | help | 1476 | help |
1473 | MicroChannel Architecture is found in some IBM PS/2 machines and | 1477 | MicroChannel Architecture is found in some IBM PS/2 machines and |
1474 | laptops. It is a bus system similar to PCI or ISA. See | 1478 | laptops. It is a bus system similar to PCI or ISA. See |
1475 | <file:Documentation/mca.txt> (and especially the web page given | 1479 | <file:Documentation/mca.txt> (and especially the web page given |
1476 | there) before attempting to build an MCA bus kernel. | 1480 | there) before attempting to build an MCA bus kernel. |
1477 | 1481 | ||
1478 | source "drivers/mca/Kconfig" | 1482 | source "drivers/mca/Kconfig" |
1479 | 1483 | ||
1480 | config SCx200 | 1484 | config SCx200 |
1481 | tristate "NatSemi SCx200 support" | 1485 | tristate "NatSemi SCx200 support" |
1482 | depends on !X86_VOYAGER | 1486 | depends on !X86_VOYAGER |
1483 | help | 1487 | help |
1484 | This provides basic support for National Semiconductor's | 1488 | This provides basic support for National Semiconductor's |
1485 | (now AMD's) Geode processors. The driver probes for the | 1489 | (now AMD's) Geode processors. The driver probes for the |
1486 | PCI-IDs of several on-chip devices, so its a good dependency | 1490 | PCI-IDs of several on-chip devices, so its a good dependency |
1487 | for other scx200_* drivers. | 1491 | for other scx200_* drivers. |
1488 | 1492 | ||
1489 | If compiled as a module, the driver is named scx200. | 1493 | If compiled as a module, the driver is named scx200. |
1490 | 1494 | ||
1491 | config SCx200HR_TIMER | 1495 | config SCx200HR_TIMER |
1492 | tristate "NatSemi SCx200 27MHz High-Resolution Timer Support" | 1496 | tristate "NatSemi SCx200 27MHz High-Resolution Timer Support" |
1493 | depends on SCx200 && GENERIC_TIME | 1497 | depends on SCx200 && GENERIC_TIME |
1494 | default y | 1498 | default y |
1495 | help | 1499 | help |
1496 | This driver provides a clocksource built upon the on-chip | 1500 | This driver provides a clocksource built upon the on-chip |
1497 | 27MHz high-resolution timer. Its also a workaround for | 1501 | 27MHz high-resolution timer. Its also a workaround for |
1498 | NSC Geode SC-1100's buggy TSC, which loses time when the | 1502 | NSC Geode SC-1100's buggy TSC, which loses time when the |
1499 | processor goes idle (as is done by the scheduler). The | 1503 | processor goes idle (as is done by the scheduler). The |
1500 | other workaround is idle=poll boot option. | 1504 | other workaround is idle=poll boot option. |
1501 | 1505 | ||
1502 | config GEODE_MFGPT_TIMER | 1506 | config GEODE_MFGPT_TIMER |
1503 | def_bool y | 1507 | def_bool y |
1504 | prompt "Geode Multi-Function General Purpose Timer (MFGPT) events" | 1508 | prompt "Geode Multi-Function General Purpose Timer (MFGPT) events" |
1505 | depends on MGEODE_LX && GENERIC_TIME && GENERIC_CLOCKEVENTS | 1509 | depends on MGEODE_LX && GENERIC_TIME && GENERIC_CLOCKEVENTS |
1506 | help | 1510 | help |
1507 | This driver provides a clock event source based on the MFGPT | 1511 | This driver provides a clock event source based on the MFGPT |
1508 | timer(s) in the CS5535 and CS5536 companion chip for the geode. | 1512 | timer(s) in the CS5535 and CS5536 companion chip for the geode. |
1509 | MFGPTs have a better resolution and max interval than the | 1513 | MFGPTs have a better resolution and max interval than the |
1510 | generic PIT, and are suitable for use as high-res timers. | 1514 | generic PIT, and are suitable for use as high-res timers. |
1511 | 1515 | ||
1512 | endif # X86_32 | 1516 | endif # X86_32 |
1513 | 1517 | ||
1514 | config K8_NB | 1518 | config K8_NB |
1515 | def_bool y | 1519 | def_bool y |
1516 | depends on AGP_AMD64 || (X86_64 && (GART_IOMMU || (PCI && NUMA))) | 1520 | depends on AGP_AMD64 || (X86_64 && (GART_IOMMU || (PCI && NUMA))) |
1517 | 1521 | ||
1518 | source "drivers/pcmcia/Kconfig" | 1522 | source "drivers/pcmcia/Kconfig" |
1519 | 1523 | ||
1520 | source "drivers/pci/hotplug/Kconfig" | 1524 | source "drivers/pci/hotplug/Kconfig" |
1521 | 1525 | ||
1522 | endmenu | 1526 | endmenu |
1523 | 1527 | ||
1524 | 1528 | ||
1525 | menu "Executable file formats / Emulations" | 1529 | menu "Executable file formats / Emulations" |
1526 | 1530 | ||
1527 | source "fs/Kconfig.binfmt" | 1531 | source "fs/Kconfig.binfmt" |
1528 | 1532 | ||
1529 | config IA32_EMULATION | 1533 | config IA32_EMULATION |
1530 | bool "IA32 Emulation" | 1534 | bool "IA32 Emulation" |
1531 | depends on X86_64 | 1535 | depends on X86_64 |
1532 | help | 1536 | help |
1533 | Include code to run 32-bit programs under a 64-bit kernel. You should | 1537 | Include code to run 32-bit programs under a 64-bit kernel. You should |
1534 | likely turn this on, unless you're 100% sure that you don't have any | 1538 | likely turn this on, unless you're 100% sure that you don't have any |
1535 | 32-bit programs left. | 1539 | 32-bit programs left. |
1536 | 1540 | ||
1537 | config IA32_AOUT | 1541 | config IA32_AOUT |
1538 | tristate "IA32 a.out support" | 1542 | tristate "IA32 a.out support" |
1539 | depends on IA32_EMULATION | 1543 | depends on IA32_EMULATION |
1540 | help | 1544 | help |
1541 | Support old a.out binaries in the 32bit emulation. | 1545 | Support old a.out binaries in the 32bit emulation. |
1542 | 1546 | ||
1543 | config COMPAT | 1547 | config COMPAT |
1544 | def_bool y | 1548 | def_bool y |
1545 | depends on IA32_EMULATION | 1549 | depends on IA32_EMULATION |
1546 | 1550 | ||
1547 | config COMPAT_FOR_U64_ALIGNMENT | 1551 | config COMPAT_FOR_U64_ALIGNMENT |
1548 | def_bool COMPAT | 1552 | def_bool COMPAT |
1549 | depends on X86_64 | 1553 | depends on X86_64 |
1550 | 1554 | ||
1551 | config SYSVIPC_COMPAT | 1555 | config SYSVIPC_COMPAT |
1552 | def_bool y | 1556 | def_bool y |
1553 | depends on X86_64 && COMPAT && SYSVIPC | 1557 | depends on X86_64 && COMPAT && SYSVIPC |
1554 | 1558 | ||
1555 | endmenu | 1559 | endmenu |
1556 | 1560 | ||
1557 | 1561 | ||
1558 | source "net/Kconfig" | 1562 | source "net/Kconfig" |
1559 | 1563 | ||
1560 | source "drivers/Kconfig" | 1564 | source "drivers/Kconfig" |
1561 | 1565 | ||
1562 | source "drivers/firmware/Kconfig" | 1566 | source "drivers/firmware/Kconfig" |
1563 | 1567 | ||
1564 | source "fs/Kconfig" | 1568 | source "fs/Kconfig" |
1565 | 1569 | ||
1566 | source "kernel/Kconfig.instrumentation" | 1570 | source "kernel/Kconfig.instrumentation" |
1567 | 1571 | ||
1568 | source "arch/x86/Kconfig.debug" | 1572 | source "arch/x86/Kconfig.debug" |
1569 | 1573 | ||
1570 | source "security/Kconfig" | 1574 | source "security/Kconfig" |
1571 | 1575 | ||
1572 | source "crypto/Kconfig" | 1576 | source "crypto/Kconfig" |
1573 | 1577 | ||
1574 | source "lib/Kconfig" | 1578 | source "lib/Kconfig" |
1575 | 1579 |
fs/jbd/checkpoint.c
1 | /* | 1 | /* |
2 | * linux/fs/jbd/checkpoint.c | 2 | * linux/fs/jbd/checkpoint.c |
3 | * | 3 | * |
4 | * Written by Stephen C. Tweedie <sct@redhat.com>, 1999 | 4 | * Written by Stephen C. Tweedie <sct@redhat.com>, 1999 |
5 | * | 5 | * |
6 | * Copyright 1999 Red Hat Software --- All Rights Reserved | 6 | * Copyright 1999 Red Hat Software --- All Rights Reserved |
7 | * | 7 | * |
8 | * This file is part of the Linux kernel and is made available under | 8 | * This file is part of the Linux kernel and is made available under |
9 | * the terms of the GNU General Public License, version 2, or at your | 9 | * the terms of the GNU General Public License, version 2, or at your |
10 | * option, any later version, incorporated herein by reference. | 10 | * option, any later version, incorporated herein by reference. |
11 | * | 11 | * |
12 | * Checkpoint routines for the generic filesystem journaling code. | 12 | * Checkpoint routines for the generic filesystem journaling code. |
13 | * Part of the ext2fs journaling system. | 13 | * Part of the ext2fs journaling system. |
14 | * | 14 | * |
15 | * Checkpointing is the process of ensuring that a section of the log is | 15 | * Checkpointing is the process of ensuring that a section of the log is |
16 | * committed fully to disk, so that that portion of the log can be | 16 | * committed fully to disk, so that that portion of the log can be |
17 | * reused. | 17 | * reused. |
18 | */ | 18 | */ |
19 | 19 | ||
20 | #include <linux/time.h> | 20 | #include <linux/time.h> |
21 | #include <linux/fs.h> | 21 | #include <linux/fs.h> |
22 | #include <linux/jbd.h> | 22 | #include <linux/jbd.h> |
23 | #include <linux/errno.h> | 23 | #include <linux/errno.h> |
24 | #include <linux/slab.h> | 24 | #include <linux/slab.h> |
25 | 25 | ||
26 | /* | 26 | /* |
27 | * Unlink a buffer from a transaction checkpoint list. | 27 | * Unlink a buffer from a transaction checkpoint list. |
28 | * | 28 | * |
29 | * Called with j_list_lock held. | 29 | * Called with j_list_lock held. |
30 | */ | 30 | */ |
31 | static inline void __buffer_unlink_first(struct journal_head *jh) | 31 | static inline void __buffer_unlink_first(struct journal_head *jh) |
32 | { | 32 | { |
33 | transaction_t *transaction = jh->b_cp_transaction; | 33 | transaction_t *transaction = jh->b_cp_transaction; |
34 | 34 | ||
35 | jh->b_cpnext->b_cpprev = jh->b_cpprev; | 35 | jh->b_cpnext->b_cpprev = jh->b_cpprev; |
36 | jh->b_cpprev->b_cpnext = jh->b_cpnext; | 36 | jh->b_cpprev->b_cpnext = jh->b_cpnext; |
37 | if (transaction->t_checkpoint_list == jh) { | 37 | if (transaction->t_checkpoint_list == jh) { |
38 | transaction->t_checkpoint_list = jh->b_cpnext; | 38 | transaction->t_checkpoint_list = jh->b_cpnext; |
39 | if (transaction->t_checkpoint_list == jh) | 39 | if (transaction->t_checkpoint_list == jh) |
40 | transaction->t_checkpoint_list = NULL; | 40 | transaction->t_checkpoint_list = NULL; |
41 | } | 41 | } |
42 | } | 42 | } |
43 | 43 | ||
44 | /* | 44 | /* |
45 | * Unlink a buffer from a transaction checkpoint(io) list. | 45 | * Unlink a buffer from a transaction checkpoint(io) list. |
46 | * | 46 | * |
47 | * Called with j_list_lock held. | 47 | * Called with j_list_lock held. |
48 | */ | 48 | */ |
49 | static inline void __buffer_unlink(struct journal_head *jh) | 49 | static inline void __buffer_unlink(struct journal_head *jh) |
50 | { | 50 | { |
51 | transaction_t *transaction = jh->b_cp_transaction; | 51 | transaction_t *transaction = jh->b_cp_transaction; |
52 | 52 | ||
53 | __buffer_unlink_first(jh); | 53 | __buffer_unlink_first(jh); |
54 | if (transaction->t_checkpoint_io_list == jh) { | 54 | if (transaction->t_checkpoint_io_list == jh) { |
55 | transaction->t_checkpoint_io_list = jh->b_cpnext; | 55 | transaction->t_checkpoint_io_list = jh->b_cpnext; |
56 | if (transaction->t_checkpoint_io_list == jh) | 56 | if (transaction->t_checkpoint_io_list == jh) |
57 | transaction->t_checkpoint_io_list = NULL; | 57 | transaction->t_checkpoint_io_list = NULL; |
58 | } | 58 | } |
59 | } | 59 | } |
60 | 60 | ||
61 | /* | 61 | /* |
62 | * Move a buffer from the checkpoint list to the checkpoint io list | 62 | * Move a buffer from the checkpoint list to the checkpoint io list |
63 | * | 63 | * |
64 | * Called with j_list_lock held | 64 | * Called with j_list_lock held |
65 | */ | 65 | */ |
66 | static inline void __buffer_relink_io(struct journal_head *jh) | 66 | static inline void __buffer_relink_io(struct journal_head *jh) |
67 | { | 67 | { |
68 | transaction_t *transaction = jh->b_cp_transaction; | 68 | transaction_t *transaction = jh->b_cp_transaction; |
69 | 69 | ||
70 | __buffer_unlink_first(jh); | 70 | __buffer_unlink_first(jh); |
71 | 71 | ||
72 | if (!transaction->t_checkpoint_io_list) { | 72 | if (!transaction->t_checkpoint_io_list) { |
73 | jh->b_cpnext = jh->b_cpprev = jh; | 73 | jh->b_cpnext = jh->b_cpprev = jh; |
74 | } else { | 74 | } else { |
75 | jh->b_cpnext = transaction->t_checkpoint_io_list; | 75 | jh->b_cpnext = transaction->t_checkpoint_io_list; |
76 | jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev; | 76 | jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev; |
77 | jh->b_cpprev->b_cpnext = jh; | 77 | jh->b_cpprev->b_cpnext = jh; |
78 | jh->b_cpnext->b_cpprev = jh; | 78 | jh->b_cpnext->b_cpprev = jh; |
79 | } | 79 | } |
80 | transaction->t_checkpoint_io_list = jh; | 80 | transaction->t_checkpoint_io_list = jh; |
81 | } | 81 | } |
82 | 82 | ||
83 | /* | 83 | /* |
84 | * Try to release a checkpointed buffer from its transaction. | 84 | * Try to release a checkpointed buffer from its transaction. |
85 | * Returns 1 if we released it and 2 if we also released the | 85 | * Returns 1 if we released it and 2 if we also released the |
86 | * whole transaction. | 86 | * whole transaction. |
87 | * | 87 | * |
88 | * Requires j_list_lock | 88 | * Requires j_list_lock |
89 | * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it | 89 | * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it |
90 | */ | 90 | */ |
91 | static int __try_to_free_cp_buf(struct journal_head *jh) | 91 | static int __try_to_free_cp_buf(struct journal_head *jh) |
92 | { | 92 | { |
93 | int ret = 0; | 93 | int ret = 0; |
94 | struct buffer_head *bh = jh2bh(jh); | 94 | struct buffer_head *bh = jh2bh(jh); |
95 | 95 | ||
96 | if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) { | 96 | if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) { |
97 | JBUFFER_TRACE(jh, "remove from checkpoint list"); | 97 | JBUFFER_TRACE(jh, "remove from checkpoint list"); |
98 | ret = __journal_remove_checkpoint(jh) + 1; | 98 | ret = __journal_remove_checkpoint(jh) + 1; |
99 | jbd_unlock_bh_state(bh); | 99 | jbd_unlock_bh_state(bh); |
100 | journal_remove_journal_head(bh); | 100 | journal_remove_journal_head(bh); |
101 | BUFFER_TRACE(bh, "release"); | 101 | BUFFER_TRACE(bh, "release"); |
102 | __brelse(bh); | 102 | __brelse(bh); |
103 | } else { | 103 | } else { |
104 | jbd_unlock_bh_state(bh); | 104 | jbd_unlock_bh_state(bh); |
105 | } | 105 | } |
106 | return ret; | 106 | return ret; |
107 | } | 107 | } |
108 | 108 | ||
109 | /* | 109 | /* |
110 | * __log_wait_for_space: wait until there is space in the journal. | 110 | * __log_wait_for_space: wait until there is space in the journal. |
111 | * | 111 | * |
112 | * Called under j-state_lock *only*. It will be unlocked if we have to wait | 112 | * Called under j-state_lock *only*. It will be unlocked if we have to wait |
113 | * for a checkpoint to free up some space in the log. | 113 | * for a checkpoint to free up some space in the log. |
114 | */ | 114 | */ |
115 | void __log_wait_for_space(journal_t *journal) | 115 | void __log_wait_for_space(journal_t *journal) |
116 | { | 116 | { |
117 | int nblocks; | 117 | int nblocks; |
118 | assert_spin_locked(&journal->j_state_lock); | 118 | assert_spin_locked(&journal->j_state_lock); |
119 | 119 | ||
120 | nblocks = jbd_space_needed(journal); | 120 | nblocks = jbd_space_needed(journal); |
121 | while (__log_space_left(journal) < nblocks) { | 121 | while (__log_space_left(journal) < nblocks) { |
122 | if (journal->j_flags & JFS_ABORT) | 122 | if (journal->j_flags & JFS_ABORT) |
123 | return; | 123 | return; |
124 | spin_unlock(&journal->j_state_lock); | 124 | spin_unlock(&journal->j_state_lock); |
125 | mutex_lock(&journal->j_checkpoint_mutex); | 125 | mutex_lock(&journal->j_checkpoint_mutex); |
126 | 126 | ||
127 | /* | 127 | /* |
128 | * Test again, another process may have checkpointed while we | 128 | * Test again, another process may have checkpointed while we |
129 | * were waiting for the checkpoint lock | 129 | * were waiting for the checkpoint lock |
130 | */ | 130 | */ |
131 | spin_lock(&journal->j_state_lock); | 131 | spin_lock(&journal->j_state_lock); |
132 | nblocks = jbd_space_needed(journal); | 132 | nblocks = jbd_space_needed(journal); |
133 | if (__log_space_left(journal) < nblocks) { | 133 | if (__log_space_left(journal) < nblocks) { |
134 | spin_unlock(&journal->j_state_lock); | 134 | spin_unlock(&journal->j_state_lock); |
135 | log_do_checkpoint(journal); | 135 | log_do_checkpoint(journal); |
136 | spin_lock(&journal->j_state_lock); | 136 | spin_lock(&journal->j_state_lock); |
137 | } | 137 | } |
138 | mutex_unlock(&journal->j_checkpoint_mutex); | 138 | mutex_unlock(&journal->j_checkpoint_mutex); |
139 | } | 139 | } |
140 | } | 140 | } |
141 | 141 | ||
142 | /* | 142 | /* |
143 | * We were unable to perform jbd_trylock_bh_state() inside j_list_lock. | 143 | * We were unable to perform jbd_trylock_bh_state() inside j_list_lock. |
144 | * The caller must restart a list walk. Wait for someone else to run | 144 | * The caller must restart a list walk. Wait for someone else to run |
145 | * jbd_unlock_bh_state(). | 145 | * jbd_unlock_bh_state(). |
146 | */ | 146 | */ |
147 | static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh) | 147 | static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh) |
148 | __releases(journal->j_list_lock) | 148 | __releases(journal->j_list_lock) |
149 | { | 149 | { |
150 | get_bh(bh); | 150 | get_bh(bh); |
151 | spin_unlock(&journal->j_list_lock); | 151 | spin_unlock(&journal->j_list_lock); |
152 | jbd_lock_bh_state(bh); | 152 | jbd_lock_bh_state(bh); |
153 | jbd_unlock_bh_state(bh); | 153 | jbd_unlock_bh_state(bh); |
154 | put_bh(bh); | 154 | put_bh(bh); |
155 | } | 155 | } |
156 | 156 | ||
157 | /* | 157 | /* |
158 | * Clean up transaction's list of buffers submitted for io. | 158 | * Clean up transaction's list of buffers submitted for io. |
159 | * We wait for any pending IO to complete and remove any clean | 159 | * We wait for any pending IO to complete and remove any clean |
160 | * buffers. Note that we take the buffers in the opposite ordering | 160 | * buffers. Note that we take the buffers in the opposite ordering |
161 | * from the one in which they were submitted for IO. | 161 | * from the one in which they were submitted for IO. |
162 | * | 162 | * |
163 | * Called with j_list_lock held. | 163 | * Called with j_list_lock held. |
164 | */ | 164 | */ |
165 | static void __wait_cp_io(journal_t *journal, transaction_t *transaction) | 165 | static void __wait_cp_io(journal_t *journal, transaction_t *transaction) |
166 | { | 166 | { |
167 | struct journal_head *jh; | 167 | struct journal_head *jh; |
168 | struct buffer_head *bh; | 168 | struct buffer_head *bh; |
169 | tid_t this_tid; | 169 | tid_t this_tid; |
170 | int released = 0; | 170 | int released = 0; |
171 | 171 | ||
172 | this_tid = transaction->t_tid; | 172 | this_tid = transaction->t_tid; |
173 | restart: | 173 | restart: |
174 | /* Did somebody clean up the transaction in the meanwhile? */ | 174 | /* Did somebody clean up the transaction in the meanwhile? */ |
175 | if (journal->j_checkpoint_transactions != transaction || | 175 | if (journal->j_checkpoint_transactions != transaction || |
176 | transaction->t_tid != this_tid) | 176 | transaction->t_tid != this_tid) |
177 | return; | 177 | return; |
178 | while (!released && transaction->t_checkpoint_io_list) { | 178 | while (!released && transaction->t_checkpoint_io_list) { |
179 | jh = transaction->t_checkpoint_io_list; | 179 | jh = transaction->t_checkpoint_io_list; |
180 | bh = jh2bh(jh); | 180 | bh = jh2bh(jh); |
181 | if (!jbd_trylock_bh_state(bh)) { | 181 | if (!jbd_trylock_bh_state(bh)) { |
182 | jbd_sync_bh(journal, bh); | 182 | jbd_sync_bh(journal, bh); |
183 | spin_lock(&journal->j_list_lock); | 183 | spin_lock(&journal->j_list_lock); |
184 | goto restart; | 184 | goto restart; |
185 | } | 185 | } |
186 | if (buffer_locked(bh)) { | 186 | if (buffer_locked(bh)) { |
187 | atomic_inc(&bh->b_count); | 187 | atomic_inc(&bh->b_count); |
188 | spin_unlock(&journal->j_list_lock); | 188 | spin_unlock(&journal->j_list_lock); |
189 | jbd_unlock_bh_state(bh); | 189 | jbd_unlock_bh_state(bh); |
190 | wait_on_buffer(bh); | 190 | wait_on_buffer(bh); |
191 | /* the journal_head may have gone by now */ | 191 | /* the journal_head may have gone by now */ |
192 | BUFFER_TRACE(bh, "brelse"); | 192 | BUFFER_TRACE(bh, "brelse"); |
193 | __brelse(bh); | 193 | __brelse(bh); |
194 | spin_lock(&journal->j_list_lock); | 194 | spin_lock(&journal->j_list_lock); |
195 | goto restart; | 195 | goto restart; |
196 | } | 196 | } |
197 | /* | 197 | /* |
198 | * Now in whatever state the buffer currently is, we know that | 198 | * Now in whatever state the buffer currently is, we know that |
199 | * it has been written out and so we can drop it from the list | 199 | * it has been written out and so we can drop it from the list |
200 | */ | 200 | */ |
201 | released = __journal_remove_checkpoint(jh); | 201 | released = __journal_remove_checkpoint(jh); |
202 | jbd_unlock_bh_state(bh); | 202 | jbd_unlock_bh_state(bh); |
203 | journal_remove_journal_head(bh); | 203 | journal_remove_journal_head(bh); |
204 | __brelse(bh); | 204 | __brelse(bh); |
205 | } | 205 | } |
206 | } | 206 | } |
207 | 207 | ||
208 | #define NR_BATCH 64 | 208 | #define NR_BATCH 64 |
209 | 209 | ||
210 | static void | 210 | static void |
211 | __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) | 211 | __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) |
212 | { | 212 | { |
213 | int i; | 213 | int i; |
214 | 214 | ||
215 | ll_rw_block(SWRITE, *batch_count, bhs); | 215 | ll_rw_block(SWRITE, *batch_count, bhs); |
216 | for (i = 0; i < *batch_count; i++) { | 216 | for (i = 0; i < *batch_count; i++) { |
217 | struct buffer_head *bh = bhs[i]; | 217 | struct buffer_head *bh = bhs[i]; |
218 | clear_buffer_jwrite(bh); | 218 | clear_buffer_jwrite(bh); |
219 | BUFFER_TRACE(bh, "brelse"); | 219 | BUFFER_TRACE(bh, "brelse"); |
220 | __brelse(bh); | 220 | __brelse(bh); |
221 | } | 221 | } |
222 | *batch_count = 0; | 222 | *batch_count = 0; |
223 | } | 223 | } |
224 | 224 | ||
225 | /* | 225 | /* |
226 | * Try to flush one buffer from the checkpoint list to disk. | 226 | * Try to flush one buffer from the checkpoint list to disk. |
227 | * | 227 | * |
228 | * Return 1 if something happened which requires us to abort the current | 228 | * Return 1 if something happened which requires us to abort the current |
229 | * scan of the checkpoint list. | 229 | * scan of the checkpoint list. |
230 | * | 230 | * |
231 | * Called with j_list_lock held and drops it if 1 is returned | 231 | * Called with j_list_lock held and drops it if 1 is returned |
232 | * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it | 232 | * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it |
233 | */ | 233 | */ |
234 | static int __process_buffer(journal_t *journal, struct journal_head *jh, | 234 | static int __process_buffer(journal_t *journal, struct journal_head *jh, |
235 | struct buffer_head **bhs, int *batch_count) | 235 | struct buffer_head **bhs, int *batch_count) |
236 | { | 236 | { |
237 | struct buffer_head *bh = jh2bh(jh); | 237 | struct buffer_head *bh = jh2bh(jh); |
238 | int ret = 0; | 238 | int ret = 0; |
239 | 239 | ||
240 | if (buffer_locked(bh)) { | 240 | if (buffer_locked(bh)) { |
241 | atomic_inc(&bh->b_count); | 241 | atomic_inc(&bh->b_count); |
242 | spin_unlock(&journal->j_list_lock); | 242 | spin_unlock(&journal->j_list_lock); |
243 | jbd_unlock_bh_state(bh); | 243 | jbd_unlock_bh_state(bh); |
244 | wait_on_buffer(bh); | 244 | wait_on_buffer(bh); |
245 | /* the journal_head may have gone by now */ | 245 | /* the journal_head may have gone by now */ |
246 | BUFFER_TRACE(bh, "brelse"); | 246 | BUFFER_TRACE(bh, "brelse"); |
247 | __brelse(bh); | 247 | __brelse(bh); |
248 | ret = 1; | 248 | ret = 1; |
249 | } else if (jh->b_transaction != NULL) { | 249 | } else if (jh->b_transaction != NULL) { |
250 | transaction_t *t = jh->b_transaction; | 250 | transaction_t *t = jh->b_transaction; |
251 | tid_t tid = t->t_tid; | 251 | tid_t tid = t->t_tid; |
252 | 252 | ||
253 | spin_unlock(&journal->j_list_lock); | 253 | spin_unlock(&journal->j_list_lock); |
254 | jbd_unlock_bh_state(bh); | 254 | jbd_unlock_bh_state(bh); |
255 | log_start_commit(journal, tid); | 255 | log_start_commit(journal, tid); |
256 | log_wait_commit(journal, tid); | 256 | log_wait_commit(journal, tid); |
257 | ret = 1; | 257 | ret = 1; |
258 | } else if (!buffer_dirty(bh)) { | 258 | } else if (!buffer_dirty(bh)) { |
259 | J_ASSERT_JH(jh, !buffer_jbddirty(bh)); | 259 | J_ASSERT_JH(jh, !buffer_jbddirty(bh)); |
260 | BUFFER_TRACE(bh, "remove from checkpoint"); | 260 | BUFFER_TRACE(bh, "remove from checkpoint"); |
261 | __journal_remove_checkpoint(jh); | 261 | __journal_remove_checkpoint(jh); |
262 | spin_unlock(&journal->j_list_lock); | 262 | spin_unlock(&journal->j_list_lock); |
263 | jbd_unlock_bh_state(bh); | 263 | jbd_unlock_bh_state(bh); |
264 | journal_remove_journal_head(bh); | 264 | journal_remove_journal_head(bh); |
265 | __brelse(bh); | 265 | __brelse(bh); |
266 | ret = 1; | 266 | ret = 1; |
267 | } else { | 267 | } else { |
268 | /* | 268 | /* |
269 | * Important: we are about to write the buffer, and | 269 | * Important: we are about to write the buffer, and |
270 | * possibly block, while still holding the journal lock. | 270 | * possibly block, while still holding the journal lock. |
271 | * We cannot afford to let the transaction logic start | 271 | * We cannot afford to let the transaction logic start |
272 | * messing around with this buffer before we write it to | 272 | * messing around with this buffer before we write it to |
273 | * disk, as that would break recoverability. | 273 | * disk, as that would break recoverability. |
274 | */ | 274 | */ |
275 | BUFFER_TRACE(bh, "queue"); | 275 | BUFFER_TRACE(bh, "queue"); |
276 | get_bh(bh); | 276 | get_bh(bh); |
277 | J_ASSERT_BH(bh, !buffer_jwrite(bh)); | 277 | J_ASSERT_BH(bh, !buffer_jwrite(bh)); |
278 | set_buffer_jwrite(bh); | 278 | set_buffer_jwrite(bh); |
279 | bhs[*batch_count] = bh; | 279 | bhs[*batch_count] = bh; |
280 | __buffer_relink_io(jh); | 280 | __buffer_relink_io(jh); |
281 | jbd_unlock_bh_state(bh); | 281 | jbd_unlock_bh_state(bh); |
282 | (*batch_count)++; | 282 | (*batch_count)++; |
283 | if (*batch_count == NR_BATCH) { | 283 | if (*batch_count == NR_BATCH) { |
284 | spin_unlock(&journal->j_list_lock); | 284 | spin_unlock(&journal->j_list_lock); |
285 | __flush_batch(journal, bhs, batch_count); | 285 | __flush_batch(journal, bhs, batch_count); |
286 | ret = 1; | 286 | ret = 1; |
287 | } | 287 | } |
288 | } | 288 | } |
289 | return ret; | 289 | return ret; |
290 | } | 290 | } |
291 | 291 | ||
292 | /* | 292 | /* |
293 | * Perform an actual checkpoint. We take the first transaction on the | 293 | * Perform an actual checkpoint. We take the first transaction on the |
294 | * list of transactions to be checkpointed and send all its buffers | 294 | * list of transactions to be checkpointed and send all its buffers |
295 | * to disk. We submit larger chunks of data at once. | 295 | * to disk. We submit larger chunks of data at once. |
296 | * | 296 | * |
297 | * The journal should be locked before calling this function. | 297 | * The journal should be locked before calling this function. |
298 | */ | 298 | */ |
299 | int log_do_checkpoint(journal_t *journal) | 299 | int log_do_checkpoint(journal_t *journal) |
300 | { | 300 | { |
301 | transaction_t *transaction; | 301 | transaction_t *transaction; |
302 | tid_t this_tid; | 302 | tid_t this_tid; |
303 | int result; | 303 | int result; |
304 | 304 | ||
305 | jbd_debug(1, "Start checkpoint\n"); | 305 | jbd_debug(1, "Start checkpoint\n"); |
306 | 306 | ||
307 | /* | 307 | /* |
308 | * First thing: if there are any transactions in the log which | 308 | * First thing: if there are any transactions in the log which |
309 | * don't need checkpointing, just eliminate them from the | 309 | * don't need checkpointing, just eliminate them from the |
310 | * journal straight away. | 310 | * journal straight away. |
311 | */ | 311 | */ |
312 | result = cleanup_journal_tail(journal); | 312 | result = cleanup_journal_tail(journal); |
313 | jbd_debug(1, "cleanup_journal_tail returned %d\n", result); | 313 | jbd_debug(1, "cleanup_journal_tail returned %d\n", result); |
314 | if (result <= 0) | 314 | if (result <= 0) |
315 | return result; | 315 | return result; |
316 | 316 | ||
317 | /* | 317 | /* |
318 | * OK, we need to start writing disk blocks. Take one transaction | 318 | * OK, we need to start writing disk blocks. Take one transaction |
319 | * and write it. | 319 | * and write it. |
320 | */ | 320 | */ |
321 | spin_lock(&journal->j_list_lock); | 321 | spin_lock(&journal->j_list_lock); |
322 | if (!journal->j_checkpoint_transactions) | 322 | if (!journal->j_checkpoint_transactions) |
323 | goto out; | 323 | goto out; |
324 | transaction = journal->j_checkpoint_transactions; | 324 | transaction = journal->j_checkpoint_transactions; |
325 | this_tid = transaction->t_tid; | 325 | this_tid = transaction->t_tid; |
326 | restart: | 326 | restart: |
327 | /* | 327 | /* |
328 | * If someone cleaned up this transaction while we slept, we're | 328 | * If someone cleaned up this transaction while we slept, we're |
329 | * done (maybe it's a new transaction, but it fell at the same | 329 | * done (maybe it's a new transaction, but it fell at the same |
330 | * address). | 330 | * address). |
331 | */ | 331 | */ |
332 | if (journal->j_checkpoint_transactions == transaction && | 332 | if (journal->j_checkpoint_transactions == transaction && |
333 | transaction->t_tid == this_tid) { | 333 | transaction->t_tid == this_tid) { |
334 | int batch_count = 0; | 334 | int batch_count = 0; |
335 | struct buffer_head *bhs[NR_BATCH]; | 335 | struct buffer_head *bhs[NR_BATCH]; |
336 | struct journal_head *jh; | 336 | struct journal_head *jh; |
337 | int retry = 0; | 337 | int retry = 0; |
338 | 338 | ||
339 | while (!retry && transaction->t_checkpoint_list) { | 339 | while (!retry && transaction->t_checkpoint_list) { |
340 | struct buffer_head *bh; | 340 | struct buffer_head *bh; |
341 | 341 | ||
342 | jh = transaction->t_checkpoint_list; | 342 | jh = transaction->t_checkpoint_list; |
343 | bh = jh2bh(jh); | 343 | bh = jh2bh(jh); |
344 | if (!jbd_trylock_bh_state(bh)) { | 344 | if (!jbd_trylock_bh_state(bh)) { |
345 | jbd_sync_bh(journal, bh); | 345 | jbd_sync_bh(journal, bh); |
346 | retry = 1; | 346 | retry = 1; |
347 | break; | 347 | break; |
348 | } | 348 | } |
349 | retry = __process_buffer(journal, jh, bhs,&batch_count); | 349 | retry = __process_buffer(journal, jh, bhs,&batch_count); |
350 | if (!retry && lock_need_resched(&journal->j_list_lock)){ | 350 | if (!retry && (need_resched() || |
351 | spin_needbreak(&journal->j_list_lock))) { | ||
351 | spin_unlock(&journal->j_list_lock); | 352 | spin_unlock(&journal->j_list_lock); |
352 | retry = 1; | 353 | retry = 1; |
353 | break; | 354 | break; |
354 | } | 355 | } |
355 | } | 356 | } |
356 | 357 | ||
357 | if (batch_count) { | 358 | if (batch_count) { |
358 | if (!retry) { | 359 | if (!retry) { |
359 | spin_unlock(&journal->j_list_lock); | 360 | spin_unlock(&journal->j_list_lock); |
360 | retry = 1; | 361 | retry = 1; |
361 | } | 362 | } |
362 | __flush_batch(journal, bhs, &batch_count); | 363 | __flush_batch(journal, bhs, &batch_count); |
363 | } | 364 | } |
364 | 365 | ||
365 | if (retry) { | 366 | if (retry) { |
366 | spin_lock(&journal->j_list_lock); | 367 | spin_lock(&journal->j_list_lock); |
367 | goto restart; | 368 | goto restart; |
368 | } | 369 | } |
369 | /* | 370 | /* |
370 | * Now we have cleaned up the first transaction's checkpoint | 371 | * Now we have cleaned up the first transaction's checkpoint |
371 | * list. Let's clean up the second one | 372 | * list. Let's clean up the second one |
372 | */ | 373 | */ |
373 | __wait_cp_io(journal, transaction); | 374 | __wait_cp_io(journal, transaction); |
374 | } | 375 | } |
375 | out: | 376 | out: |
376 | spin_unlock(&journal->j_list_lock); | 377 | spin_unlock(&journal->j_list_lock); |
377 | result = cleanup_journal_tail(journal); | 378 | result = cleanup_journal_tail(journal); |
378 | if (result < 0) | 379 | if (result < 0) |
379 | return result; | 380 | return result; |
380 | return 0; | 381 | return 0; |
381 | } | 382 | } |
382 | 383 | ||
383 | /* | 384 | /* |
384 | * Check the list of checkpoint transactions for the journal to see if | 385 | * Check the list of checkpoint transactions for the journal to see if |
385 | * we have already got rid of any since the last update of the log tail | 386 | * we have already got rid of any since the last update of the log tail |
386 | * in the journal superblock. If so, we can instantly roll the | 387 | * in the journal superblock. If so, we can instantly roll the |
387 | * superblock forward to remove those transactions from the log. | 388 | * superblock forward to remove those transactions from the log. |
388 | * | 389 | * |
389 | * Return <0 on error, 0 on success, 1 if there was nothing to clean up. | 390 | * Return <0 on error, 0 on success, 1 if there was nothing to clean up. |
390 | * | 391 | * |
391 | * Called with the journal lock held. | 392 | * Called with the journal lock held. |
392 | * | 393 | * |
393 | * This is the only part of the journaling code which really needs to be | 394 | * This is the only part of the journaling code which really needs to be |
394 | * aware of transaction aborts. Checkpointing involves writing to the | 395 | * aware of transaction aborts. Checkpointing involves writing to the |
395 | * main filesystem area rather than to the journal, so it can proceed | 396 | * main filesystem area rather than to the journal, so it can proceed |
396 | * even in abort state, but we must not update the journal superblock if | 397 | * even in abort state, but we must not update the journal superblock if |
397 | * we have an abort error outstanding. | 398 | * we have an abort error outstanding. |
398 | */ | 399 | */ |
399 | 400 | ||
400 | int cleanup_journal_tail(journal_t *journal) | 401 | int cleanup_journal_tail(journal_t *journal) |
401 | { | 402 | { |
402 | transaction_t * transaction; | 403 | transaction_t * transaction; |
403 | tid_t first_tid; | 404 | tid_t first_tid; |
404 | unsigned long blocknr, freed; | 405 | unsigned long blocknr, freed; |
405 | 406 | ||
406 | /* OK, work out the oldest transaction remaining in the log, and | 407 | /* OK, work out the oldest transaction remaining in the log, and |
407 | * the log block it starts at. | 408 | * the log block it starts at. |
408 | * | 409 | * |
409 | * If the log is now empty, we need to work out which is the | 410 | * If the log is now empty, we need to work out which is the |
410 | * next transaction ID we will write, and where it will | 411 | * next transaction ID we will write, and where it will |
411 | * start. */ | 412 | * start. */ |
412 | 413 | ||
413 | spin_lock(&journal->j_state_lock); | 414 | spin_lock(&journal->j_state_lock); |
414 | spin_lock(&journal->j_list_lock); | 415 | spin_lock(&journal->j_list_lock); |
415 | transaction = journal->j_checkpoint_transactions; | 416 | transaction = journal->j_checkpoint_transactions; |
416 | if (transaction) { | 417 | if (transaction) { |
417 | first_tid = transaction->t_tid; | 418 | first_tid = transaction->t_tid; |
418 | blocknr = transaction->t_log_start; | 419 | blocknr = transaction->t_log_start; |
419 | } else if ((transaction = journal->j_committing_transaction) != NULL) { | 420 | } else if ((transaction = journal->j_committing_transaction) != NULL) { |
420 | first_tid = transaction->t_tid; | 421 | first_tid = transaction->t_tid; |
421 | blocknr = transaction->t_log_start; | 422 | blocknr = transaction->t_log_start; |
422 | } else if ((transaction = journal->j_running_transaction) != NULL) { | 423 | } else if ((transaction = journal->j_running_transaction) != NULL) { |
423 | first_tid = transaction->t_tid; | 424 | first_tid = transaction->t_tid; |
424 | blocknr = journal->j_head; | 425 | blocknr = journal->j_head; |
425 | } else { | 426 | } else { |
426 | first_tid = journal->j_transaction_sequence; | 427 | first_tid = journal->j_transaction_sequence; |
427 | blocknr = journal->j_head; | 428 | blocknr = journal->j_head; |
428 | } | 429 | } |
429 | spin_unlock(&journal->j_list_lock); | 430 | spin_unlock(&journal->j_list_lock); |
430 | J_ASSERT(blocknr != 0); | 431 | J_ASSERT(blocknr != 0); |
431 | 432 | ||
432 | /* If the oldest pinned transaction is at the tail of the log | 433 | /* If the oldest pinned transaction is at the tail of the log |
433 | already then there's not much we can do right now. */ | 434 | already then there's not much we can do right now. */ |
434 | if (journal->j_tail_sequence == first_tid) { | 435 | if (journal->j_tail_sequence == first_tid) { |
435 | spin_unlock(&journal->j_state_lock); | 436 | spin_unlock(&journal->j_state_lock); |
436 | return 1; | 437 | return 1; |
437 | } | 438 | } |
438 | 439 | ||
439 | /* OK, update the superblock to recover the freed space. | 440 | /* OK, update the superblock to recover the freed space. |
440 | * Physical blocks come first: have we wrapped beyond the end of | 441 | * Physical blocks come first: have we wrapped beyond the end of |
441 | * the log? */ | 442 | * the log? */ |
442 | freed = blocknr - journal->j_tail; | 443 | freed = blocknr - journal->j_tail; |
443 | if (blocknr < journal->j_tail) | 444 | if (blocknr < journal->j_tail) |
444 | freed = freed + journal->j_last - journal->j_first; | 445 | freed = freed + journal->j_last - journal->j_first; |
445 | 446 | ||
446 | jbd_debug(1, | 447 | jbd_debug(1, |
447 | "Cleaning journal tail from %d to %d (offset %lu), " | 448 | "Cleaning journal tail from %d to %d (offset %lu), " |
448 | "freeing %lu\n", | 449 | "freeing %lu\n", |
449 | journal->j_tail_sequence, first_tid, blocknr, freed); | 450 | journal->j_tail_sequence, first_tid, blocknr, freed); |
450 | 451 | ||
451 | journal->j_free += freed; | 452 | journal->j_free += freed; |
452 | journal->j_tail_sequence = first_tid; | 453 | journal->j_tail_sequence = first_tid; |
453 | journal->j_tail = blocknr; | 454 | journal->j_tail = blocknr; |
454 | spin_unlock(&journal->j_state_lock); | 455 | spin_unlock(&journal->j_state_lock); |
455 | if (!(journal->j_flags & JFS_ABORT)) | 456 | if (!(journal->j_flags & JFS_ABORT)) |
456 | journal_update_superblock(journal, 1); | 457 | journal_update_superblock(journal, 1); |
457 | return 0; | 458 | return 0; |
458 | } | 459 | } |
459 | 460 | ||
460 | 461 | ||
461 | /* Checkpoint list management */ | 462 | /* Checkpoint list management */ |
462 | 463 | ||
463 | /* | 464 | /* |
464 | * journal_clean_one_cp_list | 465 | * journal_clean_one_cp_list |
465 | * | 466 | * |
466 | * Find all the written-back checkpoint buffers in the given list and release them. | 467 | * Find all the written-back checkpoint buffers in the given list and release them. |
467 | * | 468 | * |
468 | * Called with the journal locked. | 469 | * Called with the journal locked. |
469 | * Called with j_list_lock held. | 470 | * Called with j_list_lock held. |
470 | * Returns number of bufers reaped (for debug) | 471 | * Returns number of bufers reaped (for debug) |
471 | */ | 472 | */ |
472 | 473 | ||
473 | static int journal_clean_one_cp_list(struct journal_head *jh, int *released) | 474 | static int journal_clean_one_cp_list(struct journal_head *jh, int *released) |
474 | { | 475 | { |
475 | struct journal_head *last_jh; | 476 | struct journal_head *last_jh; |
476 | struct journal_head *next_jh = jh; | 477 | struct journal_head *next_jh = jh; |
477 | int ret, freed = 0; | 478 | int ret, freed = 0; |
478 | 479 | ||
479 | *released = 0; | 480 | *released = 0; |
480 | if (!jh) | 481 | if (!jh) |
481 | return 0; | 482 | return 0; |
482 | 483 | ||
483 | last_jh = jh->b_cpprev; | 484 | last_jh = jh->b_cpprev; |
484 | do { | 485 | do { |
485 | jh = next_jh; | 486 | jh = next_jh; |
486 | next_jh = jh->b_cpnext; | 487 | next_jh = jh->b_cpnext; |
487 | /* Use trylock because of the ranking */ | 488 | /* Use trylock because of the ranking */ |
488 | if (jbd_trylock_bh_state(jh2bh(jh))) { | 489 | if (jbd_trylock_bh_state(jh2bh(jh))) { |
489 | ret = __try_to_free_cp_buf(jh); | 490 | ret = __try_to_free_cp_buf(jh); |
490 | if (ret) { | 491 | if (ret) { |
491 | freed++; | 492 | freed++; |
492 | if (ret == 2) { | 493 | if (ret == 2) { |
493 | *released = 1; | 494 | *released = 1; |
494 | return freed; | 495 | return freed; |
495 | } | 496 | } |
496 | } | 497 | } |
497 | } | 498 | } |
498 | /* | 499 | /* |
499 | * This function only frees up some memory | 500 | * This function only frees up some memory |
500 | * if possible so we dont have an obligation | 501 | * if possible so we dont have an obligation |
501 | * to finish processing. Bail out if preemption | 502 | * to finish processing. Bail out if preemption |
502 | * requested: | 503 | * requested: |
503 | */ | 504 | */ |
504 | if (need_resched()) | 505 | if (need_resched()) |
505 | return freed; | 506 | return freed; |
506 | } while (jh != last_jh); | 507 | } while (jh != last_jh); |
507 | 508 | ||
508 | return freed; | 509 | return freed; |
509 | } | 510 | } |
510 | 511 | ||
511 | /* | 512 | /* |
512 | * journal_clean_checkpoint_list | 513 | * journal_clean_checkpoint_list |
513 | * | 514 | * |
514 | * Find all the written-back checkpoint buffers in the journal and release them. | 515 | * Find all the written-back checkpoint buffers in the journal and release them. |
515 | * | 516 | * |
516 | * Called with the journal locked. | 517 | * Called with the journal locked. |
517 | * Called with j_list_lock held. | 518 | * Called with j_list_lock held. |
518 | * Returns number of buffers reaped (for debug) | 519 | * Returns number of buffers reaped (for debug) |
519 | */ | 520 | */ |
520 | 521 | ||
521 | int __journal_clean_checkpoint_list(journal_t *journal) | 522 | int __journal_clean_checkpoint_list(journal_t *journal) |
522 | { | 523 | { |
523 | transaction_t *transaction, *last_transaction, *next_transaction; | 524 | transaction_t *transaction, *last_transaction, *next_transaction; |
524 | int ret = 0; | 525 | int ret = 0; |
525 | int released; | 526 | int released; |
526 | 527 | ||
527 | transaction = journal->j_checkpoint_transactions; | 528 | transaction = journal->j_checkpoint_transactions; |
528 | if (!transaction) | 529 | if (!transaction) |
529 | goto out; | 530 | goto out; |
530 | 531 | ||
531 | last_transaction = transaction->t_cpprev; | 532 | last_transaction = transaction->t_cpprev; |
532 | next_transaction = transaction; | 533 | next_transaction = transaction; |
533 | do { | 534 | do { |
534 | transaction = next_transaction; | 535 | transaction = next_transaction; |
535 | next_transaction = transaction->t_cpnext; | 536 | next_transaction = transaction->t_cpnext; |
536 | ret += journal_clean_one_cp_list(transaction-> | 537 | ret += journal_clean_one_cp_list(transaction-> |
537 | t_checkpoint_list, &released); | 538 | t_checkpoint_list, &released); |
538 | /* | 539 | /* |
539 | * This function only frees up some memory if possible so we | 540 | * This function only frees up some memory if possible so we |
540 | * dont have an obligation to finish processing. Bail out if | 541 | * dont have an obligation to finish processing. Bail out if |
541 | * preemption requested: | 542 | * preemption requested: |
542 | */ | 543 | */ |
543 | if (need_resched()) | 544 | if (need_resched()) |
544 | goto out; | 545 | goto out; |
545 | if (released) | 546 | if (released) |
546 | continue; | 547 | continue; |
547 | /* | 548 | /* |
548 | * It is essential that we are as careful as in the case of | 549 | * It is essential that we are as careful as in the case of |
549 | * t_checkpoint_list with removing the buffer from the list as | 550 | * t_checkpoint_list with removing the buffer from the list as |
550 | * we can possibly see not yet submitted buffers on io_list | 551 | * we can possibly see not yet submitted buffers on io_list |
551 | */ | 552 | */ |
552 | ret += journal_clean_one_cp_list(transaction-> | 553 | ret += journal_clean_one_cp_list(transaction-> |
553 | t_checkpoint_io_list, &released); | 554 | t_checkpoint_io_list, &released); |
554 | if (need_resched()) | 555 | if (need_resched()) |
555 | goto out; | 556 | goto out; |
556 | } while (transaction != last_transaction); | 557 | } while (transaction != last_transaction); |
557 | out: | 558 | out: |
558 | return ret; | 559 | return ret; |
559 | } | 560 | } |
560 | 561 | ||
561 | /* | 562 | /* |
562 | * journal_remove_checkpoint: called after a buffer has been committed | 563 | * journal_remove_checkpoint: called after a buffer has been committed |
563 | * to disk (either by being write-back flushed to disk, or being | 564 | * to disk (either by being write-back flushed to disk, or being |
564 | * committed to the log). | 565 | * committed to the log). |
565 | * | 566 | * |
566 | * We cannot safely clean a transaction out of the log until all of the | 567 | * We cannot safely clean a transaction out of the log until all of the |
567 | * buffer updates committed in that transaction have safely been stored | 568 | * buffer updates committed in that transaction have safely been stored |
568 | * elsewhere on disk. To achieve this, all of the buffers in a | 569 | * elsewhere on disk. To achieve this, all of the buffers in a |
569 | * transaction need to be maintained on the transaction's checkpoint | 570 | * transaction need to be maintained on the transaction's checkpoint |
570 | * lists until they have been rewritten, at which point this function is | 571 | * lists until they have been rewritten, at which point this function is |
571 | * called to remove the buffer from the existing transaction's | 572 | * called to remove the buffer from the existing transaction's |
572 | * checkpoint lists. | 573 | * checkpoint lists. |
573 | * | 574 | * |
574 | * The function returns 1 if it frees the transaction, 0 otherwise. | 575 | * The function returns 1 if it frees the transaction, 0 otherwise. |
575 | * | 576 | * |
576 | * This function is called with the journal locked. | 577 | * This function is called with the journal locked. |
577 | * This function is called with j_list_lock held. | 578 | * This function is called with j_list_lock held. |
578 | * This function is called with jbd_lock_bh_state(jh2bh(jh)) | 579 | * This function is called with jbd_lock_bh_state(jh2bh(jh)) |
579 | */ | 580 | */ |
580 | 581 | ||
581 | int __journal_remove_checkpoint(struct journal_head *jh) | 582 | int __journal_remove_checkpoint(struct journal_head *jh) |
582 | { | 583 | { |
583 | transaction_t *transaction; | 584 | transaction_t *transaction; |
584 | journal_t *journal; | 585 | journal_t *journal; |
585 | int ret = 0; | 586 | int ret = 0; |
586 | 587 | ||
587 | JBUFFER_TRACE(jh, "entry"); | 588 | JBUFFER_TRACE(jh, "entry"); |
588 | 589 | ||
589 | if ((transaction = jh->b_cp_transaction) == NULL) { | 590 | if ((transaction = jh->b_cp_transaction) == NULL) { |
590 | JBUFFER_TRACE(jh, "not on transaction"); | 591 | JBUFFER_TRACE(jh, "not on transaction"); |
591 | goto out; | 592 | goto out; |
592 | } | 593 | } |
593 | journal = transaction->t_journal; | 594 | journal = transaction->t_journal; |
594 | 595 | ||
595 | __buffer_unlink(jh); | 596 | __buffer_unlink(jh); |
596 | jh->b_cp_transaction = NULL; | 597 | jh->b_cp_transaction = NULL; |
597 | 598 | ||
598 | if (transaction->t_checkpoint_list != NULL || | 599 | if (transaction->t_checkpoint_list != NULL || |
599 | transaction->t_checkpoint_io_list != NULL) | 600 | transaction->t_checkpoint_io_list != NULL) |
600 | goto out; | 601 | goto out; |
601 | JBUFFER_TRACE(jh, "transaction has no more buffers"); | 602 | JBUFFER_TRACE(jh, "transaction has no more buffers"); |
602 | 603 | ||
603 | /* | 604 | /* |
604 | * There is one special case to worry about: if we have just pulled the | 605 | * There is one special case to worry about: if we have just pulled the |
605 | * buffer off a running or committing transaction's checkpoing list, | 606 | * buffer off a running or committing transaction's checkpoing list, |
606 | * then even if the checkpoint list is empty, the transaction obviously | 607 | * then even if the checkpoint list is empty, the transaction obviously |
607 | * cannot be dropped! | 608 | * cannot be dropped! |
608 | * | 609 | * |
609 | * The locking here around t_state is a bit sleazy. | 610 | * The locking here around t_state is a bit sleazy. |
610 | * See the comment at the end of journal_commit_transaction(). | 611 | * See the comment at the end of journal_commit_transaction(). |
611 | */ | 612 | */ |
612 | if (transaction->t_state != T_FINISHED) { | 613 | if (transaction->t_state != T_FINISHED) { |
613 | JBUFFER_TRACE(jh, "belongs to running/committing transaction"); | 614 | JBUFFER_TRACE(jh, "belongs to running/committing transaction"); |
614 | goto out; | 615 | goto out; |
615 | } | 616 | } |
616 | 617 | ||
617 | /* OK, that was the last buffer for the transaction: we can now | 618 | /* OK, that was the last buffer for the transaction: we can now |
618 | safely remove this transaction from the log */ | 619 | safely remove this transaction from the log */ |
619 | 620 | ||
620 | __journal_drop_transaction(journal, transaction); | 621 | __journal_drop_transaction(journal, transaction); |
621 | 622 | ||
622 | /* Just in case anybody was waiting for more transactions to be | 623 | /* Just in case anybody was waiting for more transactions to be |
623 | checkpointed... */ | 624 | checkpointed... */ |
624 | wake_up(&journal->j_wait_logspace); | 625 | wake_up(&journal->j_wait_logspace); |
625 | ret = 1; | 626 | ret = 1; |
626 | out: | 627 | out: |
627 | JBUFFER_TRACE(jh, "exit"); | 628 | JBUFFER_TRACE(jh, "exit"); |
628 | return ret; | 629 | return ret; |
629 | } | 630 | } |
630 | 631 | ||
631 | /* | 632 | /* |
632 | * journal_insert_checkpoint: put a committed buffer onto a checkpoint | 633 | * journal_insert_checkpoint: put a committed buffer onto a checkpoint |
633 | * list so that we know when it is safe to clean the transaction out of | 634 | * list so that we know when it is safe to clean the transaction out of |
634 | * the log. | 635 | * the log. |
635 | * | 636 | * |
636 | * Called with the journal locked. | 637 | * Called with the journal locked. |
637 | * Called with j_list_lock held. | 638 | * Called with j_list_lock held. |
638 | */ | 639 | */ |
639 | void __journal_insert_checkpoint(struct journal_head *jh, | 640 | void __journal_insert_checkpoint(struct journal_head *jh, |
640 | transaction_t *transaction) | 641 | transaction_t *transaction) |
641 | { | 642 | { |
642 | JBUFFER_TRACE(jh, "entry"); | 643 | JBUFFER_TRACE(jh, "entry"); |
643 | J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh))); | 644 | J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh))); |
644 | J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); | 645 | J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); |
645 | 646 | ||
646 | jh->b_cp_transaction = transaction; | 647 | jh->b_cp_transaction = transaction; |
647 | 648 | ||
648 | if (!transaction->t_checkpoint_list) { | 649 | if (!transaction->t_checkpoint_list) { |
649 | jh->b_cpnext = jh->b_cpprev = jh; | 650 | jh->b_cpnext = jh->b_cpprev = jh; |
650 | } else { | 651 | } else { |
651 | jh->b_cpnext = transaction->t_checkpoint_list; | 652 | jh->b_cpnext = transaction->t_checkpoint_list; |
652 | jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev; | 653 | jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev; |
653 | jh->b_cpprev->b_cpnext = jh; | 654 | jh->b_cpprev->b_cpnext = jh; |
654 | jh->b_cpnext->b_cpprev = jh; | 655 | jh->b_cpnext->b_cpprev = jh; |
655 | } | 656 | } |
656 | transaction->t_checkpoint_list = jh; | 657 | transaction->t_checkpoint_list = jh; |
657 | } | 658 | } |
658 | 659 | ||
659 | /* | 660 | /* |
660 | * We've finished with this transaction structure: adios... | 661 | * We've finished with this transaction structure: adios... |
661 | * | 662 | * |
662 | * The transaction must have no links except for the checkpoint by this | 663 | * The transaction must have no links except for the checkpoint by this |
663 | * point. | 664 | * point. |
664 | * | 665 | * |
665 | * Called with the journal locked. | 666 | * Called with the journal locked. |
666 | * Called with j_list_lock held. | 667 | * Called with j_list_lock held. |
667 | */ | 668 | */ |
668 | 669 | ||
669 | void __journal_drop_transaction(journal_t *journal, transaction_t *transaction) | 670 | void __journal_drop_transaction(journal_t *journal, transaction_t *transaction) |
670 | { | 671 | { |
671 | assert_spin_locked(&journal->j_list_lock); | 672 | assert_spin_locked(&journal->j_list_lock); |
672 | if (transaction->t_cpnext) { | 673 | if (transaction->t_cpnext) { |
673 | transaction->t_cpnext->t_cpprev = transaction->t_cpprev; | 674 | transaction->t_cpnext->t_cpprev = transaction->t_cpprev; |
674 | transaction->t_cpprev->t_cpnext = transaction->t_cpnext; | 675 | transaction->t_cpprev->t_cpnext = transaction->t_cpnext; |
675 | if (journal->j_checkpoint_transactions == transaction) | 676 | if (journal->j_checkpoint_transactions == transaction) |
676 | journal->j_checkpoint_transactions = | 677 | journal->j_checkpoint_transactions = |
677 | transaction->t_cpnext; | 678 | transaction->t_cpnext; |
678 | if (journal->j_checkpoint_transactions == transaction) | 679 | if (journal->j_checkpoint_transactions == transaction) |
679 | journal->j_checkpoint_transactions = NULL; | 680 | journal->j_checkpoint_transactions = NULL; |
680 | } | 681 | } |
681 | 682 | ||
682 | J_ASSERT(transaction->t_state == T_FINISHED); | 683 | J_ASSERT(transaction->t_state == T_FINISHED); |
683 | J_ASSERT(transaction->t_buffers == NULL); | 684 | J_ASSERT(transaction->t_buffers == NULL); |
684 | J_ASSERT(transaction->t_sync_datalist == NULL); | 685 | J_ASSERT(transaction->t_sync_datalist == NULL); |
685 | J_ASSERT(transaction->t_forget == NULL); | 686 | J_ASSERT(transaction->t_forget == NULL); |
686 | J_ASSERT(transaction->t_iobuf_list == NULL); | 687 | J_ASSERT(transaction->t_iobuf_list == NULL); |
687 | J_ASSERT(transaction->t_shadow_list == NULL); | 688 | J_ASSERT(transaction->t_shadow_list == NULL); |
688 | J_ASSERT(transaction->t_log_list == NULL); | 689 | J_ASSERT(transaction->t_log_list == NULL); |
689 | J_ASSERT(transaction->t_checkpoint_list == NULL); | 690 | J_ASSERT(transaction->t_checkpoint_list == NULL); |
690 | J_ASSERT(transaction->t_checkpoint_io_list == NULL); | 691 | J_ASSERT(transaction->t_checkpoint_io_list == NULL); |
691 | J_ASSERT(transaction->t_updates == 0); | 692 | J_ASSERT(transaction->t_updates == 0); |
692 | J_ASSERT(journal->j_committing_transaction != transaction); | 693 | J_ASSERT(journal->j_committing_transaction != transaction); |
693 | J_ASSERT(journal->j_running_transaction != transaction); | 694 | J_ASSERT(journal->j_running_transaction != transaction); |
694 | 695 | ||
695 | jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); | 696 | jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); |
696 | kfree(transaction); | 697 | kfree(transaction); |
697 | } | 698 | } |
698 | 699 |
fs/jbd/commit.c
1 | /* | 1 | /* |
2 | * linux/fs/jbd/commit.c | 2 | * linux/fs/jbd/commit.c |
3 | * | 3 | * |
4 | * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 | 4 | * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 |
5 | * | 5 | * |
6 | * Copyright 1998 Red Hat corp --- All Rights Reserved | 6 | * Copyright 1998 Red Hat corp --- All Rights Reserved |
7 | * | 7 | * |
8 | * This file is part of the Linux kernel and is made available under | 8 | * This file is part of the Linux kernel and is made available under |
9 | * the terms of the GNU General Public License, version 2, or at your | 9 | * the terms of the GNU General Public License, version 2, or at your |
10 | * option, any later version, incorporated herein by reference. | 10 | * option, any later version, incorporated herein by reference. |
11 | * | 11 | * |
12 | * Journal commit routines for the generic filesystem journaling code; | 12 | * Journal commit routines for the generic filesystem journaling code; |
13 | * part of the ext2fs journaling system. | 13 | * part of the ext2fs journaling system. |
14 | */ | 14 | */ |
15 | 15 | ||
16 | #include <linux/time.h> | 16 | #include <linux/time.h> |
17 | #include <linux/fs.h> | 17 | #include <linux/fs.h> |
18 | #include <linux/jbd.h> | 18 | #include <linux/jbd.h> |
19 | #include <linux/errno.h> | 19 | #include <linux/errno.h> |
20 | #include <linux/slab.h> | 20 | #include <linux/slab.h> |
21 | #include <linux/mm.h> | 21 | #include <linux/mm.h> |
22 | #include <linux/pagemap.h> | 22 | #include <linux/pagemap.h> |
23 | 23 | ||
24 | /* | 24 | /* |
25 | * Default IO end handler for temporary BJ_IO buffer_heads. | 25 | * Default IO end handler for temporary BJ_IO buffer_heads. |
26 | */ | 26 | */ |
27 | static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) | 27 | static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) |
28 | { | 28 | { |
29 | BUFFER_TRACE(bh, ""); | 29 | BUFFER_TRACE(bh, ""); |
30 | if (uptodate) | 30 | if (uptodate) |
31 | set_buffer_uptodate(bh); | 31 | set_buffer_uptodate(bh); |
32 | else | 32 | else |
33 | clear_buffer_uptodate(bh); | 33 | clear_buffer_uptodate(bh); |
34 | unlock_buffer(bh); | 34 | unlock_buffer(bh); |
35 | } | 35 | } |
36 | 36 | ||
37 | /* | 37 | /* |
38 | * When an ext3-ordered file is truncated, it is possible that many pages are | 38 | * When an ext3-ordered file is truncated, it is possible that many pages are |
39 | * not sucessfully freed, because they are attached to a committing transaction. | 39 | * not sucessfully freed, because they are attached to a committing transaction. |
40 | * After the transaction commits, these pages are left on the LRU, with no | 40 | * After the transaction commits, these pages are left on the LRU, with no |
41 | * ->mapping, and with attached buffers. These pages are trivially reclaimable | 41 | * ->mapping, and with attached buffers. These pages are trivially reclaimable |
42 | * by the VM, but their apparent absence upsets the VM accounting, and it makes | 42 | * by the VM, but their apparent absence upsets the VM accounting, and it makes |
43 | * the numbers in /proc/meminfo look odd. | 43 | * the numbers in /proc/meminfo look odd. |
44 | * | 44 | * |
45 | * So here, we have a buffer which has just come off the forget list. Look to | 45 | * So here, we have a buffer which has just come off the forget list. Look to |
46 | * see if we can strip all buffers from the backing page. | 46 | * see if we can strip all buffers from the backing page. |
47 | * | 47 | * |
48 | * Called under lock_journal(), and possibly under journal_datalist_lock. The | 48 | * Called under lock_journal(), and possibly under journal_datalist_lock. The |
49 | * caller provided us with a ref against the buffer, and we drop that here. | 49 | * caller provided us with a ref against the buffer, and we drop that here. |
50 | */ | 50 | */ |
51 | static void release_buffer_page(struct buffer_head *bh) | 51 | static void release_buffer_page(struct buffer_head *bh) |
52 | { | 52 | { |
53 | struct page *page; | 53 | struct page *page; |
54 | 54 | ||
55 | if (buffer_dirty(bh)) | 55 | if (buffer_dirty(bh)) |
56 | goto nope; | 56 | goto nope; |
57 | if (atomic_read(&bh->b_count) != 1) | 57 | if (atomic_read(&bh->b_count) != 1) |
58 | goto nope; | 58 | goto nope; |
59 | page = bh->b_page; | 59 | page = bh->b_page; |
60 | if (!page) | 60 | if (!page) |
61 | goto nope; | 61 | goto nope; |
62 | if (page->mapping) | 62 | if (page->mapping) |
63 | goto nope; | 63 | goto nope; |
64 | 64 | ||
65 | /* OK, it's a truncated page */ | 65 | /* OK, it's a truncated page */ |
66 | if (TestSetPageLocked(page)) | 66 | if (TestSetPageLocked(page)) |
67 | goto nope; | 67 | goto nope; |
68 | 68 | ||
69 | page_cache_get(page); | 69 | page_cache_get(page); |
70 | __brelse(bh); | 70 | __brelse(bh); |
71 | try_to_free_buffers(page); | 71 | try_to_free_buffers(page); |
72 | unlock_page(page); | 72 | unlock_page(page); |
73 | page_cache_release(page); | 73 | page_cache_release(page); |
74 | return; | 74 | return; |
75 | 75 | ||
76 | nope: | 76 | nope: |
77 | __brelse(bh); | 77 | __brelse(bh); |
78 | } | 78 | } |
79 | 79 | ||
80 | /* | 80 | /* |
81 | * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is | 81 | * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is |
82 | * held. For ranking reasons we must trylock. If we lose, schedule away and | 82 | * held. For ranking reasons we must trylock. If we lose, schedule away and |
83 | * return 0. j_list_lock is dropped in this case. | 83 | * return 0. j_list_lock is dropped in this case. |
84 | */ | 84 | */ |
85 | static int inverted_lock(journal_t *journal, struct buffer_head *bh) | 85 | static int inverted_lock(journal_t *journal, struct buffer_head *bh) |
86 | { | 86 | { |
87 | if (!jbd_trylock_bh_state(bh)) { | 87 | if (!jbd_trylock_bh_state(bh)) { |
88 | spin_unlock(&journal->j_list_lock); | 88 | spin_unlock(&journal->j_list_lock); |
89 | schedule(); | 89 | schedule(); |
90 | return 0; | 90 | return 0; |
91 | } | 91 | } |
92 | return 1; | 92 | return 1; |
93 | } | 93 | } |
94 | 94 | ||
95 | /* Done it all: now write the commit record. We should have | 95 | /* Done it all: now write the commit record. We should have |
96 | * cleaned up our previous buffers by now, so if we are in abort | 96 | * cleaned up our previous buffers by now, so if we are in abort |
97 | * mode we can now just skip the rest of the journal write | 97 | * mode we can now just skip the rest of the journal write |
98 | * entirely. | 98 | * entirely. |
99 | * | 99 | * |
100 | * Returns 1 if the journal needs to be aborted or 0 on success | 100 | * Returns 1 if the journal needs to be aborted or 0 on success |
101 | */ | 101 | */ |
102 | static int journal_write_commit_record(journal_t *journal, | 102 | static int journal_write_commit_record(journal_t *journal, |
103 | transaction_t *commit_transaction) | 103 | transaction_t *commit_transaction) |
104 | { | 104 | { |
105 | struct journal_head *descriptor; | 105 | struct journal_head *descriptor; |
106 | struct buffer_head *bh; | 106 | struct buffer_head *bh; |
107 | int i, ret; | 107 | int i, ret; |
108 | int barrier_done = 0; | 108 | int barrier_done = 0; |
109 | 109 | ||
110 | if (is_journal_aborted(journal)) | 110 | if (is_journal_aborted(journal)) |
111 | return 0; | 111 | return 0; |
112 | 112 | ||
113 | descriptor = journal_get_descriptor_buffer(journal); | 113 | descriptor = journal_get_descriptor_buffer(journal); |
114 | if (!descriptor) | 114 | if (!descriptor) |
115 | return 1; | 115 | return 1; |
116 | 116 | ||
117 | bh = jh2bh(descriptor); | 117 | bh = jh2bh(descriptor); |
118 | 118 | ||
119 | /* AKPM: buglet - add `i' to tmp! */ | 119 | /* AKPM: buglet - add `i' to tmp! */ |
120 | for (i = 0; i < bh->b_size; i += 512) { | 120 | for (i = 0; i < bh->b_size; i += 512) { |
121 | journal_header_t *tmp = (journal_header_t*)bh->b_data; | 121 | journal_header_t *tmp = (journal_header_t*)bh->b_data; |
122 | tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); | 122 | tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); |
123 | tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK); | 123 | tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK); |
124 | tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); | 124 | tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); |
125 | } | 125 | } |
126 | 126 | ||
127 | JBUFFER_TRACE(descriptor, "write commit block"); | 127 | JBUFFER_TRACE(descriptor, "write commit block"); |
128 | set_buffer_dirty(bh); | 128 | set_buffer_dirty(bh); |
129 | if (journal->j_flags & JFS_BARRIER) { | 129 | if (journal->j_flags & JFS_BARRIER) { |
130 | set_buffer_ordered(bh); | 130 | set_buffer_ordered(bh); |
131 | barrier_done = 1; | 131 | barrier_done = 1; |
132 | } | 132 | } |
133 | ret = sync_dirty_buffer(bh); | 133 | ret = sync_dirty_buffer(bh); |
134 | /* is it possible for another commit to fail at roughly | 134 | /* is it possible for another commit to fail at roughly |
135 | * the same time as this one? If so, we don't want to | 135 | * the same time as this one? If so, we don't want to |
136 | * trust the barrier flag in the super, but instead want | 136 | * trust the barrier flag in the super, but instead want |
137 | * to remember if we sent a barrier request | 137 | * to remember if we sent a barrier request |
138 | */ | 138 | */ |
139 | if (ret == -EOPNOTSUPP && barrier_done) { | 139 | if (ret == -EOPNOTSUPP && barrier_done) { |
140 | char b[BDEVNAME_SIZE]; | 140 | char b[BDEVNAME_SIZE]; |
141 | 141 | ||
142 | printk(KERN_WARNING | 142 | printk(KERN_WARNING |
143 | "JBD: barrier-based sync failed on %s - " | 143 | "JBD: barrier-based sync failed on %s - " |
144 | "disabling barriers\n", | 144 | "disabling barriers\n", |
145 | bdevname(journal->j_dev, b)); | 145 | bdevname(journal->j_dev, b)); |
146 | spin_lock(&journal->j_state_lock); | 146 | spin_lock(&journal->j_state_lock); |
147 | journal->j_flags &= ~JFS_BARRIER; | 147 | journal->j_flags &= ~JFS_BARRIER; |
148 | spin_unlock(&journal->j_state_lock); | 148 | spin_unlock(&journal->j_state_lock); |
149 | 149 | ||
150 | /* And try again, without the barrier */ | 150 | /* And try again, without the barrier */ |
151 | clear_buffer_ordered(bh); | 151 | clear_buffer_ordered(bh); |
152 | set_buffer_uptodate(bh); | 152 | set_buffer_uptodate(bh); |
153 | set_buffer_dirty(bh); | 153 | set_buffer_dirty(bh); |
154 | ret = sync_dirty_buffer(bh); | 154 | ret = sync_dirty_buffer(bh); |
155 | } | 155 | } |
156 | put_bh(bh); /* One for getblk() */ | 156 | put_bh(bh); /* One for getblk() */ |
157 | journal_put_journal_head(descriptor); | 157 | journal_put_journal_head(descriptor); |
158 | 158 | ||
159 | return (ret == -EIO); | 159 | return (ret == -EIO); |
160 | } | 160 | } |
161 | 161 | ||
162 | static void journal_do_submit_data(struct buffer_head **wbuf, int bufs) | 162 | static void journal_do_submit_data(struct buffer_head **wbuf, int bufs) |
163 | { | 163 | { |
164 | int i; | 164 | int i; |
165 | 165 | ||
166 | for (i = 0; i < bufs; i++) { | 166 | for (i = 0; i < bufs; i++) { |
167 | wbuf[i]->b_end_io = end_buffer_write_sync; | 167 | wbuf[i]->b_end_io = end_buffer_write_sync; |
168 | /* We use-up our safety reference in submit_bh() */ | 168 | /* We use-up our safety reference in submit_bh() */ |
169 | submit_bh(WRITE, wbuf[i]); | 169 | submit_bh(WRITE, wbuf[i]); |
170 | } | 170 | } |
171 | } | 171 | } |
172 | 172 | ||
173 | /* | 173 | /* |
174 | * Submit all the data buffers to disk | 174 | * Submit all the data buffers to disk |
175 | */ | 175 | */ |
176 | static void journal_submit_data_buffers(journal_t *journal, | 176 | static void journal_submit_data_buffers(journal_t *journal, |
177 | transaction_t *commit_transaction) | 177 | transaction_t *commit_transaction) |
178 | { | 178 | { |
179 | struct journal_head *jh; | 179 | struct journal_head *jh; |
180 | struct buffer_head *bh; | 180 | struct buffer_head *bh; |
181 | int locked; | 181 | int locked; |
182 | int bufs = 0; | 182 | int bufs = 0; |
183 | struct buffer_head **wbuf = journal->j_wbuf; | 183 | struct buffer_head **wbuf = journal->j_wbuf; |
184 | 184 | ||
185 | /* | 185 | /* |
186 | * Whenever we unlock the journal and sleep, things can get added | 186 | * Whenever we unlock the journal and sleep, things can get added |
187 | * onto ->t_sync_datalist, so we have to keep looping back to | 187 | * onto ->t_sync_datalist, so we have to keep looping back to |
188 | * write_out_data until we *know* that the list is empty. | 188 | * write_out_data until we *know* that the list is empty. |
189 | * | 189 | * |
190 | * Cleanup any flushed data buffers from the data list. Even in | 190 | * Cleanup any flushed data buffers from the data list. Even in |
191 | * abort mode, we want to flush this out as soon as possible. | 191 | * abort mode, we want to flush this out as soon as possible. |
192 | */ | 192 | */ |
193 | write_out_data: | 193 | write_out_data: |
194 | cond_resched(); | 194 | cond_resched(); |
195 | spin_lock(&journal->j_list_lock); | 195 | spin_lock(&journal->j_list_lock); |
196 | 196 | ||
197 | while (commit_transaction->t_sync_datalist) { | 197 | while (commit_transaction->t_sync_datalist) { |
198 | jh = commit_transaction->t_sync_datalist; | 198 | jh = commit_transaction->t_sync_datalist; |
199 | bh = jh2bh(jh); | 199 | bh = jh2bh(jh); |
200 | locked = 0; | 200 | locked = 0; |
201 | 201 | ||
202 | /* Get reference just to make sure buffer does not disappear | 202 | /* Get reference just to make sure buffer does not disappear |
203 | * when we are forced to drop various locks */ | 203 | * when we are forced to drop various locks */ |
204 | get_bh(bh); | 204 | get_bh(bh); |
205 | /* If the buffer is dirty, we need to submit IO and hence | 205 | /* If the buffer is dirty, we need to submit IO and hence |
206 | * we need the buffer lock. We try to lock the buffer without | 206 | * we need the buffer lock. We try to lock the buffer without |
207 | * blocking. If we fail, we need to drop j_list_lock and do | 207 | * blocking. If we fail, we need to drop j_list_lock and do |
208 | * blocking lock_buffer(). | 208 | * blocking lock_buffer(). |
209 | */ | 209 | */ |
210 | if (buffer_dirty(bh)) { | 210 | if (buffer_dirty(bh)) { |
211 | if (test_set_buffer_locked(bh)) { | 211 | if (test_set_buffer_locked(bh)) { |
212 | BUFFER_TRACE(bh, "needs blocking lock"); | 212 | BUFFER_TRACE(bh, "needs blocking lock"); |
213 | spin_unlock(&journal->j_list_lock); | 213 | spin_unlock(&journal->j_list_lock); |
214 | /* Write out all data to prevent deadlocks */ | 214 | /* Write out all data to prevent deadlocks */ |
215 | journal_do_submit_data(wbuf, bufs); | 215 | journal_do_submit_data(wbuf, bufs); |
216 | bufs = 0; | 216 | bufs = 0; |
217 | lock_buffer(bh); | 217 | lock_buffer(bh); |
218 | spin_lock(&journal->j_list_lock); | 218 | spin_lock(&journal->j_list_lock); |
219 | } | 219 | } |
220 | locked = 1; | 220 | locked = 1; |
221 | } | 221 | } |
222 | /* We have to get bh_state lock. Again out of order, sigh. */ | 222 | /* We have to get bh_state lock. Again out of order, sigh. */ |
223 | if (!inverted_lock(journal, bh)) { | 223 | if (!inverted_lock(journal, bh)) { |
224 | jbd_lock_bh_state(bh); | 224 | jbd_lock_bh_state(bh); |
225 | spin_lock(&journal->j_list_lock); | 225 | spin_lock(&journal->j_list_lock); |
226 | } | 226 | } |
227 | /* Someone already cleaned up the buffer? */ | 227 | /* Someone already cleaned up the buffer? */ |
228 | if (!buffer_jbd(bh) | 228 | if (!buffer_jbd(bh) |
229 | || jh->b_transaction != commit_transaction | 229 | || jh->b_transaction != commit_transaction |
230 | || jh->b_jlist != BJ_SyncData) { | 230 | || jh->b_jlist != BJ_SyncData) { |
231 | jbd_unlock_bh_state(bh); | 231 | jbd_unlock_bh_state(bh); |
232 | if (locked) | 232 | if (locked) |
233 | unlock_buffer(bh); | 233 | unlock_buffer(bh); |
234 | BUFFER_TRACE(bh, "already cleaned up"); | 234 | BUFFER_TRACE(bh, "already cleaned up"); |
235 | put_bh(bh); | 235 | put_bh(bh); |
236 | continue; | 236 | continue; |
237 | } | 237 | } |
238 | if (locked && test_clear_buffer_dirty(bh)) { | 238 | if (locked && test_clear_buffer_dirty(bh)) { |
239 | BUFFER_TRACE(bh, "needs writeout, adding to array"); | 239 | BUFFER_TRACE(bh, "needs writeout, adding to array"); |
240 | wbuf[bufs++] = bh; | 240 | wbuf[bufs++] = bh; |
241 | __journal_file_buffer(jh, commit_transaction, | 241 | __journal_file_buffer(jh, commit_transaction, |
242 | BJ_Locked); | 242 | BJ_Locked); |
243 | jbd_unlock_bh_state(bh); | 243 | jbd_unlock_bh_state(bh); |
244 | if (bufs == journal->j_wbufsize) { | 244 | if (bufs == journal->j_wbufsize) { |
245 | spin_unlock(&journal->j_list_lock); | 245 | spin_unlock(&journal->j_list_lock); |
246 | journal_do_submit_data(wbuf, bufs); | 246 | journal_do_submit_data(wbuf, bufs); |
247 | bufs = 0; | 247 | bufs = 0; |
248 | goto write_out_data; | 248 | goto write_out_data; |
249 | } | 249 | } |
250 | } else if (!locked && buffer_locked(bh)) { | 250 | } else if (!locked && buffer_locked(bh)) { |
251 | __journal_file_buffer(jh, commit_transaction, | 251 | __journal_file_buffer(jh, commit_transaction, |
252 | BJ_Locked); | 252 | BJ_Locked); |
253 | jbd_unlock_bh_state(bh); | 253 | jbd_unlock_bh_state(bh); |
254 | put_bh(bh); | 254 | put_bh(bh); |
255 | } else { | 255 | } else { |
256 | BUFFER_TRACE(bh, "writeout complete: unfile"); | 256 | BUFFER_TRACE(bh, "writeout complete: unfile"); |
257 | __journal_unfile_buffer(jh); | 257 | __journal_unfile_buffer(jh); |
258 | jbd_unlock_bh_state(bh); | 258 | jbd_unlock_bh_state(bh); |
259 | if (locked) | 259 | if (locked) |
260 | unlock_buffer(bh); | 260 | unlock_buffer(bh); |
261 | journal_remove_journal_head(bh); | 261 | journal_remove_journal_head(bh); |
262 | /* Once for our safety reference, once for | 262 | /* Once for our safety reference, once for |
263 | * journal_remove_journal_head() */ | 263 | * journal_remove_journal_head() */ |
264 | put_bh(bh); | 264 | put_bh(bh); |
265 | put_bh(bh); | 265 | put_bh(bh); |
266 | } | 266 | } |
267 | 267 | ||
268 | if (lock_need_resched(&journal->j_list_lock)) { | 268 | if (need_resched() || spin_needbreak(&journal->j_list_lock)) { |
269 | spin_unlock(&journal->j_list_lock); | 269 | spin_unlock(&journal->j_list_lock); |
270 | goto write_out_data; | 270 | goto write_out_data; |
271 | } | 271 | } |
272 | } | 272 | } |
273 | spin_unlock(&journal->j_list_lock); | 273 | spin_unlock(&journal->j_list_lock); |
274 | journal_do_submit_data(wbuf, bufs); | 274 | journal_do_submit_data(wbuf, bufs); |
275 | } | 275 | } |
276 | 276 | ||
277 | /* | 277 | /* |
278 | * journal_commit_transaction | 278 | * journal_commit_transaction |
279 | * | 279 | * |
280 | * The primary function for committing a transaction to the log. This | 280 | * The primary function for committing a transaction to the log. This |
281 | * function is called by the journal thread to begin a complete commit. | 281 | * function is called by the journal thread to begin a complete commit. |
282 | */ | 282 | */ |
283 | void journal_commit_transaction(journal_t *journal) | 283 | void journal_commit_transaction(journal_t *journal) |
284 | { | 284 | { |
285 | transaction_t *commit_transaction; | 285 | transaction_t *commit_transaction; |
286 | struct journal_head *jh, *new_jh, *descriptor; | 286 | struct journal_head *jh, *new_jh, *descriptor; |
287 | struct buffer_head **wbuf = journal->j_wbuf; | 287 | struct buffer_head **wbuf = journal->j_wbuf; |
288 | int bufs; | 288 | int bufs; |
289 | int flags; | 289 | int flags; |
290 | int err; | 290 | int err; |
291 | unsigned long blocknr; | 291 | unsigned long blocknr; |
292 | char *tagp = NULL; | 292 | char *tagp = NULL; |
293 | journal_header_t *header; | 293 | journal_header_t *header; |
294 | journal_block_tag_t *tag = NULL; | 294 | journal_block_tag_t *tag = NULL; |
295 | int space_left = 0; | 295 | int space_left = 0; |
296 | int first_tag = 0; | 296 | int first_tag = 0; |
297 | int tag_flag; | 297 | int tag_flag; |
298 | int i; | 298 | int i; |
299 | 299 | ||
300 | /* | 300 | /* |
301 | * First job: lock down the current transaction and wait for | 301 | * First job: lock down the current transaction and wait for |
302 | * all outstanding updates to complete. | 302 | * all outstanding updates to complete. |
303 | */ | 303 | */ |
304 | 304 | ||
305 | #ifdef COMMIT_STATS | 305 | #ifdef COMMIT_STATS |
306 | spin_lock(&journal->j_list_lock); | 306 | spin_lock(&journal->j_list_lock); |
307 | summarise_journal_usage(journal); | 307 | summarise_journal_usage(journal); |
308 | spin_unlock(&journal->j_list_lock); | 308 | spin_unlock(&journal->j_list_lock); |
309 | #endif | 309 | #endif |
310 | 310 | ||
311 | /* Do we need to erase the effects of a prior journal_flush? */ | 311 | /* Do we need to erase the effects of a prior journal_flush? */ |
312 | if (journal->j_flags & JFS_FLUSHED) { | 312 | if (journal->j_flags & JFS_FLUSHED) { |
313 | jbd_debug(3, "super block updated\n"); | 313 | jbd_debug(3, "super block updated\n"); |
314 | journal_update_superblock(journal, 1); | 314 | journal_update_superblock(journal, 1); |
315 | } else { | 315 | } else { |
316 | jbd_debug(3, "superblock not updated\n"); | 316 | jbd_debug(3, "superblock not updated\n"); |
317 | } | 317 | } |
318 | 318 | ||
319 | J_ASSERT(journal->j_running_transaction != NULL); | 319 | J_ASSERT(journal->j_running_transaction != NULL); |
320 | J_ASSERT(journal->j_committing_transaction == NULL); | 320 | J_ASSERT(journal->j_committing_transaction == NULL); |
321 | 321 | ||
322 | commit_transaction = journal->j_running_transaction; | 322 | commit_transaction = journal->j_running_transaction; |
323 | J_ASSERT(commit_transaction->t_state == T_RUNNING); | 323 | J_ASSERT(commit_transaction->t_state == T_RUNNING); |
324 | 324 | ||
325 | jbd_debug(1, "JBD: starting commit of transaction %d\n", | 325 | jbd_debug(1, "JBD: starting commit of transaction %d\n", |
326 | commit_transaction->t_tid); | 326 | commit_transaction->t_tid); |
327 | 327 | ||
328 | spin_lock(&journal->j_state_lock); | 328 | spin_lock(&journal->j_state_lock); |
329 | commit_transaction->t_state = T_LOCKED; | 329 | commit_transaction->t_state = T_LOCKED; |
330 | 330 | ||
331 | spin_lock(&commit_transaction->t_handle_lock); | 331 | spin_lock(&commit_transaction->t_handle_lock); |
332 | while (commit_transaction->t_updates) { | 332 | while (commit_transaction->t_updates) { |
333 | DEFINE_WAIT(wait); | 333 | DEFINE_WAIT(wait); |
334 | 334 | ||
335 | prepare_to_wait(&journal->j_wait_updates, &wait, | 335 | prepare_to_wait(&journal->j_wait_updates, &wait, |
336 | TASK_UNINTERRUPTIBLE); | 336 | TASK_UNINTERRUPTIBLE); |
337 | if (commit_transaction->t_updates) { | 337 | if (commit_transaction->t_updates) { |
338 | spin_unlock(&commit_transaction->t_handle_lock); | 338 | spin_unlock(&commit_transaction->t_handle_lock); |
339 | spin_unlock(&journal->j_state_lock); | 339 | spin_unlock(&journal->j_state_lock); |
340 | schedule(); | 340 | schedule(); |
341 | spin_lock(&journal->j_state_lock); | 341 | spin_lock(&journal->j_state_lock); |
342 | spin_lock(&commit_transaction->t_handle_lock); | 342 | spin_lock(&commit_transaction->t_handle_lock); |
343 | } | 343 | } |
344 | finish_wait(&journal->j_wait_updates, &wait); | 344 | finish_wait(&journal->j_wait_updates, &wait); |
345 | } | 345 | } |
346 | spin_unlock(&commit_transaction->t_handle_lock); | 346 | spin_unlock(&commit_transaction->t_handle_lock); |
347 | 347 | ||
348 | J_ASSERT (commit_transaction->t_outstanding_credits <= | 348 | J_ASSERT (commit_transaction->t_outstanding_credits <= |
349 | journal->j_max_transaction_buffers); | 349 | journal->j_max_transaction_buffers); |
350 | 350 | ||
351 | /* | 351 | /* |
352 | * First thing we are allowed to do is to discard any remaining | 352 | * First thing we are allowed to do is to discard any remaining |
353 | * BJ_Reserved buffers. Note, it is _not_ permissible to assume | 353 | * BJ_Reserved buffers. Note, it is _not_ permissible to assume |
354 | * that there are no such buffers: if a large filesystem | 354 | * that there are no such buffers: if a large filesystem |
355 | * operation like a truncate needs to split itself over multiple | 355 | * operation like a truncate needs to split itself over multiple |
356 | * transactions, then it may try to do a journal_restart() while | 356 | * transactions, then it may try to do a journal_restart() while |
357 | * there are still BJ_Reserved buffers outstanding. These must | 357 | * there are still BJ_Reserved buffers outstanding. These must |
358 | * be released cleanly from the current transaction. | 358 | * be released cleanly from the current transaction. |
359 | * | 359 | * |
360 | * In this case, the filesystem must still reserve write access | 360 | * In this case, the filesystem must still reserve write access |
361 | * again before modifying the buffer in the new transaction, but | 361 | * again before modifying the buffer in the new transaction, but |
362 | * we do not require it to remember exactly which old buffers it | 362 | * we do not require it to remember exactly which old buffers it |
363 | * has reserved. This is consistent with the existing behaviour | 363 | * has reserved. This is consistent with the existing behaviour |
364 | * that multiple journal_get_write_access() calls to the same | 364 | * that multiple journal_get_write_access() calls to the same |
365 | * buffer are perfectly permissable. | 365 | * buffer are perfectly permissable. |
366 | */ | 366 | */ |
367 | while (commit_transaction->t_reserved_list) { | 367 | while (commit_transaction->t_reserved_list) { |
368 | jh = commit_transaction->t_reserved_list; | 368 | jh = commit_transaction->t_reserved_list; |
369 | JBUFFER_TRACE(jh, "reserved, unused: refile"); | 369 | JBUFFER_TRACE(jh, "reserved, unused: refile"); |
370 | /* | 370 | /* |
371 | * A journal_get_undo_access()+journal_release_buffer() may | 371 | * A journal_get_undo_access()+journal_release_buffer() may |
372 | * leave undo-committed data. | 372 | * leave undo-committed data. |
373 | */ | 373 | */ |
374 | if (jh->b_committed_data) { | 374 | if (jh->b_committed_data) { |
375 | struct buffer_head *bh = jh2bh(jh); | 375 | struct buffer_head *bh = jh2bh(jh); |
376 | 376 | ||
377 | jbd_lock_bh_state(bh); | 377 | jbd_lock_bh_state(bh); |
378 | jbd_free(jh->b_committed_data, bh->b_size); | 378 | jbd_free(jh->b_committed_data, bh->b_size); |
379 | jh->b_committed_data = NULL; | 379 | jh->b_committed_data = NULL; |
380 | jbd_unlock_bh_state(bh); | 380 | jbd_unlock_bh_state(bh); |
381 | } | 381 | } |
382 | journal_refile_buffer(journal, jh); | 382 | journal_refile_buffer(journal, jh); |
383 | } | 383 | } |
384 | 384 | ||
385 | /* | 385 | /* |
386 | * Now try to drop any written-back buffers from the journal's | 386 | * Now try to drop any written-back buffers from the journal's |
387 | * checkpoint lists. We do this *before* commit because it potentially | 387 | * checkpoint lists. We do this *before* commit because it potentially |
388 | * frees some memory | 388 | * frees some memory |
389 | */ | 389 | */ |
390 | spin_lock(&journal->j_list_lock); | 390 | spin_lock(&journal->j_list_lock); |
391 | __journal_clean_checkpoint_list(journal); | 391 | __journal_clean_checkpoint_list(journal); |
392 | spin_unlock(&journal->j_list_lock); | 392 | spin_unlock(&journal->j_list_lock); |
393 | 393 | ||
394 | jbd_debug (3, "JBD: commit phase 1\n"); | 394 | jbd_debug (3, "JBD: commit phase 1\n"); |
395 | 395 | ||
396 | /* | 396 | /* |
397 | * Switch to a new revoke table. | 397 | * Switch to a new revoke table. |
398 | */ | 398 | */ |
399 | journal_switch_revoke_table(journal); | 399 | journal_switch_revoke_table(journal); |
400 | 400 | ||
401 | commit_transaction->t_state = T_FLUSH; | 401 | commit_transaction->t_state = T_FLUSH; |
402 | journal->j_committing_transaction = commit_transaction; | 402 | journal->j_committing_transaction = commit_transaction; |
403 | journal->j_running_transaction = NULL; | 403 | journal->j_running_transaction = NULL; |
404 | commit_transaction->t_log_start = journal->j_head; | 404 | commit_transaction->t_log_start = journal->j_head; |
405 | wake_up(&journal->j_wait_transaction_locked); | 405 | wake_up(&journal->j_wait_transaction_locked); |
406 | spin_unlock(&journal->j_state_lock); | 406 | spin_unlock(&journal->j_state_lock); |
407 | 407 | ||
408 | jbd_debug (3, "JBD: commit phase 2\n"); | 408 | jbd_debug (3, "JBD: commit phase 2\n"); |
409 | 409 | ||
410 | /* | 410 | /* |
411 | * First, drop modified flag: all accesses to the buffers | 411 | * First, drop modified flag: all accesses to the buffers |
412 | * will be tracked for a new trasaction only -bzzz | 412 | * will be tracked for a new trasaction only -bzzz |
413 | */ | 413 | */ |
414 | spin_lock(&journal->j_list_lock); | 414 | spin_lock(&journal->j_list_lock); |
415 | if (commit_transaction->t_buffers) { | 415 | if (commit_transaction->t_buffers) { |
416 | new_jh = jh = commit_transaction->t_buffers->b_tnext; | 416 | new_jh = jh = commit_transaction->t_buffers->b_tnext; |
417 | do { | 417 | do { |
418 | J_ASSERT_JH(new_jh, new_jh->b_modified == 1 || | 418 | J_ASSERT_JH(new_jh, new_jh->b_modified == 1 || |
419 | new_jh->b_modified == 0); | 419 | new_jh->b_modified == 0); |
420 | new_jh->b_modified = 0; | 420 | new_jh->b_modified = 0; |
421 | new_jh = new_jh->b_tnext; | 421 | new_jh = new_jh->b_tnext; |
422 | } while (new_jh != jh); | 422 | } while (new_jh != jh); |
423 | } | 423 | } |
424 | spin_unlock(&journal->j_list_lock); | 424 | spin_unlock(&journal->j_list_lock); |
425 | 425 | ||
426 | /* | 426 | /* |
427 | * Now start flushing things to disk, in the order they appear | 427 | * Now start flushing things to disk, in the order they appear |
428 | * on the transaction lists. Data blocks go first. | 428 | * on the transaction lists. Data blocks go first. |
429 | */ | 429 | */ |
430 | err = 0; | 430 | err = 0; |
431 | journal_submit_data_buffers(journal, commit_transaction); | 431 | journal_submit_data_buffers(journal, commit_transaction); |
432 | 432 | ||
433 | /* | 433 | /* |
434 | * Wait for all previously submitted IO to complete. | 434 | * Wait for all previously submitted IO to complete. |
435 | */ | 435 | */ |
436 | spin_lock(&journal->j_list_lock); | 436 | spin_lock(&journal->j_list_lock); |
437 | while (commit_transaction->t_locked_list) { | 437 | while (commit_transaction->t_locked_list) { |
438 | struct buffer_head *bh; | 438 | struct buffer_head *bh; |
439 | 439 | ||
440 | jh = commit_transaction->t_locked_list->b_tprev; | 440 | jh = commit_transaction->t_locked_list->b_tprev; |
441 | bh = jh2bh(jh); | 441 | bh = jh2bh(jh); |
442 | get_bh(bh); | 442 | get_bh(bh); |
443 | if (buffer_locked(bh)) { | 443 | if (buffer_locked(bh)) { |
444 | spin_unlock(&journal->j_list_lock); | 444 | spin_unlock(&journal->j_list_lock); |
445 | wait_on_buffer(bh); | 445 | wait_on_buffer(bh); |
446 | if (unlikely(!buffer_uptodate(bh))) | 446 | if (unlikely(!buffer_uptodate(bh))) |
447 | err = -EIO; | 447 | err = -EIO; |
448 | spin_lock(&journal->j_list_lock); | 448 | spin_lock(&journal->j_list_lock); |
449 | } | 449 | } |
450 | if (!inverted_lock(journal, bh)) { | 450 | if (!inverted_lock(journal, bh)) { |
451 | put_bh(bh); | 451 | put_bh(bh); |
452 | spin_lock(&journal->j_list_lock); | 452 | spin_lock(&journal->j_list_lock); |
453 | continue; | 453 | continue; |
454 | } | 454 | } |
455 | if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) { | 455 | if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) { |
456 | __journal_unfile_buffer(jh); | 456 | __journal_unfile_buffer(jh); |
457 | jbd_unlock_bh_state(bh); | 457 | jbd_unlock_bh_state(bh); |
458 | journal_remove_journal_head(bh); | 458 | journal_remove_journal_head(bh); |
459 | put_bh(bh); | 459 | put_bh(bh); |
460 | } else { | 460 | } else { |
461 | jbd_unlock_bh_state(bh); | 461 | jbd_unlock_bh_state(bh); |
462 | } | 462 | } |
463 | put_bh(bh); | 463 | put_bh(bh); |
464 | cond_resched_lock(&journal->j_list_lock); | 464 | cond_resched_lock(&journal->j_list_lock); |
465 | } | 465 | } |
466 | spin_unlock(&journal->j_list_lock); | 466 | spin_unlock(&journal->j_list_lock); |
467 | 467 | ||
468 | if (err) | 468 | if (err) |
469 | journal_abort(journal, err); | 469 | journal_abort(journal, err); |
470 | 470 | ||
471 | journal_write_revoke_records(journal, commit_transaction); | 471 | journal_write_revoke_records(journal, commit_transaction); |
472 | 472 | ||
473 | jbd_debug(3, "JBD: commit phase 2\n"); | 473 | jbd_debug(3, "JBD: commit phase 2\n"); |
474 | 474 | ||
475 | /* | 475 | /* |
476 | * If we found any dirty or locked buffers, then we should have | 476 | * If we found any dirty or locked buffers, then we should have |
477 | * looped back up to the write_out_data label. If there weren't | 477 | * looped back up to the write_out_data label. If there weren't |
478 | * any then journal_clean_data_list should have wiped the list | 478 | * any then journal_clean_data_list should have wiped the list |
479 | * clean by now, so check that it is in fact empty. | 479 | * clean by now, so check that it is in fact empty. |
480 | */ | 480 | */ |
481 | J_ASSERT (commit_transaction->t_sync_datalist == NULL); | 481 | J_ASSERT (commit_transaction->t_sync_datalist == NULL); |
482 | 482 | ||
483 | jbd_debug (3, "JBD: commit phase 3\n"); | 483 | jbd_debug (3, "JBD: commit phase 3\n"); |
484 | 484 | ||
485 | /* | 485 | /* |
486 | * Way to go: we have now written out all of the data for a | 486 | * Way to go: we have now written out all of the data for a |
487 | * transaction! Now comes the tricky part: we need to write out | 487 | * transaction! Now comes the tricky part: we need to write out |
488 | * metadata. Loop over the transaction's entire buffer list: | 488 | * metadata. Loop over the transaction's entire buffer list: |
489 | */ | 489 | */ |
490 | commit_transaction->t_state = T_COMMIT; | 490 | commit_transaction->t_state = T_COMMIT; |
491 | 491 | ||
492 | descriptor = NULL; | 492 | descriptor = NULL; |
493 | bufs = 0; | 493 | bufs = 0; |
494 | while (commit_transaction->t_buffers) { | 494 | while (commit_transaction->t_buffers) { |
495 | 495 | ||
496 | /* Find the next buffer to be journaled... */ | 496 | /* Find the next buffer to be journaled... */ |
497 | 497 | ||
498 | jh = commit_transaction->t_buffers; | 498 | jh = commit_transaction->t_buffers; |
499 | 499 | ||
500 | /* If we're in abort mode, we just un-journal the buffer and | 500 | /* If we're in abort mode, we just un-journal the buffer and |
501 | release it for background writing. */ | 501 | release it for background writing. */ |
502 | 502 | ||
503 | if (is_journal_aborted(journal)) { | 503 | if (is_journal_aborted(journal)) { |
504 | JBUFFER_TRACE(jh, "journal is aborting: refile"); | 504 | JBUFFER_TRACE(jh, "journal is aborting: refile"); |
505 | journal_refile_buffer(journal, jh); | 505 | journal_refile_buffer(journal, jh); |
506 | /* If that was the last one, we need to clean up | 506 | /* If that was the last one, we need to clean up |
507 | * any descriptor buffers which may have been | 507 | * any descriptor buffers which may have been |
508 | * already allocated, even if we are now | 508 | * already allocated, even if we are now |
509 | * aborting. */ | 509 | * aborting. */ |
510 | if (!commit_transaction->t_buffers) | 510 | if (!commit_transaction->t_buffers) |
511 | goto start_journal_io; | 511 | goto start_journal_io; |
512 | continue; | 512 | continue; |
513 | } | 513 | } |
514 | 514 | ||
515 | /* Make sure we have a descriptor block in which to | 515 | /* Make sure we have a descriptor block in which to |
516 | record the metadata buffer. */ | 516 | record the metadata buffer. */ |
517 | 517 | ||
518 | if (!descriptor) { | 518 | if (!descriptor) { |
519 | struct buffer_head *bh; | 519 | struct buffer_head *bh; |
520 | 520 | ||
521 | J_ASSERT (bufs == 0); | 521 | J_ASSERT (bufs == 0); |
522 | 522 | ||
523 | jbd_debug(4, "JBD: get descriptor\n"); | 523 | jbd_debug(4, "JBD: get descriptor\n"); |
524 | 524 | ||
525 | descriptor = journal_get_descriptor_buffer(journal); | 525 | descriptor = journal_get_descriptor_buffer(journal); |
526 | if (!descriptor) { | 526 | if (!descriptor) { |
527 | journal_abort(journal, -EIO); | 527 | journal_abort(journal, -EIO); |
528 | continue; | 528 | continue; |
529 | } | 529 | } |
530 | 530 | ||
531 | bh = jh2bh(descriptor); | 531 | bh = jh2bh(descriptor); |
532 | jbd_debug(4, "JBD: got buffer %llu (%p)\n", | 532 | jbd_debug(4, "JBD: got buffer %llu (%p)\n", |
533 | (unsigned long long)bh->b_blocknr, bh->b_data); | 533 | (unsigned long long)bh->b_blocknr, bh->b_data); |
534 | header = (journal_header_t *)&bh->b_data[0]; | 534 | header = (journal_header_t *)&bh->b_data[0]; |
535 | header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); | 535 | header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); |
536 | header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK); | 536 | header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK); |
537 | header->h_sequence = cpu_to_be32(commit_transaction->t_tid); | 537 | header->h_sequence = cpu_to_be32(commit_transaction->t_tid); |
538 | 538 | ||
539 | tagp = &bh->b_data[sizeof(journal_header_t)]; | 539 | tagp = &bh->b_data[sizeof(journal_header_t)]; |
540 | space_left = bh->b_size - sizeof(journal_header_t); | 540 | space_left = bh->b_size - sizeof(journal_header_t); |
541 | first_tag = 1; | 541 | first_tag = 1; |
542 | set_buffer_jwrite(bh); | 542 | set_buffer_jwrite(bh); |
543 | set_buffer_dirty(bh); | 543 | set_buffer_dirty(bh); |
544 | wbuf[bufs++] = bh; | 544 | wbuf[bufs++] = bh; |
545 | 545 | ||
546 | /* Record it so that we can wait for IO | 546 | /* Record it so that we can wait for IO |
547 | completion later */ | 547 | completion later */ |
548 | BUFFER_TRACE(bh, "ph3: file as descriptor"); | 548 | BUFFER_TRACE(bh, "ph3: file as descriptor"); |
549 | journal_file_buffer(descriptor, commit_transaction, | 549 | journal_file_buffer(descriptor, commit_transaction, |
550 | BJ_LogCtl); | 550 | BJ_LogCtl); |
551 | } | 551 | } |
552 | 552 | ||
553 | /* Where is the buffer to be written? */ | 553 | /* Where is the buffer to be written? */ |
554 | 554 | ||
555 | err = journal_next_log_block(journal, &blocknr); | 555 | err = journal_next_log_block(journal, &blocknr); |
556 | /* If the block mapping failed, just abandon the buffer | 556 | /* If the block mapping failed, just abandon the buffer |
557 | and repeat this loop: we'll fall into the | 557 | and repeat this loop: we'll fall into the |
558 | refile-on-abort condition above. */ | 558 | refile-on-abort condition above. */ |
559 | if (err) { | 559 | if (err) { |
560 | journal_abort(journal, err); | 560 | journal_abort(journal, err); |
561 | continue; | 561 | continue; |
562 | } | 562 | } |
563 | 563 | ||
564 | /* | 564 | /* |
565 | * start_this_handle() uses t_outstanding_credits to determine | 565 | * start_this_handle() uses t_outstanding_credits to determine |
566 | * the free space in the log, but this counter is changed | 566 | * the free space in the log, but this counter is changed |
567 | * by journal_next_log_block() also. | 567 | * by journal_next_log_block() also. |
568 | */ | 568 | */ |
569 | commit_transaction->t_outstanding_credits--; | 569 | commit_transaction->t_outstanding_credits--; |
570 | 570 | ||
571 | /* Bump b_count to prevent truncate from stumbling over | 571 | /* Bump b_count to prevent truncate from stumbling over |
572 | the shadowed buffer! @@@ This can go if we ever get | 572 | the shadowed buffer! @@@ This can go if we ever get |
573 | rid of the BJ_IO/BJ_Shadow pairing of buffers. */ | 573 | rid of the BJ_IO/BJ_Shadow pairing of buffers. */ |
574 | atomic_inc(&jh2bh(jh)->b_count); | 574 | atomic_inc(&jh2bh(jh)->b_count); |
575 | 575 | ||
576 | /* Make a temporary IO buffer with which to write it out | 576 | /* Make a temporary IO buffer with which to write it out |
577 | (this will requeue both the metadata buffer and the | 577 | (this will requeue both the metadata buffer and the |
578 | temporary IO buffer). new_bh goes on BJ_IO*/ | 578 | temporary IO buffer). new_bh goes on BJ_IO*/ |
579 | 579 | ||
580 | set_bit(BH_JWrite, &jh2bh(jh)->b_state); | 580 | set_bit(BH_JWrite, &jh2bh(jh)->b_state); |
581 | /* | 581 | /* |
582 | * akpm: journal_write_metadata_buffer() sets | 582 | * akpm: journal_write_metadata_buffer() sets |
583 | * new_bh->b_transaction to commit_transaction. | 583 | * new_bh->b_transaction to commit_transaction. |
584 | * We need to clean this up before we release new_bh | 584 | * We need to clean this up before we release new_bh |
585 | * (which is of type BJ_IO) | 585 | * (which is of type BJ_IO) |
586 | */ | 586 | */ |
587 | JBUFFER_TRACE(jh, "ph3: write metadata"); | 587 | JBUFFER_TRACE(jh, "ph3: write metadata"); |
588 | flags = journal_write_metadata_buffer(commit_transaction, | 588 | flags = journal_write_metadata_buffer(commit_transaction, |
589 | jh, &new_jh, blocknr); | 589 | jh, &new_jh, blocknr); |
590 | set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); | 590 | set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); |
591 | wbuf[bufs++] = jh2bh(new_jh); | 591 | wbuf[bufs++] = jh2bh(new_jh); |
592 | 592 | ||
593 | /* Record the new block's tag in the current descriptor | 593 | /* Record the new block's tag in the current descriptor |
594 | buffer */ | 594 | buffer */ |
595 | 595 | ||
596 | tag_flag = 0; | 596 | tag_flag = 0; |
597 | if (flags & 1) | 597 | if (flags & 1) |
598 | tag_flag |= JFS_FLAG_ESCAPE; | 598 | tag_flag |= JFS_FLAG_ESCAPE; |
599 | if (!first_tag) | 599 | if (!first_tag) |
600 | tag_flag |= JFS_FLAG_SAME_UUID; | 600 | tag_flag |= JFS_FLAG_SAME_UUID; |
601 | 601 | ||
602 | tag = (journal_block_tag_t *) tagp; | 602 | tag = (journal_block_tag_t *) tagp; |
603 | tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr); | 603 | tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr); |
604 | tag->t_flags = cpu_to_be32(tag_flag); | 604 | tag->t_flags = cpu_to_be32(tag_flag); |
605 | tagp += sizeof(journal_block_tag_t); | 605 | tagp += sizeof(journal_block_tag_t); |
606 | space_left -= sizeof(journal_block_tag_t); | 606 | space_left -= sizeof(journal_block_tag_t); |
607 | 607 | ||
608 | if (first_tag) { | 608 | if (first_tag) { |
609 | memcpy (tagp, journal->j_uuid, 16); | 609 | memcpy (tagp, journal->j_uuid, 16); |
610 | tagp += 16; | 610 | tagp += 16; |
611 | space_left -= 16; | 611 | space_left -= 16; |
612 | first_tag = 0; | 612 | first_tag = 0; |
613 | } | 613 | } |
614 | 614 | ||
615 | /* If there's no more to do, or if the descriptor is full, | 615 | /* If there's no more to do, or if the descriptor is full, |
616 | let the IO rip! */ | 616 | let the IO rip! */ |
617 | 617 | ||
618 | if (bufs == journal->j_wbufsize || | 618 | if (bufs == journal->j_wbufsize || |
619 | commit_transaction->t_buffers == NULL || | 619 | commit_transaction->t_buffers == NULL || |
620 | space_left < sizeof(journal_block_tag_t) + 16) { | 620 | space_left < sizeof(journal_block_tag_t) + 16) { |
621 | 621 | ||
622 | jbd_debug(4, "JBD: Submit %d IOs\n", bufs); | 622 | jbd_debug(4, "JBD: Submit %d IOs\n", bufs); |
623 | 623 | ||
624 | /* Write an end-of-descriptor marker before | 624 | /* Write an end-of-descriptor marker before |
625 | submitting the IOs. "tag" still points to | 625 | submitting the IOs. "tag" still points to |
626 | the last tag we set up. */ | 626 | the last tag we set up. */ |
627 | 627 | ||
628 | tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG); | 628 | tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG); |
629 | 629 | ||
630 | start_journal_io: | 630 | start_journal_io: |
631 | for (i = 0; i < bufs; i++) { | 631 | for (i = 0; i < bufs; i++) { |
632 | struct buffer_head *bh = wbuf[i]; | 632 | struct buffer_head *bh = wbuf[i]; |
633 | lock_buffer(bh); | 633 | lock_buffer(bh); |
634 | clear_buffer_dirty(bh); | 634 | clear_buffer_dirty(bh); |
635 | set_buffer_uptodate(bh); | 635 | set_buffer_uptodate(bh); |
636 | bh->b_end_io = journal_end_buffer_io_sync; | 636 | bh->b_end_io = journal_end_buffer_io_sync; |
637 | submit_bh(WRITE, bh); | 637 | submit_bh(WRITE, bh); |
638 | } | 638 | } |
639 | cond_resched(); | 639 | cond_resched(); |
640 | 640 | ||
641 | /* Force a new descriptor to be generated next | 641 | /* Force a new descriptor to be generated next |
642 | time round the loop. */ | 642 | time round the loop. */ |
643 | descriptor = NULL; | 643 | descriptor = NULL; |
644 | bufs = 0; | 644 | bufs = 0; |
645 | } | 645 | } |
646 | } | 646 | } |
647 | 647 | ||
648 | /* Lo and behold: we have just managed to send a transaction to | 648 | /* Lo and behold: we have just managed to send a transaction to |
649 | the log. Before we can commit it, wait for the IO so far to | 649 | the log. Before we can commit it, wait for the IO so far to |
650 | complete. Control buffers being written are on the | 650 | complete. Control buffers being written are on the |
651 | transaction's t_log_list queue, and metadata buffers are on | 651 | transaction's t_log_list queue, and metadata buffers are on |
652 | the t_iobuf_list queue. | 652 | the t_iobuf_list queue. |
653 | 653 | ||
654 | Wait for the buffers in reverse order. That way we are | 654 | Wait for the buffers in reverse order. That way we are |
655 | less likely to be woken up until all IOs have completed, and | 655 | less likely to be woken up until all IOs have completed, and |
656 | so we incur less scheduling load. | 656 | so we incur less scheduling load. |
657 | */ | 657 | */ |
658 | 658 | ||
659 | jbd_debug(3, "JBD: commit phase 4\n"); | 659 | jbd_debug(3, "JBD: commit phase 4\n"); |
660 | 660 | ||
661 | /* | 661 | /* |
662 | * akpm: these are BJ_IO, and j_list_lock is not needed. | 662 | * akpm: these are BJ_IO, and j_list_lock is not needed. |
663 | * See __journal_try_to_free_buffer. | 663 | * See __journal_try_to_free_buffer. |
664 | */ | 664 | */ |
665 | wait_for_iobuf: | 665 | wait_for_iobuf: |
666 | while (commit_transaction->t_iobuf_list != NULL) { | 666 | while (commit_transaction->t_iobuf_list != NULL) { |
667 | struct buffer_head *bh; | 667 | struct buffer_head *bh; |
668 | 668 | ||
669 | jh = commit_transaction->t_iobuf_list->b_tprev; | 669 | jh = commit_transaction->t_iobuf_list->b_tprev; |
670 | bh = jh2bh(jh); | 670 | bh = jh2bh(jh); |
671 | if (buffer_locked(bh)) { | 671 | if (buffer_locked(bh)) { |
672 | wait_on_buffer(bh); | 672 | wait_on_buffer(bh); |
673 | goto wait_for_iobuf; | 673 | goto wait_for_iobuf; |
674 | } | 674 | } |
675 | if (cond_resched()) | 675 | if (cond_resched()) |
676 | goto wait_for_iobuf; | 676 | goto wait_for_iobuf; |
677 | 677 | ||
678 | if (unlikely(!buffer_uptodate(bh))) | 678 | if (unlikely(!buffer_uptodate(bh))) |
679 | err = -EIO; | 679 | err = -EIO; |
680 | 680 | ||
681 | clear_buffer_jwrite(bh); | 681 | clear_buffer_jwrite(bh); |
682 | 682 | ||
683 | JBUFFER_TRACE(jh, "ph4: unfile after journal write"); | 683 | JBUFFER_TRACE(jh, "ph4: unfile after journal write"); |
684 | journal_unfile_buffer(journal, jh); | 684 | journal_unfile_buffer(journal, jh); |
685 | 685 | ||
686 | /* | 686 | /* |
687 | * ->t_iobuf_list should contain only dummy buffer_heads | 687 | * ->t_iobuf_list should contain only dummy buffer_heads |
688 | * which were created by journal_write_metadata_buffer(). | 688 | * which were created by journal_write_metadata_buffer(). |
689 | */ | 689 | */ |
690 | BUFFER_TRACE(bh, "dumping temporary bh"); | 690 | BUFFER_TRACE(bh, "dumping temporary bh"); |
691 | journal_put_journal_head(jh); | 691 | journal_put_journal_head(jh); |
692 | __brelse(bh); | 692 | __brelse(bh); |
693 | J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0); | 693 | J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0); |
694 | free_buffer_head(bh); | 694 | free_buffer_head(bh); |
695 | 695 | ||
696 | /* We also have to unlock and free the corresponding | 696 | /* We also have to unlock and free the corresponding |
697 | shadowed buffer */ | 697 | shadowed buffer */ |
698 | jh = commit_transaction->t_shadow_list->b_tprev; | 698 | jh = commit_transaction->t_shadow_list->b_tprev; |
699 | bh = jh2bh(jh); | 699 | bh = jh2bh(jh); |
700 | clear_bit(BH_JWrite, &bh->b_state); | 700 | clear_bit(BH_JWrite, &bh->b_state); |
701 | J_ASSERT_BH(bh, buffer_jbddirty(bh)); | 701 | J_ASSERT_BH(bh, buffer_jbddirty(bh)); |
702 | 702 | ||
703 | /* The metadata is now released for reuse, but we need | 703 | /* The metadata is now released for reuse, but we need |
704 | to remember it against this transaction so that when | 704 | to remember it against this transaction so that when |
705 | we finally commit, we can do any checkpointing | 705 | we finally commit, we can do any checkpointing |
706 | required. */ | 706 | required. */ |
707 | JBUFFER_TRACE(jh, "file as BJ_Forget"); | 707 | JBUFFER_TRACE(jh, "file as BJ_Forget"); |
708 | journal_file_buffer(jh, commit_transaction, BJ_Forget); | 708 | journal_file_buffer(jh, commit_transaction, BJ_Forget); |
709 | /* Wake up any transactions which were waiting for this | 709 | /* Wake up any transactions which were waiting for this |
710 | IO to complete */ | 710 | IO to complete */ |
711 | wake_up_bit(&bh->b_state, BH_Unshadow); | 711 | wake_up_bit(&bh->b_state, BH_Unshadow); |
712 | JBUFFER_TRACE(jh, "brelse shadowed buffer"); | 712 | JBUFFER_TRACE(jh, "brelse shadowed buffer"); |
713 | __brelse(bh); | 713 | __brelse(bh); |
714 | } | 714 | } |
715 | 715 | ||
716 | J_ASSERT (commit_transaction->t_shadow_list == NULL); | 716 | J_ASSERT (commit_transaction->t_shadow_list == NULL); |
717 | 717 | ||
718 | jbd_debug(3, "JBD: commit phase 5\n"); | 718 | jbd_debug(3, "JBD: commit phase 5\n"); |
719 | 719 | ||
720 | /* Here we wait for the revoke record and descriptor record buffers */ | 720 | /* Here we wait for the revoke record and descriptor record buffers */ |
721 | wait_for_ctlbuf: | 721 | wait_for_ctlbuf: |
722 | while (commit_transaction->t_log_list != NULL) { | 722 | while (commit_transaction->t_log_list != NULL) { |
723 | struct buffer_head *bh; | 723 | struct buffer_head *bh; |
724 | 724 | ||
725 | jh = commit_transaction->t_log_list->b_tprev; | 725 | jh = commit_transaction->t_log_list->b_tprev; |
726 | bh = jh2bh(jh); | 726 | bh = jh2bh(jh); |
727 | if (buffer_locked(bh)) { | 727 | if (buffer_locked(bh)) { |
728 | wait_on_buffer(bh); | 728 | wait_on_buffer(bh); |
729 | goto wait_for_ctlbuf; | 729 | goto wait_for_ctlbuf; |
730 | } | 730 | } |
731 | if (cond_resched()) | 731 | if (cond_resched()) |
732 | goto wait_for_ctlbuf; | 732 | goto wait_for_ctlbuf; |
733 | 733 | ||
734 | if (unlikely(!buffer_uptodate(bh))) | 734 | if (unlikely(!buffer_uptodate(bh))) |
735 | err = -EIO; | 735 | err = -EIO; |
736 | 736 | ||
737 | BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); | 737 | BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); |
738 | clear_buffer_jwrite(bh); | 738 | clear_buffer_jwrite(bh); |
739 | journal_unfile_buffer(journal, jh); | 739 | journal_unfile_buffer(journal, jh); |
740 | journal_put_journal_head(jh); | 740 | journal_put_journal_head(jh); |
741 | __brelse(bh); /* One for getblk */ | 741 | __brelse(bh); /* One for getblk */ |
742 | /* AKPM: bforget here */ | 742 | /* AKPM: bforget here */ |
743 | } | 743 | } |
744 | 744 | ||
745 | jbd_debug(3, "JBD: commit phase 6\n"); | 745 | jbd_debug(3, "JBD: commit phase 6\n"); |
746 | 746 | ||
747 | if (journal_write_commit_record(journal, commit_transaction)) | 747 | if (journal_write_commit_record(journal, commit_transaction)) |
748 | err = -EIO; | 748 | err = -EIO; |
749 | 749 | ||
750 | if (err) | 750 | if (err) |
751 | journal_abort(journal, err); | 751 | journal_abort(journal, err); |
752 | 752 | ||
753 | /* End of a transaction! Finally, we can do checkpoint | 753 | /* End of a transaction! Finally, we can do checkpoint |
754 | processing: any buffers committed as a result of this | 754 | processing: any buffers committed as a result of this |
755 | transaction can be removed from any checkpoint list it was on | 755 | transaction can be removed from any checkpoint list it was on |
756 | before. */ | 756 | before. */ |
757 | 757 | ||
758 | jbd_debug(3, "JBD: commit phase 7\n"); | 758 | jbd_debug(3, "JBD: commit phase 7\n"); |
759 | 759 | ||
760 | J_ASSERT(commit_transaction->t_sync_datalist == NULL); | 760 | J_ASSERT(commit_transaction->t_sync_datalist == NULL); |
761 | J_ASSERT(commit_transaction->t_buffers == NULL); | 761 | J_ASSERT(commit_transaction->t_buffers == NULL); |
762 | J_ASSERT(commit_transaction->t_checkpoint_list == NULL); | 762 | J_ASSERT(commit_transaction->t_checkpoint_list == NULL); |
763 | J_ASSERT(commit_transaction->t_iobuf_list == NULL); | 763 | J_ASSERT(commit_transaction->t_iobuf_list == NULL); |
764 | J_ASSERT(commit_transaction->t_shadow_list == NULL); | 764 | J_ASSERT(commit_transaction->t_shadow_list == NULL); |
765 | J_ASSERT(commit_transaction->t_log_list == NULL); | 765 | J_ASSERT(commit_transaction->t_log_list == NULL); |
766 | 766 | ||
767 | restart_loop: | 767 | restart_loop: |
768 | /* | 768 | /* |
769 | * As there are other places (journal_unmap_buffer()) adding buffers | 769 | * As there are other places (journal_unmap_buffer()) adding buffers |
770 | * to this list we have to be careful and hold the j_list_lock. | 770 | * to this list we have to be careful and hold the j_list_lock. |
771 | */ | 771 | */ |
772 | spin_lock(&journal->j_list_lock); | 772 | spin_lock(&journal->j_list_lock); |
773 | while (commit_transaction->t_forget) { | 773 | while (commit_transaction->t_forget) { |
774 | transaction_t *cp_transaction; | 774 | transaction_t *cp_transaction; |
775 | struct buffer_head *bh; | 775 | struct buffer_head *bh; |
776 | 776 | ||
777 | jh = commit_transaction->t_forget; | 777 | jh = commit_transaction->t_forget; |
778 | spin_unlock(&journal->j_list_lock); | 778 | spin_unlock(&journal->j_list_lock); |
779 | bh = jh2bh(jh); | 779 | bh = jh2bh(jh); |
780 | jbd_lock_bh_state(bh); | 780 | jbd_lock_bh_state(bh); |
781 | J_ASSERT_JH(jh, jh->b_transaction == commit_transaction || | 781 | J_ASSERT_JH(jh, jh->b_transaction == commit_transaction || |
782 | jh->b_transaction == journal->j_running_transaction); | 782 | jh->b_transaction == journal->j_running_transaction); |
783 | 783 | ||
784 | /* | 784 | /* |
785 | * If there is undo-protected committed data against | 785 | * If there is undo-protected committed data against |
786 | * this buffer, then we can remove it now. If it is a | 786 | * this buffer, then we can remove it now. If it is a |
787 | * buffer needing such protection, the old frozen_data | 787 | * buffer needing such protection, the old frozen_data |
788 | * field now points to a committed version of the | 788 | * field now points to a committed version of the |
789 | * buffer, so rotate that field to the new committed | 789 | * buffer, so rotate that field to the new committed |
790 | * data. | 790 | * data. |
791 | * | 791 | * |
792 | * Otherwise, we can just throw away the frozen data now. | 792 | * Otherwise, we can just throw away the frozen data now. |
793 | */ | 793 | */ |
794 | if (jh->b_committed_data) { | 794 | if (jh->b_committed_data) { |
795 | jbd_free(jh->b_committed_data, bh->b_size); | 795 | jbd_free(jh->b_committed_data, bh->b_size); |
796 | jh->b_committed_data = NULL; | 796 | jh->b_committed_data = NULL; |
797 | if (jh->b_frozen_data) { | 797 | if (jh->b_frozen_data) { |
798 | jh->b_committed_data = jh->b_frozen_data; | 798 | jh->b_committed_data = jh->b_frozen_data; |
799 | jh->b_frozen_data = NULL; | 799 | jh->b_frozen_data = NULL; |
800 | } | 800 | } |
801 | } else if (jh->b_frozen_data) { | 801 | } else if (jh->b_frozen_data) { |
802 | jbd_free(jh->b_frozen_data, bh->b_size); | 802 | jbd_free(jh->b_frozen_data, bh->b_size); |
803 | jh->b_frozen_data = NULL; | 803 | jh->b_frozen_data = NULL; |
804 | } | 804 | } |
805 | 805 | ||
806 | spin_lock(&journal->j_list_lock); | 806 | spin_lock(&journal->j_list_lock); |
807 | cp_transaction = jh->b_cp_transaction; | 807 | cp_transaction = jh->b_cp_transaction; |
808 | if (cp_transaction) { | 808 | if (cp_transaction) { |
809 | JBUFFER_TRACE(jh, "remove from old cp transaction"); | 809 | JBUFFER_TRACE(jh, "remove from old cp transaction"); |
810 | __journal_remove_checkpoint(jh); | 810 | __journal_remove_checkpoint(jh); |
811 | } | 811 | } |
812 | 812 | ||
813 | /* Only re-checkpoint the buffer_head if it is marked | 813 | /* Only re-checkpoint the buffer_head if it is marked |
814 | * dirty. If the buffer was added to the BJ_Forget list | 814 | * dirty. If the buffer was added to the BJ_Forget list |
815 | * by journal_forget, it may no longer be dirty and | 815 | * by journal_forget, it may no longer be dirty and |
816 | * there's no point in keeping a checkpoint record for | 816 | * there's no point in keeping a checkpoint record for |
817 | * it. */ | 817 | * it. */ |
818 | 818 | ||
819 | /* A buffer which has been freed while still being | 819 | /* A buffer which has been freed while still being |
820 | * journaled by a previous transaction may end up still | 820 | * journaled by a previous transaction may end up still |
821 | * being dirty here, but we want to avoid writing back | 821 | * being dirty here, but we want to avoid writing back |
822 | * that buffer in the future now that the last use has | 822 | * that buffer in the future now that the last use has |
823 | * been committed. That's not only a performance gain, | 823 | * been committed. That's not only a performance gain, |
824 | * it also stops aliasing problems if the buffer is left | 824 | * it also stops aliasing problems if the buffer is left |
825 | * behind for writeback and gets reallocated for another | 825 | * behind for writeback and gets reallocated for another |
826 | * use in a different page. */ | 826 | * use in a different page. */ |
827 | if (buffer_freed(bh)) { | 827 | if (buffer_freed(bh)) { |
828 | clear_buffer_freed(bh); | 828 | clear_buffer_freed(bh); |
829 | clear_buffer_jbddirty(bh); | 829 | clear_buffer_jbddirty(bh); |
830 | } | 830 | } |
831 | 831 | ||
832 | if (buffer_jbddirty(bh)) { | 832 | if (buffer_jbddirty(bh)) { |
833 | JBUFFER_TRACE(jh, "add to new checkpointing trans"); | 833 | JBUFFER_TRACE(jh, "add to new checkpointing trans"); |
834 | __journal_insert_checkpoint(jh, commit_transaction); | 834 | __journal_insert_checkpoint(jh, commit_transaction); |
835 | JBUFFER_TRACE(jh, "refile for checkpoint writeback"); | 835 | JBUFFER_TRACE(jh, "refile for checkpoint writeback"); |
836 | __journal_refile_buffer(jh); | 836 | __journal_refile_buffer(jh); |
837 | jbd_unlock_bh_state(bh); | 837 | jbd_unlock_bh_state(bh); |
838 | } else { | 838 | } else { |
839 | J_ASSERT_BH(bh, !buffer_dirty(bh)); | 839 | J_ASSERT_BH(bh, !buffer_dirty(bh)); |
840 | /* The buffer on BJ_Forget list and not jbddirty means | 840 | /* The buffer on BJ_Forget list and not jbddirty means |
841 | * it has been freed by this transaction and hence it | 841 | * it has been freed by this transaction and hence it |
842 | * could not have been reallocated until this | 842 | * could not have been reallocated until this |
843 | * transaction has committed. *BUT* it could be | 843 | * transaction has committed. *BUT* it could be |
844 | * reallocated once we have written all the data to | 844 | * reallocated once we have written all the data to |
845 | * disk and before we process the buffer on BJ_Forget | 845 | * disk and before we process the buffer on BJ_Forget |
846 | * list. */ | 846 | * list. */ |
847 | JBUFFER_TRACE(jh, "refile or unfile freed buffer"); | 847 | JBUFFER_TRACE(jh, "refile or unfile freed buffer"); |
848 | __journal_refile_buffer(jh); | 848 | __journal_refile_buffer(jh); |
849 | if (!jh->b_transaction) { | 849 | if (!jh->b_transaction) { |
850 | jbd_unlock_bh_state(bh); | 850 | jbd_unlock_bh_state(bh); |
851 | /* needs a brelse */ | 851 | /* needs a brelse */ |
852 | journal_remove_journal_head(bh); | 852 | journal_remove_journal_head(bh); |
853 | release_buffer_page(bh); | 853 | release_buffer_page(bh); |
854 | } else | 854 | } else |
855 | jbd_unlock_bh_state(bh); | 855 | jbd_unlock_bh_state(bh); |
856 | } | 856 | } |
857 | cond_resched_lock(&journal->j_list_lock); | 857 | cond_resched_lock(&journal->j_list_lock); |
858 | } | 858 | } |
859 | spin_unlock(&journal->j_list_lock); | 859 | spin_unlock(&journal->j_list_lock); |
860 | /* | 860 | /* |
861 | * This is a bit sleazy. We use j_list_lock to protect transition | 861 | * This is a bit sleazy. We use j_list_lock to protect transition |
862 | * of a transaction into T_FINISHED state and calling | 862 | * of a transaction into T_FINISHED state and calling |
863 | * __journal_drop_transaction(). Otherwise we could race with | 863 | * __journal_drop_transaction(). Otherwise we could race with |
864 | * other checkpointing code processing the transaction... | 864 | * other checkpointing code processing the transaction... |
865 | */ | 865 | */ |
866 | spin_lock(&journal->j_state_lock); | 866 | spin_lock(&journal->j_state_lock); |
867 | spin_lock(&journal->j_list_lock); | 867 | spin_lock(&journal->j_list_lock); |
868 | /* | 868 | /* |
869 | * Now recheck if some buffers did not get attached to the transaction | 869 | * Now recheck if some buffers did not get attached to the transaction |
870 | * while the lock was dropped... | 870 | * while the lock was dropped... |
871 | */ | 871 | */ |
872 | if (commit_transaction->t_forget) { | 872 | if (commit_transaction->t_forget) { |
873 | spin_unlock(&journal->j_list_lock); | 873 | spin_unlock(&journal->j_list_lock); |
874 | spin_unlock(&journal->j_state_lock); | 874 | spin_unlock(&journal->j_state_lock); |
875 | goto restart_loop; | 875 | goto restart_loop; |
876 | } | 876 | } |
877 | 877 | ||
878 | /* Done with this transaction! */ | 878 | /* Done with this transaction! */ |
879 | 879 | ||
880 | jbd_debug(3, "JBD: commit phase 8\n"); | 880 | jbd_debug(3, "JBD: commit phase 8\n"); |
881 | 881 | ||
882 | J_ASSERT(commit_transaction->t_state == T_COMMIT); | 882 | J_ASSERT(commit_transaction->t_state == T_COMMIT); |
883 | 883 | ||
884 | commit_transaction->t_state = T_FINISHED; | 884 | commit_transaction->t_state = T_FINISHED; |
885 | J_ASSERT(commit_transaction == journal->j_committing_transaction); | 885 | J_ASSERT(commit_transaction == journal->j_committing_transaction); |
886 | journal->j_commit_sequence = commit_transaction->t_tid; | 886 | journal->j_commit_sequence = commit_transaction->t_tid; |
887 | journal->j_committing_transaction = NULL; | 887 | journal->j_committing_transaction = NULL; |
888 | spin_unlock(&journal->j_state_lock); | 888 | spin_unlock(&journal->j_state_lock); |
889 | 889 | ||
890 | if (commit_transaction->t_checkpoint_list == NULL && | 890 | if (commit_transaction->t_checkpoint_list == NULL && |
891 | commit_transaction->t_checkpoint_io_list == NULL) { | 891 | commit_transaction->t_checkpoint_io_list == NULL) { |
892 | __journal_drop_transaction(journal, commit_transaction); | 892 | __journal_drop_transaction(journal, commit_transaction); |
893 | } else { | 893 | } else { |
894 | if (journal->j_checkpoint_transactions == NULL) { | 894 | if (journal->j_checkpoint_transactions == NULL) { |
895 | journal->j_checkpoint_transactions = commit_transaction; | 895 | journal->j_checkpoint_transactions = commit_transaction; |
896 | commit_transaction->t_cpnext = commit_transaction; | 896 | commit_transaction->t_cpnext = commit_transaction; |
897 | commit_transaction->t_cpprev = commit_transaction; | 897 | commit_transaction->t_cpprev = commit_transaction; |
898 | } else { | 898 | } else { |
899 | commit_transaction->t_cpnext = | 899 | commit_transaction->t_cpnext = |
900 | journal->j_checkpoint_transactions; | 900 | journal->j_checkpoint_transactions; |
901 | commit_transaction->t_cpprev = | 901 | commit_transaction->t_cpprev = |
902 | commit_transaction->t_cpnext->t_cpprev; | 902 | commit_transaction->t_cpnext->t_cpprev; |
903 | commit_transaction->t_cpnext->t_cpprev = | 903 | commit_transaction->t_cpnext->t_cpprev = |
904 | commit_transaction; | 904 | commit_transaction; |
905 | commit_transaction->t_cpprev->t_cpnext = | 905 | commit_transaction->t_cpprev->t_cpnext = |
906 | commit_transaction; | 906 | commit_transaction; |
907 | } | 907 | } |
908 | } | 908 | } |
909 | spin_unlock(&journal->j_list_lock); | 909 | spin_unlock(&journal->j_list_lock); |
910 | 910 | ||
911 | jbd_debug(1, "JBD: commit %d complete, head %d\n", | 911 | jbd_debug(1, "JBD: commit %d complete, head %d\n", |
912 | journal->j_commit_sequence, journal->j_tail_sequence); | 912 | journal->j_commit_sequence, journal->j_tail_sequence); |
913 | 913 | ||
914 | wake_up(&journal->j_wait_done_commit); | 914 | wake_up(&journal->j_wait_done_commit); |
915 | } | 915 | } |
916 | 916 |
fs/jbd2/checkpoint.c
1 | /* | 1 | /* |
2 | * linux/fs/jbd2/checkpoint.c | 2 | * linux/fs/jbd2/checkpoint.c |
3 | * | 3 | * |
4 | * Written by Stephen C. Tweedie <sct@redhat.com>, 1999 | 4 | * Written by Stephen C. Tweedie <sct@redhat.com>, 1999 |
5 | * | 5 | * |
6 | * Copyright 1999 Red Hat Software --- All Rights Reserved | 6 | * Copyright 1999 Red Hat Software --- All Rights Reserved |
7 | * | 7 | * |
8 | * This file is part of the Linux kernel and is made available under | 8 | * This file is part of the Linux kernel and is made available under |
9 | * the terms of the GNU General Public License, version 2, or at your | 9 | * the terms of the GNU General Public License, version 2, or at your |
10 | * option, any later version, incorporated herein by reference. | 10 | * option, any later version, incorporated herein by reference. |
11 | * | 11 | * |
12 | * Checkpoint routines for the generic filesystem journaling code. | 12 | * Checkpoint routines for the generic filesystem journaling code. |
13 | * Part of the ext2fs journaling system. | 13 | * Part of the ext2fs journaling system. |
14 | * | 14 | * |
15 | * Checkpointing is the process of ensuring that a section of the log is | 15 | * Checkpointing is the process of ensuring that a section of the log is |
16 | * committed fully to disk, so that that portion of the log can be | 16 | * committed fully to disk, so that that portion of the log can be |
17 | * reused. | 17 | * reused. |
18 | */ | 18 | */ |
19 | 19 | ||
20 | #include <linux/time.h> | 20 | #include <linux/time.h> |
21 | #include <linux/fs.h> | 21 | #include <linux/fs.h> |
22 | #include <linux/jbd2.h> | 22 | #include <linux/jbd2.h> |
23 | #include <linux/errno.h> | 23 | #include <linux/errno.h> |
24 | #include <linux/slab.h> | 24 | #include <linux/slab.h> |
25 | 25 | ||
26 | /* | 26 | /* |
27 | * Unlink a buffer from a transaction checkpoint list. | 27 | * Unlink a buffer from a transaction checkpoint list. |
28 | * | 28 | * |
29 | * Called with j_list_lock held. | 29 | * Called with j_list_lock held. |
30 | */ | 30 | */ |
31 | static inline void __buffer_unlink_first(struct journal_head *jh) | 31 | static inline void __buffer_unlink_first(struct journal_head *jh) |
32 | { | 32 | { |
33 | transaction_t *transaction = jh->b_cp_transaction; | 33 | transaction_t *transaction = jh->b_cp_transaction; |
34 | 34 | ||
35 | jh->b_cpnext->b_cpprev = jh->b_cpprev; | 35 | jh->b_cpnext->b_cpprev = jh->b_cpprev; |
36 | jh->b_cpprev->b_cpnext = jh->b_cpnext; | 36 | jh->b_cpprev->b_cpnext = jh->b_cpnext; |
37 | if (transaction->t_checkpoint_list == jh) { | 37 | if (transaction->t_checkpoint_list == jh) { |
38 | transaction->t_checkpoint_list = jh->b_cpnext; | 38 | transaction->t_checkpoint_list = jh->b_cpnext; |
39 | if (transaction->t_checkpoint_list == jh) | 39 | if (transaction->t_checkpoint_list == jh) |
40 | transaction->t_checkpoint_list = NULL; | 40 | transaction->t_checkpoint_list = NULL; |
41 | } | 41 | } |
42 | } | 42 | } |
43 | 43 | ||
44 | /* | 44 | /* |
45 | * Unlink a buffer from a transaction checkpoint(io) list. | 45 | * Unlink a buffer from a transaction checkpoint(io) list. |
46 | * | 46 | * |
47 | * Called with j_list_lock held. | 47 | * Called with j_list_lock held. |
48 | */ | 48 | */ |
49 | static inline void __buffer_unlink(struct journal_head *jh) | 49 | static inline void __buffer_unlink(struct journal_head *jh) |
50 | { | 50 | { |
51 | transaction_t *transaction = jh->b_cp_transaction; | 51 | transaction_t *transaction = jh->b_cp_transaction; |
52 | 52 | ||
53 | __buffer_unlink_first(jh); | 53 | __buffer_unlink_first(jh); |
54 | if (transaction->t_checkpoint_io_list == jh) { | 54 | if (transaction->t_checkpoint_io_list == jh) { |
55 | transaction->t_checkpoint_io_list = jh->b_cpnext; | 55 | transaction->t_checkpoint_io_list = jh->b_cpnext; |
56 | if (transaction->t_checkpoint_io_list == jh) | 56 | if (transaction->t_checkpoint_io_list == jh) |
57 | transaction->t_checkpoint_io_list = NULL; | 57 | transaction->t_checkpoint_io_list = NULL; |
58 | } | 58 | } |
59 | } | 59 | } |
60 | 60 | ||
61 | /* | 61 | /* |
62 | * Move a buffer from the checkpoint list to the checkpoint io list | 62 | * Move a buffer from the checkpoint list to the checkpoint io list |
63 | * | 63 | * |
64 | * Called with j_list_lock held | 64 | * Called with j_list_lock held |
65 | */ | 65 | */ |
66 | static inline void __buffer_relink_io(struct journal_head *jh) | 66 | static inline void __buffer_relink_io(struct journal_head *jh) |
67 | { | 67 | { |
68 | transaction_t *transaction = jh->b_cp_transaction; | 68 | transaction_t *transaction = jh->b_cp_transaction; |
69 | 69 | ||
70 | __buffer_unlink_first(jh); | 70 | __buffer_unlink_first(jh); |
71 | 71 | ||
72 | if (!transaction->t_checkpoint_io_list) { | 72 | if (!transaction->t_checkpoint_io_list) { |
73 | jh->b_cpnext = jh->b_cpprev = jh; | 73 | jh->b_cpnext = jh->b_cpprev = jh; |
74 | } else { | 74 | } else { |
75 | jh->b_cpnext = transaction->t_checkpoint_io_list; | 75 | jh->b_cpnext = transaction->t_checkpoint_io_list; |
76 | jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev; | 76 | jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev; |
77 | jh->b_cpprev->b_cpnext = jh; | 77 | jh->b_cpprev->b_cpnext = jh; |
78 | jh->b_cpnext->b_cpprev = jh; | 78 | jh->b_cpnext->b_cpprev = jh; |
79 | } | 79 | } |
80 | transaction->t_checkpoint_io_list = jh; | 80 | transaction->t_checkpoint_io_list = jh; |
81 | } | 81 | } |
82 | 82 | ||
83 | /* | 83 | /* |
84 | * Try to release a checkpointed buffer from its transaction. | 84 | * Try to release a checkpointed buffer from its transaction. |
85 | * Returns 1 if we released it and 2 if we also released the | 85 | * Returns 1 if we released it and 2 if we also released the |
86 | * whole transaction. | 86 | * whole transaction. |
87 | * | 87 | * |
88 | * Requires j_list_lock | 88 | * Requires j_list_lock |
89 | * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it | 89 | * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it |
90 | */ | 90 | */ |
91 | static int __try_to_free_cp_buf(struct journal_head *jh) | 91 | static int __try_to_free_cp_buf(struct journal_head *jh) |
92 | { | 92 | { |
93 | int ret = 0; | 93 | int ret = 0; |
94 | struct buffer_head *bh = jh2bh(jh); | 94 | struct buffer_head *bh = jh2bh(jh); |
95 | 95 | ||
96 | if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) { | 96 | if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) { |
97 | JBUFFER_TRACE(jh, "remove from checkpoint list"); | 97 | JBUFFER_TRACE(jh, "remove from checkpoint list"); |
98 | ret = __jbd2_journal_remove_checkpoint(jh) + 1; | 98 | ret = __jbd2_journal_remove_checkpoint(jh) + 1; |
99 | jbd_unlock_bh_state(bh); | 99 | jbd_unlock_bh_state(bh); |
100 | jbd2_journal_remove_journal_head(bh); | 100 | jbd2_journal_remove_journal_head(bh); |
101 | BUFFER_TRACE(bh, "release"); | 101 | BUFFER_TRACE(bh, "release"); |
102 | __brelse(bh); | 102 | __brelse(bh); |
103 | } else { | 103 | } else { |
104 | jbd_unlock_bh_state(bh); | 104 | jbd_unlock_bh_state(bh); |
105 | } | 105 | } |
106 | return ret; | 106 | return ret; |
107 | } | 107 | } |
108 | 108 | ||
109 | /* | 109 | /* |
110 | * __jbd2_log_wait_for_space: wait until there is space in the journal. | 110 | * __jbd2_log_wait_for_space: wait until there is space in the journal. |
111 | * | 111 | * |
112 | * Called under j-state_lock *only*. It will be unlocked if we have to wait | 112 | * Called under j-state_lock *only*. It will be unlocked if we have to wait |
113 | * for a checkpoint to free up some space in the log. | 113 | * for a checkpoint to free up some space in the log. |
114 | */ | 114 | */ |
115 | void __jbd2_log_wait_for_space(journal_t *journal) | 115 | void __jbd2_log_wait_for_space(journal_t *journal) |
116 | { | 116 | { |
117 | int nblocks; | 117 | int nblocks; |
118 | assert_spin_locked(&journal->j_state_lock); | 118 | assert_spin_locked(&journal->j_state_lock); |
119 | 119 | ||
120 | nblocks = jbd_space_needed(journal); | 120 | nblocks = jbd_space_needed(journal); |
121 | while (__jbd2_log_space_left(journal) < nblocks) { | 121 | while (__jbd2_log_space_left(journal) < nblocks) { |
122 | if (journal->j_flags & JBD2_ABORT) | 122 | if (journal->j_flags & JBD2_ABORT) |
123 | return; | 123 | return; |
124 | spin_unlock(&journal->j_state_lock); | 124 | spin_unlock(&journal->j_state_lock); |
125 | mutex_lock(&journal->j_checkpoint_mutex); | 125 | mutex_lock(&journal->j_checkpoint_mutex); |
126 | 126 | ||
127 | /* | 127 | /* |
128 | * Test again, another process may have checkpointed while we | 128 | * Test again, another process may have checkpointed while we |
129 | * were waiting for the checkpoint lock | 129 | * were waiting for the checkpoint lock |
130 | */ | 130 | */ |
131 | spin_lock(&journal->j_state_lock); | 131 | spin_lock(&journal->j_state_lock); |
132 | nblocks = jbd_space_needed(journal); | 132 | nblocks = jbd_space_needed(journal); |
133 | if (__jbd2_log_space_left(journal) < nblocks) { | 133 | if (__jbd2_log_space_left(journal) < nblocks) { |
134 | spin_unlock(&journal->j_state_lock); | 134 | spin_unlock(&journal->j_state_lock); |
135 | jbd2_log_do_checkpoint(journal); | 135 | jbd2_log_do_checkpoint(journal); |
136 | spin_lock(&journal->j_state_lock); | 136 | spin_lock(&journal->j_state_lock); |
137 | } | 137 | } |
138 | mutex_unlock(&journal->j_checkpoint_mutex); | 138 | mutex_unlock(&journal->j_checkpoint_mutex); |
139 | } | 139 | } |
140 | } | 140 | } |
141 | 141 | ||
142 | /* | 142 | /* |
143 | * We were unable to perform jbd_trylock_bh_state() inside j_list_lock. | 143 | * We were unable to perform jbd_trylock_bh_state() inside j_list_lock. |
144 | * The caller must restart a list walk. Wait for someone else to run | 144 | * The caller must restart a list walk. Wait for someone else to run |
145 | * jbd_unlock_bh_state(). | 145 | * jbd_unlock_bh_state(). |
146 | */ | 146 | */ |
147 | static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh) | 147 | static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh) |
148 | __releases(journal->j_list_lock) | 148 | __releases(journal->j_list_lock) |
149 | { | 149 | { |
150 | get_bh(bh); | 150 | get_bh(bh); |
151 | spin_unlock(&journal->j_list_lock); | 151 | spin_unlock(&journal->j_list_lock); |
152 | jbd_lock_bh_state(bh); | 152 | jbd_lock_bh_state(bh); |
153 | jbd_unlock_bh_state(bh); | 153 | jbd_unlock_bh_state(bh); |
154 | put_bh(bh); | 154 | put_bh(bh); |
155 | } | 155 | } |
156 | 156 | ||
157 | /* | 157 | /* |
158 | * Clean up transaction's list of buffers submitted for io. | 158 | * Clean up transaction's list of buffers submitted for io. |
159 | * We wait for any pending IO to complete and remove any clean | 159 | * We wait for any pending IO to complete and remove any clean |
160 | * buffers. Note that we take the buffers in the opposite ordering | 160 | * buffers. Note that we take the buffers in the opposite ordering |
161 | * from the one in which they were submitted for IO. | 161 | * from the one in which they were submitted for IO. |
162 | * | 162 | * |
163 | * Called with j_list_lock held. | 163 | * Called with j_list_lock held. |
164 | */ | 164 | */ |
165 | static void __wait_cp_io(journal_t *journal, transaction_t *transaction) | 165 | static void __wait_cp_io(journal_t *journal, transaction_t *transaction) |
166 | { | 166 | { |
167 | struct journal_head *jh; | 167 | struct journal_head *jh; |
168 | struct buffer_head *bh; | 168 | struct buffer_head *bh; |
169 | tid_t this_tid; | 169 | tid_t this_tid; |
170 | int released = 0; | 170 | int released = 0; |
171 | 171 | ||
172 | this_tid = transaction->t_tid; | 172 | this_tid = transaction->t_tid; |
173 | restart: | 173 | restart: |
174 | /* Did somebody clean up the transaction in the meanwhile? */ | 174 | /* Did somebody clean up the transaction in the meanwhile? */ |
175 | if (journal->j_checkpoint_transactions != transaction || | 175 | if (journal->j_checkpoint_transactions != transaction || |
176 | transaction->t_tid != this_tid) | 176 | transaction->t_tid != this_tid) |
177 | return; | 177 | return; |
178 | while (!released && transaction->t_checkpoint_io_list) { | 178 | while (!released && transaction->t_checkpoint_io_list) { |
179 | jh = transaction->t_checkpoint_io_list; | 179 | jh = transaction->t_checkpoint_io_list; |
180 | bh = jh2bh(jh); | 180 | bh = jh2bh(jh); |
181 | if (!jbd_trylock_bh_state(bh)) { | 181 | if (!jbd_trylock_bh_state(bh)) { |
182 | jbd_sync_bh(journal, bh); | 182 | jbd_sync_bh(journal, bh); |
183 | spin_lock(&journal->j_list_lock); | 183 | spin_lock(&journal->j_list_lock); |
184 | goto restart; | 184 | goto restart; |
185 | } | 185 | } |
186 | if (buffer_locked(bh)) { | 186 | if (buffer_locked(bh)) { |
187 | atomic_inc(&bh->b_count); | 187 | atomic_inc(&bh->b_count); |
188 | spin_unlock(&journal->j_list_lock); | 188 | spin_unlock(&journal->j_list_lock); |
189 | jbd_unlock_bh_state(bh); | 189 | jbd_unlock_bh_state(bh); |
190 | wait_on_buffer(bh); | 190 | wait_on_buffer(bh); |
191 | /* the journal_head may have gone by now */ | 191 | /* the journal_head may have gone by now */ |
192 | BUFFER_TRACE(bh, "brelse"); | 192 | BUFFER_TRACE(bh, "brelse"); |
193 | __brelse(bh); | 193 | __brelse(bh); |
194 | spin_lock(&journal->j_list_lock); | 194 | spin_lock(&journal->j_list_lock); |
195 | goto restart; | 195 | goto restart; |
196 | } | 196 | } |
197 | /* | 197 | /* |
198 | * Now in whatever state the buffer currently is, we know that | 198 | * Now in whatever state the buffer currently is, we know that |
199 | * it has been written out and so we can drop it from the list | 199 | * it has been written out and so we can drop it from the list |
200 | */ | 200 | */ |
201 | released = __jbd2_journal_remove_checkpoint(jh); | 201 | released = __jbd2_journal_remove_checkpoint(jh); |
202 | jbd_unlock_bh_state(bh); | 202 | jbd_unlock_bh_state(bh); |
203 | jbd2_journal_remove_journal_head(bh); | 203 | jbd2_journal_remove_journal_head(bh); |
204 | __brelse(bh); | 204 | __brelse(bh); |
205 | } | 205 | } |
206 | } | 206 | } |
207 | 207 | ||
208 | #define NR_BATCH 64 | 208 | #define NR_BATCH 64 |
209 | 209 | ||
210 | static void | 210 | static void |
211 | __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) | 211 | __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) |
212 | { | 212 | { |
213 | int i; | 213 | int i; |
214 | 214 | ||
215 | ll_rw_block(SWRITE, *batch_count, bhs); | 215 | ll_rw_block(SWRITE, *batch_count, bhs); |
216 | for (i = 0; i < *batch_count; i++) { | 216 | for (i = 0; i < *batch_count; i++) { |
217 | struct buffer_head *bh = bhs[i]; | 217 | struct buffer_head *bh = bhs[i]; |
218 | clear_buffer_jwrite(bh); | 218 | clear_buffer_jwrite(bh); |
219 | BUFFER_TRACE(bh, "brelse"); | 219 | BUFFER_TRACE(bh, "brelse"); |
220 | __brelse(bh); | 220 | __brelse(bh); |
221 | } | 221 | } |
222 | *batch_count = 0; | 222 | *batch_count = 0; |
223 | } | 223 | } |
224 | 224 | ||
225 | /* | 225 | /* |
226 | * Try to flush one buffer from the checkpoint list to disk. | 226 | * Try to flush one buffer from the checkpoint list to disk. |
227 | * | 227 | * |
228 | * Return 1 if something happened which requires us to abort the current | 228 | * Return 1 if something happened which requires us to abort the current |
229 | * scan of the checkpoint list. | 229 | * scan of the checkpoint list. |
230 | * | 230 | * |
231 | * Called with j_list_lock held and drops it if 1 is returned | 231 | * Called with j_list_lock held and drops it if 1 is returned |
232 | * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it | 232 | * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it |
233 | */ | 233 | */ |
234 | static int __process_buffer(journal_t *journal, struct journal_head *jh, | 234 | static int __process_buffer(journal_t *journal, struct journal_head *jh, |
235 | struct buffer_head **bhs, int *batch_count, | 235 | struct buffer_head **bhs, int *batch_count, |
236 | transaction_t *transaction) | 236 | transaction_t *transaction) |
237 | { | 237 | { |
238 | struct buffer_head *bh = jh2bh(jh); | 238 | struct buffer_head *bh = jh2bh(jh); |
239 | int ret = 0; | 239 | int ret = 0; |
240 | 240 | ||
241 | if (buffer_locked(bh)) { | 241 | if (buffer_locked(bh)) { |
242 | atomic_inc(&bh->b_count); | 242 | atomic_inc(&bh->b_count); |
243 | spin_unlock(&journal->j_list_lock); | 243 | spin_unlock(&journal->j_list_lock); |
244 | jbd_unlock_bh_state(bh); | 244 | jbd_unlock_bh_state(bh); |
245 | wait_on_buffer(bh); | 245 | wait_on_buffer(bh); |
246 | /* the journal_head may have gone by now */ | 246 | /* the journal_head may have gone by now */ |
247 | BUFFER_TRACE(bh, "brelse"); | 247 | BUFFER_TRACE(bh, "brelse"); |
248 | __brelse(bh); | 248 | __brelse(bh); |
249 | ret = 1; | 249 | ret = 1; |
250 | } else if (jh->b_transaction != NULL) { | 250 | } else if (jh->b_transaction != NULL) { |
251 | transaction_t *t = jh->b_transaction; | 251 | transaction_t *t = jh->b_transaction; |
252 | tid_t tid = t->t_tid; | 252 | tid_t tid = t->t_tid; |
253 | 253 | ||
254 | transaction->t_chp_stats.cs_forced_to_close++; | 254 | transaction->t_chp_stats.cs_forced_to_close++; |
255 | spin_unlock(&journal->j_list_lock); | 255 | spin_unlock(&journal->j_list_lock); |
256 | jbd_unlock_bh_state(bh); | 256 | jbd_unlock_bh_state(bh); |
257 | jbd2_log_start_commit(journal, tid); | 257 | jbd2_log_start_commit(journal, tid); |
258 | jbd2_log_wait_commit(journal, tid); | 258 | jbd2_log_wait_commit(journal, tid); |
259 | ret = 1; | 259 | ret = 1; |
260 | } else if (!buffer_dirty(bh)) { | 260 | } else if (!buffer_dirty(bh)) { |
261 | J_ASSERT_JH(jh, !buffer_jbddirty(bh)); | 261 | J_ASSERT_JH(jh, !buffer_jbddirty(bh)); |
262 | BUFFER_TRACE(bh, "remove from checkpoint"); | 262 | BUFFER_TRACE(bh, "remove from checkpoint"); |
263 | __jbd2_journal_remove_checkpoint(jh); | 263 | __jbd2_journal_remove_checkpoint(jh); |
264 | spin_unlock(&journal->j_list_lock); | 264 | spin_unlock(&journal->j_list_lock); |
265 | jbd_unlock_bh_state(bh); | 265 | jbd_unlock_bh_state(bh); |
266 | jbd2_journal_remove_journal_head(bh); | 266 | jbd2_journal_remove_journal_head(bh); |
267 | __brelse(bh); | 267 | __brelse(bh); |
268 | ret = 1; | 268 | ret = 1; |
269 | } else { | 269 | } else { |
270 | /* | 270 | /* |
271 | * Important: we are about to write the buffer, and | 271 | * Important: we are about to write the buffer, and |
272 | * possibly block, while still holding the journal lock. | 272 | * possibly block, while still holding the journal lock. |
273 | * We cannot afford to let the transaction logic start | 273 | * We cannot afford to let the transaction logic start |
274 | * messing around with this buffer before we write it to | 274 | * messing around with this buffer before we write it to |
275 | * disk, as that would break recoverability. | 275 | * disk, as that would break recoverability. |
276 | */ | 276 | */ |
277 | BUFFER_TRACE(bh, "queue"); | 277 | BUFFER_TRACE(bh, "queue"); |
278 | get_bh(bh); | 278 | get_bh(bh); |
279 | J_ASSERT_BH(bh, !buffer_jwrite(bh)); | 279 | J_ASSERT_BH(bh, !buffer_jwrite(bh)); |
280 | set_buffer_jwrite(bh); | 280 | set_buffer_jwrite(bh); |
281 | bhs[*batch_count] = bh; | 281 | bhs[*batch_count] = bh; |
282 | __buffer_relink_io(jh); | 282 | __buffer_relink_io(jh); |
283 | jbd_unlock_bh_state(bh); | 283 | jbd_unlock_bh_state(bh); |
284 | transaction->t_chp_stats.cs_written++; | 284 | transaction->t_chp_stats.cs_written++; |
285 | (*batch_count)++; | 285 | (*batch_count)++; |
286 | if (*batch_count == NR_BATCH) { | 286 | if (*batch_count == NR_BATCH) { |
287 | spin_unlock(&journal->j_list_lock); | 287 | spin_unlock(&journal->j_list_lock); |
288 | __flush_batch(journal, bhs, batch_count); | 288 | __flush_batch(journal, bhs, batch_count); |
289 | ret = 1; | 289 | ret = 1; |
290 | } | 290 | } |
291 | } | 291 | } |
292 | return ret; | 292 | return ret; |
293 | } | 293 | } |
294 | 294 | ||
295 | /* | 295 | /* |
296 | * Perform an actual checkpoint. We take the first transaction on the | 296 | * Perform an actual checkpoint. We take the first transaction on the |
297 | * list of transactions to be checkpointed and send all its buffers | 297 | * list of transactions to be checkpointed and send all its buffers |
298 | * to disk. We submit larger chunks of data at once. | 298 | * to disk. We submit larger chunks of data at once. |
299 | * | 299 | * |
300 | * The journal should be locked before calling this function. | 300 | * The journal should be locked before calling this function. |
301 | */ | 301 | */ |
302 | int jbd2_log_do_checkpoint(journal_t *journal) | 302 | int jbd2_log_do_checkpoint(journal_t *journal) |
303 | { | 303 | { |
304 | transaction_t *transaction; | 304 | transaction_t *transaction; |
305 | tid_t this_tid; | 305 | tid_t this_tid; |
306 | int result; | 306 | int result; |
307 | 307 | ||
308 | jbd_debug(1, "Start checkpoint\n"); | 308 | jbd_debug(1, "Start checkpoint\n"); |
309 | 309 | ||
310 | /* | 310 | /* |
311 | * First thing: if there are any transactions in the log which | 311 | * First thing: if there are any transactions in the log which |
312 | * don't need checkpointing, just eliminate them from the | 312 | * don't need checkpointing, just eliminate them from the |
313 | * journal straight away. | 313 | * journal straight away. |
314 | */ | 314 | */ |
315 | result = jbd2_cleanup_journal_tail(journal); | 315 | result = jbd2_cleanup_journal_tail(journal); |
316 | jbd_debug(1, "cleanup_journal_tail returned %d\n", result); | 316 | jbd_debug(1, "cleanup_journal_tail returned %d\n", result); |
317 | if (result <= 0) | 317 | if (result <= 0) |
318 | return result; | 318 | return result; |
319 | 319 | ||
320 | /* | 320 | /* |
321 | * OK, we need to start writing disk blocks. Take one transaction | 321 | * OK, we need to start writing disk blocks. Take one transaction |
322 | * and write it. | 322 | * and write it. |
323 | */ | 323 | */ |
324 | spin_lock(&journal->j_list_lock); | 324 | spin_lock(&journal->j_list_lock); |
325 | if (!journal->j_checkpoint_transactions) | 325 | if (!journal->j_checkpoint_transactions) |
326 | goto out; | 326 | goto out; |
327 | transaction = journal->j_checkpoint_transactions; | 327 | transaction = journal->j_checkpoint_transactions; |
328 | if (transaction->t_chp_stats.cs_chp_time == 0) | 328 | if (transaction->t_chp_stats.cs_chp_time == 0) |
329 | transaction->t_chp_stats.cs_chp_time = jiffies; | 329 | transaction->t_chp_stats.cs_chp_time = jiffies; |
330 | this_tid = transaction->t_tid; | 330 | this_tid = transaction->t_tid; |
331 | restart: | 331 | restart: |
332 | /* | 332 | /* |
333 | * If someone cleaned up this transaction while we slept, we're | 333 | * If someone cleaned up this transaction while we slept, we're |
334 | * done (maybe it's a new transaction, but it fell at the same | 334 | * done (maybe it's a new transaction, but it fell at the same |
335 | * address). | 335 | * address). |
336 | */ | 336 | */ |
337 | if (journal->j_checkpoint_transactions == transaction && | 337 | if (journal->j_checkpoint_transactions == transaction && |
338 | transaction->t_tid == this_tid) { | 338 | transaction->t_tid == this_tid) { |
339 | int batch_count = 0; | 339 | int batch_count = 0; |
340 | struct buffer_head *bhs[NR_BATCH]; | 340 | struct buffer_head *bhs[NR_BATCH]; |
341 | struct journal_head *jh; | 341 | struct journal_head *jh; |
342 | int retry = 0; | 342 | int retry = 0; |
343 | 343 | ||
344 | while (!retry && transaction->t_checkpoint_list) { | 344 | while (!retry && transaction->t_checkpoint_list) { |
345 | struct buffer_head *bh; | 345 | struct buffer_head *bh; |
346 | 346 | ||
347 | jh = transaction->t_checkpoint_list; | 347 | jh = transaction->t_checkpoint_list; |
348 | bh = jh2bh(jh); | 348 | bh = jh2bh(jh); |
349 | if (!jbd_trylock_bh_state(bh)) { | 349 | if (!jbd_trylock_bh_state(bh)) { |
350 | jbd_sync_bh(journal, bh); | 350 | jbd_sync_bh(journal, bh); |
351 | retry = 1; | 351 | retry = 1; |
352 | break; | 352 | break; |
353 | } | 353 | } |
354 | retry = __process_buffer(journal, jh, bhs, &batch_count, | 354 | retry = __process_buffer(journal, jh, bhs, &batch_count, |
355 | transaction); | 355 | transaction); |
356 | if (!retry && lock_need_resched(&journal->j_list_lock)){ | 356 | if (!retry && (need_resched() || |
357 | spin_needbreak(&journal->j_list_lock))) { | ||
357 | spin_unlock(&journal->j_list_lock); | 358 | spin_unlock(&journal->j_list_lock); |
358 | retry = 1; | 359 | retry = 1; |
359 | break; | 360 | break; |
360 | } | 361 | } |
361 | } | 362 | } |
362 | 363 | ||
363 | if (batch_count) { | 364 | if (batch_count) { |
364 | if (!retry) { | 365 | if (!retry) { |
365 | spin_unlock(&journal->j_list_lock); | 366 | spin_unlock(&journal->j_list_lock); |
366 | retry = 1; | 367 | retry = 1; |
367 | } | 368 | } |
368 | __flush_batch(journal, bhs, &batch_count); | 369 | __flush_batch(journal, bhs, &batch_count); |
369 | } | 370 | } |
370 | 371 | ||
371 | if (retry) { | 372 | if (retry) { |
372 | spin_lock(&journal->j_list_lock); | 373 | spin_lock(&journal->j_list_lock); |
373 | goto restart; | 374 | goto restart; |
374 | } | 375 | } |
375 | /* | 376 | /* |
376 | * Now we have cleaned up the first transaction's checkpoint | 377 | * Now we have cleaned up the first transaction's checkpoint |
377 | * list. Let's clean up the second one | 378 | * list. Let's clean up the second one |
378 | */ | 379 | */ |
379 | __wait_cp_io(journal, transaction); | 380 | __wait_cp_io(journal, transaction); |
380 | } | 381 | } |
381 | out: | 382 | out: |
382 | spin_unlock(&journal->j_list_lock); | 383 | spin_unlock(&journal->j_list_lock); |
383 | result = jbd2_cleanup_journal_tail(journal); | 384 | result = jbd2_cleanup_journal_tail(journal); |
384 | if (result < 0) | 385 | if (result < 0) |
385 | return result; | 386 | return result; |
386 | return 0; | 387 | return 0; |
387 | } | 388 | } |
388 | 389 | ||
389 | /* | 390 | /* |
390 | * Check the list of checkpoint transactions for the journal to see if | 391 | * Check the list of checkpoint transactions for the journal to see if |
391 | * we have already got rid of any since the last update of the log tail | 392 | * we have already got rid of any since the last update of the log tail |
392 | * in the journal superblock. If so, we can instantly roll the | 393 | * in the journal superblock. If so, we can instantly roll the |
393 | * superblock forward to remove those transactions from the log. | 394 | * superblock forward to remove those transactions from the log. |
394 | * | 395 | * |
395 | * Return <0 on error, 0 on success, 1 if there was nothing to clean up. | 396 | * Return <0 on error, 0 on success, 1 if there was nothing to clean up. |
396 | * | 397 | * |
397 | * Called with the journal lock held. | 398 | * Called with the journal lock held. |
398 | * | 399 | * |
399 | * This is the only part of the journaling code which really needs to be | 400 | * This is the only part of the journaling code which really needs to be |
400 | * aware of transaction aborts. Checkpointing involves writing to the | 401 | * aware of transaction aborts. Checkpointing involves writing to the |
401 | * main filesystem area rather than to the journal, so it can proceed | 402 | * main filesystem area rather than to the journal, so it can proceed |
402 | * even in abort state, but we must not update the journal superblock if | 403 | * even in abort state, but we must not update the journal superblock if |
403 | * we have an abort error outstanding. | 404 | * we have an abort error outstanding. |
404 | */ | 405 | */ |
405 | 406 | ||
406 | int jbd2_cleanup_journal_tail(journal_t *journal) | 407 | int jbd2_cleanup_journal_tail(journal_t *journal) |
407 | { | 408 | { |
408 | transaction_t * transaction; | 409 | transaction_t * transaction; |
409 | tid_t first_tid; | 410 | tid_t first_tid; |
410 | unsigned long blocknr, freed; | 411 | unsigned long blocknr, freed; |
411 | 412 | ||
412 | /* OK, work out the oldest transaction remaining in the log, and | 413 | /* OK, work out the oldest transaction remaining in the log, and |
413 | * the log block it starts at. | 414 | * the log block it starts at. |
414 | * | 415 | * |
415 | * If the log is now empty, we need to work out which is the | 416 | * If the log is now empty, we need to work out which is the |
416 | * next transaction ID we will write, and where it will | 417 | * next transaction ID we will write, and where it will |
417 | * start. */ | 418 | * start. */ |
418 | 419 | ||
419 | spin_lock(&journal->j_state_lock); | 420 | spin_lock(&journal->j_state_lock); |
420 | spin_lock(&journal->j_list_lock); | 421 | spin_lock(&journal->j_list_lock); |
421 | transaction = journal->j_checkpoint_transactions; | 422 | transaction = journal->j_checkpoint_transactions; |
422 | if (transaction) { | 423 | if (transaction) { |
423 | first_tid = transaction->t_tid; | 424 | first_tid = transaction->t_tid; |
424 | blocknr = transaction->t_log_start; | 425 | blocknr = transaction->t_log_start; |
425 | } else if ((transaction = journal->j_committing_transaction) != NULL) { | 426 | } else if ((transaction = journal->j_committing_transaction) != NULL) { |
426 | first_tid = transaction->t_tid; | 427 | first_tid = transaction->t_tid; |
427 | blocknr = transaction->t_log_start; | 428 | blocknr = transaction->t_log_start; |
428 | } else if ((transaction = journal->j_running_transaction) != NULL) { | 429 | } else if ((transaction = journal->j_running_transaction) != NULL) { |
429 | first_tid = transaction->t_tid; | 430 | first_tid = transaction->t_tid; |
430 | blocknr = journal->j_head; | 431 | blocknr = journal->j_head; |
431 | } else { | 432 | } else { |
432 | first_tid = journal->j_transaction_sequence; | 433 | first_tid = journal->j_transaction_sequence; |
433 | blocknr = journal->j_head; | 434 | blocknr = journal->j_head; |
434 | } | 435 | } |
435 | spin_unlock(&journal->j_list_lock); | 436 | spin_unlock(&journal->j_list_lock); |
436 | J_ASSERT(blocknr != 0); | 437 | J_ASSERT(blocknr != 0); |
437 | 438 | ||
438 | /* If the oldest pinned transaction is at the tail of the log | 439 | /* If the oldest pinned transaction is at the tail of the log |
439 | already then there's not much we can do right now. */ | 440 | already then there's not much we can do right now. */ |
440 | if (journal->j_tail_sequence == first_tid) { | 441 | if (journal->j_tail_sequence == first_tid) { |
441 | spin_unlock(&journal->j_state_lock); | 442 | spin_unlock(&journal->j_state_lock); |
442 | return 1; | 443 | return 1; |
443 | } | 444 | } |
444 | 445 | ||
445 | /* OK, update the superblock to recover the freed space. | 446 | /* OK, update the superblock to recover the freed space. |
446 | * Physical blocks come first: have we wrapped beyond the end of | 447 | * Physical blocks come first: have we wrapped beyond the end of |
447 | * the log? */ | 448 | * the log? */ |
448 | freed = blocknr - journal->j_tail; | 449 | freed = blocknr - journal->j_tail; |
449 | if (blocknr < journal->j_tail) | 450 | if (blocknr < journal->j_tail) |
450 | freed = freed + journal->j_last - journal->j_first; | 451 | freed = freed + journal->j_last - journal->j_first; |
451 | 452 | ||
452 | jbd_debug(1, | 453 | jbd_debug(1, |
453 | "Cleaning journal tail from %d to %d (offset %lu), " | 454 | "Cleaning journal tail from %d to %d (offset %lu), " |
454 | "freeing %lu\n", | 455 | "freeing %lu\n", |
455 | journal->j_tail_sequence, first_tid, blocknr, freed); | 456 | journal->j_tail_sequence, first_tid, blocknr, freed); |
456 | 457 | ||
457 | journal->j_free += freed; | 458 | journal->j_free += freed; |
458 | journal->j_tail_sequence = first_tid; | 459 | journal->j_tail_sequence = first_tid; |
459 | journal->j_tail = blocknr; | 460 | journal->j_tail = blocknr; |
460 | spin_unlock(&journal->j_state_lock); | 461 | spin_unlock(&journal->j_state_lock); |
461 | if (!(journal->j_flags & JBD2_ABORT)) | 462 | if (!(journal->j_flags & JBD2_ABORT)) |
462 | jbd2_journal_update_superblock(journal, 1); | 463 | jbd2_journal_update_superblock(journal, 1); |
463 | return 0; | 464 | return 0; |
464 | } | 465 | } |
465 | 466 | ||
466 | 467 | ||
467 | /* Checkpoint list management */ | 468 | /* Checkpoint list management */ |
468 | 469 | ||
469 | /* | 470 | /* |
470 | * journal_clean_one_cp_list | 471 | * journal_clean_one_cp_list |
471 | * | 472 | * |
472 | * Find all the written-back checkpoint buffers in the given list and release them. | 473 | * Find all the written-back checkpoint buffers in the given list and release them. |
473 | * | 474 | * |
474 | * Called with the journal locked. | 475 | * Called with the journal locked. |
475 | * Called with j_list_lock held. | 476 | * Called with j_list_lock held. |
476 | * Returns number of bufers reaped (for debug) | 477 | * Returns number of bufers reaped (for debug) |
477 | */ | 478 | */ |
478 | 479 | ||
479 | static int journal_clean_one_cp_list(struct journal_head *jh, int *released) | 480 | static int journal_clean_one_cp_list(struct journal_head *jh, int *released) |
480 | { | 481 | { |
481 | struct journal_head *last_jh; | 482 | struct journal_head *last_jh; |
482 | struct journal_head *next_jh = jh; | 483 | struct journal_head *next_jh = jh; |
483 | int ret, freed = 0; | 484 | int ret, freed = 0; |
484 | 485 | ||
485 | *released = 0; | 486 | *released = 0; |
486 | if (!jh) | 487 | if (!jh) |
487 | return 0; | 488 | return 0; |
488 | 489 | ||
489 | last_jh = jh->b_cpprev; | 490 | last_jh = jh->b_cpprev; |
490 | do { | 491 | do { |
491 | jh = next_jh; | 492 | jh = next_jh; |
492 | next_jh = jh->b_cpnext; | 493 | next_jh = jh->b_cpnext; |
493 | /* Use trylock because of the ranking */ | 494 | /* Use trylock because of the ranking */ |
494 | if (jbd_trylock_bh_state(jh2bh(jh))) { | 495 | if (jbd_trylock_bh_state(jh2bh(jh))) { |
495 | ret = __try_to_free_cp_buf(jh); | 496 | ret = __try_to_free_cp_buf(jh); |
496 | if (ret) { | 497 | if (ret) { |
497 | freed++; | 498 | freed++; |
498 | if (ret == 2) { | 499 | if (ret == 2) { |
499 | *released = 1; | 500 | *released = 1; |
500 | return freed; | 501 | return freed; |
501 | } | 502 | } |
502 | } | 503 | } |
503 | } | 504 | } |
504 | /* | 505 | /* |
505 | * This function only frees up some memory | 506 | * This function only frees up some memory |
506 | * if possible so we dont have an obligation | 507 | * if possible so we dont have an obligation |
507 | * to finish processing. Bail out if preemption | 508 | * to finish processing. Bail out if preemption |
508 | * requested: | 509 | * requested: |
509 | */ | 510 | */ |
510 | if (need_resched()) | 511 | if (need_resched()) |
511 | return freed; | 512 | return freed; |
512 | } while (jh != last_jh); | 513 | } while (jh != last_jh); |
513 | 514 | ||
514 | return freed; | 515 | return freed; |
515 | } | 516 | } |
516 | 517 | ||
517 | /* | 518 | /* |
518 | * journal_clean_checkpoint_list | 519 | * journal_clean_checkpoint_list |
519 | * | 520 | * |
520 | * Find all the written-back checkpoint buffers in the journal and release them. | 521 | * Find all the written-back checkpoint buffers in the journal and release them. |
521 | * | 522 | * |
522 | * Called with the journal locked. | 523 | * Called with the journal locked. |
523 | * Called with j_list_lock held. | 524 | * Called with j_list_lock held. |
524 | * Returns number of buffers reaped (for debug) | 525 | * Returns number of buffers reaped (for debug) |
525 | */ | 526 | */ |
526 | 527 | ||
527 | int __jbd2_journal_clean_checkpoint_list(journal_t *journal) | 528 | int __jbd2_journal_clean_checkpoint_list(journal_t *journal) |
528 | { | 529 | { |
529 | transaction_t *transaction, *last_transaction, *next_transaction; | 530 | transaction_t *transaction, *last_transaction, *next_transaction; |
530 | int ret = 0; | 531 | int ret = 0; |
531 | int released; | 532 | int released; |
532 | 533 | ||
533 | transaction = journal->j_checkpoint_transactions; | 534 | transaction = journal->j_checkpoint_transactions; |
534 | if (!transaction) | 535 | if (!transaction) |
535 | goto out; | 536 | goto out; |
536 | 537 | ||
537 | last_transaction = transaction->t_cpprev; | 538 | last_transaction = transaction->t_cpprev; |
538 | next_transaction = transaction; | 539 | next_transaction = transaction; |
539 | do { | 540 | do { |
540 | transaction = next_transaction; | 541 | transaction = next_transaction; |
541 | next_transaction = transaction->t_cpnext; | 542 | next_transaction = transaction->t_cpnext; |
542 | ret += journal_clean_one_cp_list(transaction-> | 543 | ret += journal_clean_one_cp_list(transaction-> |
543 | t_checkpoint_list, &released); | 544 | t_checkpoint_list, &released); |
544 | /* | 545 | /* |
545 | * This function only frees up some memory if possible so we | 546 | * This function only frees up some memory if possible so we |
546 | * dont have an obligation to finish processing. Bail out if | 547 | * dont have an obligation to finish processing. Bail out if |
547 | * preemption requested: | 548 | * preemption requested: |
548 | */ | 549 | */ |
549 | if (need_resched()) | 550 | if (need_resched()) |
550 | goto out; | 551 | goto out; |
551 | if (released) | 552 | if (released) |
552 | continue; | 553 | continue; |
553 | /* | 554 | /* |
554 | * It is essential that we are as careful as in the case of | 555 | * It is essential that we are as careful as in the case of |
555 | * t_checkpoint_list with removing the buffer from the list as | 556 | * t_checkpoint_list with removing the buffer from the list as |
556 | * we can possibly see not yet submitted buffers on io_list | 557 | * we can possibly see not yet submitted buffers on io_list |
557 | */ | 558 | */ |
558 | ret += journal_clean_one_cp_list(transaction-> | 559 | ret += journal_clean_one_cp_list(transaction-> |
559 | t_checkpoint_io_list, &released); | 560 | t_checkpoint_io_list, &released); |
560 | if (need_resched()) | 561 | if (need_resched()) |
561 | goto out; | 562 | goto out; |
562 | } while (transaction != last_transaction); | 563 | } while (transaction != last_transaction); |
563 | out: | 564 | out: |
564 | return ret; | 565 | return ret; |
565 | } | 566 | } |
566 | 567 | ||
567 | /* | 568 | /* |
568 | * journal_remove_checkpoint: called after a buffer has been committed | 569 | * journal_remove_checkpoint: called after a buffer has been committed |
569 | * to disk (either by being write-back flushed to disk, or being | 570 | * to disk (either by being write-back flushed to disk, or being |
570 | * committed to the log). | 571 | * committed to the log). |
571 | * | 572 | * |
572 | * We cannot safely clean a transaction out of the log until all of the | 573 | * We cannot safely clean a transaction out of the log until all of the |
573 | * buffer updates committed in that transaction have safely been stored | 574 | * buffer updates committed in that transaction have safely been stored |
574 | * elsewhere on disk. To achieve this, all of the buffers in a | 575 | * elsewhere on disk. To achieve this, all of the buffers in a |
575 | * transaction need to be maintained on the transaction's checkpoint | 576 | * transaction need to be maintained on the transaction's checkpoint |
576 | * lists until they have been rewritten, at which point this function is | 577 | * lists until they have been rewritten, at which point this function is |
577 | * called to remove the buffer from the existing transaction's | 578 | * called to remove the buffer from the existing transaction's |
578 | * checkpoint lists. | 579 | * checkpoint lists. |
579 | * | 580 | * |
580 | * The function returns 1 if it frees the transaction, 0 otherwise. | 581 | * The function returns 1 if it frees the transaction, 0 otherwise. |
581 | * | 582 | * |
582 | * This function is called with the journal locked. | 583 | * This function is called with the journal locked. |
583 | * This function is called with j_list_lock held. | 584 | * This function is called with j_list_lock held. |
584 | * This function is called with jbd_lock_bh_state(jh2bh(jh)) | 585 | * This function is called with jbd_lock_bh_state(jh2bh(jh)) |
585 | */ | 586 | */ |
586 | 587 | ||
587 | int __jbd2_journal_remove_checkpoint(struct journal_head *jh) | 588 | int __jbd2_journal_remove_checkpoint(struct journal_head *jh) |
588 | { | 589 | { |
589 | transaction_t *transaction; | 590 | transaction_t *transaction; |
590 | journal_t *journal; | 591 | journal_t *journal; |
591 | int ret = 0; | 592 | int ret = 0; |
592 | 593 | ||
593 | JBUFFER_TRACE(jh, "entry"); | 594 | JBUFFER_TRACE(jh, "entry"); |
594 | 595 | ||
595 | if ((transaction = jh->b_cp_transaction) == NULL) { | 596 | if ((transaction = jh->b_cp_transaction) == NULL) { |
596 | JBUFFER_TRACE(jh, "not on transaction"); | 597 | JBUFFER_TRACE(jh, "not on transaction"); |
597 | goto out; | 598 | goto out; |
598 | } | 599 | } |
599 | journal = transaction->t_journal; | 600 | journal = transaction->t_journal; |
600 | 601 | ||
601 | __buffer_unlink(jh); | 602 | __buffer_unlink(jh); |
602 | jh->b_cp_transaction = NULL; | 603 | jh->b_cp_transaction = NULL; |
603 | 604 | ||
604 | if (transaction->t_checkpoint_list != NULL || | 605 | if (transaction->t_checkpoint_list != NULL || |
605 | transaction->t_checkpoint_io_list != NULL) | 606 | transaction->t_checkpoint_io_list != NULL) |
606 | goto out; | 607 | goto out; |
607 | JBUFFER_TRACE(jh, "transaction has no more buffers"); | 608 | JBUFFER_TRACE(jh, "transaction has no more buffers"); |
608 | 609 | ||
609 | /* | 610 | /* |
610 | * There is one special case to worry about: if we have just pulled the | 611 | * There is one special case to worry about: if we have just pulled the |
611 | * buffer off a running or committing transaction's checkpoing list, | 612 | * buffer off a running or committing transaction's checkpoing list, |
612 | * then even if the checkpoint list is empty, the transaction obviously | 613 | * then even if the checkpoint list is empty, the transaction obviously |
613 | * cannot be dropped! | 614 | * cannot be dropped! |
614 | * | 615 | * |
615 | * The locking here around t_state is a bit sleazy. | 616 | * The locking here around t_state is a bit sleazy. |
616 | * See the comment at the end of jbd2_journal_commit_transaction(). | 617 | * See the comment at the end of jbd2_journal_commit_transaction(). |
617 | */ | 618 | */ |
618 | if (transaction->t_state != T_FINISHED) { | 619 | if (transaction->t_state != T_FINISHED) { |
619 | JBUFFER_TRACE(jh, "belongs to running/committing transaction"); | 620 | JBUFFER_TRACE(jh, "belongs to running/committing transaction"); |
620 | goto out; | 621 | goto out; |
621 | } | 622 | } |
622 | 623 | ||
623 | /* OK, that was the last buffer for the transaction: we can now | 624 | /* OK, that was the last buffer for the transaction: we can now |
624 | safely remove this transaction from the log */ | 625 | safely remove this transaction from the log */ |
625 | 626 | ||
626 | __jbd2_journal_drop_transaction(journal, transaction); | 627 | __jbd2_journal_drop_transaction(journal, transaction); |
627 | 628 | ||
628 | /* Just in case anybody was waiting for more transactions to be | 629 | /* Just in case anybody was waiting for more transactions to be |
629 | checkpointed... */ | 630 | checkpointed... */ |
630 | wake_up(&journal->j_wait_logspace); | 631 | wake_up(&journal->j_wait_logspace); |
631 | ret = 1; | 632 | ret = 1; |
632 | out: | 633 | out: |
633 | JBUFFER_TRACE(jh, "exit"); | 634 | JBUFFER_TRACE(jh, "exit"); |
634 | return ret; | 635 | return ret; |
635 | } | 636 | } |
636 | 637 | ||
637 | /* | 638 | /* |
638 | * journal_insert_checkpoint: put a committed buffer onto a checkpoint | 639 | * journal_insert_checkpoint: put a committed buffer onto a checkpoint |
639 | * list so that we know when it is safe to clean the transaction out of | 640 | * list so that we know when it is safe to clean the transaction out of |
640 | * the log. | 641 | * the log. |
641 | * | 642 | * |
642 | * Called with the journal locked. | 643 | * Called with the journal locked. |
643 | * Called with j_list_lock held. | 644 | * Called with j_list_lock held. |
644 | */ | 645 | */ |
645 | void __jbd2_journal_insert_checkpoint(struct journal_head *jh, | 646 | void __jbd2_journal_insert_checkpoint(struct journal_head *jh, |
646 | transaction_t *transaction) | 647 | transaction_t *transaction) |
647 | { | 648 | { |
648 | JBUFFER_TRACE(jh, "entry"); | 649 | JBUFFER_TRACE(jh, "entry"); |
649 | J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh))); | 650 | J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh))); |
650 | J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); | 651 | J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); |
651 | 652 | ||
652 | jh->b_cp_transaction = transaction; | 653 | jh->b_cp_transaction = transaction; |
653 | 654 | ||
654 | if (!transaction->t_checkpoint_list) { | 655 | if (!transaction->t_checkpoint_list) { |
655 | jh->b_cpnext = jh->b_cpprev = jh; | 656 | jh->b_cpnext = jh->b_cpprev = jh; |
656 | } else { | 657 | } else { |
657 | jh->b_cpnext = transaction->t_checkpoint_list; | 658 | jh->b_cpnext = transaction->t_checkpoint_list; |
658 | jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev; | 659 | jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev; |
659 | jh->b_cpprev->b_cpnext = jh; | 660 | jh->b_cpprev->b_cpnext = jh; |
660 | jh->b_cpnext->b_cpprev = jh; | 661 | jh->b_cpnext->b_cpprev = jh; |
661 | } | 662 | } |
662 | transaction->t_checkpoint_list = jh; | 663 | transaction->t_checkpoint_list = jh; |
663 | } | 664 | } |
664 | 665 | ||
665 | /* | 666 | /* |
666 | * We've finished with this transaction structure: adios... | 667 | * We've finished with this transaction structure: adios... |
667 | * | 668 | * |
668 | * The transaction must have no links except for the checkpoint by this | 669 | * The transaction must have no links except for the checkpoint by this |
669 | * point. | 670 | * point. |
670 | * | 671 | * |
671 | * Called with the journal locked. | 672 | * Called with the journal locked. |
672 | * Called with j_list_lock held. | 673 | * Called with j_list_lock held. |
673 | */ | 674 | */ |
674 | 675 | ||
675 | void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transaction) | 676 | void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transaction) |
676 | { | 677 | { |
677 | assert_spin_locked(&journal->j_list_lock); | 678 | assert_spin_locked(&journal->j_list_lock); |
678 | if (transaction->t_cpnext) { | 679 | if (transaction->t_cpnext) { |
679 | transaction->t_cpnext->t_cpprev = transaction->t_cpprev; | 680 | transaction->t_cpnext->t_cpprev = transaction->t_cpprev; |
680 | transaction->t_cpprev->t_cpnext = transaction->t_cpnext; | 681 | transaction->t_cpprev->t_cpnext = transaction->t_cpnext; |
681 | if (journal->j_checkpoint_transactions == transaction) | 682 | if (journal->j_checkpoint_transactions == transaction) |
682 | journal->j_checkpoint_transactions = | 683 | journal->j_checkpoint_transactions = |
683 | transaction->t_cpnext; | 684 | transaction->t_cpnext; |
684 | if (journal->j_checkpoint_transactions == transaction) | 685 | if (journal->j_checkpoint_transactions == transaction) |
685 | journal->j_checkpoint_transactions = NULL; | 686 | journal->j_checkpoint_transactions = NULL; |
686 | } | 687 | } |
687 | 688 | ||
688 | J_ASSERT(transaction->t_state == T_FINISHED); | 689 | J_ASSERT(transaction->t_state == T_FINISHED); |
689 | J_ASSERT(transaction->t_buffers == NULL); | 690 | J_ASSERT(transaction->t_buffers == NULL); |
690 | J_ASSERT(transaction->t_sync_datalist == NULL); | 691 | J_ASSERT(transaction->t_sync_datalist == NULL); |
691 | J_ASSERT(transaction->t_forget == NULL); | 692 | J_ASSERT(transaction->t_forget == NULL); |
692 | J_ASSERT(transaction->t_iobuf_list == NULL); | 693 | J_ASSERT(transaction->t_iobuf_list == NULL); |
693 | J_ASSERT(transaction->t_shadow_list == NULL); | 694 | J_ASSERT(transaction->t_shadow_list == NULL); |
694 | J_ASSERT(transaction->t_log_list == NULL); | 695 | J_ASSERT(transaction->t_log_list == NULL); |
695 | J_ASSERT(transaction->t_checkpoint_list == NULL); | 696 | J_ASSERT(transaction->t_checkpoint_list == NULL); |
696 | J_ASSERT(transaction->t_checkpoint_io_list == NULL); | 697 | J_ASSERT(transaction->t_checkpoint_io_list == NULL); |
697 | J_ASSERT(transaction->t_updates == 0); | 698 | J_ASSERT(transaction->t_updates == 0); |
698 | J_ASSERT(journal->j_committing_transaction != transaction); | 699 | J_ASSERT(journal->j_committing_transaction != transaction); |
699 | J_ASSERT(journal->j_running_transaction != transaction); | 700 | J_ASSERT(journal->j_running_transaction != transaction); |
700 | 701 | ||
701 | jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); | 702 | jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); |
702 | kfree(transaction); | 703 | kfree(transaction); |
703 | } | 704 | } |
704 | 705 |
fs/jbd2/commit.c
1 | /* | 1 | /* |
2 | * linux/fs/jbd2/commit.c | 2 | * linux/fs/jbd2/commit.c |
3 | * | 3 | * |
4 | * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 | 4 | * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 |
5 | * | 5 | * |
6 | * Copyright 1998 Red Hat corp --- All Rights Reserved | 6 | * Copyright 1998 Red Hat corp --- All Rights Reserved |
7 | * | 7 | * |
8 | * This file is part of the Linux kernel and is made available under | 8 | * This file is part of the Linux kernel and is made available under |
9 | * the terms of the GNU General Public License, version 2, or at your | 9 | * the terms of the GNU General Public License, version 2, or at your |
10 | * option, any later version, incorporated herein by reference. | 10 | * option, any later version, incorporated herein by reference. |
11 | * | 11 | * |
12 | * Journal commit routines for the generic filesystem journaling code; | 12 | * Journal commit routines for the generic filesystem journaling code; |
13 | * part of the ext2fs journaling system. | 13 | * part of the ext2fs journaling system. |
14 | */ | 14 | */ |
15 | 15 | ||
16 | #include <linux/time.h> | 16 | #include <linux/time.h> |
17 | #include <linux/fs.h> | 17 | #include <linux/fs.h> |
18 | #include <linux/jbd2.h> | 18 | #include <linux/jbd2.h> |
19 | #include <linux/errno.h> | 19 | #include <linux/errno.h> |
20 | #include <linux/slab.h> | 20 | #include <linux/slab.h> |
21 | #include <linux/mm.h> | 21 | #include <linux/mm.h> |
22 | #include <linux/pagemap.h> | 22 | #include <linux/pagemap.h> |
23 | #include <linux/jiffies.h> | 23 | #include <linux/jiffies.h> |
24 | #include <linux/crc32.h> | 24 | #include <linux/crc32.h> |
25 | 25 | ||
26 | /* | 26 | /* |
27 | * Default IO end handler for temporary BJ_IO buffer_heads. | 27 | * Default IO end handler for temporary BJ_IO buffer_heads. |
28 | */ | 28 | */ |
29 | static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) | 29 | static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) |
30 | { | 30 | { |
31 | BUFFER_TRACE(bh, ""); | 31 | BUFFER_TRACE(bh, ""); |
32 | if (uptodate) | 32 | if (uptodate) |
33 | set_buffer_uptodate(bh); | 33 | set_buffer_uptodate(bh); |
34 | else | 34 | else |
35 | clear_buffer_uptodate(bh); | 35 | clear_buffer_uptodate(bh); |
36 | unlock_buffer(bh); | 36 | unlock_buffer(bh); |
37 | } | 37 | } |
38 | 38 | ||
39 | /* | 39 | /* |
40 | * When an ext3-ordered file is truncated, it is possible that many pages are | 40 | * When an ext3-ordered file is truncated, it is possible that many pages are |
41 | * not sucessfully freed, because they are attached to a committing transaction. | 41 | * not sucessfully freed, because they are attached to a committing transaction. |
42 | * After the transaction commits, these pages are left on the LRU, with no | 42 | * After the transaction commits, these pages are left on the LRU, with no |
43 | * ->mapping, and with attached buffers. These pages are trivially reclaimable | 43 | * ->mapping, and with attached buffers. These pages are trivially reclaimable |
44 | * by the VM, but their apparent absence upsets the VM accounting, and it makes | 44 | * by the VM, but their apparent absence upsets the VM accounting, and it makes |
45 | * the numbers in /proc/meminfo look odd. | 45 | * the numbers in /proc/meminfo look odd. |
46 | * | 46 | * |
47 | * So here, we have a buffer which has just come off the forget list. Look to | 47 | * So here, we have a buffer which has just come off the forget list. Look to |
48 | * see if we can strip all buffers from the backing page. | 48 | * see if we can strip all buffers from the backing page. |
49 | * | 49 | * |
50 | * Called under lock_journal(), and possibly under journal_datalist_lock. The | 50 | * Called under lock_journal(), and possibly under journal_datalist_lock. The |
51 | * caller provided us with a ref against the buffer, and we drop that here. | 51 | * caller provided us with a ref against the buffer, and we drop that here. |
52 | */ | 52 | */ |
53 | static void release_buffer_page(struct buffer_head *bh) | 53 | static void release_buffer_page(struct buffer_head *bh) |
54 | { | 54 | { |
55 | struct page *page; | 55 | struct page *page; |
56 | 56 | ||
57 | if (buffer_dirty(bh)) | 57 | if (buffer_dirty(bh)) |
58 | goto nope; | 58 | goto nope; |
59 | if (atomic_read(&bh->b_count) != 1) | 59 | if (atomic_read(&bh->b_count) != 1) |
60 | goto nope; | 60 | goto nope; |
61 | page = bh->b_page; | 61 | page = bh->b_page; |
62 | if (!page) | 62 | if (!page) |
63 | goto nope; | 63 | goto nope; |
64 | if (page->mapping) | 64 | if (page->mapping) |
65 | goto nope; | 65 | goto nope; |
66 | 66 | ||
67 | /* OK, it's a truncated page */ | 67 | /* OK, it's a truncated page */ |
68 | if (TestSetPageLocked(page)) | 68 | if (TestSetPageLocked(page)) |
69 | goto nope; | 69 | goto nope; |
70 | 70 | ||
71 | page_cache_get(page); | 71 | page_cache_get(page); |
72 | __brelse(bh); | 72 | __brelse(bh); |
73 | try_to_free_buffers(page); | 73 | try_to_free_buffers(page); |
74 | unlock_page(page); | 74 | unlock_page(page); |
75 | page_cache_release(page); | 75 | page_cache_release(page); |
76 | return; | 76 | return; |
77 | 77 | ||
78 | nope: | 78 | nope: |
79 | __brelse(bh); | 79 | __brelse(bh); |
80 | } | 80 | } |
81 | 81 | ||
82 | /* | 82 | /* |
83 | * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is | 83 | * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is |
84 | * held. For ranking reasons we must trylock. If we lose, schedule away and | 84 | * held. For ranking reasons we must trylock. If we lose, schedule away and |
85 | * return 0. j_list_lock is dropped in this case. | 85 | * return 0. j_list_lock is dropped in this case. |
86 | */ | 86 | */ |
87 | static int inverted_lock(journal_t *journal, struct buffer_head *bh) | 87 | static int inverted_lock(journal_t *journal, struct buffer_head *bh) |
88 | { | 88 | { |
89 | if (!jbd_trylock_bh_state(bh)) { | 89 | if (!jbd_trylock_bh_state(bh)) { |
90 | spin_unlock(&journal->j_list_lock); | 90 | spin_unlock(&journal->j_list_lock); |
91 | schedule(); | 91 | schedule(); |
92 | return 0; | 92 | return 0; |
93 | } | 93 | } |
94 | return 1; | 94 | return 1; |
95 | } | 95 | } |
96 | 96 | ||
97 | /* | 97 | /* |
98 | * Done it all: now submit the commit record. We should have | 98 | * Done it all: now submit the commit record. We should have |
99 | * cleaned up our previous buffers by now, so if we are in abort | 99 | * cleaned up our previous buffers by now, so if we are in abort |
100 | * mode we can now just skip the rest of the journal write | 100 | * mode we can now just skip the rest of the journal write |
101 | * entirely. | 101 | * entirely. |
102 | * | 102 | * |
103 | * Returns 1 if the journal needs to be aborted or 0 on success | 103 | * Returns 1 if the journal needs to be aborted or 0 on success |
104 | */ | 104 | */ |
105 | static int journal_submit_commit_record(journal_t *journal, | 105 | static int journal_submit_commit_record(journal_t *journal, |
106 | transaction_t *commit_transaction, | 106 | transaction_t *commit_transaction, |
107 | struct buffer_head **cbh, | 107 | struct buffer_head **cbh, |
108 | __u32 crc32_sum) | 108 | __u32 crc32_sum) |
109 | { | 109 | { |
110 | struct journal_head *descriptor; | 110 | struct journal_head *descriptor; |
111 | struct commit_header *tmp; | 111 | struct commit_header *tmp; |
112 | struct buffer_head *bh; | 112 | struct buffer_head *bh; |
113 | int ret; | 113 | int ret; |
114 | int barrier_done = 0; | 114 | int barrier_done = 0; |
115 | 115 | ||
116 | if (is_journal_aborted(journal)) | 116 | if (is_journal_aborted(journal)) |
117 | return 0; | 117 | return 0; |
118 | 118 | ||
119 | descriptor = jbd2_journal_get_descriptor_buffer(journal); | 119 | descriptor = jbd2_journal_get_descriptor_buffer(journal); |
120 | if (!descriptor) | 120 | if (!descriptor) |
121 | return 1; | 121 | return 1; |
122 | 122 | ||
123 | bh = jh2bh(descriptor); | 123 | bh = jh2bh(descriptor); |
124 | 124 | ||
125 | tmp = (struct commit_header *)bh->b_data; | 125 | tmp = (struct commit_header *)bh->b_data; |
126 | tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); | 126 | tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); |
127 | tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); | 127 | tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); |
128 | tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); | 128 | tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); |
129 | 129 | ||
130 | if (JBD2_HAS_COMPAT_FEATURE(journal, | 130 | if (JBD2_HAS_COMPAT_FEATURE(journal, |
131 | JBD2_FEATURE_COMPAT_CHECKSUM)) { | 131 | JBD2_FEATURE_COMPAT_CHECKSUM)) { |
132 | tmp->h_chksum_type = JBD2_CRC32_CHKSUM; | 132 | tmp->h_chksum_type = JBD2_CRC32_CHKSUM; |
133 | tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE; | 133 | tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE; |
134 | tmp->h_chksum[0] = cpu_to_be32(crc32_sum); | 134 | tmp->h_chksum[0] = cpu_to_be32(crc32_sum); |
135 | } | 135 | } |
136 | 136 | ||
137 | JBUFFER_TRACE(descriptor, "submit commit block"); | 137 | JBUFFER_TRACE(descriptor, "submit commit block"); |
138 | lock_buffer(bh); | 138 | lock_buffer(bh); |
139 | 139 | ||
140 | set_buffer_dirty(bh); | 140 | set_buffer_dirty(bh); |
141 | set_buffer_uptodate(bh); | 141 | set_buffer_uptodate(bh); |
142 | bh->b_end_io = journal_end_buffer_io_sync; | 142 | bh->b_end_io = journal_end_buffer_io_sync; |
143 | 143 | ||
144 | if (journal->j_flags & JBD2_BARRIER && | 144 | if (journal->j_flags & JBD2_BARRIER && |
145 | !JBD2_HAS_COMPAT_FEATURE(journal, | 145 | !JBD2_HAS_COMPAT_FEATURE(journal, |
146 | JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { | 146 | JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { |
147 | set_buffer_ordered(bh); | 147 | set_buffer_ordered(bh); |
148 | barrier_done = 1; | 148 | barrier_done = 1; |
149 | } | 149 | } |
150 | ret = submit_bh(WRITE, bh); | 150 | ret = submit_bh(WRITE, bh); |
151 | 151 | ||
152 | /* is it possible for another commit to fail at roughly | 152 | /* is it possible for another commit to fail at roughly |
153 | * the same time as this one? If so, we don't want to | 153 | * the same time as this one? If so, we don't want to |
154 | * trust the barrier flag in the super, but instead want | 154 | * trust the barrier flag in the super, but instead want |
155 | * to remember if we sent a barrier request | 155 | * to remember if we sent a barrier request |
156 | */ | 156 | */ |
157 | if (ret == -EOPNOTSUPP && barrier_done) { | 157 | if (ret == -EOPNOTSUPP && barrier_done) { |
158 | char b[BDEVNAME_SIZE]; | 158 | char b[BDEVNAME_SIZE]; |
159 | 159 | ||
160 | printk(KERN_WARNING | 160 | printk(KERN_WARNING |
161 | "JBD: barrier-based sync failed on %s - " | 161 | "JBD: barrier-based sync failed on %s - " |
162 | "disabling barriers\n", | 162 | "disabling barriers\n", |
163 | bdevname(journal->j_dev, b)); | 163 | bdevname(journal->j_dev, b)); |
164 | spin_lock(&journal->j_state_lock); | 164 | spin_lock(&journal->j_state_lock); |
165 | journal->j_flags &= ~JBD2_BARRIER; | 165 | journal->j_flags &= ~JBD2_BARRIER; |
166 | spin_unlock(&journal->j_state_lock); | 166 | spin_unlock(&journal->j_state_lock); |
167 | 167 | ||
168 | /* And try again, without the barrier */ | 168 | /* And try again, without the barrier */ |
169 | clear_buffer_ordered(bh); | 169 | clear_buffer_ordered(bh); |
170 | set_buffer_uptodate(bh); | 170 | set_buffer_uptodate(bh); |
171 | set_buffer_dirty(bh); | 171 | set_buffer_dirty(bh); |
172 | ret = submit_bh(WRITE, bh); | 172 | ret = submit_bh(WRITE, bh); |
173 | } | 173 | } |
174 | *cbh = bh; | 174 | *cbh = bh; |
175 | return ret; | 175 | return ret; |
176 | } | 176 | } |
177 | 177 | ||
178 | /* | 178 | /* |
179 | * This function along with journal_submit_commit_record | 179 | * This function along with journal_submit_commit_record |
180 | * allows to write the commit record asynchronously. | 180 | * allows to write the commit record asynchronously. |
181 | */ | 181 | */ |
182 | static int journal_wait_on_commit_record(struct buffer_head *bh) | 182 | static int journal_wait_on_commit_record(struct buffer_head *bh) |
183 | { | 183 | { |
184 | int ret = 0; | 184 | int ret = 0; |
185 | 185 | ||
186 | clear_buffer_dirty(bh); | 186 | clear_buffer_dirty(bh); |
187 | wait_on_buffer(bh); | 187 | wait_on_buffer(bh); |
188 | 188 | ||
189 | if (unlikely(!buffer_uptodate(bh))) | 189 | if (unlikely(!buffer_uptodate(bh))) |
190 | ret = -EIO; | 190 | ret = -EIO; |
191 | put_bh(bh); /* One for getblk() */ | 191 | put_bh(bh); /* One for getblk() */ |
192 | jbd2_journal_put_journal_head(bh2jh(bh)); | 192 | jbd2_journal_put_journal_head(bh2jh(bh)); |
193 | 193 | ||
194 | return ret; | 194 | return ret; |
195 | } | 195 | } |
196 | 196 | ||
197 | /* | 197 | /* |
198 | * Wait for all submitted IO to complete. | 198 | * Wait for all submitted IO to complete. |
199 | */ | 199 | */ |
200 | static int journal_wait_on_locked_list(journal_t *journal, | 200 | static int journal_wait_on_locked_list(journal_t *journal, |
201 | transaction_t *commit_transaction) | 201 | transaction_t *commit_transaction) |
202 | { | 202 | { |
203 | int ret = 0; | 203 | int ret = 0; |
204 | struct journal_head *jh; | 204 | struct journal_head *jh; |
205 | 205 | ||
206 | while (commit_transaction->t_locked_list) { | 206 | while (commit_transaction->t_locked_list) { |
207 | struct buffer_head *bh; | 207 | struct buffer_head *bh; |
208 | 208 | ||
209 | jh = commit_transaction->t_locked_list->b_tprev; | 209 | jh = commit_transaction->t_locked_list->b_tprev; |
210 | bh = jh2bh(jh); | 210 | bh = jh2bh(jh); |
211 | get_bh(bh); | 211 | get_bh(bh); |
212 | if (buffer_locked(bh)) { | 212 | if (buffer_locked(bh)) { |
213 | spin_unlock(&journal->j_list_lock); | 213 | spin_unlock(&journal->j_list_lock); |
214 | wait_on_buffer(bh); | 214 | wait_on_buffer(bh); |
215 | if (unlikely(!buffer_uptodate(bh))) | 215 | if (unlikely(!buffer_uptodate(bh))) |
216 | ret = -EIO; | 216 | ret = -EIO; |
217 | spin_lock(&journal->j_list_lock); | 217 | spin_lock(&journal->j_list_lock); |
218 | } | 218 | } |
219 | if (!inverted_lock(journal, bh)) { | 219 | if (!inverted_lock(journal, bh)) { |
220 | put_bh(bh); | 220 | put_bh(bh); |
221 | spin_lock(&journal->j_list_lock); | 221 | spin_lock(&journal->j_list_lock); |
222 | continue; | 222 | continue; |
223 | } | 223 | } |
224 | if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) { | 224 | if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) { |
225 | __jbd2_journal_unfile_buffer(jh); | 225 | __jbd2_journal_unfile_buffer(jh); |
226 | jbd_unlock_bh_state(bh); | 226 | jbd_unlock_bh_state(bh); |
227 | jbd2_journal_remove_journal_head(bh); | 227 | jbd2_journal_remove_journal_head(bh); |
228 | put_bh(bh); | 228 | put_bh(bh); |
229 | } else { | 229 | } else { |
230 | jbd_unlock_bh_state(bh); | 230 | jbd_unlock_bh_state(bh); |
231 | } | 231 | } |
232 | put_bh(bh); | 232 | put_bh(bh); |
233 | cond_resched_lock(&journal->j_list_lock); | 233 | cond_resched_lock(&journal->j_list_lock); |
234 | } | 234 | } |
235 | return ret; | 235 | return ret; |
236 | } | 236 | } |
237 | 237 | ||
238 | static void journal_do_submit_data(struct buffer_head **wbuf, int bufs) | 238 | static void journal_do_submit_data(struct buffer_head **wbuf, int bufs) |
239 | { | 239 | { |
240 | int i; | 240 | int i; |
241 | 241 | ||
242 | for (i = 0; i < bufs; i++) { | 242 | for (i = 0; i < bufs; i++) { |
243 | wbuf[i]->b_end_io = end_buffer_write_sync; | 243 | wbuf[i]->b_end_io = end_buffer_write_sync; |
244 | /* We use-up our safety reference in submit_bh() */ | 244 | /* We use-up our safety reference in submit_bh() */ |
245 | submit_bh(WRITE, wbuf[i]); | 245 | submit_bh(WRITE, wbuf[i]); |
246 | } | 246 | } |
247 | } | 247 | } |
248 | 248 | ||
249 | /* | 249 | /* |
250 | * Submit all the data buffers to disk | 250 | * Submit all the data buffers to disk |
251 | */ | 251 | */ |
252 | static void journal_submit_data_buffers(journal_t *journal, | 252 | static void journal_submit_data_buffers(journal_t *journal, |
253 | transaction_t *commit_transaction) | 253 | transaction_t *commit_transaction) |
254 | { | 254 | { |
255 | struct journal_head *jh; | 255 | struct journal_head *jh; |
256 | struct buffer_head *bh; | 256 | struct buffer_head *bh; |
257 | int locked; | 257 | int locked; |
258 | int bufs = 0; | 258 | int bufs = 0; |
259 | struct buffer_head **wbuf = journal->j_wbuf; | 259 | struct buffer_head **wbuf = journal->j_wbuf; |
260 | 260 | ||
261 | /* | 261 | /* |
262 | * Whenever we unlock the journal and sleep, things can get added | 262 | * Whenever we unlock the journal and sleep, things can get added |
263 | * onto ->t_sync_datalist, so we have to keep looping back to | 263 | * onto ->t_sync_datalist, so we have to keep looping back to |
264 | * write_out_data until we *know* that the list is empty. | 264 | * write_out_data until we *know* that the list is empty. |
265 | * | 265 | * |
266 | * Cleanup any flushed data buffers from the data list. Even in | 266 | * Cleanup any flushed data buffers from the data list. Even in |
267 | * abort mode, we want to flush this out as soon as possible. | 267 | * abort mode, we want to flush this out as soon as possible. |
268 | */ | 268 | */ |
269 | write_out_data: | 269 | write_out_data: |
270 | cond_resched(); | 270 | cond_resched(); |
271 | spin_lock(&journal->j_list_lock); | 271 | spin_lock(&journal->j_list_lock); |
272 | 272 | ||
273 | while (commit_transaction->t_sync_datalist) { | 273 | while (commit_transaction->t_sync_datalist) { |
274 | jh = commit_transaction->t_sync_datalist; | 274 | jh = commit_transaction->t_sync_datalist; |
275 | bh = jh2bh(jh); | 275 | bh = jh2bh(jh); |
276 | locked = 0; | 276 | locked = 0; |
277 | 277 | ||
278 | /* Get reference just to make sure buffer does not disappear | 278 | /* Get reference just to make sure buffer does not disappear |
279 | * when we are forced to drop various locks */ | 279 | * when we are forced to drop various locks */ |
280 | get_bh(bh); | 280 | get_bh(bh); |
281 | /* If the buffer is dirty, we need to submit IO and hence | 281 | /* If the buffer is dirty, we need to submit IO and hence |
282 | * we need the buffer lock. We try to lock the buffer without | 282 | * we need the buffer lock. We try to lock the buffer without |
283 | * blocking. If we fail, we need to drop j_list_lock and do | 283 | * blocking. If we fail, we need to drop j_list_lock and do |
284 | * blocking lock_buffer(). | 284 | * blocking lock_buffer(). |
285 | */ | 285 | */ |
286 | if (buffer_dirty(bh)) { | 286 | if (buffer_dirty(bh)) { |
287 | if (test_set_buffer_locked(bh)) { | 287 | if (test_set_buffer_locked(bh)) { |
288 | BUFFER_TRACE(bh, "needs blocking lock"); | 288 | BUFFER_TRACE(bh, "needs blocking lock"); |
289 | spin_unlock(&journal->j_list_lock); | 289 | spin_unlock(&journal->j_list_lock); |
290 | /* Write out all data to prevent deadlocks */ | 290 | /* Write out all data to prevent deadlocks */ |
291 | journal_do_submit_data(wbuf, bufs); | 291 | journal_do_submit_data(wbuf, bufs); |
292 | bufs = 0; | 292 | bufs = 0; |
293 | lock_buffer(bh); | 293 | lock_buffer(bh); |
294 | spin_lock(&journal->j_list_lock); | 294 | spin_lock(&journal->j_list_lock); |
295 | } | 295 | } |
296 | locked = 1; | 296 | locked = 1; |
297 | } | 297 | } |
298 | /* We have to get bh_state lock. Again out of order, sigh. */ | 298 | /* We have to get bh_state lock. Again out of order, sigh. */ |
299 | if (!inverted_lock(journal, bh)) { | 299 | if (!inverted_lock(journal, bh)) { |
300 | jbd_lock_bh_state(bh); | 300 | jbd_lock_bh_state(bh); |
301 | spin_lock(&journal->j_list_lock); | 301 | spin_lock(&journal->j_list_lock); |
302 | } | 302 | } |
303 | /* Someone already cleaned up the buffer? */ | 303 | /* Someone already cleaned up the buffer? */ |
304 | if (!buffer_jbd(bh) | 304 | if (!buffer_jbd(bh) |
305 | || jh->b_transaction != commit_transaction | 305 | || jh->b_transaction != commit_transaction |
306 | || jh->b_jlist != BJ_SyncData) { | 306 | || jh->b_jlist != BJ_SyncData) { |
307 | jbd_unlock_bh_state(bh); | 307 | jbd_unlock_bh_state(bh); |
308 | if (locked) | 308 | if (locked) |
309 | unlock_buffer(bh); | 309 | unlock_buffer(bh); |
310 | BUFFER_TRACE(bh, "already cleaned up"); | 310 | BUFFER_TRACE(bh, "already cleaned up"); |
311 | put_bh(bh); | 311 | put_bh(bh); |
312 | continue; | 312 | continue; |
313 | } | 313 | } |
314 | if (locked && test_clear_buffer_dirty(bh)) { | 314 | if (locked && test_clear_buffer_dirty(bh)) { |
315 | BUFFER_TRACE(bh, "needs writeout, adding to array"); | 315 | BUFFER_TRACE(bh, "needs writeout, adding to array"); |
316 | wbuf[bufs++] = bh; | 316 | wbuf[bufs++] = bh; |
317 | __jbd2_journal_file_buffer(jh, commit_transaction, | 317 | __jbd2_journal_file_buffer(jh, commit_transaction, |
318 | BJ_Locked); | 318 | BJ_Locked); |
319 | jbd_unlock_bh_state(bh); | 319 | jbd_unlock_bh_state(bh); |
320 | if (bufs == journal->j_wbufsize) { | 320 | if (bufs == journal->j_wbufsize) { |
321 | spin_unlock(&journal->j_list_lock); | 321 | spin_unlock(&journal->j_list_lock); |
322 | journal_do_submit_data(wbuf, bufs); | 322 | journal_do_submit_data(wbuf, bufs); |
323 | bufs = 0; | 323 | bufs = 0; |
324 | goto write_out_data; | 324 | goto write_out_data; |
325 | } | 325 | } |
326 | } else if (!locked && buffer_locked(bh)) { | 326 | } else if (!locked && buffer_locked(bh)) { |
327 | __jbd2_journal_file_buffer(jh, commit_transaction, | 327 | __jbd2_journal_file_buffer(jh, commit_transaction, |
328 | BJ_Locked); | 328 | BJ_Locked); |
329 | jbd_unlock_bh_state(bh); | 329 | jbd_unlock_bh_state(bh); |
330 | put_bh(bh); | 330 | put_bh(bh); |
331 | } else { | 331 | } else { |
332 | BUFFER_TRACE(bh, "writeout complete: unfile"); | 332 | BUFFER_TRACE(bh, "writeout complete: unfile"); |
333 | __jbd2_journal_unfile_buffer(jh); | 333 | __jbd2_journal_unfile_buffer(jh); |
334 | jbd_unlock_bh_state(bh); | 334 | jbd_unlock_bh_state(bh); |
335 | if (locked) | 335 | if (locked) |
336 | unlock_buffer(bh); | 336 | unlock_buffer(bh); |
337 | jbd2_journal_remove_journal_head(bh); | 337 | jbd2_journal_remove_journal_head(bh); |
338 | /* Once for our safety reference, once for | 338 | /* Once for our safety reference, once for |
339 | * jbd2_journal_remove_journal_head() */ | 339 | * jbd2_journal_remove_journal_head() */ |
340 | put_bh(bh); | 340 | put_bh(bh); |
341 | put_bh(bh); | 341 | put_bh(bh); |
342 | } | 342 | } |
343 | 343 | ||
344 | if (lock_need_resched(&journal->j_list_lock)) { | 344 | if (need_resched() || spin_needbreak(&journal->j_list_lock)) { |
345 | spin_unlock(&journal->j_list_lock); | 345 | spin_unlock(&journal->j_list_lock); |
346 | goto write_out_data; | 346 | goto write_out_data; |
347 | } | 347 | } |
348 | } | 348 | } |
349 | spin_unlock(&journal->j_list_lock); | 349 | spin_unlock(&journal->j_list_lock); |
350 | journal_do_submit_data(wbuf, bufs); | 350 | journal_do_submit_data(wbuf, bufs); |
351 | } | 351 | } |
352 | 352 | ||
353 | static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh) | 353 | static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh) |
354 | { | 354 | { |
355 | struct page *page = bh->b_page; | 355 | struct page *page = bh->b_page; |
356 | char *addr; | 356 | char *addr; |
357 | __u32 checksum; | 357 | __u32 checksum; |
358 | 358 | ||
359 | addr = kmap_atomic(page, KM_USER0); | 359 | addr = kmap_atomic(page, KM_USER0); |
360 | checksum = crc32_be(crc32_sum, | 360 | checksum = crc32_be(crc32_sum, |
361 | (void *)(addr + offset_in_page(bh->b_data)), bh->b_size); | 361 | (void *)(addr + offset_in_page(bh->b_data)), bh->b_size); |
362 | kunmap_atomic(addr, KM_USER0); | 362 | kunmap_atomic(addr, KM_USER0); |
363 | 363 | ||
364 | return checksum; | 364 | return checksum; |
365 | } | 365 | } |
366 | 366 | ||
367 | static void write_tag_block(int tag_bytes, journal_block_tag_t *tag, | 367 | static void write_tag_block(int tag_bytes, journal_block_tag_t *tag, |
368 | unsigned long long block) | 368 | unsigned long long block) |
369 | { | 369 | { |
370 | tag->t_blocknr = cpu_to_be32(block & (u32)~0); | 370 | tag->t_blocknr = cpu_to_be32(block & (u32)~0); |
371 | if (tag_bytes > JBD2_TAG_SIZE32) | 371 | if (tag_bytes > JBD2_TAG_SIZE32) |
372 | tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1); | 372 | tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1); |
373 | } | 373 | } |
374 | 374 | ||
375 | /* | 375 | /* |
376 | * jbd2_journal_commit_transaction | 376 | * jbd2_journal_commit_transaction |
377 | * | 377 | * |
378 | * The primary function for committing a transaction to the log. This | 378 | * The primary function for committing a transaction to the log. This |
379 | * function is called by the journal thread to begin a complete commit. | 379 | * function is called by the journal thread to begin a complete commit. |
380 | */ | 380 | */ |
381 | void jbd2_journal_commit_transaction(journal_t *journal) | 381 | void jbd2_journal_commit_transaction(journal_t *journal) |
382 | { | 382 | { |
383 | struct transaction_stats_s stats; | 383 | struct transaction_stats_s stats; |
384 | transaction_t *commit_transaction; | 384 | transaction_t *commit_transaction; |
385 | struct journal_head *jh, *new_jh, *descriptor; | 385 | struct journal_head *jh, *new_jh, *descriptor; |
386 | struct buffer_head **wbuf = journal->j_wbuf; | 386 | struct buffer_head **wbuf = journal->j_wbuf; |
387 | int bufs; | 387 | int bufs; |
388 | int flags; | 388 | int flags; |
389 | int err; | 389 | int err; |
390 | unsigned long long blocknr; | 390 | unsigned long long blocknr; |
391 | char *tagp = NULL; | 391 | char *tagp = NULL; |
392 | journal_header_t *header; | 392 | journal_header_t *header; |
393 | journal_block_tag_t *tag = NULL; | 393 | journal_block_tag_t *tag = NULL; |
394 | int space_left = 0; | 394 | int space_left = 0; |
395 | int first_tag = 0; | 395 | int first_tag = 0; |
396 | int tag_flag; | 396 | int tag_flag; |
397 | int i; | 397 | int i; |
398 | int tag_bytes = journal_tag_bytes(journal); | 398 | int tag_bytes = journal_tag_bytes(journal); |
399 | struct buffer_head *cbh = NULL; /* For transactional checksums */ | 399 | struct buffer_head *cbh = NULL; /* For transactional checksums */ |
400 | __u32 crc32_sum = ~0; | 400 | __u32 crc32_sum = ~0; |
401 | 401 | ||
402 | /* | 402 | /* |
403 | * First job: lock down the current transaction and wait for | 403 | * First job: lock down the current transaction and wait for |
404 | * all outstanding updates to complete. | 404 | * all outstanding updates to complete. |
405 | */ | 405 | */ |
406 | 406 | ||
407 | #ifdef COMMIT_STATS | 407 | #ifdef COMMIT_STATS |
408 | spin_lock(&journal->j_list_lock); | 408 | spin_lock(&journal->j_list_lock); |
409 | summarise_journal_usage(journal); | 409 | summarise_journal_usage(journal); |
410 | spin_unlock(&journal->j_list_lock); | 410 | spin_unlock(&journal->j_list_lock); |
411 | #endif | 411 | #endif |
412 | 412 | ||
413 | /* Do we need to erase the effects of a prior jbd2_journal_flush? */ | 413 | /* Do we need to erase the effects of a prior jbd2_journal_flush? */ |
414 | if (journal->j_flags & JBD2_FLUSHED) { | 414 | if (journal->j_flags & JBD2_FLUSHED) { |
415 | jbd_debug(3, "super block updated\n"); | 415 | jbd_debug(3, "super block updated\n"); |
416 | jbd2_journal_update_superblock(journal, 1); | 416 | jbd2_journal_update_superblock(journal, 1); |
417 | } else { | 417 | } else { |
418 | jbd_debug(3, "superblock not updated\n"); | 418 | jbd_debug(3, "superblock not updated\n"); |
419 | } | 419 | } |
420 | 420 | ||
421 | J_ASSERT(journal->j_running_transaction != NULL); | 421 | J_ASSERT(journal->j_running_transaction != NULL); |
422 | J_ASSERT(journal->j_committing_transaction == NULL); | 422 | J_ASSERT(journal->j_committing_transaction == NULL); |
423 | 423 | ||
424 | commit_transaction = journal->j_running_transaction; | 424 | commit_transaction = journal->j_running_transaction; |
425 | J_ASSERT(commit_transaction->t_state == T_RUNNING); | 425 | J_ASSERT(commit_transaction->t_state == T_RUNNING); |
426 | 426 | ||
427 | jbd_debug(1, "JBD: starting commit of transaction %d\n", | 427 | jbd_debug(1, "JBD: starting commit of transaction %d\n", |
428 | commit_transaction->t_tid); | 428 | commit_transaction->t_tid); |
429 | 429 | ||
430 | spin_lock(&journal->j_state_lock); | 430 | spin_lock(&journal->j_state_lock); |
431 | commit_transaction->t_state = T_LOCKED; | 431 | commit_transaction->t_state = T_LOCKED; |
432 | 432 | ||
433 | stats.u.run.rs_wait = commit_transaction->t_max_wait; | 433 | stats.u.run.rs_wait = commit_transaction->t_max_wait; |
434 | stats.u.run.rs_locked = jiffies; | 434 | stats.u.run.rs_locked = jiffies; |
435 | stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start, | 435 | stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start, |
436 | stats.u.run.rs_locked); | 436 | stats.u.run.rs_locked); |
437 | 437 | ||
438 | spin_lock(&commit_transaction->t_handle_lock); | 438 | spin_lock(&commit_transaction->t_handle_lock); |
439 | while (commit_transaction->t_updates) { | 439 | while (commit_transaction->t_updates) { |
440 | DEFINE_WAIT(wait); | 440 | DEFINE_WAIT(wait); |
441 | 441 | ||
442 | prepare_to_wait(&journal->j_wait_updates, &wait, | 442 | prepare_to_wait(&journal->j_wait_updates, &wait, |
443 | TASK_UNINTERRUPTIBLE); | 443 | TASK_UNINTERRUPTIBLE); |
444 | if (commit_transaction->t_updates) { | 444 | if (commit_transaction->t_updates) { |
445 | spin_unlock(&commit_transaction->t_handle_lock); | 445 | spin_unlock(&commit_transaction->t_handle_lock); |
446 | spin_unlock(&journal->j_state_lock); | 446 | spin_unlock(&journal->j_state_lock); |
447 | schedule(); | 447 | schedule(); |
448 | spin_lock(&journal->j_state_lock); | 448 | spin_lock(&journal->j_state_lock); |
449 | spin_lock(&commit_transaction->t_handle_lock); | 449 | spin_lock(&commit_transaction->t_handle_lock); |
450 | } | 450 | } |
451 | finish_wait(&journal->j_wait_updates, &wait); | 451 | finish_wait(&journal->j_wait_updates, &wait); |
452 | } | 452 | } |
453 | spin_unlock(&commit_transaction->t_handle_lock); | 453 | spin_unlock(&commit_transaction->t_handle_lock); |
454 | 454 | ||
455 | J_ASSERT (commit_transaction->t_outstanding_credits <= | 455 | J_ASSERT (commit_transaction->t_outstanding_credits <= |
456 | journal->j_max_transaction_buffers); | 456 | journal->j_max_transaction_buffers); |
457 | 457 | ||
458 | /* | 458 | /* |
459 | * First thing we are allowed to do is to discard any remaining | 459 | * First thing we are allowed to do is to discard any remaining |
460 | * BJ_Reserved buffers. Note, it is _not_ permissible to assume | 460 | * BJ_Reserved buffers. Note, it is _not_ permissible to assume |
461 | * that there are no such buffers: if a large filesystem | 461 | * that there are no such buffers: if a large filesystem |
462 | * operation like a truncate needs to split itself over multiple | 462 | * operation like a truncate needs to split itself over multiple |
463 | * transactions, then it may try to do a jbd2_journal_restart() while | 463 | * transactions, then it may try to do a jbd2_journal_restart() while |
464 | * there are still BJ_Reserved buffers outstanding. These must | 464 | * there are still BJ_Reserved buffers outstanding. These must |
465 | * be released cleanly from the current transaction. | 465 | * be released cleanly from the current transaction. |
466 | * | 466 | * |
467 | * In this case, the filesystem must still reserve write access | 467 | * In this case, the filesystem must still reserve write access |
468 | * again before modifying the buffer in the new transaction, but | 468 | * again before modifying the buffer in the new transaction, but |
469 | * we do not require it to remember exactly which old buffers it | 469 | * we do not require it to remember exactly which old buffers it |
470 | * has reserved. This is consistent with the existing behaviour | 470 | * has reserved. This is consistent with the existing behaviour |
471 | * that multiple jbd2_journal_get_write_access() calls to the same | 471 | * that multiple jbd2_journal_get_write_access() calls to the same |
472 | * buffer are perfectly permissable. | 472 | * buffer are perfectly permissable. |
473 | */ | 473 | */ |
474 | while (commit_transaction->t_reserved_list) { | 474 | while (commit_transaction->t_reserved_list) { |
475 | jh = commit_transaction->t_reserved_list; | 475 | jh = commit_transaction->t_reserved_list; |
476 | JBUFFER_TRACE(jh, "reserved, unused: refile"); | 476 | JBUFFER_TRACE(jh, "reserved, unused: refile"); |
477 | /* | 477 | /* |
478 | * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may | 478 | * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may |
479 | * leave undo-committed data. | 479 | * leave undo-committed data. |
480 | */ | 480 | */ |
481 | if (jh->b_committed_data) { | 481 | if (jh->b_committed_data) { |
482 | struct buffer_head *bh = jh2bh(jh); | 482 | struct buffer_head *bh = jh2bh(jh); |
483 | 483 | ||
484 | jbd_lock_bh_state(bh); | 484 | jbd_lock_bh_state(bh); |
485 | jbd2_free(jh->b_committed_data, bh->b_size); | 485 | jbd2_free(jh->b_committed_data, bh->b_size); |
486 | jh->b_committed_data = NULL; | 486 | jh->b_committed_data = NULL; |
487 | jbd_unlock_bh_state(bh); | 487 | jbd_unlock_bh_state(bh); |
488 | } | 488 | } |
489 | jbd2_journal_refile_buffer(journal, jh); | 489 | jbd2_journal_refile_buffer(journal, jh); |
490 | } | 490 | } |
491 | 491 | ||
492 | /* | 492 | /* |
493 | * Now try to drop any written-back buffers from the journal's | 493 | * Now try to drop any written-back buffers from the journal's |
494 | * checkpoint lists. We do this *before* commit because it potentially | 494 | * checkpoint lists. We do this *before* commit because it potentially |
495 | * frees some memory | 495 | * frees some memory |
496 | */ | 496 | */ |
497 | spin_lock(&journal->j_list_lock); | 497 | spin_lock(&journal->j_list_lock); |
498 | __jbd2_journal_clean_checkpoint_list(journal); | 498 | __jbd2_journal_clean_checkpoint_list(journal); |
499 | spin_unlock(&journal->j_list_lock); | 499 | spin_unlock(&journal->j_list_lock); |
500 | 500 | ||
501 | jbd_debug (3, "JBD: commit phase 1\n"); | 501 | jbd_debug (3, "JBD: commit phase 1\n"); |
502 | 502 | ||
503 | /* | 503 | /* |
504 | * Switch to a new revoke table. | 504 | * Switch to a new revoke table. |
505 | */ | 505 | */ |
506 | jbd2_journal_switch_revoke_table(journal); | 506 | jbd2_journal_switch_revoke_table(journal); |
507 | 507 | ||
508 | stats.u.run.rs_flushing = jiffies; | 508 | stats.u.run.rs_flushing = jiffies; |
509 | stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked, | 509 | stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked, |
510 | stats.u.run.rs_flushing); | 510 | stats.u.run.rs_flushing); |
511 | 511 | ||
512 | commit_transaction->t_state = T_FLUSH; | 512 | commit_transaction->t_state = T_FLUSH; |
513 | journal->j_committing_transaction = commit_transaction; | 513 | journal->j_committing_transaction = commit_transaction; |
514 | journal->j_running_transaction = NULL; | 514 | journal->j_running_transaction = NULL; |
515 | commit_transaction->t_log_start = journal->j_head; | 515 | commit_transaction->t_log_start = journal->j_head; |
516 | wake_up(&journal->j_wait_transaction_locked); | 516 | wake_up(&journal->j_wait_transaction_locked); |
517 | spin_unlock(&journal->j_state_lock); | 517 | spin_unlock(&journal->j_state_lock); |
518 | 518 | ||
519 | jbd_debug (3, "JBD: commit phase 2\n"); | 519 | jbd_debug (3, "JBD: commit phase 2\n"); |
520 | 520 | ||
521 | /* | 521 | /* |
522 | * First, drop modified flag: all accesses to the buffers | 522 | * First, drop modified flag: all accesses to the buffers |
523 | * will be tracked for a new trasaction only -bzzz | 523 | * will be tracked for a new trasaction only -bzzz |
524 | */ | 524 | */ |
525 | spin_lock(&journal->j_list_lock); | 525 | spin_lock(&journal->j_list_lock); |
526 | if (commit_transaction->t_buffers) { | 526 | if (commit_transaction->t_buffers) { |
527 | new_jh = jh = commit_transaction->t_buffers->b_tnext; | 527 | new_jh = jh = commit_transaction->t_buffers->b_tnext; |
528 | do { | 528 | do { |
529 | J_ASSERT_JH(new_jh, new_jh->b_modified == 1 || | 529 | J_ASSERT_JH(new_jh, new_jh->b_modified == 1 || |
530 | new_jh->b_modified == 0); | 530 | new_jh->b_modified == 0); |
531 | new_jh->b_modified = 0; | 531 | new_jh->b_modified = 0; |
532 | new_jh = new_jh->b_tnext; | 532 | new_jh = new_jh->b_tnext; |
533 | } while (new_jh != jh); | 533 | } while (new_jh != jh); |
534 | } | 534 | } |
535 | spin_unlock(&journal->j_list_lock); | 535 | spin_unlock(&journal->j_list_lock); |
536 | 536 | ||
537 | /* | 537 | /* |
538 | * Now start flushing things to disk, in the order they appear | 538 | * Now start flushing things to disk, in the order they appear |
539 | * on the transaction lists. Data blocks go first. | 539 | * on the transaction lists. Data blocks go first. |
540 | */ | 540 | */ |
541 | err = 0; | 541 | err = 0; |
542 | journal_submit_data_buffers(journal, commit_transaction); | 542 | journal_submit_data_buffers(journal, commit_transaction); |
543 | 543 | ||
544 | /* | 544 | /* |
545 | * Wait for all previously submitted IO to complete if commit | 545 | * Wait for all previously submitted IO to complete if commit |
546 | * record is to be written synchronously. | 546 | * record is to be written synchronously. |
547 | */ | 547 | */ |
548 | spin_lock(&journal->j_list_lock); | 548 | spin_lock(&journal->j_list_lock); |
549 | if (!JBD2_HAS_INCOMPAT_FEATURE(journal, | 549 | if (!JBD2_HAS_INCOMPAT_FEATURE(journal, |
550 | JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) | 550 | JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) |
551 | err = journal_wait_on_locked_list(journal, | 551 | err = journal_wait_on_locked_list(journal, |
552 | commit_transaction); | 552 | commit_transaction); |
553 | 553 | ||
554 | spin_unlock(&journal->j_list_lock); | 554 | spin_unlock(&journal->j_list_lock); |
555 | 555 | ||
556 | if (err) | 556 | if (err) |
557 | jbd2_journal_abort(journal, err); | 557 | jbd2_journal_abort(journal, err); |
558 | 558 | ||
559 | jbd2_journal_write_revoke_records(journal, commit_transaction); | 559 | jbd2_journal_write_revoke_records(journal, commit_transaction); |
560 | 560 | ||
561 | jbd_debug(3, "JBD: commit phase 2\n"); | 561 | jbd_debug(3, "JBD: commit phase 2\n"); |
562 | 562 | ||
563 | /* | 563 | /* |
564 | * If we found any dirty or locked buffers, then we should have | 564 | * If we found any dirty or locked buffers, then we should have |
565 | * looped back up to the write_out_data label. If there weren't | 565 | * looped back up to the write_out_data label. If there weren't |
566 | * any then journal_clean_data_list should have wiped the list | 566 | * any then journal_clean_data_list should have wiped the list |
567 | * clean by now, so check that it is in fact empty. | 567 | * clean by now, so check that it is in fact empty. |
568 | */ | 568 | */ |
569 | J_ASSERT (commit_transaction->t_sync_datalist == NULL); | 569 | J_ASSERT (commit_transaction->t_sync_datalist == NULL); |
570 | 570 | ||
571 | jbd_debug (3, "JBD: commit phase 3\n"); | 571 | jbd_debug (3, "JBD: commit phase 3\n"); |
572 | 572 | ||
573 | /* | 573 | /* |
574 | * Way to go: we have now written out all of the data for a | 574 | * Way to go: we have now written out all of the data for a |
575 | * transaction! Now comes the tricky part: we need to write out | 575 | * transaction! Now comes the tricky part: we need to write out |
576 | * metadata. Loop over the transaction's entire buffer list: | 576 | * metadata. Loop over the transaction's entire buffer list: |
577 | */ | 577 | */ |
578 | commit_transaction->t_state = T_COMMIT; | 578 | commit_transaction->t_state = T_COMMIT; |
579 | 579 | ||
580 | stats.u.run.rs_logging = jiffies; | 580 | stats.u.run.rs_logging = jiffies; |
581 | stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing, | 581 | stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing, |
582 | stats.u.run.rs_logging); | 582 | stats.u.run.rs_logging); |
583 | stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits; | 583 | stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits; |
584 | stats.u.run.rs_blocks_logged = 0; | 584 | stats.u.run.rs_blocks_logged = 0; |
585 | 585 | ||
586 | descriptor = NULL; | 586 | descriptor = NULL; |
587 | bufs = 0; | 587 | bufs = 0; |
588 | while (commit_transaction->t_buffers) { | 588 | while (commit_transaction->t_buffers) { |
589 | 589 | ||
590 | /* Find the next buffer to be journaled... */ | 590 | /* Find the next buffer to be journaled... */ |
591 | 591 | ||
592 | jh = commit_transaction->t_buffers; | 592 | jh = commit_transaction->t_buffers; |
593 | 593 | ||
594 | /* If we're in abort mode, we just un-journal the buffer and | 594 | /* If we're in abort mode, we just un-journal the buffer and |
595 | release it for background writing. */ | 595 | release it for background writing. */ |
596 | 596 | ||
597 | if (is_journal_aborted(journal)) { | 597 | if (is_journal_aborted(journal)) { |
598 | JBUFFER_TRACE(jh, "journal is aborting: refile"); | 598 | JBUFFER_TRACE(jh, "journal is aborting: refile"); |
599 | jbd2_journal_refile_buffer(journal, jh); | 599 | jbd2_journal_refile_buffer(journal, jh); |
600 | /* If that was the last one, we need to clean up | 600 | /* If that was the last one, we need to clean up |
601 | * any descriptor buffers which may have been | 601 | * any descriptor buffers which may have been |
602 | * already allocated, even if we are now | 602 | * already allocated, even if we are now |
603 | * aborting. */ | 603 | * aborting. */ |
604 | if (!commit_transaction->t_buffers) | 604 | if (!commit_transaction->t_buffers) |
605 | goto start_journal_io; | 605 | goto start_journal_io; |
606 | continue; | 606 | continue; |
607 | } | 607 | } |
608 | 608 | ||
609 | /* Make sure we have a descriptor block in which to | 609 | /* Make sure we have a descriptor block in which to |
610 | record the metadata buffer. */ | 610 | record the metadata buffer. */ |
611 | 611 | ||
612 | if (!descriptor) { | 612 | if (!descriptor) { |
613 | struct buffer_head *bh; | 613 | struct buffer_head *bh; |
614 | 614 | ||
615 | J_ASSERT (bufs == 0); | 615 | J_ASSERT (bufs == 0); |
616 | 616 | ||
617 | jbd_debug(4, "JBD: get descriptor\n"); | 617 | jbd_debug(4, "JBD: get descriptor\n"); |
618 | 618 | ||
619 | descriptor = jbd2_journal_get_descriptor_buffer(journal); | 619 | descriptor = jbd2_journal_get_descriptor_buffer(journal); |
620 | if (!descriptor) { | 620 | if (!descriptor) { |
621 | jbd2_journal_abort(journal, -EIO); | 621 | jbd2_journal_abort(journal, -EIO); |
622 | continue; | 622 | continue; |
623 | } | 623 | } |
624 | 624 | ||
625 | bh = jh2bh(descriptor); | 625 | bh = jh2bh(descriptor); |
626 | jbd_debug(4, "JBD: got buffer %llu (%p)\n", | 626 | jbd_debug(4, "JBD: got buffer %llu (%p)\n", |
627 | (unsigned long long)bh->b_blocknr, bh->b_data); | 627 | (unsigned long long)bh->b_blocknr, bh->b_data); |
628 | header = (journal_header_t *)&bh->b_data[0]; | 628 | header = (journal_header_t *)&bh->b_data[0]; |
629 | header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); | 629 | header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); |
630 | header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK); | 630 | header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK); |
631 | header->h_sequence = cpu_to_be32(commit_transaction->t_tid); | 631 | header->h_sequence = cpu_to_be32(commit_transaction->t_tid); |
632 | 632 | ||
633 | tagp = &bh->b_data[sizeof(journal_header_t)]; | 633 | tagp = &bh->b_data[sizeof(journal_header_t)]; |
634 | space_left = bh->b_size - sizeof(journal_header_t); | 634 | space_left = bh->b_size - sizeof(journal_header_t); |
635 | first_tag = 1; | 635 | first_tag = 1; |
636 | set_buffer_jwrite(bh); | 636 | set_buffer_jwrite(bh); |
637 | set_buffer_dirty(bh); | 637 | set_buffer_dirty(bh); |
638 | wbuf[bufs++] = bh; | 638 | wbuf[bufs++] = bh; |
639 | 639 | ||
640 | /* Record it so that we can wait for IO | 640 | /* Record it so that we can wait for IO |
641 | completion later */ | 641 | completion later */ |
642 | BUFFER_TRACE(bh, "ph3: file as descriptor"); | 642 | BUFFER_TRACE(bh, "ph3: file as descriptor"); |
643 | jbd2_journal_file_buffer(descriptor, commit_transaction, | 643 | jbd2_journal_file_buffer(descriptor, commit_transaction, |
644 | BJ_LogCtl); | 644 | BJ_LogCtl); |
645 | } | 645 | } |
646 | 646 | ||
647 | /* Where is the buffer to be written? */ | 647 | /* Where is the buffer to be written? */ |
648 | 648 | ||
649 | err = jbd2_journal_next_log_block(journal, &blocknr); | 649 | err = jbd2_journal_next_log_block(journal, &blocknr); |
650 | /* If the block mapping failed, just abandon the buffer | 650 | /* If the block mapping failed, just abandon the buffer |
651 | and repeat this loop: we'll fall into the | 651 | and repeat this loop: we'll fall into the |
652 | refile-on-abort condition above. */ | 652 | refile-on-abort condition above. */ |
653 | if (err) { | 653 | if (err) { |
654 | jbd2_journal_abort(journal, err); | 654 | jbd2_journal_abort(journal, err); |
655 | continue; | 655 | continue; |
656 | } | 656 | } |
657 | 657 | ||
658 | /* | 658 | /* |
659 | * start_this_handle() uses t_outstanding_credits to determine | 659 | * start_this_handle() uses t_outstanding_credits to determine |
660 | * the free space in the log, but this counter is changed | 660 | * the free space in the log, but this counter is changed |
661 | * by jbd2_journal_next_log_block() also. | 661 | * by jbd2_journal_next_log_block() also. |
662 | */ | 662 | */ |
663 | commit_transaction->t_outstanding_credits--; | 663 | commit_transaction->t_outstanding_credits--; |
664 | 664 | ||
665 | /* Bump b_count to prevent truncate from stumbling over | 665 | /* Bump b_count to prevent truncate from stumbling over |
666 | the shadowed buffer! @@@ This can go if we ever get | 666 | the shadowed buffer! @@@ This can go if we ever get |
667 | rid of the BJ_IO/BJ_Shadow pairing of buffers. */ | 667 | rid of the BJ_IO/BJ_Shadow pairing of buffers. */ |
668 | atomic_inc(&jh2bh(jh)->b_count); | 668 | atomic_inc(&jh2bh(jh)->b_count); |
669 | 669 | ||
670 | /* Make a temporary IO buffer with which to write it out | 670 | /* Make a temporary IO buffer with which to write it out |
671 | (this will requeue both the metadata buffer and the | 671 | (this will requeue both the metadata buffer and the |
672 | temporary IO buffer). new_bh goes on BJ_IO*/ | 672 | temporary IO buffer). new_bh goes on BJ_IO*/ |
673 | 673 | ||
674 | set_bit(BH_JWrite, &jh2bh(jh)->b_state); | 674 | set_bit(BH_JWrite, &jh2bh(jh)->b_state); |
675 | /* | 675 | /* |
676 | * akpm: jbd2_journal_write_metadata_buffer() sets | 676 | * akpm: jbd2_journal_write_metadata_buffer() sets |
677 | * new_bh->b_transaction to commit_transaction. | 677 | * new_bh->b_transaction to commit_transaction. |
678 | * We need to clean this up before we release new_bh | 678 | * We need to clean this up before we release new_bh |
679 | * (which is of type BJ_IO) | 679 | * (which is of type BJ_IO) |
680 | */ | 680 | */ |
681 | JBUFFER_TRACE(jh, "ph3: write metadata"); | 681 | JBUFFER_TRACE(jh, "ph3: write metadata"); |
682 | flags = jbd2_journal_write_metadata_buffer(commit_transaction, | 682 | flags = jbd2_journal_write_metadata_buffer(commit_transaction, |
683 | jh, &new_jh, blocknr); | 683 | jh, &new_jh, blocknr); |
684 | set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); | 684 | set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); |
685 | wbuf[bufs++] = jh2bh(new_jh); | 685 | wbuf[bufs++] = jh2bh(new_jh); |
686 | 686 | ||
687 | /* Record the new block's tag in the current descriptor | 687 | /* Record the new block's tag in the current descriptor |
688 | buffer */ | 688 | buffer */ |
689 | 689 | ||
690 | tag_flag = 0; | 690 | tag_flag = 0; |
691 | if (flags & 1) | 691 | if (flags & 1) |
692 | tag_flag |= JBD2_FLAG_ESCAPE; | 692 | tag_flag |= JBD2_FLAG_ESCAPE; |
693 | if (!first_tag) | 693 | if (!first_tag) |
694 | tag_flag |= JBD2_FLAG_SAME_UUID; | 694 | tag_flag |= JBD2_FLAG_SAME_UUID; |
695 | 695 | ||
696 | tag = (journal_block_tag_t *) tagp; | 696 | tag = (journal_block_tag_t *) tagp; |
697 | write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr); | 697 | write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr); |
698 | tag->t_flags = cpu_to_be32(tag_flag); | 698 | tag->t_flags = cpu_to_be32(tag_flag); |
699 | tagp += tag_bytes; | 699 | tagp += tag_bytes; |
700 | space_left -= tag_bytes; | 700 | space_left -= tag_bytes; |
701 | 701 | ||
702 | if (first_tag) { | 702 | if (first_tag) { |
703 | memcpy (tagp, journal->j_uuid, 16); | 703 | memcpy (tagp, journal->j_uuid, 16); |
704 | tagp += 16; | 704 | tagp += 16; |
705 | space_left -= 16; | 705 | space_left -= 16; |
706 | first_tag = 0; | 706 | first_tag = 0; |
707 | } | 707 | } |
708 | 708 | ||
709 | /* If there's no more to do, or if the descriptor is full, | 709 | /* If there's no more to do, or if the descriptor is full, |
710 | let the IO rip! */ | 710 | let the IO rip! */ |
711 | 711 | ||
712 | if (bufs == journal->j_wbufsize || | 712 | if (bufs == journal->j_wbufsize || |
713 | commit_transaction->t_buffers == NULL || | 713 | commit_transaction->t_buffers == NULL || |
714 | space_left < tag_bytes + 16) { | 714 | space_left < tag_bytes + 16) { |
715 | 715 | ||
716 | jbd_debug(4, "JBD: Submit %d IOs\n", bufs); | 716 | jbd_debug(4, "JBD: Submit %d IOs\n", bufs); |
717 | 717 | ||
718 | /* Write an end-of-descriptor marker before | 718 | /* Write an end-of-descriptor marker before |
719 | submitting the IOs. "tag" still points to | 719 | submitting the IOs. "tag" still points to |
720 | the last tag we set up. */ | 720 | the last tag we set up. */ |
721 | 721 | ||
722 | tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG); | 722 | tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG); |
723 | 723 | ||
724 | start_journal_io: | 724 | start_journal_io: |
725 | for (i = 0; i < bufs; i++) { | 725 | for (i = 0; i < bufs; i++) { |
726 | struct buffer_head *bh = wbuf[i]; | 726 | struct buffer_head *bh = wbuf[i]; |
727 | /* | 727 | /* |
728 | * Compute checksum. | 728 | * Compute checksum. |
729 | */ | 729 | */ |
730 | if (JBD2_HAS_COMPAT_FEATURE(journal, | 730 | if (JBD2_HAS_COMPAT_FEATURE(journal, |
731 | JBD2_FEATURE_COMPAT_CHECKSUM)) { | 731 | JBD2_FEATURE_COMPAT_CHECKSUM)) { |
732 | crc32_sum = | 732 | crc32_sum = |
733 | jbd2_checksum_data(crc32_sum, bh); | 733 | jbd2_checksum_data(crc32_sum, bh); |
734 | } | 734 | } |
735 | 735 | ||
736 | lock_buffer(bh); | 736 | lock_buffer(bh); |
737 | clear_buffer_dirty(bh); | 737 | clear_buffer_dirty(bh); |
738 | set_buffer_uptodate(bh); | 738 | set_buffer_uptodate(bh); |
739 | bh->b_end_io = journal_end_buffer_io_sync; | 739 | bh->b_end_io = journal_end_buffer_io_sync; |
740 | submit_bh(WRITE, bh); | 740 | submit_bh(WRITE, bh); |
741 | } | 741 | } |
742 | cond_resched(); | 742 | cond_resched(); |
743 | stats.u.run.rs_blocks_logged += bufs; | 743 | stats.u.run.rs_blocks_logged += bufs; |
744 | 744 | ||
745 | /* Force a new descriptor to be generated next | 745 | /* Force a new descriptor to be generated next |
746 | time round the loop. */ | 746 | time round the loop. */ |
747 | descriptor = NULL; | 747 | descriptor = NULL; |
748 | bufs = 0; | 748 | bufs = 0; |
749 | } | 749 | } |
750 | } | 750 | } |
751 | 751 | ||
752 | /* Done it all: now write the commit record asynchronously. */ | 752 | /* Done it all: now write the commit record asynchronously. */ |
753 | 753 | ||
754 | if (JBD2_HAS_INCOMPAT_FEATURE(journal, | 754 | if (JBD2_HAS_INCOMPAT_FEATURE(journal, |
755 | JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { | 755 | JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { |
756 | err = journal_submit_commit_record(journal, commit_transaction, | 756 | err = journal_submit_commit_record(journal, commit_transaction, |
757 | &cbh, crc32_sum); | 757 | &cbh, crc32_sum); |
758 | if (err) | 758 | if (err) |
759 | __jbd2_journal_abort_hard(journal); | 759 | __jbd2_journal_abort_hard(journal); |
760 | 760 | ||
761 | spin_lock(&journal->j_list_lock); | 761 | spin_lock(&journal->j_list_lock); |
762 | err = journal_wait_on_locked_list(journal, | 762 | err = journal_wait_on_locked_list(journal, |
763 | commit_transaction); | 763 | commit_transaction); |
764 | spin_unlock(&journal->j_list_lock); | 764 | spin_unlock(&journal->j_list_lock); |
765 | if (err) | 765 | if (err) |
766 | __jbd2_journal_abort_hard(journal); | 766 | __jbd2_journal_abort_hard(journal); |
767 | } | 767 | } |
768 | 768 | ||
769 | /* Lo and behold: we have just managed to send a transaction to | 769 | /* Lo and behold: we have just managed to send a transaction to |
770 | the log. Before we can commit it, wait for the IO so far to | 770 | the log. Before we can commit it, wait for the IO so far to |
771 | complete. Control buffers being written are on the | 771 | complete. Control buffers being written are on the |
772 | transaction's t_log_list queue, and metadata buffers are on | 772 | transaction's t_log_list queue, and metadata buffers are on |
773 | the t_iobuf_list queue. | 773 | the t_iobuf_list queue. |
774 | 774 | ||
775 | Wait for the buffers in reverse order. That way we are | 775 | Wait for the buffers in reverse order. That way we are |
776 | less likely to be woken up until all IOs have completed, and | 776 | less likely to be woken up until all IOs have completed, and |
777 | so we incur less scheduling load. | 777 | so we incur less scheduling load. |
778 | */ | 778 | */ |
779 | 779 | ||
780 | jbd_debug(3, "JBD: commit phase 4\n"); | 780 | jbd_debug(3, "JBD: commit phase 4\n"); |
781 | 781 | ||
782 | /* | 782 | /* |
783 | * akpm: these are BJ_IO, and j_list_lock is not needed. | 783 | * akpm: these are BJ_IO, and j_list_lock is not needed. |
784 | * See __journal_try_to_free_buffer. | 784 | * See __journal_try_to_free_buffer. |
785 | */ | 785 | */ |
786 | wait_for_iobuf: | 786 | wait_for_iobuf: |
787 | while (commit_transaction->t_iobuf_list != NULL) { | 787 | while (commit_transaction->t_iobuf_list != NULL) { |
788 | struct buffer_head *bh; | 788 | struct buffer_head *bh; |
789 | 789 | ||
790 | jh = commit_transaction->t_iobuf_list->b_tprev; | 790 | jh = commit_transaction->t_iobuf_list->b_tprev; |
791 | bh = jh2bh(jh); | 791 | bh = jh2bh(jh); |
792 | if (buffer_locked(bh)) { | 792 | if (buffer_locked(bh)) { |
793 | wait_on_buffer(bh); | 793 | wait_on_buffer(bh); |
794 | goto wait_for_iobuf; | 794 | goto wait_for_iobuf; |
795 | } | 795 | } |
796 | if (cond_resched()) | 796 | if (cond_resched()) |
797 | goto wait_for_iobuf; | 797 | goto wait_for_iobuf; |
798 | 798 | ||
799 | if (unlikely(!buffer_uptodate(bh))) | 799 | if (unlikely(!buffer_uptodate(bh))) |
800 | err = -EIO; | 800 | err = -EIO; |
801 | 801 | ||
802 | clear_buffer_jwrite(bh); | 802 | clear_buffer_jwrite(bh); |
803 | 803 | ||
804 | JBUFFER_TRACE(jh, "ph4: unfile after journal write"); | 804 | JBUFFER_TRACE(jh, "ph4: unfile after journal write"); |
805 | jbd2_journal_unfile_buffer(journal, jh); | 805 | jbd2_journal_unfile_buffer(journal, jh); |
806 | 806 | ||
807 | /* | 807 | /* |
808 | * ->t_iobuf_list should contain only dummy buffer_heads | 808 | * ->t_iobuf_list should contain only dummy buffer_heads |
809 | * which were created by jbd2_journal_write_metadata_buffer(). | 809 | * which were created by jbd2_journal_write_metadata_buffer(). |
810 | */ | 810 | */ |
811 | BUFFER_TRACE(bh, "dumping temporary bh"); | 811 | BUFFER_TRACE(bh, "dumping temporary bh"); |
812 | jbd2_journal_put_journal_head(jh); | 812 | jbd2_journal_put_journal_head(jh); |
813 | __brelse(bh); | 813 | __brelse(bh); |
814 | J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0); | 814 | J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0); |
815 | free_buffer_head(bh); | 815 | free_buffer_head(bh); |
816 | 816 | ||
817 | /* We also have to unlock and free the corresponding | 817 | /* We also have to unlock and free the corresponding |
818 | shadowed buffer */ | 818 | shadowed buffer */ |
819 | jh = commit_transaction->t_shadow_list->b_tprev; | 819 | jh = commit_transaction->t_shadow_list->b_tprev; |
820 | bh = jh2bh(jh); | 820 | bh = jh2bh(jh); |
821 | clear_bit(BH_JWrite, &bh->b_state); | 821 | clear_bit(BH_JWrite, &bh->b_state); |
822 | J_ASSERT_BH(bh, buffer_jbddirty(bh)); | 822 | J_ASSERT_BH(bh, buffer_jbddirty(bh)); |
823 | 823 | ||
824 | /* The metadata is now released for reuse, but we need | 824 | /* The metadata is now released for reuse, but we need |
825 | to remember it against this transaction so that when | 825 | to remember it against this transaction so that when |
826 | we finally commit, we can do any checkpointing | 826 | we finally commit, we can do any checkpointing |
827 | required. */ | 827 | required. */ |
828 | JBUFFER_TRACE(jh, "file as BJ_Forget"); | 828 | JBUFFER_TRACE(jh, "file as BJ_Forget"); |
829 | jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget); | 829 | jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget); |
830 | /* Wake up any transactions which were waiting for this | 830 | /* Wake up any transactions which were waiting for this |
831 | IO to complete */ | 831 | IO to complete */ |
832 | wake_up_bit(&bh->b_state, BH_Unshadow); | 832 | wake_up_bit(&bh->b_state, BH_Unshadow); |
833 | JBUFFER_TRACE(jh, "brelse shadowed buffer"); | 833 | JBUFFER_TRACE(jh, "brelse shadowed buffer"); |
834 | __brelse(bh); | 834 | __brelse(bh); |
835 | } | 835 | } |
836 | 836 | ||
837 | J_ASSERT (commit_transaction->t_shadow_list == NULL); | 837 | J_ASSERT (commit_transaction->t_shadow_list == NULL); |
838 | 838 | ||
839 | jbd_debug(3, "JBD: commit phase 5\n"); | 839 | jbd_debug(3, "JBD: commit phase 5\n"); |
840 | 840 | ||
841 | /* Here we wait for the revoke record and descriptor record buffers */ | 841 | /* Here we wait for the revoke record and descriptor record buffers */ |
842 | wait_for_ctlbuf: | 842 | wait_for_ctlbuf: |
843 | while (commit_transaction->t_log_list != NULL) { | 843 | while (commit_transaction->t_log_list != NULL) { |
844 | struct buffer_head *bh; | 844 | struct buffer_head *bh; |
845 | 845 | ||
846 | jh = commit_transaction->t_log_list->b_tprev; | 846 | jh = commit_transaction->t_log_list->b_tprev; |
847 | bh = jh2bh(jh); | 847 | bh = jh2bh(jh); |
848 | if (buffer_locked(bh)) { | 848 | if (buffer_locked(bh)) { |
849 | wait_on_buffer(bh); | 849 | wait_on_buffer(bh); |
850 | goto wait_for_ctlbuf; | 850 | goto wait_for_ctlbuf; |
851 | } | 851 | } |
852 | if (cond_resched()) | 852 | if (cond_resched()) |
853 | goto wait_for_ctlbuf; | 853 | goto wait_for_ctlbuf; |
854 | 854 | ||
855 | if (unlikely(!buffer_uptodate(bh))) | 855 | if (unlikely(!buffer_uptodate(bh))) |
856 | err = -EIO; | 856 | err = -EIO; |
857 | 857 | ||
858 | BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); | 858 | BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); |
859 | clear_buffer_jwrite(bh); | 859 | clear_buffer_jwrite(bh); |
860 | jbd2_journal_unfile_buffer(journal, jh); | 860 | jbd2_journal_unfile_buffer(journal, jh); |
861 | jbd2_journal_put_journal_head(jh); | 861 | jbd2_journal_put_journal_head(jh); |
862 | __brelse(bh); /* One for getblk */ | 862 | __brelse(bh); /* One for getblk */ |
863 | /* AKPM: bforget here */ | 863 | /* AKPM: bforget here */ |
864 | } | 864 | } |
865 | 865 | ||
866 | jbd_debug(3, "JBD: commit phase 6\n"); | 866 | jbd_debug(3, "JBD: commit phase 6\n"); |
867 | 867 | ||
868 | if (!JBD2_HAS_INCOMPAT_FEATURE(journal, | 868 | if (!JBD2_HAS_INCOMPAT_FEATURE(journal, |
869 | JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { | 869 | JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { |
870 | err = journal_submit_commit_record(journal, commit_transaction, | 870 | err = journal_submit_commit_record(journal, commit_transaction, |
871 | &cbh, crc32_sum); | 871 | &cbh, crc32_sum); |
872 | if (err) | 872 | if (err) |
873 | __jbd2_journal_abort_hard(journal); | 873 | __jbd2_journal_abort_hard(journal); |
874 | } | 874 | } |
875 | err = journal_wait_on_commit_record(cbh); | 875 | err = journal_wait_on_commit_record(cbh); |
876 | 876 | ||
877 | if (err) | 877 | if (err) |
878 | jbd2_journal_abort(journal, err); | 878 | jbd2_journal_abort(journal, err); |
879 | 879 | ||
880 | /* End of a transaction! Finally, we can do checkpoint | 880 | /* End of a transaction! Finally, we can do checkpoint |
881 | processing: any buffers committed as a result of this | 881 | processing: any buffers committed as a result of this |
882 | transaction can be removed from any checkpoint list it was on | 882 | transaction can be removed from any checkpoint list it was on |
883 | before. */ | 883 | before. */ |
884 | 884 | ||
885 | jbd_debug(3, "JBD: commit phase 7\n"); | 885 | jbd_debug(3, "JBD: commit phase 7\n"); |
886 | 886 | ||
887 | J_ASSERT(commit_transaction->t_sync_datalist == NULL); | 887 | J_ASSERT(commit_transaction->t_sync_datalist == NULL); |
888 | J_ASSERT(commit_transaction->t_buffers == NULL); | 888 | J_ASSERT(commit_transaction->t_buffers == NULL); |
889 | J_ASSERT(commit_transaction->t_checkpoint_list == NULL); | 889 | J_ASSERT(commit_transaction->t_checkpoint_list == NULL); |
890 | J_ASSERT(commit_transaction->t_iobuf_list == NULL); | 890 | J_ASSERT(commit_transaction->t_iobuf_list == NULL); |
891 | J_ASSERT(commit_transaction->t_shadow_list == NULL); | 891 | J_ASSERT(commit_transaction->t_shadow_list == NULL); |
892 | J_ASSERT(commit_transaction->t_log_list == NULL); | 892 | J_ASSERT(commit_transaction->t_log_list == NULL); |
893 | 893 | ||
894 | restart_loop: | 894 | restart_loop: |
895 | /* | 895 | /* |
896 | * As there are other places (journal_unmap_buffer()) adding buffers | 896 | * As there are other places (journal_unmap_buffer()) adding buffers |
897 | * to this list we have to be careful and hold the j_list_lock. | 897 | * to this list we have to be careful and hold the j_list_lock. |
898 | */ | 898 | */ |
899 | spin_lock(&journal->j_list_lock); | 899 | spin_lock(&journal->j_list_lock); |
900 | while (commit_transaction->t_forget) { | 900 | while (commit_transaction->t_forget) { |
901 | transaction_t *cp_transaction; | 901 | transaction_t *cp_transaction; |
902 | struct buffer_head *bh; | 902 | struct buffer_head *bh; |
903 | 903 | ||
904 | jh = commit_transaction->t_forget; | 904 | jh = commit_transaction->t_forget; |
905 | spin_unlock(&journal->j_list_lock); | 905 | spin_unlock(&journal->j_list_lock); |
906 | bh = jh2bh(jh); | 906 | bh = jh2bh(jh); |
907 | jbd_lock_bh_state(bh); | 907 | jbd_lock_bh_state(bh); |
908 | J_ASSERT_JH(jh, jh->b_transaction == commit_transaction || | 908 | J_ASSERT_JH(jh, jh->b_transaction == commit_transaction || |
909 | jh->b_transaction == journal->j_running_transaction); | 909 | jh->b_transaction == journal->j_running_transaction); |
910 | 910 | ||
911 | /* | 911 | /* |
912 | * If there is undo-protected committed data against | 912 | * If there is undo-protected committed data against |
913 | * this buffer, then we can remove it now. If it is a | 913 | * this buffer, then we can remove it now. If it is a |
914 | * buffer needing such protection, the old frozen_data | 914 | * buffer needing such protection, the old frozen_data |
915 | * field now points to a committed version of the | 915 | * field now points to a committed version of the |
916 | * buffer, so rotate that field to the new committed | 916 | * buffer, so rotate that field to the new committed |
917 | * data. | 917 | * data. |
918 | * | 918 | * |
919 | * Otherwise, we can just throw away the frozen data now. | 919 | * Otherwise, we can just throw away the frozen data now. |
920 | */ | 920 | */ |
921 | if (jh->b_committed_data) { | 921 | if (jh->b_committed_data) { |
922 | jbd2_free(jh->b_committed_data, bh->b_size); | 922 | jbd2_free(jh->b_committed_data, bh->b_size); |
923 | jh->b_committed_data = NULL; | 923 | jh->b_committed_data = NULL; |
924 | if (jh->b_frozen_data) { | 924 | if (jh->b_frozen_data) { |
925 | jh->b_committed_data = jh->b_frozen_data; | 925 | jh->b_committed_data = jh->b_frozen_data; |
926 | jh->b_frozen_data = NULL; | 926 | jh->b_frozen_data = NULL; |
927 | } | 927 | } |
928 | } else if (jh->b_frozen_data) { | 928 | } else if (jh->b_frozen_data) { |
929 | jbd2_free(jh->b_frozen_data, bh->b_size); | 929 | jbd2_free(jh->b_frozen_data, bh->b_size); |
930 | jh->b_frozen_data = NULL; | 930 | jh->b_frozen_data = NULL; |
931 | } | 931 | } |
932 | 932 | ||
933 | spin_lock(&journal->j_list_lock); | 933 | spin_lock(&journal->j_list_lock); |
934 | cp_transaction = jh->b_cp_transaction; | 934 | cp_transaction = jh->b_cp_transaction; |
935 | if (cp_transaction) { | 935 | if (cp_transaction) { |
936 | JBUFFER_TRACE(jh, "remove from old cp transaction"); | 936 | JBUFFER_TRACE(jh, "remove from old cp transaction"); |
937 | cp_transaction->t_chp_stats.cs_dropped++; | 937 | cp_transaction->t_chp_stats.cs_dropped++; |
938 | __jbd2_journal_remove_checkpoint(jh); | 938 | __jbd2_journal_remove_checkpoint(jh); |
939 | } | 939 | } |
940 | 940 | ||
941 | /* Only re-checkpoint the buffer_head if it is marked | 941 | /* Only re-checkpoint the buffer_head if it is marked |
942 | * dirty. If the buffer was added to the BJ_Forget list | 942 | * dirty. If the buffer was added to the BJ_Forget list |
943 | * by jbd2_journal_forget, it may no longer be dirty and | 943 | * by jbd2_journal_forget, it may no longer be dirty and |
944 | * there's no point in keeping a checkpoint record for | 944 | * there's no point in keeping a checkpoint record for |
945 | * it. */ | 945 | * it. */ |
946 | 946 | ||
947 | /* A buffer which has been freed while still being | 947 | /* A buffer which has been freed while still being |
948 | * journaled by a previous transaction may end up still | 948 | * journaled by a previous transaction may end up still |
949 | * being dirty here, but we want to avoid writing back | 949 | * being dirty here, but we want to avoid writing back |
950 | * that buffer in the future now that the last use has | 950 | * that buffer in the future now that the last use has |
951 | * been committed. That's not only a performance gain, | 951 | * been committed. That's not only a performance gain, |
952 | * it also stops aliasing problems if the buffer is left | 952 | * it also stops aliasing problems if the buffer is left |
953 | * behind for writeback and gets reallocated for another | 953 | * behind for writeback and gets reallocated for another |
954 | * use in a different page. */ | 954 | * use in a different page. */ |
955 | if (buffer_freed(bh)) { | 955 | if (buffer_freed(bh)) { |
956 | clear_buffer_freed(bh); | 956 | clear_buffer_freed(bh); |
957 | clear_buffer_jbddirty(bh); | 957 | clear_buffer_jbddirty(bh); |
958 | } | 958 | } |
959 | 959 | ||
960 | if (buffer_jbddirty(bh)) { | 960 | if (buffer_jbddirty(bh)) { |
961 | JBUFFER_TRACE(jh, "add to new checkpointing trans"); | 961 | JBUFFER_TRACE(jh, "add to new checkpointing trans"); |
962 | __jbd2_journal_insert_checkpoint(jh, commit_transaction); | 962 | __jbd2_journal_insert_checkpoint(jh, commit_transaction); |
963 | JBUFFER_TRACE(jh, "refile for checkpoint writeback"); | 963 | JBUFFER_TRACE(jh, "refile for checkpoint writeback"); |
964 | __jbd2_journal_refile_buffer(jh); | 964 | __jbd2_journal_refile_buffer(jh); |
965 | jbd_unlock_bh_state(bh); | 965 | jbd_unlock_bh_state(bh); |
966 | } else { | 966 | } else { |
967 | J_ASSERT_BH(bh, !buffer_dirty(bh)); | 967 | J_ASSERT_BH(bh, !buffer_dirty(bh)); |
968 | /* The buffer on BJ_Forget list and not jbddirty means | 968 | /* The buffer on BJ_Forget list and not jbddirty means |
969 | * it has been freed by this transaction and hence it | 969 | * it has been freed by this transaction and hence it |
970 | * could not have been reallocated until this | 970 | * could not have been reallocated until this |
971 | * transaction has committed. *BUT* it could be | 971 | * transaction has committed. *BUT* it could be |
972 | * reallocated once we have written all the data to | 972 | * reallocated once we have written all the data to |
973 | * disk and before we process the buffer on BJ_Forget | 973 | * disk and before we process the buffer on BJ_Forget |
974 | * list. */ | 974 | * list. */ |
975 | JBUFFER_TRACE(jh, "refile or unfile freed buffer"); | 975 | JBUFFER_TRACE(jh, "refile or unfile freed buffer"); |
976 | __jbd2_journal_refile_buffer(jh); | 976 | __jbd2_journal_refile_buffer(jh); |
977 | if (!jh->b_transaction) { | 977 | if (!jh->b_transaction) { |
978 | jbd_unlock_bh_state(bh); | 978 | jbd_unlock_bh_state(bh); |
979 | /* needs a brelse */ | 979 | /* needs a brelse */ |
980 | jbd2_journal_remove_journal_head(bh); | 980 | jbd2_journal_remove_journal_head(bh); |
981 | release_buffer_page(bh); | 981 | release_buffer_page(bh); |
982 | } else | 982 | } else |
983 | jbd_unlock_bh_state(bh); | 983 | jbd_unlock_bh_state(bh); |
984 | } | 984 | } |
985 | cond_resched_lock(&journal->j_list_lock); | 985 | cond_resched_lock(&journal->j_list_lock); |
986 | } | 986 | } |
987 | spin_unlock(&journal->j_list_lock); | 987 | spin_unlock(&journal->j_list_lock); |
988 | /* | 988 | /* |
989 | * This is a bit sleazy. We use j_list_lock to protect transition | 989 | * This is a bit sleazy. We use j_list_lock to protect transition |
990 | * of a transaction into T_FINISHED state and calling | 990 | * of a transaction into T_FINISHED state and calling |
991 | * __jbd2_journal_drop_transaction(). Otherwise we could race with | 991 | * __jbd2_journal_drop_transaction(). Otherwise we could race with |
992 | * other checkpointing code processing the transaction... | 992 | * other checkpointing code processing the transaction... |
993 | */ | 993 | */ |
994 | spin_lock(&journal->j_state_lock); | 994 | spin_lock(&journal->j_state_lock); |
995 | spin_lock(&journal->j_list_lock); | 995 | spin_lock(&journal->j_list_lock); |
996 | /* | 996 | /* |
997 | * Now recheck if some buffers did not get attached to the transaction | 997 | * Now recheck if some buffers did not get attached to the transaction |
998 | * while the lock was dropped... | 998 | * while the lock was dropped... |
999 | */ | 999 | */ |
1000 | if (commit_transaction->t_forget) { | 1000 | if (commit_transaction->t_forget) { |
1001 | spin_unlock(&journal->j_list_lock); | 1001 | spin_unlock(&journal->j_list_lock); |
1002 | spin_unlock(&journal->j_state_lock); | 1002 | spin_unlock(&journal->j_state_lock); |
1003 | goto restart_loop; | 1003 | goto restart_loop; |
1004 | } | 1004 | } |
1005 | 1005 | ||
1006 | /* Done with this transaction! */ | 1006 | /* Done with this transaction! */ |
1007 | 1007 | ||
1008 | jbd_debug(3, "JBD: commit phase 8\n"); | 1008 | jbd_debug(3, "JBD: commit phase 8\n"); |
1009 | 1009 | ||
1010 | J_ASSERT(commit_transaction->t_state == T_COMMIT); | 1010 | J_ASSERT(commit_transaction->t_state == T_COMMIT); |
1011 | 1011 | ||
1012 | commit_transaction->t_start = jiffies; | 1012 | commit_transaction->t_start = jiffies; |
1013 | stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging, | 1013 | stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging, |
1014 | commit_transaction->t_start); | 1014 | commit_transaction->t_start); |
1015 | 1015 | ||
1016 | /* | 1016 | /* |
1017 | * File the transaction for history | 1017 | * File the transaction for history |
1018 | */ | 1018 | */ |
1019 | stats.ts_type = JBD2_STATS_RUN; | 1019 | stats.ts_type = JBD2_STATS_RUN; |
1020 | stats.ts_tid = commit_transaction->t_tid; | 1020 | stats.ts_tid = commit_transaction->t_tid; |
1021 | stats.u.run.rs_handle_count = commit_transaction->t_handle_count; | 1021 | stats.u.run.rs_handle_count = commit_transaction->t_handle_count; |
1022 | spin_lock(&journal->j_history_lock); | 1022 | spin_lock(&journal->j_history_lock); |
1023 | memcpy(journal->j_history + journal->j_history_cur, &stats, | 1023 | memcpy(journal->j_history + journal->j_history_cur, &stats, |
1024 | sizeof(stats)); | 1024 | sizeof(stats)); |
1025 | if (++journal->j_history_cur == journal->j_history_max) | 1025 | if (++journal->j_history_cur == journal->j_history_max) |
1026 | journal->j_history_cur = 0; | 1026 | journal->j_history_cur = 0; |
1027 | 1027 | ||
1028 | /* | 1028 | /* |
1029 | * Calculate overall stats | 1029 | * Calculate overall stats |
1030 | */ | 1030 | */ |
1031 | journal->j_stats.ts_tid++; | 1031 | journal->j_stats.ts_tid++; |
1032 | journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait; | 1032 | journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait; |
1033 | journal->j_stats.u.run.rs_running += stats.u.run.rs_running; | 1033 | journal->j_stats.u.run.rs_running += stats.u.run.rs_running; |
1034 | journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked; | 1034 | journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked; |
1035 | journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing; | 1035 | journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing; |
1036 | journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging; | 1036 | journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging; |
1037 | journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count; | 1037 | journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count; |
1038 | journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks; | 1038 | journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks; |
1039 | journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged; | 1039 | journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged; |
1040 | spin_unlock(&journal->j_history_lock); | 1040 | spin_unlock(&journal->j_history_lock); |
1041 | 1041 | ||
1042 | commit_transaction->t_state = T_FINISHED; | 1042 | commit_transaction->t_state = T_FINISHED; |
1043 | J_ASSERT(commit_transaction == journal->j_committing_transaction); | 1043 | J_ASSERT(commit_transaction == journal->j_committing_transaction); |
1044 | journal->j_commit_sequence = commit_transaction->t_tid; | 1044 | journal->j_commit_sequence = commit_transaction->t_tid; |
1045 | journal->j_committing_transaction = NULL; | 1045 | journal->j_committing_transaction = NULL; |
1046 | spin_unlock(&journal->j_state_lock); | 1046 | spin_unlock(&journal->j_state_lock); |
1047 | 1047 | ||
1048 | if (commit_transaction->t_checkpoint_list == NULL && | 1048 | if (commit_transaction->t_checkpoint_list == NULL && |
1049 | commit_transaction->t_checkpoint_io_list == NULL) { | 1049 | commit_transaction->t_checkpoint_io_list == NULL) { |
1050 | __jbd2_journal_drop_transaction(journal, commit_transaction); | 1050 | __jbd2_journal_drop_transaction(journal, commit_transaction); |
1051 | } else { | 1051 | } else { |
1052 | if (journal->j_checkpoint_transactions == NULL) { | 1052 | if (journal->j_checkpoint_transactions == NULL) { |
1053 | journal->j_checkpoint_transactions = commit_transaction; | 1053 | journal->j_checkpoint_transactions = commit_transaction; |
1054 | commit_transaction->t_cpnext = commit_transaction; | 1054 | commit_transaction->t_cpnext = commit_transaction; |
1055 | commit_transaction->t_cpprev = commit_transaction; | 1055 | commit_transaction->t_cpprev = commit_transaction; |
1056 | } else { | 1056 | } else { |
1057 | commit_transaction->t_cpnext = | 1057 | commit_transaction->t_cpnext = |
1058 | journal->j_checkpoint_transactions; | 1058 | journal->j_checkpoint_transactions; |
1059 | commit_transaction->t_cpprev = | 1059 | commit_transaction->t_cpprev = |
1060 | commit_transaction->t_cpnext->t_cpprev; | 1060 | commit_transaction->t_cpnext->t_cpprev; |
1061 | commit_transaction->t_cpnext->t_cpprev = | 1061 | commit_transaction->t_cpnext->t_cpprev = |
1062 | commit_transaction; | 1062 | commit_transaction; |
1063 | commit_transaction->t_cpprev->t_cpnext = | 1063 | commit_transaction->t_cpprev->t_cpnext = |
1064 | commit_transaction; | 1064 | commit_transaction; |
1065 | } | 1065 | } |
1066 | } | 1066 | } |
1067 | spin_unlock(&journal->j_list_lock); | 1067 | spin_unlock(&journal->j_list_lock); |
1068 | 1068 | ||
1069 | jbd_debug(1, "JBD: commit %d complete, head %d\n", | 1069 | jbd_debug(1, "JBD: commit %d complete, head %d\n", |
1070 | journal->j_commit_sequence, journal->j_tail_sequence); | 1070 | journal->j_commit_sequence, journal->j_tail_sequence); |
1071 | 1071 | ||
1072 | wake_up(&journal->j_wait_done_commit); | 1072 | wake_up(&journal->j_wait_done_commit); |
1073 | } | 1073 | } |
1074 | 1074 |
include/linux/sched.h
1 | #ifndef _LINUX_SCHED_H | 1 | #ifndef _LINUX_SCHED_H |
2 | #define _LINUX_SCHED_H | 2 | #define _LINUX_SCHED_H |
3 | 3 | ||
4 | /* | 4 | /* |
5 | * cloning flags: | 5 | * cloning flags: |
6 | */ | 6 | */ |
7 | #define CSIGNAL 0x000000ff /* signal mask to be sent at exit */ | 7 | #define CSIGNAL 0x000000ff /* signal mask to be sent at exit */ |
8 | #define CLONE_VM 0x00000100 /* set if VM shared between processes */ | 8 | #define CLONE_VM 0x00000100 /* set if VM shared between processes */ |
9 | #define CLONE_FS 0x00000200 /* set if fs info shared between processes */ | 9 | #define CLONE_FS 0x00000200 /* set if fs info shared between processes */ |
10 | #define CLONE_FILES 0x00000400 /* set if open files shared between processes */ | 10 | #define CLONE_FILES 0x00000400 /* set if open files shared between processes */ |
11 | #define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */ | 11 | #define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */ |
12 | #define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */ | 12 | #define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */ |
13 | #define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */ | 13 | #define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */ |
14 | #define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */ | 14 | #define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */ |
15 | #define CLONE_THREAD 0x00010000 /* Same thread group? */ | 15 | #define CLONE_THREAD 0x00010000 /* Same thread group? */ |
16 | #define CLONE_NEWNS 0x00020000 /* New namespace group? */ | 16 | #define CLONE_NEWNS 0x00020000 /* New namespace group? */ |
17 | #define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */ | 17 | #define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */ |
18 | #define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */ | 18 | #define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */ |
19 | #define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */ | 19 | #define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */ |
20 | #define CLONE_CHILD_CLEARTID 0x00200000 /* clear the TID in the child */ | 20 | #define CLONE_CHILD_CLEARTID 0x00200000 /* clear the TID in the child */ |
21 | #define CLONE_DETACHED 0x00400000 /* Unused, ignored */ | 21 | #define CLONE_DETACHED 0x00400000 /* Unused, ignored */ |
22 | #define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */ | 22 | #define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */ |
23 | #define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */ | 23 | #define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */ |
24 | #define CLONE_STOPPED 0x02000000 /* Start in stopped state */ | 24 | #define CLONE_STOPPED 0x02000000 /* Start in stopped state */ |
25 | #define CLONE_NEWUTS 0x04000000 /* New utsname group? */ | 25 | #define CLONE_NEWUTS 0x04000000 /* New utsname group? */ |
26 | #define CLONE_NEWIPC 0x08000000 /* New ipcs */ | 26 | #define CLONE_NEWIPC 0x08000000 /* New ipcs */ |
27 | #define CLONE_NEWUSER 0x10000000 /* New user namespace */ | 27 | #define CLONE_NEWUSER 0x10000000 /* New user namespace */ |
28 | #define CLONE_NEWPID 0x20000000 /* New pid namespace */ | 28 | #define CLONE_NEWPID 0x20000000 /* New pid namespace */ |
29 | #define CLONE_NEWNET 0x40000000 /* New network namespace */ | 29 | #define CLONE_NEWNET 0x40000000 /* New network namespace */ |
30 | #define CLONE_IO 0x80000000 /* Clone io context */ | 30 | #define CLONE_IO 0x80000000 /* Clone io context */ |
31 | 31 | ||
32 | /* | 32 | /* |
33 | * Scheduling policies | 33 | * Scheduling policies |
34 | */ | 34 | */ |
35 | #define SCHED_NORMAL 0 | 35 | #define SCHED_NORMAL 0 |
36 | #define SCHED_FIFO 1 | 36 | #define SCHED_FIFO 1 |
37 | #define SCHED_RR 2 | 37 | #define SCHED_RR 2 |
38 | #define SCHED_BATCH 3 | 38 | #define SCHED_BATCH 3 |
39 | /* SCHED_ISO: reserved but not implemented yet */ | 39 | /* SCHED_ISO: reserved but not implemented yet */ |
40 | #define SCHED_IDLE 5 | 40 | #define SCHED_IDLE 5 |
41 | 41 | ||
42 | #ifdef __KERNEL__ | 42 | #ifdef __KERNEL__ |
43 | 43 | ||
44 | struct sched_param { | 44 | struct sched_param { |
45 | int sched_priority; | 45 | int sched_priority; |
46 | }; | 46 | }; |
47 | 47 | ||
48 | #include <asm/param.h> /* for HZ */ | 48 | #include <asm/param.h> /* for HZ */ |
49 | 49 | ||
50 | #include <linux/capability.h> | 50 | #include <linux/capability.h> |
51 | #include <linux/threads.h> | 51 | #include <linux/threads.h> |
52 | #include <linux/kernel.h> | 52 | #include <linux/kernel.h> |
53 | #include <linux/types.h> | 53 | #include <linux/types.h> |
54 | #include <linux/timex.h> | 54 | #include <linux/timex.h> |
55 | #include <linux/jiffies.h> | 55 | #include <linux/jiffies.h> |
56 | #include <linux/rbtree.h> | 56 | #include <linux/rbtree.h> |
57 | #include <linux/thread_info.h> | 57 | #include <linux/thread_info.h> |
58 | #include <linux/cpumask.h> | 58 | #include <linux/cpumask.h> |
59 | #include <linux/errno.h> | 59 | #include <linux/errno.h> |
60 | #include <linux/nodemask.h> | 60 | #include <linux/nodemask.h> |
61 | #include <linux/mm_types.h> | 61 | #include <linux/mm_types.h> |
62 | 62 | ||
63 | #include <asm/system.h> | 63 | #include <asm/system.h> |
64 | #include <asm/semaphore.h> | 64 | #include <asm/semaphore.h> |
65 | #include <asm/page.h> | 65 | #include <asm/page.h> |
66 | #include <asm/ptrace.h> | 66 | #include <asm/ptrace.h> |
67 | #include <asm/cputime.h> | 67 | #include <asm/cputime.h> |
68 | 68 | ||
69 | #include <linux/smp.h> | 69 | #include <linux/smp.h> |
70 | #include <linux/sem.h> | 70 | #include <linux/sem.h> |
71 | #include <linux/signal.h> | 71 | #include <linux/signal.h> |
72 | #include <linux/securebits.h> | 72 | #include <linux/securebits.h> |
73 | #include <linux/fs_struct.h> | 73 | #include <linux/fs_struct.h> |
74 | #include <linux/compiler.h> | 74 | #include <linux/compiler.h> |
75 | #include <linux/completion.h> | 75 | #include <linux/completion.h> |
76 | #include <linux/pid.h> | 76 | #include <linux/pid.h> |
77 | #include <linux/percpu.h> | 77 | #include <linux/percpu.h> |
78 | #include <linux/topology.h> | 78 | #include <linux/topology.h> |
79 | #include <linux/proportions.h> | 79 | #include <linux/proportions.h> |
80 | #include <linux/seccomp.h> | 80 | #include <linux/seccomp.h> |
81 | #include <linux/rcupdate.h> | 81 | #include <linux/rcupdate.h> |
82 | #include <linux/rtmutex.h> | 82 | #include <linux/rtmutex.h> |
83 | 83 | ||
84 | #include <linux/time.h> | 84 | #include <linux/time.h> |
85 | #include <linux/param.h> | 85 | #include <linux/param.h> |
86 | #include <linux/resource.h> | 86 | #include <linux/resource.h> |
87 | #include <linux/timer.h> | 87 | #include <linux/timer.h> |
88 | #include <linux/hrtimer.h> | 88 | #include <linux/hrtimer.h> |
89 | #include <linux/task_io_accounting.h> | 89 | #include <linux/task_io_accounting.h> |
90 | #include <linux/kobject.h> | 90 | #include <linux/kobject.h> |
91 | #include <linux/latencytop.h> | 91 | #include <linux/latencytop.h> |
92 | 92 | ||
93 | #include <asm/processor.h> | 93 | #include <asm/processor.h> |
94 | 94 | ||
95 | struct exec_domain; | 95 | struct exec_domain; |
96 | struct futex_pi_state; | 96 | struct futex_pi_state; |
97 | struct robust_list_head; | 97 | struct robust_list_head; |
98 | struct bio; | 98 | struct bio; |
99 | 99 | ||
100 | /* | 100 | /* |
101 | * List of flags we want to share for kernel threads, | 101 | * List of flags we want to share for kernel threads, |
102 | * if only because they are not used by them anyway. | 102 | * if only because they are not used by them anyway. |
103 | */ | 103 | */ |
104 | #define CLONE_KERNEL (CLONE_FS | CLONE_FILES | CLONE_SIGHAND) | 104 | #define CLONE_KERNEL (CLONE_FS | CLONE_FILES | CLONE_SIGHAND) |
105 | 105 | ||
106 | /* | 106 | /* |
107 | * These are the constant used to fake the fixed-point load-average | 107 | * These are the constant used to fake the fixed-point load-average |
108 | * counting. Some notes: | 108 | * counting. Some notes: |
109 | * - 11 bit fractions expand to 22 bits by the multiplies: this gives | 109 | * - 11 bit fractions expand to 22 bits by the multiplies: this gives |
110 | * a load-average precision of 10 bits integer + 11 bits fractional | 110 | * a load-average precision of 10 bits integer + 11 bits fractional |
111 | * - if you want to count load-averages more often, you need more | 111 | * - if you want to count load-averages more often, you need more |
112 | * precision, or rounding will get you. With 2-second counting freq, | 112 | * precision, or rounding will get you. With 2-second counting freq, |
113 | * the EXP_n values would be 1981, 2034 and 2043 if still using only | 113 | * the EXP_n values would be 1981, 2034 and 2043 if still using only |
114 | * 11 bit fractions. | 114 | * 11 bit fractions. |
115 | */ | 115 | */ |
116 | extern unsigned long avenrun[]; /* Load averages */ | 116 | extern unsigned long avenrun[]; /* Load averages */ |
117 | 117 | ||
118 | #define FSHIFT 11 /* nr of bits of precision */ | 118 | #define FSHIFT 11 /* nr of bits of precision */ |
119 | #define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */ | 119 | #define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */ |
120 | #define LOAD_FREQ (5*HZ+1) /* 5 sec intervals */ | 120 | #define LOAD_FREQ (5*HZ+1) /* 5 sec intervals */ |
121 | #define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */ | 121 | #define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */ |
122 | #define EXP_5 2014 /* 1/exp(5sec/5min) */ | 122 | #define EXP_5 2014 /* 1/exp(5sec/5min) */ |
123 | #define EXP_15 2037 /* 1/exp(5sec/15min) */ | 123 | #define EXP_15 2037 /* 1/exp(5sec/15min) */ |
124 | 124 | ||
125 | #define CALC_LOAD(load,exp,n) \ | 125 | #define CALC_LOAD(load,exp,n) \ |
126 | load *= exp; \ | 126 | load *= exp; \ |
127 | load += n*(FIXED_1-exp); \ | 127 | load += n*(FIXED_1-exp); \ |
128 | load >>= FSHIFT; | 128 | load >>= FSHIFT; |
129 | 129 | ||
130 | extern unsigned long total_forks; | 130 | extern unsigned long total_forks; |
131 | extern int nr_threads; | 131 | extern int nr_threads; |
132 | DECLARE_PER_CPU(unsigned long, process_counts); | 132 | DECLARE_PER_CPU(unsigned long, process_counts); |
133 | extern int nr_processes(void); | 133 | extern int nr_processes(void); |
134 | extern unsigned long nr_running(void); | 134 | extern unsigned long nr_running(void); |
135 | extern unsigned long nr_uninterruptible(void); | 135 | extern unsigned long nr_uninterruptible(void); |
136 | extern unsigned long nr_active(void); | 136 | extern unsigned long nr_active(void); |
137 | extern unsigned long nr_iowait(void); | 137 | extern unsigned long nr_iowait(void); |
138 | extern unsigned long weighted_cpuload(const int cpu); | 138 | extern unsigned long weighted_cpuload(const int cpu); |
139 | 139 | ||
140 | struct seq_file; | 140 | struct seq_file; |
141 | struct cfs_rq; | 141 | struct cfs_rq; |
142 | struct task_group; | 142 | struct task_group; |
143 | #ifdef CONFIG_SCHED_DEBUG | 143 | #ifdef CONFIG_SCHED_DEBUG |
144 | extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m); | 144 | extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m); |
145 | extern void proc_sched_set_task(struct task_struct *p); | 145 | extern void proc_sched_set_task(struct task_struct *p); |
146 | extern void | 146 | extern void |
147 | print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq); | 147 | print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq); |
148 | #else | 148 | #else |
149 | static inline void | 149 | static inline void |
150 | proc_sched_show_task(struct task_struct *p, struct seq_file *m) | 150 | proc_sched_show_task(struct task_struct *p, struct seq_file *m) |
151 | { | 151 | { |
152 | } | 152 | } |
153 | static inline void proc_sched_set_task(struct task_struct *p) | 153 | static inline void proc_sched_set_task(struct task_struct *p) |
154 | { | 154 | { |
155 | } | 155 | } |
156 | static inline void | 156 | static inline void |
157 | print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | 157 | print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) |
158 | { | 158 | { |
159 | } | 159 | } |
160 | #endif | 160 | #endif |
161 | 161 | ||
162 | /* | 162 | /* |
163 | * Task state bitmask. NOTE! These bits are also | 163 | * Task state bitmask. NOTE! These bits are also |
164 | * encoded in fs/proc/array.c: get_task_state(). | 164 | * encoded in fs/proc/array.c: get_task_state(). |
165 | * | 165 | * |
166 | * We have two separate sets of flags: task->state | 166 | * We have two separate sets of flags: task->state |
167 | * is about runnability, while task->exit_state are | 167 | * is about runnability, while task->exit_state are |
168 | * about the task exiting. Confusing, but this way | 168 | * about the task exiting. Confusing, but this way |
169 | * modifying one set can't modify the other one by | 169 | * modifying one set can't modify the other one by |
170 | * mistake. | 170 | * mistake. |
171 | */ | 171 | */ |
172 | #define TASK_RUNNING 0 | 172 | #define TASK_RUNNING 0 |
173 | #define TASK_INTERRUPTIBLE 1 | 173 | #define TASK_INTERRUPTIBLE 1 |
174 | #define TASK_UNINTERRUPTIBLE 2 | 174 | #define TASK_UNINTERRUPTIBLE 2 |
175 | #define TASK_STOPPED 4 | 175 | #define TASK_STOPPED 4 |
176 | #define TASK_TRACED 8 | 176 | #define TASK_TRACED 8 |
177 | /* in tsk->exit_state */ | 177 | /* in tsk->exit_state */ |
178 | #define EXIT_ZOMBIE 16 | 178 | #define EXIT_ZOMBIE 16 |
179 | #define EXIT_DEAD 32 | 179 | #define EXIT_DEAD 32 |
180 | /* in tsk->state again */ | 180 | /* in tsk->state again */ |
181 | #define TASK_DEAD 64 | 181 | #define TASK_DEAD 64 |
182 | 182 | ||
183 | #define __set_task_state(tsk, state_value) \ | 183 | #define __set_task_state(tsk, state_value) \ |
184 | do { (tsk)->state = (state_value); } while (0) | 184 | do { (tsk)->state = (state_value); } while (0) |
185 | #define set_task_state(tsk, state_value) \ | 185 | #define set_task_state(tsk, state_value) \ |
186 | set_mb((tsk)->state, (state_value)) | 186 | set_mb((tsk)->state, (state_value)) |
187 | 187 | ||
188 | /* | 188 | /* |
189 | * set_current_state() includes a barrier so that the write of current->state | 189 | * set_current_state() includes a barrier so that the write of current->state |
190 | * is correctly serialised wrt the caller's subsequent test of whether to | 190 | * is correctly serialised wrt the caller's subsequent test of whether to |
191 | * actually sleep: | 191 | * actually sleep: |
192 | * | 192 | * |
193 | * set_current_state(TASK_UNINTERRUPTIBLE); | 193 | * set_current_state(TASK_UNINTERRUPTIBLE); |
194 | * if (do_i_need_to_sleep()) | 194 | * if (do_i_need_to_sleep()) |
195 | * schedule(); | 195 | * schedule(); |
196 | * | 196 | * |
197 | * If the caller does not need such serialisation then use __set_current_state() | 197 | * If the caller does not need such serialisation then use __set_current_state() |
198 | */ | 198 | */ |
199 | #define __set_current_state(state_value) \ | 199 | #define __set_current_state(state_value) \ |
200 | do { current->state = (state_value); } while (0) | 200 | do { current->state = (state_value); } while (0) |
201 | #define set_current_state(state_value) \ | 201 | #define set_current_state(state_value) \ |
202 | set_mb(current->state, (state_value)) | 202 | set_mb(current->state, (state_value)) |
203 | 203 | ||
204 | /* Task command name length */ | 204 | /* Task command name length */ |
205 | #define TASK_COMM_LEN 16 | 205 | #define TASK_COMM_LEN 16 |
206 | 206 | ||
207 | #include <linux/spinlock.h> | 207 | #include <linux/spinlock.h> |
208 | 208 | ||
209 | /* | 209 | /* |
210 | * This serializes "schedule()" and also protects | 210 | * This serializes "schedule()" and also protects |
211 | * the run-queue from deletions/modifications (but | 211 | * the run-queue from deletions/modifications (but |
212 | * _adding_ to the beginning of the run-queue has | 212 | * _adding_ to the beginning of the run-queue has |
213 | * a separate lock). | 213 | * a separate lock). |
214 | */ | 214 | */ |
215 | extern rwlock_t tasklist_lock; | 215 | extern rwlock_t tasklist_lock; |
216 | extern spinlock_t mmlist_lock; | 216 | extern spinlock_t mmlist_lock; |
217 | 217 | ||
218 | struct task_struct; | 218 | struct task_struct; |
219 | 219 | ||
220 | extern void sched_init(void); | 220 | extern void sched_init(void); |
221 | extern void sched_init_smp(void); | 221 | extern void sched_init_smp(void); |
222 | extern void init_idle(struct task_struct *idle, int cpu); | 222 | extern void init_idle(struct task_struct *idle, int cpu); |
223 | extern void init_idle_bootup_task(struct task_struct *idle); | 223 | extern void init_idle_bootup_task(struct task_struct *idle); |
224 | 224 | ||
225 | extern cpumask_t nohz_cpu_mask; | 225 | extern cpumask_t nohz_cpu_mask; |
226 | #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) | 226 | #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) |
227 | extern int select_nohz_load_balancer(int cpu); | 227 | extern int select_nohz_load_balancer(int cpu); |
228 | #else | 228 | #else |
229 | static inline int select_nohz_load_balancer(int cpu) | 229 | static inline int select_nohz_load_balancer(int cpu) |
230 | { | 230 | { |
231 | return 0; | 231 | return 0; |
232 | } | 232 | } |
233 | #endif | 233 | #endif |
234 | 234 | ||
235 | extern unsigned long rt_needs_cpu(int cpu); | 235 | extern unsigned long rt_needs_cpu(int cpu); |
236 | 236 | ||
237 | /* | 237 | /* |
238 | * Only dump TASK_* tasks. (0 for all tasks) | 238 | * Only dump TASK_* tasks. (0 for all tasks) |
239 | */ | 239 | */ |
240 | extern void show_state_filter(unsigned long state_filter); | 240 | extern void show_state_filter(unsigned long state_filter); |
241 | 241 | ||
242 | static inline void show_state(void) | 242 | static inline void show_state(void) |
243 | { | 243 | { |
244 | show_state_filter(0); | 244 | show_state_filter(0); |
245 | } | 245 | } |
246 | 246 | ||
247 | extern void show_regs(struct pt_regs *); | 247 | extern void show_regs(struct pt_regs *); |
248 | 248 | ||
249 | /* | 249 | /* |
250 | * TASK is a pointer to the task whose backtrace we want to see (or NULL for current | 250 | * TASK is a pointer to the task whose backtrace we want to see (or NULL for current |
251 | * task), SP is the stack pointer of the first frame that should be shown in the back | 251 | * task), SP is the stack pointer of the first frame that should be shown in the back |
252 | * trace (or NULL if the entire call-chain of the task should be shown). | 252 | * trace (or NULL if the entire call-chain of the task should be shown). |
253 | */ | 253 | */ |
254 | extern void show_stack(struct task_struct *task, unsigned long *sp); | 254 | extern void show_stack(struct task_struct *task, unsigned long *sp); |
255 | 255 | ||
256 | void io_schedule(void); | 256 | void io_schedule(void); |
257 | long io_schedule_timeout(long timeout); | 257 | long io_schedule_timeout(long timeout); |
258 | 258 | ||
259 | extern void cpu_init (void); | 259 | extern void cpu_init (void); |
260 | extern void trap_init(void); | 260 | extern void trap_init(void); |
261 | extern void account_process_tick(struct task_struct *task, int user); | 261 | extern void account_process_tick(struct task_struct *task, int user); |
262 | extern void update_process_times(int user); | 262 | extern void update_process_times(int user); |
263 | extern void scheduler_tick(void); | 263 | extern void scheduler_tick(void); |
264 | extern void hrtick_resched(void); | 264 | extern void hrtick_resched(void); |
265 | 265 | ||
266 | extern void sched_show_task(struct task_struct *p); | 266 | extern void sched_show_task(struct task_struct *p); |
267 | 267 | ||
268 | #ifdef CONFIG_DETECT_SOFTLOCKUP | 268 | #ifdef CONFIG_DETECT_SOFTLOCKUP |
269 | extern void softlockup_tick(void); | 269 | extern void softlockup_tick(void); |
270 | extern void spawn_softlockup_task(void); | 270 | extern void spawn_softlockup_task(void); |
271 | extern void touch_softlockup_watchdog(void); | 271 | extern void touch_softlockup_watchdog(void); |
272 | extern void touch_all_softlockup_watchdogs(void); | 272 | extern void touch_all_softlockup_watchdogs(void); |
273 | extern unsigned long softlockup_thresh; | 273 | extern unsigned long softlockup_thresh; |
274 | extern unsigned long sysctl_hung_task_check_count; | 274 | extern unsigned long sysctl_hung_task_check_count; |
275 | extern unsigned long sysctl_hung_task_timeout_secs; | 275 | extern unsigned long sysctl_hung_task_timeout_secs; |
276 | extern unsigned long sysctl_hung_task_warnings; | 276 | extern unsigned long sysctl_hung_task_warnings; |
277 | #else | 277 | #else |
278 | static inline void softlockup_tick(void) | 278 | static inline void softlockup_tick(void) |
279 | { | 279 | { |
280 | } | 280 | } |
281 | static inline void spawn_softlockup_task(void) | 281 | static inline void spawn_softlockup_task(void) |
282 | { | 282 | { |
283 | } | 283 | } |
284 | static inline void touch_softlockup_watchdog(void) | 284 | static inline void touch_softlockup_watchdog(void) |
285 | { | 285 | { |
286 | } | 286 | } |
287 | static inline void touch_all_softlockup_watchdogs(void) | 287 | static inline void touch_all_softlockup_watchdogs(void) |
288 | { | 288 | { |
289 | } | 289 | } |
290 | #endif | 290 | #endif |
291 | 291 | ||
292 | 292 | ||
293 | /* Attach to any functions which should be ignored in wchan output. */ | 293 | /* Attach to any functions which should be ignored in wchan output. */ |
294 | #define __sched __attribute__((__section__(".sched.text"))) | 294 | #define __sched __attribute__((__section__(".sched.text"))) |
295 | 295 | ||
296 | /* Linker adds these: start and end of __sched functions */ | 296 | /* Linker adds these: start and end of __sched functions */ |
297 | extern char __sched_text_start[], __sched_text_end[]; | 297 | extern char __sched_text_start[], __sched_text_end[]; |
298 | 298 | ||
299 | /* Is this address in the __sched functions? */ | 299 | /* Is this address in the __sched functions? */ |
300 | extern int in_sched_functions(unsigned long addr); | 300 | extern int in_sched_functions(unsigned long addr); |
301 | 301 | ||
302 | #define MAX_SCHEDULE_TIMEOUT LONG_MAX | 302 | #define MAX_SCHEDULE_TIMEOUT LONG_MAX |
303 | extern signed long FASTCALL(schedule_timeout(signed long timeout)); | 303 | extern signed long FASTCALL(schedule_timeout(signed long timeout)); |
304 | extern signed long schedule_timeout_interruptible(signed long timeout); | 304 | extern signed long schedule_timeout_interruptible(signed long timeout); |
305 | extern signed long schedule_timeout_uninterruptible(signed long timeout); | 305 | extern signed long schedule_timeout_uninterruptible(signed long timeout); |
306 | asmlinkage void schedule(void); | 306 | asmlinkage void schedule(void); |
307 | 307 | ||
308 | struct nsproxy; | 308 | struct nsproxy; |
309 | struct user_namespace; | 309 | struct user_namespace; |
310 | 310 | ||
311 | /* Maximum number of active map areas.. This is a random (large) number */ | 311 | /* Maximum number of active map areas.. This is a random (large) number */ |
312 | #define DEFAULT_MAX_MAP_COUNT 65536 | 312 | #define DEFAULT_MAX_MAP_COUNT 65536 |
313 | 313 | ||
314 | extern int sysctl_max_map_count; | 314 | extern int sysctl_max_map_count; |
315 | 315 | ||
316 | #include <linux/aio.h> | 316 | #include <linux/aio.h> |
317 | 317 | ||
318 | extern unsigned long | 318 | extern unsigned long |
319 | arch_get_unmapped_area(struct file *, unsigned long, unsigned long, | 319 | arch_get_unmapped_area(struct file *, unsigned long, unsigned long, |
320 | unsigned long, unsigned long); | 320 | unsigned long, unsigned long); |
321 | extern unsigned long | 321 | extern unsigned long |
322 | arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, | 322 | arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, |
323 | unsigned long len, unsigned long pgoff, | 323 | unsigned long len, unsigned long pgoff, |
324 | unsigned long flags); | 324 | unsigned long flags); |
325 | extern void arch_unmap_area(struct mm_struct *, unsigned long); | 325 | extern void arch_unmap_area(struct mm_struct *, unsigned long); |
326 | extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); | 326 | extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); |
327 | 327 | ||
328 | #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS | 328 | #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS |
329 | /* | 329 | /* |
330 | * The mm counters are not protected by its page_table_lock, | 330 | * The mm counters are not protected by its page_table_lock, |
331 | * so must be incremented atomically. | 331 | * so must be incremented atomically. |
332 | */ | 332 | */ |
333 | #define set_mm_counter(mm, member, value) atomic_long_set(&(mm)->_##member, value) | 333 | #define set_mm_counter(mm, member, value) atomic_long_set(&(mm)->_##member, value) |
334 | #define get_mm_counter(mm, member) ((unsigned long)atomic_long_read(&(mm)->_##member)) | 334 | #define get_mm_counter(mm, member) ((unsigned long)atomic_long_read(&(mm)->_##member)) |
335 | #define add_mm_counter(mm, member, value) atomic_long_add(value, &(mm)->_##member) | 335 | #define add_mm_counter(mm, member, value) atomic_long_add(value, &(mm)->_##member) |
336 | #define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member) | 336 | #define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member) |
337 | #define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member) | 337 | #define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member) |
338 | 338 | ||
339 | #else /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ | 339 | #else /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ |
340 | /* | 340 | /* |
341 | * The mm counters are protected by its page_table_lock, | 341 | * The mm counters are protected by its page_table_lock, |
342 | * so can be incremented directly. | 342 | * so can be incremented directly. |
343 | */ | 343 | */ |
344 | #define set_mm_counter(mm, member, value) (mm)->_##member = (value) | 344 | #define set_mm_counter(mm, member, value) (mm)->_##member = (value) |
345 | #define get_mm_counter(mm, member) ((mm)->_##member) | 345 | #define get_mm_counter(mm, member) ((mm)->_##member) |
346 | #define add_mm_counter(mm, member, value) (mm)->_##member += (value) | 346 | #define add_mm_counter(mm, member, value) (mm)->_##member += (value) |
347 | #define inc_mm_counter(mm, member) (mm)->_##member++ | 347 | #define inc_mm_counter(mm, member) (mm)->_##member++ |
348 | #define dec_mm_counter(mm, member) (mm)->_##member-- | 348 | #define dec_mm_counter(mm, member) (mm)->_##member-- |
349 | 349 | ||
350 | #endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ | 350 | #endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ |
351 | 351 | ||
352 | #define get_mm_rss(mm) \ | 352 | #define get_mm_rss(mm) \ |
353 | (get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss)) | 353 | (get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss)) |
354 | #define update_hiwater_rss(mm) do { \ | 354 | #define update_hiwater_rss(mm) do { \ |
355 | unsigned long _rss = get_mm_rss(mm); \ | 355 | unsigned long _rss = get_mm_rss(mm); \ |
356 | if ((mm)->hiwater_rss < _rss) \ | 356 | if ((mm)->hiwater_rss < _rss) \ |
357 | (mm)->hiwater_rss = _rss; \ | 357 | (mm)->hiwater_rss = _rss; \ |
358 | } while (0) | 358 | } while (0) |
359 | #define update_hiwater_vm(mm) do { \ | 359 | #define update_hiwater_vm(mm) do { \ |
360 | if ((mm)->hiwater_vm < (mm)->total_vm) \ | 360 | if ((mm)->hiwater_vm < (mm)->total_vm) \ |
361 | (mm)->hiwater_vm = (mm)->total_vm; \ | 361 | (mm)->hiwater_vm = (mm)->total_vm; \ |
362 | } while (0) | 362 | } while (0) |
363 | 363 | ||
364 | extern void set_dumpable(struct mm_struct *mm, int value); | 364 | extern void set_dumpable(struct mm_struct *mm, int value); |
365 | extern int get_dumpable(struct mm_struct *mm); | 365 | extern int get_dumpable(struct mm_struct *mm); |
366 | 366 | ||
367 | /* mm flags */ | 367 | /* mm flags */ |
368 | /* dumpable bits */ | 368 | /* dumpable bits */ |
369 | #define MMF_DUMPABLE 0 /* core dump is permitted */ | 369 | #define MMF_DUMPABLE 0 /* core dump is permitted */ |
370 | #define MMF_DUMP_SECURELY 1 /* core file is readable only by root */ | 370 | #define MMF_DUMP_SECURELY 1 /* core file is readable only by root */ |
371 | #define MMF_DUMPABLE_BITS 2 | 371 | #define MMF_DUMPABLE_BITS 2 |
372 | 372 | ||
373 | /* coredump filter bits */ | 373 | /* coredump filter bits */ |
374 | #define MMF_DUMP_ANON_PRIVATE 2 | 374 | #define MMF_DUMP_ANON_PRIVATE 2 |
375 | #define MMF_DUMP_ANON_SHARED 3 | 375 | #define MMF_DUMP_ANON_SHARED 3 |
376 | #define MMF_DUMP_MAPPED_PRIVATE 4 | 376 | #define MMF_DUMP_MAPPED_PRIVATE 4 |
377 | #define MMF_DUMP_MAPPED_SHARED 5 | 377 | #define MMF_DUMP_MAPPED_SHARED 5 |
378 | #define MMF_DUMP_ELF_HEADERS 6 | 378 | #define MMF_DUMP_ELF_HEADERS 6 |
379 | #define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS | 379 | #define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS |
380 | #define MMF_DUMP_FILTER_BITS 5 | 380 | #define MMF_DUMP_FILTER_BITS 5 |
381 | #define MMF_DUMP_FILTER_MASK \ | 381 | #define MMF_DUMP_FILTER_MASK \ |
382 | (((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT) | 382 | (((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT) |
383 | #define MMF_DUMP_FILTER_DEFAULT \ | 383 | #define MMF_DUMP_FILTER_DEFAULT \ |
384 | ((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED)) | 384 | ((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED)) |
385 | 385 | ||
386 | struct sighand_struct { | 386 | struct sighand_struct { |
387 | atomic_t count; | 387 | atomic_t count; |
388 | struct k_sigaction action[_NSIG]; | 388 | struct k_sigaction action[_NSIG]; |
389 | spinlock_t siglock; | 389 | spinlock_t siglock; |
390 | wait_queue_head_t signalfd_wqh; | 390 | wait_queue_head_t signalfd_wqh; |
391 | }; | 391 | }; |
392 | 392 | ||
393 | struct pacct_struct { | 393 | struct pacct_struct { |
394 | int ac_flag; | 394 | int ac_flag; |
395 | long ac_exitcode; | 395 | long ac_exitcode; |
396 | unsigned long ac_mem; | 396 | unsigned long ac_mem; |
397 | cputime_t ac_utime, ac_stime; | 397 | cputime_t ac_utime, ac_stime; |
398 | unsigned long ac_minflt, ac_majflt; | 398 | unsigned long ac_minflt, ac_majflt; |
399 | }; | 399 | }; |
400 | 400 | ||
401 | /* | 401 | /* |
402 | * NOTE! "signal_struct" does not have it's own | 402 | * NOTE! "signal_struct" does not have it's own |
403 | * locking, because a shared signal_struct always | 403 | * locking, because a shared signal_struct always |
404 | * implies a shared sighand_struct, so locking | 404 | * implies a shared sighand_struct, so locking |
405 | * sighand_struct is always a proper superset of | 405 | * sighand_struct is always a proper superset of |
406 | * the locking of signal_struct. | 406 | * the locking of signal_struct. |
407 | */ | 407 | */ |
408 | struct signal_struct { | 408 | struct signal_struct { |
409 | atomic_t count; | 409 | atomic_t count; |
410 | atomic_t live; | 410 | atomic_t live; |
411 | 411 | ||
412 | wait_queue_head_t wait_chldexit; /* for wait4() */ | 412 | wait_queue_head_t wait_chldexit; /* for wait4() */ |
413 | 413 | ||
414 | /* current thread group signal load-balancing target: */ | 414 | /* current thread group signal load-balancing target: */ |
415 | struct task_struct *curr_target; | 415 | struct task_struct *curr_target; |
416 | 416 | ||
417 | /* shared signal handling: */ | 417 | /* shared signal handling: */ |
418 | struct sigpending shared_pending; | 418 | struct sigpending shared_pending; |
419 | 419 | ||
420 | /* thread group exit support */ | 420 | /* thread group exit support */ |
421 | int group_exit_code; | 421 | int group_exit_code; |
422 | /* overloaded: | 422 | /* overloaded: |
423 | * - notify group_exit_task when ->count is equal to notify_count | 423 | * - notify group_exit_task when ->count is equal to notify_count |
424 | * - everyone except group_exit_task is stopped during signal delivery | 424 | * - everyone except group_exit_task is stopped during signal delivery |
425 | * of fatal signals, group_exit_task processes the signal. | 425 | * of fatal signals, group_exit_task processes the signal. |
426 | */ | 426 | */ |
427 | struct task_struct *group_exit_task; | 427 | struct task_struct *group_exit_task; |
428 | int notify_count; | 428 | int notify_count; |
429 | 429 | ||
430 | /* thread group stop support, overloads group_exit_code too */ | 430 | /* thread group stop support, overloads group_exit_code too */ |
431 | int group_stop_count; | 431 | int group_stop_count; |
432 | unsigned int flags; /* see SIGNAL_* flags below */ | 432 | unsigned int flags; /* see SIGNAL_* flags below */ |
433 | 433 | ||
434 | /* POSIX.1b Interval Timers */ | 434 | /* POSIX.1b Interval Timers */ |
435 | struct list_head posix_timers; | 435 | struct list_head posix_timers; |
436 | 436 | ||
437 | /* ITIMER_REAL timer for the process */ | 437 | /* ITIMER_REAL timer for the process */ |
438 | struct hrtimer real_timer; | 438 | struct hrtimer real_timer; |
439 | struct task_struct *tsk; | 439 | struct task_struct *tsk; |
440 | ktime_t it_real_incr; | 440 | ktime_t it_real_incr; |
441 | 441 | ||
442 | /* ITIMER_PROF and ITIMER_VIRTUAL timers for the process */ | 442 | /* ITIMER_PROF and ITIMER_VIRTUAL timers for the process */ |
443 | cputime_t it_prof_expires, it_virt_expires; | 443 | cputime_t it_prof_expires, it_virt_expires; |
444 | cputime_t it_prof_incr, it_virt_incr; | 444 | cputime_t it_prof_incr, it_virt_incr; |
445 | 445 | ||
446 | /* job control IDs */ | 446 | /* job control IDs */ |
447 | 447 | ||
448 | /* | 448 | /* |
449 | * pgrp and session fields are deprecated. | 449 | * pgrp and session fields are deprecated. |
450 | * use the task_session_Xnr and task_pgrp_Xnr routines below | 450 | * use the task_session_Xnr and task_pgrp_Xnr routines below |
451 | */ | 451 | */ |
452 | 452 | ||
453 | union { | 453 | union { |
454 | pid_t pgrp __deprecated; | 454 | pid_t pgrp __deprecated; |
455 | pid_t __pgrp; | 455 | pid_t __pgrp; |
456 | }; | 456 | }; |
457 | 457 | ||
458 | struct pid *tty_old_pgrp; | 458 | struct pid *tty_old_pgrp; |
459 | 459 | ||
460 | union { | 460 | union { |
461 | pid_t session __deprecated; | 461 | pid_t session __deprecated; |
462 | pid_t __session; | 462 | pid_t __session; |
463 | }; | 463 | }; |
464 | 464 | ||
465 | /* boolean value for session group leader */ | 465 | /* boolean value for session group leader */ |
466 | int leader; | 466 | int leader; |
467 | 467 | ||
468 | struct tty_struct *tty; /* NULL if no tty */ | 468 | struct tty_struct *tty; /* NULL if no tty */ |
469 | 469 | ||
470 | /* | 470 | /* |
471 | * Cumulative resource counters for dead threads in the group, | 471 | * Cumulative resource counters for dead threads in the group, |
472 | * and for reaped dead child processes forked by this group. | 472 | * and for reaped dead child processes forked by this group. |
473 | * Live threads maintain their own counters and add to these | 473 | * Live threads maintain their own counters and add to these |
474 | * in __exit_signal, except for the group leader. | 474 | * in __exit_signal, except for the group leader. |
475 | */ | 475 | */ |
476 | cputime_t utime, stime, cutime, cstime; | 476 | cputime_t utime, stime, cutime, cstime; |
477 | cputime_t gtime; | 477 | cputime_t gtime; |
478 | cputime_t cgtime; | 478 | cputime_t cgtime; |
479 | unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; | 479 | unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; |
480 | unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; | 480 | unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; |
481 | unsigned long inblock, oublock, cinblock, coublock; | 481 | unsigned long inblock, oublock, cinblock, coublock; |
482 | 482 | ||
483 | /* | 483 | /* |
484 | * Cumulative ns of scheduled CPU time for dead threads in the | 484 | * Cumulative ns of scheduled CPU time for dead threads in the |
485 | * group, not including a zombie group leader. (This only differs | 485 | * group, not including a zombie group leader. (This only differs |
486 | * from jiffies_to_ns(utime + stime) if sched_clock uses something | 486 | * from jiffies_to_ns(utime + stime) if sched_clock uses something |
487 | * other than jiffies.) | 487 | * other than jiffies.) |
488 | */ | 488 | */ |
489 | unsigned long long sum_sched_runtime; | 489 | unsigned long long sum_sched_runtime; |
490 | 490 | ||
491 | /* | 491 | /* |
492 | * We don't bother to synchronize most readers of this at all, | 492 | * We don't bother to synchronize most readers of this at all, |
493 | * because there is no reader checking a limit that actually needs | 493 | * because there is no reader checking a limit that actually needs |
494 | * to get both rlim_cur and rlim_max atomically, and either one | 494 | * to get both rlim_cur and rlim_max atomically, and either one |
495 | * alone is a single word that can safely be read normally. | 495 | * alone is a single word that can safely be read normally. |
496 | * getrlimit/setrlimit use task_lock(current->group_leader) to | 496 | * getrlimit/setrlimit use task_lock(current->group_leader) to |
497 | * protect this instead of the siglock, because they really | 497 | * protect this instead of the siglock, because they really |
498 | * have no need to disable irqs. | 498 | * have no need to disable irqs. |
499 | */ | 499 | */ |
500 | struct rlimit rlim[RLIM_NLIMITS]; | 500 | struct rlimit rlim[RLIM_NLIMITS]; |
501 | 501 | ||
502 | struct list_head cpu_timers[3]; | 502 | struct list_head cpu_timers[3]; |
503 | 503 | ||
504 | /* keep the process-shared keyrings here so that they do the right | 504 | /* keep the process-shared keyrings here so that they do the right |
505 | * thing in threads created with CLONE_THREAD */ | 505 | * thing in threads created with CLONE_THREAD */ |
506 | #ifdef CONFIG_KEYS | 506 | #ifdef CONFIG_KEYS |
507 | struct key *session_keyring; /* keyring inherited over fork */ | 507 | struct key *session_keyring; /* keyring inherited over fork */ |
508 | struct key *process_keyring; /* keyring private to this process */ | 508 | struct key *process_keyring; /* keyring private to this process */ |
509 | #endif | 509 | #endif |
510 | #ifdef CONFIG_BSD_PROCESS_ACCT | 510 | #ifdef CONFIG_BSD_PROCESS_ACCT |
511 | struct pacct_struct pacct; /* per-process accounting information */ | 511 | struct pacct_struct pacct; /* per-process accounting information */ |
512 | #endif | 512 | #endif |
513 | #ifdef CONFIG_TASKSTATS | 513 | #ifdef CONFIG_TASKSTATS |
514 | struct taskstats *stats; | 514 | struct taskstats *stats; |
515 | #endif | 515 | #endif |
516 | #ifdef CONFIG_AUDIT | 516 | #ifdef CONFIG_AUDIT |
517 | unsigned audit_tty; | 517 | unsigned audit_tty; |
518 | struct tty_audit_buf *tty_audit_buf; | 518 | struct tty_audit_buf *tty_audit_buf; |
519 | #endif | 519 | #endif |
520 | }; | 520 | }; |
521 | 521 | ||
522 | /* Context switch must be unlocked if interrupts are to be enabled */ | 522 | /* Context switch must be unlocked if interrupts are to be enabled */ |
523 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | 523 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
524 | # define __ARCH_WANT_UNLOCKED_CTXSW | 524 | # define __ARCH_WANT_UNLOCKED_CTXSW |
525 | #endif | 525 | #endif |
526 | 526 | ||
527 | /* | 527 | /* |
528 | * Bits in flags field of signal_struct. | 528 | * Bits in flags field of signal_struct. |
529 | */ | 529 | */ |
530 | #define SIGNAL_STOP_STOPPED 0x00000001 /* job control stop in effect */ | 530 | #define SIGNAL_STOP_STOPPED 0x00000001 /* job control stop in effect */ |
531 | #define SIGNAL_STOP_DEQUEUED 0x00000002 /* stop signal dequeued */ | 531 | #define SIGNAL_STOP_DEQUEUED 0x00000002 /* stop signal dequeued */ |
532 | #define SIGNAL_STOP_CONTINUED 0x00000004 /* SIGCONT since WCONTINUED reap */ | 532 | #define SIGNAL_STOP_CONTINUED 0x00000004 /* SIGCONT since WCONTINUED reap */ |
533 | #define SIGNAL_GROUP_EXIT 0x00000008 /* group exit in progress */ | 533 | #define SIGNAL_GROUP_EXIT 0x00000008 /* group exit in progress */ |
534 | 534 | ||
535 | /* | 535 | /* |
536 | * Some day this will be a full-fledged user tracking system.. | 536 | * Some day this will be a full-fledged user tracking system.. |
537 | */ | 537 | */ |
538 | struct user_struct { | 538 | struct user_struct { |
539 | atomic_t __count; /* reference count */ | 539 | atomic_t __count; /* reference count */ |
540 | atomic_t processes; /* How many processes does this user have? */ | 540 | atomic_t processes; /* How many processes does this user have? */ |
541 | atomic_t files; /* How many open files does this user have? */ | 541 | atomic_t files; /* How many open files does this user have? */ |
542 | atomic_t sigpending; /* How many pending signals does this user have? */ | 542 | atomic_t sigpending; /* How many pending signals does this user have? */ |
543 | #ifdef CONFIG_INOTIFY_USER | 543 | #ifdef CONFIG_INOTIFY_USER |
544 | atomic_t inotify_watches; /* How many inotify watches does this user have? */ | 544 | atomic_t inotify_watches; /* How many inotify watches does this user have? */ |
545 | atomic_t inotify_devs; /* How many inotify devs does this user have opened? */ | 545 | atomic_t inotify_devs; /* How many inotify devs does this user have opened? */ |
546 | #endif | 546 | #endif |
547 | #ifdef CONFIG_POSIX_MQUEUE | 547 | #ifdef CONFIG_POSIX_MQUEUE |
548 | /* protected by mq_lock */ | 548 | /* protected by mq_lock */ |
549 | unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */ | 549 | unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */ |
550 | #endif | 550 | #endif |
551 | unsigned long locked_shm; /* How many pages of mlocked shm ? */ | 551 | unsigned long locked_shm; /* How many pages of mlocked shm ? */ |
552 | 552 | ||
553 | #ifdef CONFIG_KEYS | 553 | #ifdef CONFIG_KEYS |
554 | struct key *uid_keyring; /* UID specific keyring */ | 554 | struct key *uid_keyring; /* UID specific keyring */ |
555 | struct key *session_keyring; /* UID's default session keyring */ | 555 | struct key *session_keyring; /* UID's default session keyring */ |
556 | #endif | 556 | #endif |
557 | 557 | ||
558 | /* Hash table maintenance information */ | 558 | /* Hash table maintenance information */ |
559 | struct hlist_node uidhash_node; | 559 | struct hlist_node uidhash_node; |
560 | uid_t uid; | 560 | uid_t uid; |
561 | 561 | ||
562 | #ifdef CONFIG_FAIR_USER_SCHED | 562 | #ifdef CONFIG_FAIR_USER_SCHED |
563 | struct task_group *tg; | 563 | struct task_group *tg; |
564 | #ifdef CONFIG_SYSFS | 564 | #ifdef CONFIG_SYSFS |
565 | struct kobject kobj; | 565 | struct kobject kobj; |
566 | struct work_struct work; | 566 | struct work_struct work; |
567 | #endif | 567 | #endif |
568 | #endif | 568 | #endif |
569 | }; | 569 | }; |
570 | 570 | ||
571 | extern int uids_sysfs_init(void); | 571 | extern int uids_sysfs_init(void); |
572 | 572 | ||
573 | extern struct user_struct *find_user(uid_t); | 573 | extern struct user_struct *find_user(uid_t); |
574 | 574 | ||
575 | extern struct user_struct root_user; | 575 | extern struct user_struct root_user; |
576 | #define INIT_USER (&root_user) | 576 | #define INIT_USER (&root_user) |
577 | 577 | ||
578 | struct backing_dev_info; | 578 | struct backing_dev_info; |
579 | struct reclaim_state; | 579 | struct reclaim_state; |
580 | 580 | ||
581 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | 581 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
582 | struct sched_info { | 582 | struct sched_info { |
583 | /* cumulative counters */ | 583 | /* cumulative counters */ |
584 | unsigned long pcount; /* # of times run on this cpu */ | 584 | unsigned long pcount; /* # of times run on this cpu */ |
585 | unsigned long long cpu_time, /* time spent on the cpu */ | 585 | unsigned long long cpu_time, /* time spent on the cpu */ |
586 | run_delay; /* time spent waiting on a runqueue */ | 586 | run_delay; /* time spent waiting on a runqueue */ |
587 | 587 | ||
588 | /* timestamps */ | 588 | /* timestamps */ |
589 | unsigned long long last_arrival,/* when we last ran on a cpu */ | 589 | unsigned long long last_arrival,/* when we last ran on a cpu */ |
590 | last_queued; /* when we were last queued to run */ | 590 | last_queued; /* when we were last queued to run */ |
591 | #ifdef CONFIG_SCHEDSTATS | 591 | #ifdef CONFIG_SCHEDSTATS |
592 | /* BKL stats */ | 592 | /* BKL stats */ |
593 | unsigned int bkl_count; | 593 | unsigned int bkl_count; |
594 | #endif | 594 | #endif |
595 | }; | 595 | }; |
596 | #endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */ | 596 | #endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */ |
597 | 597 | ||
598 | #ifdef CONFIG_SCHEDSTATS | 598 | #ifdef CONFIG_SCHEDSTATS |
599 | extern const struct file_operations proc_schedstat_operations; | 599 | extern const struct file_operations proc_schedstat_operations; |
600 | #endif /* CONFIG_SCHEDSTATS */ | 600 | #endif /* CONFIG_SCHEDSTATS */ |
601 | 601 | ||
602 | #ifdef CONFIG_TASK_DELAY_ACCT | 602 | #ifdef CONFIG_TASK_DELAY_ACCT |
603 | struct task_delay_info { | 603 | struct task_delay_info { |
604 | spinlock_t lock; | 604 | spinlock_t lock; |
605 | unsigned int flags; /* Private per-task flags */ | 605 | unsigned int flags; /* Private per-task flags */ |
606 | 606 | ||
607 | /* For each stat XXX, add following, aligned appropriately | 607 | /* For each stat XXX, add following, aligned appropriately |
608 | * | 608 | * |
609 | * struct timespec XXX_start, XXX_end; | 609 | * struct timespec XXX_start, XXX_end; |
610 | * u64 XXX_delay; | 610 | * u64 XXX_delay; |
611 | * u32 XXX_count; | 611 | * u32 XXX_count; |
612 | * | 612 | * |
613 | * Atomicity of updates to XXX_delay, XXX_count protected by | 613 | * Atomicity of updates to XXX_delay, XXX_count protected by |
614 | * single lock above (split into XXX_lock if contention is an issue). | 614 | * single lock above (split into XXX_lock if contention is an issue). |
615 | */ | 615 | */ |
616 | 616 | ||
617 | /* | 617 | /* |
618 | * XXX_count is incremented on every XXX operation, the delay | 618 | * XXX_count is incremented on every XXX operation, the delay |
619 | * associated with the operation is added to XXX_delay. | 619 | * associated with the operation is added to XXX_delay. |
620 | * XXX_delay contains the accumulated delay time in nanoseconds. | 620 | * XXX_delay contains the accumulated delay time in nanoseconds. |
621 | */ | 621 | */ |
622 | struct timespec blkio_start, blkio_end; /* Shared by blkio, swapin */ | 622 | struct timespec blkio_start, blkio_end; /* Shared by blkio, swapin */ |
623 | u64 blkio_delay; /* wait for sync block io completion */ | 623 | u64 blkio_delay; /* wait for sync block io completion */ |
624 | u64 swapin_delay; /* wait for swapin block io completion */ | 624 | u64 swapin_delay; /* wait for swapin block io completion */ |
625 | u32 blkio_count; /* total count of the number of sync block */ | 625 | u32 blkio_count; /* total count of the number of sync block */ |
626 | /* io operations performed */ | 626 | /* io operations performed */ |
627 | u32 swapin_count; /* total count of the number of swapin block */ | 627 | u32 swapin_count; /* total count of the number of swapin block */ |
628 | /* io operations performed */ | 628 | /* io operations performed */ |
629 | }; | 629 | }; |
630 | #endif /* CONFIG_TASK_DELAY_ACCT */ | 630 | #endif /* CONFIG_TASK_DELAY_ACCT */ |
631 | 631 | ||
632 | static inline int sched_info_on(void) | 632 | static inline int sched_info_on(void) |
633 | { | 633 | { |
634 | #ifdef CONFIG_SCHEDSTATS | 634 | #ifdef CONFIG_SCHEDSTATS |
635 | return 1; | 635 | return 1; |
636 | #elif defined(CONFIG_TASK_DELAY_ACCT) | 636 | #elif defined(CONFIG_TASK_DELAY_ACCT) |
637 | extern int delayacct_on; | 637 | extern int delayacct_on; |
638 | return delayacct_on; | 638 | return delayacct_on; |
639 | #else | 639 | #else |
640 | return 0; | 640 | return 0; |
641 | #endif | 641 | #endif |
642 | } | 642 | } |
643 | 643 | ||
644 | enum cpu_idle_type { | 644 | enum cpu_idle_type { |
645 | CPU_IDLE, | 645 | CPU_IDLE, |
646 | CPU_NOT_IDLE, | 646 | CPU_NOT_IDLE, |
647 | CPU_NEWLY_IDLE, | 647 | CPU_NEWLY_IDLE, |
648 | CPU_MAX_IDLE_TYPES | 648 | CPU_MAX_IDLE_TYPES |
649 | }; | 649 | }; |
650 | 650 | ||
651 | /* | 651 | /* |
652 | * sched-domains (multiprocessor balancing) declarations: | 652 | * sched-domains (multiprocessor balancing) declarations: |
653 | */ | 653 | */ |
654 | 654 | ||
655 | /* | 655 | /* |
656 | * Increase resolution of nice-level calculations: | 656 | * Increase resolution of nice-level calculations: |
657 | */ | 657 | */ |
658 | #define SCHED_LOAD_SHIFT 10 | 658 | #define SCHED_LOAD_SHIFT 10 |
659 | #define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT) | 659 | #define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT) |
660 | 660 | ||
661 | #define SCHED_LOAD_SCALE_FUZZ SCHED_LOAD_SCALE | 661 | #define SCHED_LOAD_SCALE_FUZZ SCHED_LOAD_SCALE |
662 | 662 | ||
663 | #ifdef CONFIG_SMP | 663 | #ifdef CONFIG_SMP |
664 | #define SD_LOAD_BALANCE 1 /* Do load balancing on this domain. */ | 664 | #define SD_LOAD_BALANCE 1 /* Do load balancing on this domain. */ |
665 | #define SD_BALANCE_NEWIDLE 2 /* Balance when about to become idle */ | 665 | #define SD_BALANCE_NEWIDLE 2 /* Balance when about to become idle */ |
666 | #define SD_BALANCE_EXEC 4 /* Balance on exec */ | 666 | #define SD_BALANCE_EXEC 4 /* Balance on exec */ |
667 | #define SD_BALANCE_FORK 8 /* Balance on fork, clone */ | 667 | #define SD_BALANCE_FORK 8 /* Balance on fork, clone */ |
668 | #define SD_WAKE_IDLE 16 /* Wake to idle CPU on task wakeup */ | 668 | #define SD_WAKE_IDLE 16 /* Wake to idle CPU on task wakeup */ |
669 | #define SD_WAKE_AFFINE 32 /* Wake task to waking CPU */ | 669 | #define SD_WAKE_AFFINE 32 /* Wake task to waking CPU */ |
670 | #define SD_WAKE_BALANCE 64 /* Perform balancing at task wakeup */ | 670 | #define SD_WAKE_BALANCE 64 /* Perform balancing at task wakeup */ |
671 | #define SD_SHARE_CPUPOWER 128 /* Domain members share cpu power */ | 671 | #define SD_SHARE_CPUPOWER 128 /* Domain members share cpu power */ |
672 | #define SD_POWERSAVINGS_BALANCE 256 /* Balance for power savings */ | 672 | #define SD_POWERSAVINGS_BALANCE 256 /* Balance for power savings */ |
673 | #define SD_SHARE_PKG_RESOURCES 512 /* Domain members share cpu pkg resources */ | 673 | #define SD_SHARE_PKG_RESOURCES 512 /* Domain members share cpu pkg resources */ |
674 | #define SD_SERIALIZE 1024 /* Only a single load balancing instance */ | 674 | #define SD_SERIALIZE 1024 /* Only a single load balancing instance */ |
675 | 675 | ||
676 | #define BALANCE_FOR_MC_POWER \ | 676 | #define BALANCE_FOR_MC_POWER \ |
677 | (sched_smt_power_savings ? SD_POWERSAVINGS_BALANCE : 0) | 677 | (sched_smt_power_savings ? SD_POWERSAVINGS_BALANCE : 0) |
678 | 678 | ||
679 | #define BALANCE_FOR_PKG_POWER \ | 679 | #define BALANCE_FOR_PKG_POWER \ |
680 | ((sched_mc_power_savings || sched_smt_power_savings) ? \ | 680 | ((sched_mc_power_savings || sched_smt_power_savings) ? \ |
681 | SD_POWERSAVINGS_BALANCE : 0) | 681 | SD_POWERSAVINGS_BALANCE : 0) |
682 | 682 | ||
683 | #define test_sd_parent(sd, flag) ((sd->parent && \ | 683 | #define test_sd_parent(sd, flag) ((sd->parent && \ |
684 | (sd->parent->flags & flag)) ? 1 : 0) | 684 | (sd->parent->flags & flag)) ? 1 : 0) |
685 | 685 | ||
686 | 686 | ||
687 | struct sched_group { | 687 | struct sched_group { |
688 | struct sched_group *next; /* Must be a circular list */ | 688 | struct sched_group *next; /* Must be a circular list */ |
689 | cpumask_t cpumask; | 689 | cpumask_t cpumask; |
690 | 690 | ||
691 | /* | 691 | /* |
692 | * CPU power of this group, SCHED_LOAD_SCALE being max power for a | 692 | * CPU power of this group, SCHED_LOAD_SCALE being max power for a |
693 | * single CPU. This is read only (except for setup, hotplug CPU). | 693 | * single CPU. This is read only (except for setup, hotplug CPU). |
694 | * Note : Never change cpu_power without recompute its reciprocal | 694 | * Note : Never change cpu_power without recompute its reciprocal |
695 | */ | 695 | */ |
696 | unsigned int __cpu_power; | 696 | unsigned int __cpu_power; |
697 | /* | 697 | /* |
698 | * reciprocal value of cpu_power to avoid expensive divides | 698 | * reciprocal value of cpu_power to avoid expensive divides |
699 | * (see include/linux/reciprocal_div.h) | 699 | * (see include/linux/reciprocal_div.h) |
700 | */ | 700 | */ |
701 | u32 reciprocal_cpu_power; | 701 | u32 reciprocal_cpu_power; |
702 | }; | 702 | }; |
703 | 703 | ||
704 | struct sched_domain { | 704 | struct sched_domain { |
705 | /* These fields must be setup */ | 705 | /* These fields must be setup */ |
706 | struct sched_domain *parent; /* top domain must be null terminated */ | 706 | struct sched_domain *parent; /* top domain must be null terminated */ |
707 | struct sched_domain *child; /* bottom domain must be null terminated */ | 707 | struct sched_domain *child; /* bottom domain must be null terminated */ |
708 | struct sched_group *groups; /* the balancing groups of the domain */ | 708 | struct sched_group *groups; /* the balancing groups of the domain */ |
709 | cpumask_t span; /* span of all CPUs in this domain */ | 709 | cpumask_t span; /* span of all CPUs in this domain */ |
710 | unsigned long min_interval; /* Minimum balance interval ms */ | 710 | unsigned long min_interval; /* Minimum balance interval ms */ |
711 | unsigned long max_interval; /* Maximum balance interval ms */ | 711 | unsigned long max_interval; /* Maximum balance interval ms */ |
712 | unsigned int busy_factor; /* less balancing by factor if busy */ | 712 | unsigned int busy_factor; /* less balancing by factor if busy */ |
713 | unsigned int imbalance_pct; /* No balance until over watermark */ | 713 | unsigned int imbalance_pct; /* No balance until over watermark */ |
714 | unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */ | 714 | unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */ |
715 | unsigned int busy_idx; | 715 | unsigned int busy_idx; |
716 | unsigned int idle_idx; | 716 | unsigned int idle_idx; |
717 | unsigned int newidle_idx; | 717 | unsigned int newidle_idx; |
718 | unsigned int wake_idx; | 718 | unsigned int wake_idx; |
719 | unsigned int forkexec_idx; | 719 | unsigned int forkexec_idx; |
720 | int flags; /* See SD_* */ | 720 | int flags; /* See SD_* */ |
721 | 721 | ||
722 | /* Runtime fields. */ | 722 | /* Runtime fields. */ |
723 | unsigned long last_balance; /* init to jiffies. units in jiffies */ | 723 | unsigned long last_balance; /* init to jiffies. units in jiffies */ |
724 | unsigned int balance_interval; /* initialise to 1. units in ms. */ | 724 | unsigned int balance_interval; /* initialise to 1. units in ms. */ |
725 | unsigned int nr_balance_failed; /* initialise to 0 */ | 725 | unsigned int nr_balance_failed; /* initialise to 0 */ |
726 | 726 | ||
727 | #ifdef CONFIG_SCHEDSTATS | 727 | #ifdef CONFIG_SCHEDSTATS |
728 | /* load_balance() stats */ | 728 | /* load_balance() stats */ |
729 | unsigned int lb_count[CPU_MAX_IDLE_TYPES]; | 729 | unsigned int lb_count[CPU_MAX_IDLE_TYPES]; |
730 | unsigned int lb_failed[CPU_MAX_IDLE_TYPES]; | 730 | unsigned int lb_failed[CPU_MAX_IDLE_TYPES]; |
731 | unsigned int lb_balanced[CPU_MAX_IDLE_TYPES]; | 731 | unsigned int lb_balanced[CPU_MAX_IDLE_TYPES]; |
732 | unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES]; | 732 | unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES]; |
733 | unsigned int lb_gained[CPU_MAX_IDLE_TYPES]; | 733 | unsigned int lb_gained[CPU_MAX_IDLE_TYPES]; |
734 | unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES]; | 734 | unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES]; |
735 | unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES]; | 735 | unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES]; |
736 | unsigned int lb_nobusyq[CPU_MAX_IDLE_TYPES]; | 736 | unsigned int lb_nobusyq[CPU_MAX_IDLE_TYPES]; |
737 | 737 | ||
738 | /* Active load balancing */ | 738 | /* Active load balancing */ |
739 | unsigned int alb_count; | 739 | unsigned int alb_count; |
740 | unsigned int alb_failed; | 740 | unsigned int alb_failed; |
741 | unsigned int alb_pushed; | 741 | unsigned int alb_pushed; |
742 | 742 | ||
743 | /* SD_BALANCE_EXEC stats */ | 743 | /* SD_BALANCE_EXEC stats */ |
744 | unsigned int sbe_count; | 744 | unsigned int sbe_count; |
745 | unsigned int sbe_balanced; | 745 | unsigned int sbe_balanced; |
746 | unsigned int sbe_pushed; | 746 | unsigned int sbe_pushed; |
747 | 747 | ||
748 | /* SD_BALANCE_FORK stats */ | 748 | /* SD_BALANCE_FORK stats */ |
749 | unsigned int sbf_count; | 749 | unsigned int sbf_count; |
750 | unsigned int sbf_balanced; | 750 | unsigned int sbf_balanced; |
751 | unsigned int sbf_pushed; | 751 | unsigned int sbf_pushed; |
752 | 752 | ||
753 | /* try_to_wake_up() stats */ | 753 | /* try_to_wake_up() stats */ |
754 | unsigned int ttwu_wake_remote; | 754 | unsigned int ttwu_wake_remote; |
755 | unsigned int ttwu_move_affine; | 755 | unsigned int ttwu_move_affine; |
756 | unsigned int ttwu_move_balance; | 756 | unsigned int ttwu_move_balance; |
757 | #endif | 757 | #endif |
758 | }; | 758 | }; |
759 | 759 | ||
760 | extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new); | 760 | extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new); |
761 | 761 | ||
762 | #endif /* CONFIG_SMP */ | 762 | #endif /* CONFIG_SMP */ |
763 | 763 | ||
764 | /* | 764 | /* |
765 | * A runqueue laden with a single nice 0 task scores a weighted_cpuload of | 765 | * A runqueue laden with a single nice 0 task scores a weighted_cpuload of |
766 | * SCHED_LOAD_SCALE. This function returns 1 if any cpu is laden with a | 766 | * SCHED_LOAD_SCALE. This function returns 1 if any cpu is laden with a |
767 | * task of nice 0 or enough lower priority tasks to bring up the | 767 | * task of nice 0 or enough lower priority tasks to bring up the |
768 | * weighted_cpuload | 768 | * weighted_cpuload |
769 | */ | 769 | */ |
770 | static inline int above_background_load(void) | 770 | static inline int above_background_load(void) |
771 | { | 771 | { |
772 | unsigned long cpu; | 772 | unsigned long cpu; |
773 | 773 | ||
774 | for_each_online_cpu(cpu) { | 774 | for_each_online_cpu(cpu) { |
775 | if (weighted_cpuload(cpu) >= SCHED_LOAD_SCALE) | 775 | if (weighted_cpuload(cpu) >= SCHED_LOAD_SCALE) |
776 | return 1; | 776 | return 1; |
777 | } | 777 | } |
778 | return 0; | 778 | return 0; |
779 | } | 779 | } |
780 | 780 | ||
781 | struct io_context; /* See blkdev.h */ | 781 | struct io_context; /* See blkdev.h */ |
782 | #define NGROUPS_SMALL 32 | 782 | #define NGROUPS_SMALL 32 |
783 | #define NGROUPS_PER_BLOCK ((int)(PAGE_SIZE / sizeof(gid_t))) | 783 | #define NGROUPS_PER_BLOCK ((int)(PAGE_SIZE / sizeof(gid_t))) |
784 | struct group_info { | 784 | struct group_info { |
785 | int ngroups; | 785 | int ngroups; |
786 | atomic_t usage; | 786 | atomic_t usage; |
787 | gid_t small_block[NGROUPS_SMALL]; | 787 | gid_t small_block[NGROUPS_SMALL]; |
788 | int nblocks; | 788 | int nblocks; |
789 | gid_t *blocks[0]; | 789 | gid_t *blocks[0]; |
790 | }; | 790 | }; |
791 | 791 | ||
792 | /* | 792 | /* |
793 | * get_group_info() must be called with the owning task locked (via task_lock()) | 793 | * get_group_info() must be called with the owning task locked (via task_lock()) |
794 | * when task != current. The reason being that the vast majority of callers are | 794 | * when task != current. The reason being that the vast majority of callers are |
795 | * looking at current->group_info, which can not be changed except by the | 795 | * looking at current->group_info, which can not be changed except by the |
796 | * current task. Changing current->group_info requires the task lock, too. | 796 | * current task. Changing current->group_info requires the task lock, too. |
797 | */ | 797 | */ |
798 | #define get_group_info(group_info) do { \ | 798 | #define get_group_info(group_info) do { \ |
799 | atomic_inc(&(group_info)->usage); \ | 799 | atomic_inc(&(group_info)->usage); \ |
800 | } while (0) | 800 | } while (0) |
801 | 801 | ||
802 | #define put_group_info(group_info) do { \ | 802 | #define put_group_info(group_info) do { \ |
803 | if (atomic_dec_and_test(&(group_info)->usage)) \ | 803 | if (atomic_dec_and_test(&(group_info)->usage)) \ |
804 | groups_free(group_info); \ | 804 | groups_free(group_info); \ |
805 | } while (0) | 805 | } while (0) |
806 | 806 | ||
807 | extern struct group_info *groups_alloc(int gidsetsize); | 807 | extern struct group_info *groups_alloc(int gidsetsize); |
808 | extern void groups_free(struct group_info *group_info); | 808 | extern void groups_free(struct group_info *group_info); |
809 | extern int set_current_groups(struct group_info *group_info); | 809 | extern int set_current_groups(struct group_info *group_info); |
810 | extern int groups_search(struct group_info *group_info, gid_t grp); | 810 | extern int groups_search(struct group_info *group_info, gid_t grp); |
811 | /* access the groups "array" with this macro */ | 811 | /* access the groups "array" with this macro */ |
812 | #define GROUP_AT(gi, i) \ | 812 | #define GROUP_AT(gi, i) \ |
813 | ((gi)->blocks[(i)/NGROUPS_PER_BLOCK][(i)%NGROUPS_PER_BLOCK]) | 813 | ((gi)->blocks[(i)/NGROUPS_PER_BLOCK][(i)%NGROUPS_PER_BLOCK]) |
814 | 814 | ||
815 | #ifdef ARCH_HAS_PREFETCH_SWITCH_STACK | 815 | #ifdef ARCH_HAS_PREFETCH_SWITCH_STACK |
816 | extern void prefetch_stack(struct task_struct *t); | 816 | extern void prefetch_stack(struct task_struct *t); |
817 | #else | 817 | #else |
818 | static inline void prefetch_stack(struct task_struct *t) { } | 818 | static inline void prefetch_stack(struct task_struct *t) { } |
819 | #endif | 819 | #endif |
820 | 820 | ||
821 | struct audit_context; /* See audit.c */ | 821 | struct audit_context; /* See audit.c */ |
822 | struct mempolicy; | 822 | struct mempolicy; |
823 | struct pipe_inode_info; | 823 | struct pipe_inode_info; |
824 | struct uts_namespace; | 824 | struct uts_namespace; |
825 | 825 | ||
826 | struct rq; | 826 | struct rq; |
827 | struct sched_domain; | 827 | struct sched_domain; |
828 | 828 | ||
829 | struct sched_class { | 829 | struct sched_class { |
830 | const struct sched_class *next; | 830 | const struct sched_class *next; |
831 | 831 | ||
832 | void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup); | 832 | void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup); |
833 | void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep); | 833 | void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep); |
834 | void (*yield_task) (struct rq *rq); | 834 | void (*yield_task) (struct rq *rq); |
835 | int (*select_task_rq)(struct task_struct *p, int sync); | 835 | int (*select_task_rq)(struct task_struct *p, int sync); |
836 | 836 | ||
837 | void (*check_preempt_curr) (struct rq *rq, struct task_struct *p); | 837 | void (*check_preempt_curr) (struct rq *rq, struct task_struct *p); |
838 | 838 | ||
839 | struct task_struct * (*pick_next_task) (struct rq *rq); | 839 | struct task_struct * (*pick_next_task) (struct rq *rq); |
840 | void (*put_prev_task) (struct rq *rq, struct task_struct *p); | 840 | void (*put_prev_task) (struct rq *rq, struct task_struct *p); |
841 | 841 | ||
842 | #ifdef CONFIG_SMP | 842 | #ifdef CONFIG_SMP |
843 | unsigned long (*load_balance) (struct rq *this_rq, int this_cpu, | 843 | unsigned long (*load_balance) (struct rq *this_rq, int this_cpu, |
844 | struct rq *busiest, unsigned long max_load_move, | 844 | struct rq *busiest, unsigned long max_load_move, |
845 | struct sched_domain *sd, enum cpu_idle_type idle, | 845 | struct sched_domain *sd, enum cpu_idle_type idle, |
846 | int *all_pinned, int *this_best_prio); | 846 | int *all_pinned, int *this_best_prio); |
847 | 847 | ||
848 | int (*move_one_task) (struct rq *this_rq, int this_cpu, | 848 | int (*move_one_task) (struct rq *this_rq, int this_cpu, |
849 | struct rq *busiest, struct sched_domain *sd, | 849 | struct rq *busiest, struct sched_domain *sd, |
850 | enum cpu_idle_type idle); | 850 | enum cpu_idle_type idle); |
851 | void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); | 851 | void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); |
852 | void (*post_schedule) (struct rq *this_rq); | 852 | void (*post_schedule) (struct rq *this_rq); |
853 | void (*task_wake_up) (struct rq *this_rq, struct task_struct *task); | 853 | void (*task_wake_up) (struct rq *this_rq, struct task_struct *task); |
854 | #endif | 854 | #endif |
855 | 855 | ||
856 | void (*set_curr_task) (struct rq *rq); | 856 | void (*set_curr_task) (struct rq *rq); |
857 | void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); | 857 | void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); |
858 | void (*task_new) (struct rq *rq, struct task_struct *p); | 858 | void (*task_new) (struct rq *rq, struct task_struct *p); |
859 | void (*set_cpus_allowed)(struct task_struct *p, cpumask_t *newmask); | 859 | void (*set_cpus_allowed)(struct task_struct *p, cpumask_t *newmask); |
860 | 860 | ||
861 | void (*join_domain)(struct rq *rq); | 861 | void (*join_domain)(struct rq *rq); |
862 | void (*leave_domain)(struct rq *rq); | 862 | void (*leave_domain)(struct rq *rq); |
863 | 863 | ||
864 | void (*switched_from) (struct rq *this_rq, struct task_struct *task, | 864 | void (*switched_from) (struct rq *this_rq, struct task_struct *task, |
865 | int running); | 865 | int running); |
866 | void (*switched_to) (struct rq *this_rq, struct task_struct *task, | 866 | void (*switched_to) (struct rq *this_rq, struct task_struct *task, |
867 | int running); | 867 | int running); |
868 | void (*prio_changed) (struct rq *this_rq, struct task_struct *task, | 868 | void (*prio_changed) (struct rq *this_rq, struct task_struct *task, |
869 | int oldprio, int running); | 869 | int oldprio, int running); |
870 | }; | 870 | }; |
871 | 871 | ||
872 | struct load_weight { | 872 | struct load_weight { |
873 | unsigned long weight, inv_weight; | 873 | unsigned long weight, inv_weight; |
874 | }; | 874 | }; |
875 | 875 | ||
876 | /* | 876 | /* |
877 | * CFS stats for a schedulable entity (task, task-group etc) | 877 | * CFS stats for a schedulable entity (task, task-group etc) |
878 | * | 878 | * |
879 | * Current field usage histogram: | 879 | * Current field usage histogram: |
880 | * | 880 | * |
881 | * 4 se->block_start | 881 | * 4 se->block_start |
882 | * 4 se->run_node | 882 | * 4 se->run_node |
883 | * 4 se->sleep_start | 883 | * 4 se->sleep_start |
884 | * 6 se->load.weight | 884 | * 6 se->load.weight |
885 | */ | 885 | */ |
886 | struct sched_entity { | 886 | struct sched_entity { |
887 | struct load_weight load; /* for load-balancing */ | 887 | struct load_weight load; /* for load-balancing */ |
888 | struct rb_node run_node; | 888 | struct rb_node run_node; |
889 | unsigned int on_rq; | 889 | unsigned int on_rq; |
890 | 890 | ||
891 | u64 exec_start; | 891 | u64 exec_start; |
892 | u64 sum_exec_runtime; | 892 | u64 sum_exec_runtime; |
893 | u64 vruntime; | 893 | u64 vruntime; |
894 | u64 prev_sum_exec_runtime; | 894 | u64 prev_sum_exec_runtime; |
895 | 895 | ||
896 | #ifdef CONFIG_SCHEDSTATS | 896 | #ifdef CONFIG_SCHEDSTATS |
897 | u64 wait_start; | 897 | u64 wait_start; |
898 | u64 wait_max; | 898 | u64 wait_max; |
899 | u64 wait_count; | 899 | u64 wait_count; |
900 | u64 wait_sum; | 900 | u64 wait_sum; |
901 | 901 | ||
902 | u64 sleep_start; | 902 | u64 sleep_start; |
903 | u64 sleep_max; | 903 | u64 sleep_max; |
904 | s64 sum_sleep_runtime; | 904 | s64 sum_sleep_runtime; |
905 | 905 | ||
906 | u64 block_start; | 906 | u64 block_start; |
907 | u64 block_max; | 907 | u64 block_max; |
908 | u64 exec_max; | 908 | u64 exec_max; |
909 | u64 slice_max; | 909 | u64 slice_max; |
910 | 910 | ||
911 | u64 nr_migrations; | 911 | u64 nr_migrations; |
912 | u64 nr_migrations_cold; | 912 | u64 nr_migrations_cold; |
913 | u64 nr_failed_migrations_affine; | 913 | u64 nr_failed_migrations_affine; |
914 | u64 nr_failed_migrations_running; | 914 | u64 nr_failed_migrations_running; |
915 | u64 nr_failed_migrations_hot; | 915 | u64 nr_failed_migrations_hot; |
916 | u64 nr_forced_migrations; | 916 | u64 nr_forced_migrations; |
917 | u64 nr_forced2_migrations; | 917 | u64 nr_forced2_migrations; |
918 | 918 | ||
919 | u64 nr_wakeups; | 919 | u64 nr_wakeups; |
920 | u64 nr_wakeups_sync; | 920 | u64 nr_wakeups_sync; |
921 | u64 nr_wakeups_migrate; | 921 | u64 nr_wakeups_migrate; |
922 | u64 nr_wakeups_local; | 922 | u64 nr_wakeups_local; |
923 | u64 nr_wakeups_remote; | 923 | u64 nr_wakeups_remote; |
924 | u64 nr_wakeups_affine; | 924 | u64 nr_wakeups_affine; |
925 | u64 nr_wakeups_affine_attempts; | 925 | u64 nr_wakeups_affine_attempts; |
926 | u64 nr_wakeups_passive; | 926 | u64 nr_wakeups_passive; |
927 | u64 nr_wakeups_idle; | 927 | u64 nr_wakeups_idle; |
928 | #endif | 928 | #endif |
929 | 929 | ||
930 | #ifdef CONFIG_FAIR_GROUP_SCHED | 930 | #ifdef CONFIG_FAIR_GROUP_SCHED |
931 | struct sched_entity *parent; | 931 | struct sched_entity *parent; |
932 | /* rq on which this entity is (to be) queued: */ | 932 | /* rq on which this entity is (to be) queued: */ |
933 | struct cfs_rq *cfs_rq; | 933 | struct cfs_rq *cfs_rq; |
934 | /* rq "owned" by this entity/group: */ | 934 | /* rq "owned" by this entity/group: */ |
935 | struct cfs_rq *my_q; | 935 | struct cfs_rq *my_q; |
936 | #endif | 936 | #endif |
937 | }; | 937 | }; |
938 | 938 | ||
939 | struct sched_rt_entity { | 939 | struct sched_rt_entity { |
940 | struct list_head run_list; | 940 | struct list_head run_list; |
941 | unsigned int time_slice; | 941 | unsigned int time_slice; |
942 | unsigned long timeout; | 942 | unsigned long timeout; |
943 | int nr_cpus_allowed; | 943 | int nr_cpus_allowed; |
944 | 944 | ||
945 | #ifdef CONFIG_FAIR_GROUP_SCHED | 945 | #ifdef CONFIG_FAIR_GROUP_SCHED |
946 | struct sched_rt_entity *parent; | 946 | struct sched_rt_entity *parent; |
947 | /* rq on which this entity is (to be) queued: */ | 947 | /* rq on which this entity is (to be) queued: */ |
948 | struct rt_rq *rt_rq; | 948 | struct rt_rq *rt_rq; |
949 | /* rq "owned" by this entity/group: */ | 949 | /* rq "owned" by this entity/group: */ |
950 | struct rt_rq *my_q; | 950 | struct rt_rq *my_q; |
951 | #endif | 951 | #endif |
952 | }; | 952 | }; |
953 | 953 | ||
954 | struct task_struct { | 954 | struct task_struct { |
955 | volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ | 955 | volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ |
956 | void *stack; | 956 | void *stack; |
957 | atomic_t usage; | 957 | atomic_t usage; |
958 | unsigned int flags; /* per process flags, defined below */ | 958 | unsigned int flags; /* per process flags, defined below */ |
959 | unsigned int ptrace; | 959 | unsigned int ptrace; |
960 | 960 | ||
961 | int lock_depth; /* BKL lock depth */ | 961 | int lock_depth; /* BKL lock depth */ |
962 | 962 | ||
963 | #ifdef CONFIG_SMP | 963 | #ifdef CONFIG_SMP |
964 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW | 964 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW |
965 | int oncpu; | 965 | int oncpu; |
966 | #endif | 966 | #endif |
967 | #endif | 967 | #endif |
968 | 968 | ||
969 | int prio, static_prio, normal_prio; | 969 | int prio, static_prio, normal_prio; |
970 | const struct sched_class *sched_class; | 970 | const struct sched_class *sched_class; |
971 | struct sched_entity se; | 971 | struct sched_entity se; |
972 | struct sched_rt_entity rt; | 972 | struct sched_rt_entity rt; |
973 | 973 | ||
974 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 974 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
975 | /* list of struct preempt_notifier: */ | 975 | /* list of struct preempt_notifier: */ |
976 | struct hlist_head preempt_notifiers; | 976 | struct hlist_head preempt_notifiers; |
977 | #endif | 977 | #endif |
978 | 978 | ||
979 | /* | 979 | /* |
980 | * fpu_counter contains the number of consecutive context switches | 980 | * fpu_counter contains the number of consecutive context switches |
981 | * that the FPU is used. If this is over a threshold, the lazy fpu | 981 | * that the FPU is used. If this is over a threshold, the lazy fpu |
982 | * saving becomes unlazy to save the trap. This is an unsigned char | 982 | * saving becomes unlazy to save the trap. This is an unsigned char |
983 | * so that after 256 times the counter wraps and the behavior turns | 983 | * so that after 256 times the counter wraps and the behavior turns |
984 | * lazy again; this to deal with bursty apps that only use FPU for | 984 | * lazy again; this to deal with bursty apps that only use FPU for |
985 | * a short time | 985 | * a short time |
986 | */ | 986 | */ |
987 | unsigned char fpu_counter; | 987 | unsigned char fpu_counter; |
988 | s8 oomkilladj; /* OOM kill score adjustment (bit shift). */ | 988 | s8 oomkilladj; /* OOM kill score adjustment (bit shift). */ |
989 | #ifdef CONFIG_BLK_DEV_IO_TRACE | 989 | #ifdef CONFIG_BLK_DEV_IO_TRACE |
990 | unsigned int btrace_seq; | 990 | unsigned int btrace_seq; |
991 | #endif | 991 | #endif |
992 | 992 | ||
993 | unsigned int policy; | 993 | unsigned int policy; |
994 | cpumask_t cpus_allowed; | 994 | cpumask_t cpus_allowed; |
995 | 995 | ||
996 | #ifdef CONFIG_PREEMPT_RCU | 996 | #ifdef CONFIG_PREEMPT_RCU |
997 | int rcu_read_lock_nesting; | 997 | int rcu_read_lock_nesting; |
998 | int rcu_flipctr_idx; | 998 | int rcu_flipctr_idx; |
999 | #endif /* #ifdef CONFIG_PREEMPT_RCU */ | 999 | #endif /* #ifdef CONFIG_PREEMPT_RCU */ |
1000 | 1000 | ||
1001 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | 1001 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
1002 | struct sched_info sched_info; | 1002 | struct sched_info sched_info; |
1003 | #endif | 1003 | #endif |
1004 | 1004 | ||
1005 | struct list_head tasks; | 1005 | struct list_head tasks; |
1006 | /* | 1006 | /* |
1007 | * ptrace_list/ptrace_children forms the list of my children | 1007 | * ptrace_list/ptrace_children forms the list of my children |
1008 | * that were stolen by a ptracer. | 1008 | * that were stolen by a ptracer. |
1009 | */ | 1009 | */ |
1010 | struct list_head ptrace_children; | 1010 | struct list_head ptrace_children; |
1011 | struct list_head ptrace_list; | 1011 | struct list_head ptrace_list; |
1012 | 1012 | ||
1013 | struct mm_struct *mm, *active_mm; | 1013 | struct mm_struct *mm, *active_mm; |
1014 | 1014 | ||
1015 | /* task state */ | 1015 | /* task state */ |
1016 | struct linux_binfmt *binfmt; | 1016 | struct linux_binfmt *binfmt; |
1017 | int exit_state; | 1017 | int exit_state; |
1018 | int exit_code, exit_signal; | 1018 | int exit_code, exit_signal; |
1019 | int pdeath_signal; /* The signal sent when the parent dies */ | 1019 | int pdeath_signal; /* The signal sent when the parent dies */ |
1020 | /* ??? */ | 1020 | /* ??? */ |
1021 | unsigned int personality; | 1021 | unsigned int personality; |
1022 | unsigned did_exec:1; | 1022 | unsigned did_exec:1; |
1023 | pid_t pid; | 1023 | pid_t pid; |
1024 | pid_t tgid; | 1024 | pid_t tgid; |
1025 | 1025 | ||
1026 | #ifdef CONFIG_CC_STACKPROTECTOR | 1026 | #ifdef CONFIG_CC_STACKPROTECTOR |
1027 | /* Canary value for the -fstack-protector gcc feature */ | 1027 | /* Canary value for the -fstack-protector gcc feature */ |
1028 | unsigned long stack_canary; | 1028 | unsigned long stack_canary; |
1029 | #endif | 1029 | #endif |
1030 | /* | 1030 | /* |
1031 | * pointers to (original) parent process, youngest child, younger sibling, | 1031 | * pointers to (original) parent process, youngest child, younger sibling, |
1032 | * older sibling, respectively. (p->father can be replaced with | 1032 | * older sibling, respectively. (p->father can be replaced with |
1033 | * p->parent->pid) | 1033 | * p->parent->pid) |
1034 | */ | 1034 | */ |
1035 | struct task_struct *real_parent; /* real parent process (when being debugged) */ | 1035 | struct task_struct *real_parent; /* real parent process (when being debugged) */ |
1036 | struct task_struct *parent; /* parent process */ | 1036 | struct task_struct *parent; /* parent process */ |
1037 | /* | 1037 | /* |
1038 | * children/sibling forms the list of my children plus the | 1038 | * children/sibling forms the list of my children plus the |
1039 | * tasks I'm ptracing. | 1039 | * tasks I'm ptracing. |
1040 | */ | 1040 | */ |
1041 | struct list_head children; /* list of my children */ | 1041 | struct list_head children; /* list of my children */ |
1042 | struct list_head sibling; /* linkage in my parent's children list */ | 1042 | struct list_head sibling; /* linkage in my parent's children list */ |
1043 | struct task_struct *group_leader; /* threadgroup leader */ | 1043 | struct task_struct *group_leader; /* threadgroup leader */ |
1044 | 1044 | ||
1045 | /* PID/PID hash table linkage. */ | 1045 | /* PID/PID hash table linkage. */ |
1046 | struct pid_link pids[PIDTYPE_MAX]; | 1046 | struct pid_link pids[PIDTYPE_MAX]; |
1047 | struct list_head thread_group; | 1047 | struct list_head thread_group; |
1048 | 1048 | ||
1049 | struct completion *vfork_done; /* for vfork() */ | 1049 | struct completion *vfork_done; /* for vfork() */ |
1050 | int __user *set_child_tid; /* CLONE_CHILD_SETTID */ | 1050 | int __user *set_child_tid; /* CLONE_CHILD_SETTID */ |
1051 | int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ | 1051 | int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ |
1052 | 1052 | ||
1053 | unsigned int rt_priority; | 1053 | unsigned int rt_priority; |
1054 | cputime_t utime, stime, utimescaled, stimescaled; | 1054 | cputime_t utime, stime, utimescaled, stimescaled; |
1055 | cputime_t gtime; | 1055 | cputime_t gtime; |
1056 | cputime_t prev_utime, prev_stime; | 1056 | cputime_t prev_utime, prev_stime; |
1057 | unsigned long nvcsw, nivcsw; /* context switch counts */ | 1057 | unsigned long nvcsw, nivcsw; /* context switch counts */ |
1058 | struct timespec start_time; /* monotonic time */ | 1058 | struct timespec start_time; /* monotonic time */ |
1059 | struct timespec real_start_time; /* boot based time */ | 1059 | struct timespec real_start_time; /* boot based time */ |
1060 | /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ | 1060 | /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ |
1061 | unsigned long min_flt, maj_flt; | 1061 | unsigned long min_flt, maj_flt; |
1062 | 1062 | ||
1063 | cputime_t it_prof_expires, it_virt_expires; | 1063 | cputime_t it_prof_expires, it_virt_expires; |
1064 | unsigned long long it_sched_expires; | 1064 | unsigned long long it_sched_expires; |
1065 | struct list_head cpu_timers[3]; | 1065 | struct list_head cpu_timers[3]; |
1066 | 1066 | ||
1067 | /* process credentials */ | 1067 | /* process credentials */ |
1068 | uid_t uid,euid,suid,fsuid; | 1068 | uid_t uid,euid,suid,fsuid; |
1069 | gid_t gid,egid,sgid,fsgid; | 1069 | gid_t gid,egid,sgid,fsgid; |
1070 | struct group_info *group_info; | 1070 | struct group_info *group_info; |
1071 | kernel_cap_t cap_effective, cap_inheritable, cap_permitted; | 1071 | kernel_cap_t cap_effective, cap_inheritable, cap_permitted; |
1072 | unsigned keep_capabilities:1; | 1072 | unsigned keep_capabilities:1; |
1073 | struct user_struct *user; | 1073 | struct user_struct *user; |
1074 | #ifdef CONFIG_KEYS | 1074 | #ifdef CONFIG_KEYS |
1075 | struct key *request_key_auth; /* assumed request_key authority */ | 1075 | struct key *request_key_auth; /* assumed request_key authority */ |
1076 | struct key *thread_keyring; /* keyring private to this thread */ | 1076 | struct key *thread_keyring; /* keyring private to this thread */ |
1077 | unsigned char jit_keyring; /* default keyring to attach requested keys to */ | 1077 | unsigned char jit_keyring; /* default keyring to attach requested keys to */ |
1078 | #endif | 1078 | #endif |
1079 | char comm[TASK_COMM_LEN]; /* executable name excluding path | 1079 | char comm[TASK_COMM_LEN]; /* executable name excluding path |
1080 | - access with [gs]et_task_comm (which lock | 1080 | - access with [gs]et_task_comm (which lock |
1081 | it with task_lock()) | 1081 | it with task_lock()) |
1082 | - initialized normally by flush_old_exec */ | 1082 | - initialized normally by flush_old_exec */ |
1083 | /* file system info */ | 1083 | /* file system info */ |
1084 | int link_count, total_link_count; | 1084 | int link_count, total_link_count; |
1085 | #ifdef CONFIG_SYSVIPC | 1085 | #ifdef CONFIG_SYSVIPC |
1086 | /* ipc stuff */ | 1086 | /* ipc stuff */ |
1087 | struct sysv_sem sysvsem; | 1087 | struct sysv_sem sysvsem; |
1088 | #endif | 1088 | #endif |
1089 | #ifdef CONFIG_DETECT_SOFTLOCKUP | 1089 | #ifdef CONFIG_DETECT_SOFTLOCKUP |
1090 | /* hung task detection */ | 1090 | /* hung task detection */ |
1091 | unsigned long last_switch_timestamp; | 1091 | unsigned long last_switch_timestamp; |
1092 | unsigned long last_switch_count; | 1092 | unsigned long last_switch_count; |
1093 | #endif | 1093 | #endif |
1094 | /* CPU-specific state of this task */ | 1094 | /* CPU-specific state of this task */ |
1095 | struct thread_struct thread; | 1095 | struct thread_struct thread; |
1096 | /* filesystem information */ | 1096 | /* filesystem information */ |
1097 | struct fs_struct *fs; | 1097 | struct fs_struct *fs; |
1098 | /* open file information */ | 1098 | /* open file information */ |
1099 | struct files_struct *files; | 1099 | struct files_struct *files; |
1100 | /* namespaces */ | 1100 | /* namespaces */ |
1101 | struct nsproxy *nsproxy; | 1101 | struct nsproxy *nsproxy; |
1102 | /* signal handlers */ | 1102 | /* signal handlers */ |
1103 | struct signal_struct *signal; | 1103 | struct signal_struct *signal; |
1104 | struct sighand_struct *sighand; | 1104 | struct sighand_struct *sighand; |
1105 | 1105 | ||
1106 | sigset_t blocked, real_blocked; | 1106 | sigset_t blocked, real_blocked; |
1107 | sigset_t saved_sigmask; /* To be restored with TIF_RESTORE_SIGMASK */ | 1107 | sigset_t saved_sigmask; /* To be restored with TIF_RESTORE_SIGMASK */ |
1108 | struct sigpending pending; | 1108 | struct sigpending pending; |
1109 | 1109 | ||
1110 | unsigned long sas_ss_sp; | 1110 | unsigned long sas_ss_sp; |
1111 | size_t sas_ss_size; | 1111 | size_t sas_ss_size; |
1112 | int (*notifier)(void *priv); | 1112 | int (*notifier)(void *priv); |
1113 | void *notifier_data; | 1113 | void *notifier_data; |
1114 | sigset_t *notifier_mask; | 1114 | sigset_t *notifier_mask; |
1115 | #ifdef CONFIG_SECURITY | 1115 | #ifdef CONFIG_SECURITY |
1116 | void *security; | 1116 | void *security; |
1117 | #endif | 1117 | #endif |
1118 | struct audit_context *audit_context; | 1118 | struct audit_context *audit_context; |
1119 | seccomp_t seccomp; | 1119 | seccomp_t seccomp; |
1120 | 1120 | ||
1121 | /* Thread group tracking */ | 1121 | /* Thread group tracking */ |
1122 | u32 parent_exec_id; | 1122 | u32 parent_exec_id; |
1123 | u32 self_exec_id; | 1123 | u32 self_exec_id; |
1124 | /* Protection of (de-)allocation: mm, files, fs, tty, keyrings */ | 1124 | /* Protection of (de-)allocation: mm, files, fs, tty, keyrings */ |
1125 | spinlock_t alloc_lock; | 1125 | spinlock_t alloc_lock; |
1126 | 1126 | ||
1127 | /* Protection of the PI data structures: */ | 1127 | /* Protection of the PI data structures: */ |
1128 | spinlock_t pi_lock; | 1128 | spinlock_t pi_lock; |
1129 | 1129 | ||
1130 | #ifdef CONFIG_RT_MUTEXES | 1130 | #ifdef CONFIG_RT_MUTEXES |
1131 | /* PI waiters blocked on a rt_mutex held by this task */ | 1131 | /* PI waiters blocked on a rt_mutex held by this task */ |
1132 | struct plist_head pi_waiters; | 1132 | struct plist_head pi_waiters; |
1133 | /* Deadlock detection and priority inheritance handling */ | 1133 | /* Deadlock detection and priority inheritance handling */ |
1134 | struct rt_mutex_waiter *pi_blocked_on; | 1134 | struct rt_mutex_waiter *pi_blocked_on; |
1135 | #endif | 1135 | #endif |
1136 | 1136 | ||
1137 | #ifdef CONFIG_DEBUG_MUTEXES | 1137 | #ifdef CONFIG_DEBUG_MUTEXES |
1138 | /* mutex deadlock detection */ | 1138 | /* mutex deadlock detection */ |
1139 | struct mutex_waiter *blocked_on; | 1139 | struct mutex_waiter *blocked_on; |
1140 | #endif | 1140 | #endif |
1141 | #ifdef CONFIG_TRACE_IRQFLAGS | 1141 | #ifdef CONFIG_TRACE_IRQFLAGS |
1142 | unsigned int irq_events; | 1142 | unsigned int irq_events; |
1143 | int hardirqs_enabled; | 1143 | int hardirqs_enabled; |
1144 | unsigned long hardirq_enable_ip; | 1144 | unsigned long hardirq_enable_ip; |
1145 | unsigned int hardirq_enable_event; | 1145 | unsigned int hardirq_enable_event; |
1146 | unsigned long hardirq_disable_ip; | 1146 | unsigned long hardirq_disable_ip; |
1147 | unsigned int hardirq_disable_event; | 1147 | unsigned int hardirq_disable_event; |
1148 | int softirqs_enabled; | 1148 | int softirqs_enabled; |
1149 | unsigned long softirq_disable_ip; | 1149 | unsigned long softirq_disable_ip; |
1150 | unsigned int softirq_disable_event; | 1150 | unsigned int softirq_disable_event; |
1151 | unsigned long softirq_enable_ip; | 1151 | unsigned long softirq_enable_ip; |
1152 | unsigned int softirq_enable_event; | 1152 | unsigned int softirq_enable_event; |
1153 | int hardirq_context; | 1153 | int hardirq_context; |
1154 | int softirq_context; | 1154 | int softirq_context; |
1155 | #endif | 1155 | #endif |
1156 | #ifdef CONFIG_LOCKDEP | 1156 | #ifdef CONFIG_LOCKDEP |
1157 | # define MAX_LOCK_DEPTH 30UL | 1157 | # define MAX_LOCK_DEPTH 30UL |
1158 | u64 curr_chain_key; | 1158 | u64 curr_chain_key; |
1159 | int lockdep_depth; | 1159 | int lockdep_depth; |
1160 | struct held_lock held_locks[MAX_LOCK_DEPTH]; | 1160 | struct held_lock held_locks[MAX_LOCK_DEPTH]; |
1161 | unsigned int lockdep_recursion; | 1161 | unsigned int lockdep_recursion; |
1162 | #endif | 1162 | #endif |
1163 | 1163 | ||
1164 | /* journalling filesystem info */ | 1164 | /* journalling filesystem info */ |
1165 | void *journal_info; | 1165 | void *journal_info; |
1166 | 1166 | ||
1167 | /* stacked block device info */ | 1167 | /* stacked block device info */ |
1168 | struct bio *bio_list, **bio_tail; | 1168 | struct bio *bio_list, **bio_tail; |
1169 | 1169 | ||
1170 | /* VM state */ | 1170 | /* VM state */ |
1171 | struct reclaim_state *reclaim_state; | 1171 | struct reclaim_state *reclaim_state; |
1172 | 1172 | ||
1173 | struct backing_dev_info *backing_dev_info; | 1173 | struct backing_dev_info *backing_dev_info; |
1174 | 1174 | ||
1175 | struct io_context *io_context; | 1175 | struct io_context *io_context; |
1176 | 1176 | ||
1177 | unsigned long ptrace_message; | 1177 | unsigned long ptrace_message; |
1178 | siginfo_t *last_siginfo; /* For ptrace use. */ | 1178 | siginfo_t *last_siginfo; /* For ptrace use. */ |
1179 | #ifdef CONFIG_TASK_XACCT | 1179 | #ifdef CONFIG_TASK_XACCT |
1180 | /* i/o counters(bytes read/written, #syscalls */ | 1180 | /* i/o counters(bytes read/written, #syscalls */ |
1181 | u64 rchar, wchar, syscr, syscw; | 1181 | u64 rchar, wchar, syscr, syscw; |
1182 | #endif | 1182 | #endif |
1183 | struct task_io_accounting ioac; | 1183 | struct task_io_accounting ioac; |
1184 | #if defined(CONFIG_TASK_XACCT) | 1184 | #if defined(CONFIG_TASK_XACCT) |
1185 | u64 acct_rss_mem1; /* accumulated rss usage */ | 1185 | u64 acct_rss_mem1; /* accumulated rss usage */ |
1186 | u64 acct_vm_mem1; /* accumulated virtual memory usage */ | 1186 | u64 acct_vm_mem1; /* accumulated virtual memory usage */ |
1187 | cputime_t acct_stimexpd;/* stime since last update */ | 1187 | cputime_t acct_stimexpd;/* stime since last update */ |
1188 | #endif | 1188 | #endif |
1189 | #ifdef CONFIG_NUMA | 1189 | #ifdef CONFIG_NUMA |
1190 | struct mempolicy *mempolicy; | 1190 | struct mempolicy *mempolicy; |
1191 | short il_next; | 1191 | short il_next; |
1192 | #endif | 1192 | #endif |
1193 | #ifdef CONFIG_CPUSETS | 1193 | #ifdef CONFIG_CPUSETS |
1194 | nodemask_t mems_allowed; | 1194 | nodemask_t mems_allowed; |
1195 | int cpuset_mems_generation; | 1195 | int cpuset_mems_generation; |
1196 | int cpuset_mem_spread_rotor; | 1196 | int cpuset_mem_spread_rotor; |
1197 | #endif | 1197 | #endif |
1198 | #ifdef CONFIG_CGROUPS | 1198 | #ifdef CONFIG_CGROUPS |
1199 | /* Control Group info protected by css_set_lock */ | 1199 | /* Control Group info protected by css_set_lock */ |
1200 | struct css_set *cgroups; | 1200 | struct css_set *cgroups; |
1201 | /* cg_list protected by css_set_lock and tsk->alloc_lock */ | 1201 | /* cg_list protected by css_set_lock and tsk->alloc_lock */ |
1202 | struct list_head cg_list; | 1202 | struct list_head cg_list; |
1203 | #endif | 1203 | #endif |
1204 | #ifdef CONFIG_FUTEX | 1204 | #ifdef CONFIG_FUTEX |
1205 | struct robust_list_head __user *robust_list; | 1205 | struct robust_list_head __user *robust_list; |
1206 | #ifdef CONFIG_COMPAT | 1206 | #ifdef CONFIG_COMPAT |
1207 | struct compat_robust_list_head __user *compat_robust_list; | 1207 | struct compat_robust_list_head __user *compat_robust_list; |
1208 | #endif | 1208 | #endif |
1209 | struct list_head pi_state_list; | 1209 | struct list_head pi_state_list; |
1210 | struct futex_pi_state *pi_state_cache; | 1210 | struct futex_pi_state *pi_state_cache; |
1211 | #endif | 1211 | #endif |
1212 | atomic_t fs_excl; /* holding fs exclusive resources */ | 1212 | atomic_t fs_excl; /* holding fs exclusive resources */ |
1213 | struct rcu_head rcu; | 1213 | struct rcu_head rcu; |
1214 | 1214 | ||
1215 | /* | 1215 | /* |
1216 | * cache last used pipe for splice | 1216 | * cache last used pipe for splice |
1217 | */ | 1217 | */ |
1218 | struct pipe_inode_info *splice_pipe; | 1218 | struct pipe_inode_info *splice_pipe; |
1219 | #ifdef CONFIG_TASK_DELAY_ACCT | 1219 | #ifdef CONFIG_TASK_DELAY_ACCT |
1220 | struct task_delay_info *delays; | 1220 | struct task_delay_info *delays; |
1221 | #endif | 1221 | #endif |
1222 | #ifdef CONFIG_FAULT_INJECTION | 1222 | #ifdef CONFIG_FAULT_INJECTION |
1223 | int make_it_fail; | 1223 | int make_it_fail; |
1224 | #endif | 1224 | #endif |
1225 | struct prop_local_single dirties; | 1225 | struct prop_local_single dirties; |
1226 | #ifdef CONFIG_LATENCYTOP | 1226 | #ifdef CONFIG_LATENCYTOP |
1227 | int latency_record_count; | 1227 | int latency_record_count; |
1228 | struct latency_record latency_record[LT_SAVECOUNT]; | 1228 | struct latency_record latency_record[LT_SAVECOUNT]; |
1229 | #endif | 1229 | #endif |
1230 | }; | 1230 | }; |
1231 | 1231 | ||
1232 | /* | 1232 | /* |
1233 | * Priority of a process goes from 0..MAX_PRIO-1, valid RT | 1233 | * Priority of a process goes from 0..MAX_PRIO-1, valid RT |
1234 | * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH | 1234 | * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH |
1235 | * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority | 1235 | * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority |
1236 | * values are inverted: lower p->prio value means higher priority. | 1236 | * values are inverted: lower p->prio value means higher priority. |
1237 | * | 1237 | * |
1238 | * The MAX_USER_RT_PRIO value allows the actual maximum | 1238 | * The MAX_USER_RT_PRIO value allows the actual maximum |
1239 | * RT priority to be separate from the value exported to | 1239 | * RT priority to be separate from the value exported to |
1240 | * user-space. This allows kernel threads to set their | 1240 | * user-space. This allows kernel threads to set their |
1241 | * priority to a value higher than any user task. Note: | 1241 | * priority to a value higher than any user task. Note: |
1242 | * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO. | 1242 | * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO. |
1243 | */ | 1243 | */ |
1244 | 1244 | ||
1245 | #define MAX_USER_RT_PRIO 100 | 1245 | #define MAX_USER_RT_PRIO 100 |
1246 | #define MAX_RT_PRIO MAX_USER_RT_PRIO | 1246 | #define MAX_RT_PRIO MAX_USER_RT_PRIO |
1247 | 1247 | ||
1248 | #define MAX_PRIO (MAX_RT_PRIO + 40) | 1248 | #define MAX_PRIO (MAX_RT_PRIO + 40) |
1249 | #define DEFAULT_PRIO (MAX_RT_PRIO + 20) | 1249 | #define DEFAULT_PRIO (MAX_RT_PRIO + 20) |
1250 | 1250 | ||
1251 | static inline int rt_prio(int prio) | 1251 | static inline int rt_prio(int prio) |
1252 | { | 1252 | { |
1253 | if (unlikely(prio < MAX_RT_PRIO)) | 1253 | if (unlikely(prio < MAX_RT_PRIO)) |
1254 | return 1; | 1254 | return 1; |
1255 | return 0; | 1255 | return 0; |
1256 | } | 1256 | } |
1257 | 1257 | ||
1258 | static inline int rt_task(struct task_struct *p) | 1258 | static inline int rt_task(struct task_struct *p) |
1259 | { | 1259 | { |
1260 | return rt_prio(p->prio); | 1260 | return rt_prio(p->prio); |
1261 | } | 1261 | } |
1262 | 1262 | ||
1263 | static inline void set_task_session(struct task_struct *tsk, pid_t session) | 1263 | static inline void set_task_session(struct task_struct *tsk, pid_t session) |
1264 | { | 1264 | { |
1265 | tsk->signal->__session = session; | 1265 | tsk->signal->__session = session; |
1266 | } | 1266 | } |
1267 | 1267 | ||
1268 | static inline void set_task_pgrp(struct task_struct *tsk, pid_t pgrp) | 1268 | static inline void set_task_pgrp(struct task_struct *tsk, pid_t pgrp) |
1269 | { | 1269 | { |
1270 | tsk->signal->__pgrp = pgrp; | 1270 | tsk->signal->__pgrp = pgrp; |
1271 | } | 1271 | } |
1272 | 1272 | ||
1273 | static inline struct pid *task_pid(struct task_struct *task) | 1273 | static inline struct pid *task_pid(struct task_struct *task) |
1274 | { | 1274 | { |
1275 | return task->pids[PIDTYPE_PID].pid; | 1275 | return task->pids[PIDTYPE_PID].pid; |
1276 | } | 1276 | } |
1277 | 1277 | ||
1278 | static inline struct pid *task_tgid(struct task_struct *task) | 1278 | static inline struct pid *task_tgid(struct task_struct *task) |
1279 | { | 1279 | { |
1280 | return task->group_leader->pids[PIDTYPE_PID].pid; | 1280 | return task->group_leader->pids[PIDTYPE_PID].pid; |
1281 | } | 1281 | } |
1282 | 1282 | ||
1283 | static inline struct pid *task_pgrp(struct task_struct *task) | 1283 | static inline struct pid *task_pgrp(struct task_struct *task) |
1284 | { | 1284 | { |
1285 | return task->group_leader->pids[PIDTYPE_PGID].pid; | 1285 | return task->group_leader->pids[PIDTYPE_PGID].pid; |
1286 | } | 1286 | } |
1287 | 1287 | ||
1288 | static inline struct pid *task_session(struct task_struct *task) | 1288 | static inline struct pid *task_session(struct task_struct *task) |
1289 | { | 1289 | { |
1290 | return task->group_leader->pids[PIDTYPE_SID].pid; | 1290 | return task->group_leader->pids[PIDTYPE_SID].pid; |
1291 | } | 1291 | } |
1292 | 1292 | ||
1293 | struct pid_namespace; | 1293 | struct pid_namespace; |
1294 | 1294 | ||
1295 | /* | 1295 | /* |
1296 | * the helpers to get the task's different pids as they are seen | 1296 | * the helpers to get the task's different pids as they are seen |
1297 | * from various namespaces | 1297 | * from various namespaces |
1298 | * | 1298 | * |
1299 | * task_xid_nr() : global id, i.e. the id seen from the init namespace; | 1299 | * task_xid_nr() : global id, i.e. the id seen from the init namespace; |
1300 | * task_xid_vnr() : virtual id, i.e. the id seen from the namespace the task | 1300 | * task_xid_vnr() : virtual id, i.e. the id seen from the namespace the task |
1301 | * belongs to. this only makes sence when called in the | 1301 | * belongs to. this only makes sence when called in the |
1302 | * context of the task that belongs to the same namespace; | 1302 | * context of the task that belongs to the same namespace; |
1303 | * task_xid_nr_ns() : id seen from the ns specified; | 1303 | * task_xid_nr_ns() : id seen from the ns specified; |
1304 | * | 1304 | * |
1305 | * set_task_vxid() : assigns a virtual id to a task; | 1305 | * set_task_vxid() : assigns a virtual id to a task; |
1306 | * | 1306 | * |
1307 | * see also pid_nr() etc in include/linux/pid.h | 1307 | * see also pid_nr() etc in include/linux/pid.h |
1308 | */ | 1308 | */ |
1309 | 1309 | ||
1310 | static inline pid_t task_pid_nr(struct task_struct *tsk) | 1310 | static inline pid_t task_pid_nr(struct task_struct *tsk) |
1311 | { | 1311 | { |
1312 | return tsk->pid; | 1312 | return tsk->pid; |
1313 | } | 1313 | } |
1314 | 1314 | ||
1315 | pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns); | 1315 | pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns); |
1316 | 1316 | ||
1317 | static inline pid_t task_pid_vnr(struct task_struct *tsk) | 1317 | static inline pid_t task_pid_vnr(struct task_struct *tsk) |
1318 | { | 1318 | { |
1319 | return pid_vnr(task_pid(tsk)); | 1319 | return pid_vnr(task_pid(tsk)); |
1320 | } | 1320 | } |
1321 | 1321 | ||
1322 | 1322 | ||
1323 | static inline pid_t task_tgid_nr(struct task_struct *tsk) | 1323 | static inline pid_t task_tgid_nr(struct task_struct *tsk) |
1324 | { | 1324 | { |
1325 | return tsk->tgid; | 1325 | return tsk->tgid; |
1326 | } | 1326 | } |
1327 | 1327 | ||
1328 | pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns); | 1328 | pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns); |
1329 | 1329 | ||
1330 | static inline pid_t task_tgid_vnr(struct task_struct *tsk) | 1330 | static inline pid_t task_tgid_vnr(struct task_struct *tsk) |
1331 | { | 1331 | { |
1332 | return pid_vnr(task_tgid(tsk)); | 1332 | return pid_vnr(task_tgid(tsk)); |
1333 | } | 1333 | } |
1334 | 1334 | ||
1335 | 1335 | ||
1336 | static inline pid_t task_pgrp_nr(struct task_struct *tsk) | 1336 | static inline pid_t task_pgrp_nr(struct task_struct *tsk) |
1337 | { | 1337 | { |
1338 | return tsk->signal->__pgrp; | 1338 | return tsk->signal->__pgrp; |
1339 | } | 1339 | } |
1340 | 1340 | ||
1341 | pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns); | 1341 | pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns); |
1342 | 1342 | ||
1343 | static inline pid_t task_pgrp_vnr(struct task_struct *tsk) | 1343 | static inline pid_t task_pgrp_vnr(struct task_struct *tsk) |
1344 | { | 1344 | { |
1345 | return pid_vnr(task_pgrp(tsk)); | 1345 | return pid_vnr(task_pgrp(tsk)); |
1346 | } | 1346 | } |
1347 | 1347 | ||
1348 | 1348 | ||
1349 | static inline pid_t task_session_nr(struct task_struct *tsk) | 1349 | static inline pid_t task_session_nr(struct task_struct *tsk) |
1350 | { | 1350 | { |
1351 | return tsk->signal->__session; | 1351 | return tsk->signal->__session; |
1352 | } | 1352 | } |
1353 | 1353 | ||
1354 | pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns); | 1354 | pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns); |
1355 | 1355 | ||
1356 | static inline pid_t task_session_vnr(struct task_struct *tsk) | 1356 | static inline pid_t task_session_vnr(struct task_struct *tsk) |
1357 | { | 1357 | { |
1358 | return pid_vnr(task_session(tsk)); | 1358 | return pid_vnr(task_session(tsk)); |
1359 | } | 1359 | } |
1360 | 1360 | ||
1361 | 1361 | ||
1362 | /** | 1362 | /** |
1363 | * pid_alive - check that a task structure is not stale | 1363 | * pid_alive - check that a task structure is not stale |
1364 | * @p: Task structure to be checked. | 1364 | * @p: Task structure to be checked. |
1365 | * | 1365 | * |
1366 | * Test if a process is not yet dead (at most zombie state) | 1366 | * Test if a process is not yet dead (at most zombie state) |
1367 | * If pid_alive fails, then pointers within the task structure | 1367 | * If pid_alive fails, then pointers within the task structure |
1368 | * can be stale and must not be dereferenced. | 1368 | * can be stale and must not be dereferenced. |
1369 | */ | 1369 | */ |
1370 | static inline int pid_alive(struct task_struct *p) | 1370 | static inline int pid_alive(struct task_struct *p) |
1371 | { | 1371 | { |
1372 | return p->pids[PIDTYPE_PID].pid != NULL; | 1372 | return p->pids[PIDTYPE_PID].pid != NULL; |
1373 | } | 1373 | } |
1374 | 1374 | ||
1375 | /** | 1375 | /** |
1376 | * is_global_init - check if a task structure is init | 1376 | * is_global_init - check if a task structure is init |
1377 | * @tsk: Task structure to be checked. | 1377 | * @tsk: Task structure to be checked. |
1378 | * | 1378 | * |
1379 | * Check if a task structure is the first user space task the kernel created. | 1379 | * Check if a task structure is the first user space task the kernel created. |
1380 | */ | 1380 | */ |
1381 | static inline int is_global_init(struct task_struct *tsk) | 1381 | static inline int is_global_init(struct task_struct *tsk) |
1382 | { | 1382 | { |
1383 | return tsk->pid == 1; | 1383 | return tsk->pid == 1; |
1384 | } | 1384 | } |
1385 | 1385 | ||
1386 | /* | 1386 | /* |
1387 | * is_container_init: | 1387 | * is_container_init: |
1388 | * check whether in the task is init in its own pid namespace. | 1388 | * check whether in the task is init in its own pid namespace. |
1389 | */ | 1389 | */ |
1390 | extern int is_container_init(struct task_struct *tsk); | 1390 | extern int is_container_init(struct task_struct *tsk); |
1391 | 1391 | ||
1392 | extern struct pid *cad_pid; | 1392 | extern struct pid *cad_pid; |
1393 | 1393 | ||
1394 | extern void free_task(struct task_struct *tsk); | 1394 | extern void free_task(struct task_struct *tsk); |
1395 | #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0) | 1395 | #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0) |
1396 | 1396 | ||
1397 | extern void __put_task_struct(struct task_struct *t); | 1397 | extern void __put_task_struct(struct task_struct *t); |
1398 | 1398 | ||
1399 | static inline void put_task_struct(struct task_struct *t) | 1399 | static inline void put_task_struct(struct task_struct *t) |
1400 | { | 1400 | { |
1401 | if (atomic_dec_and_test(&t->usage)) | 1401 | if (atomic_dec_and_test(&t->usage)) |
1402 | __put_task_struct(t); | 1402 | __put_task_struct(t); |
1403 | } | 1403 | } |
1404 | 1404 | ||
1405 | /* | 1405 | /* |
1406 | * Per process flags | 1406 | * Per process flags |
1407 | */ | 1407 | */ |
1408 | #define PF_ALIGNWARN 0x00000001 /* Print alignment warning msgs */ | 1408 | #define PF_ALIGNWARN 0x00000001 /* Print alignment warning msgs */ |
1409 | /* Not implemented yet, only for 486*/ | 1409 | /* Not implemented yet, only for 486*/ |
1410 | #define PF_STARTING 0x00000002 /* being created */ | 1410 | #define PF_STARTING 0x00000002 /* being created */ |
1411 | #define PF_EXITING 0x00000004 /* getting shut down */ | 1411 | #define PF_EXITING 0x00000004 /* getting shut down */ |
1412 | #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ | 1412 | #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ |
1413 | #define PF_VCPU 0x00000010 /* I'm a virtual CPU */ | 1413 | #define PF_VCPU 0x00000010 /* I'm a virtual CPU */ |
1414 | #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ | 1414 | #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ |
1415 | #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ | 1415 | #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ |
1416 | #define PF_DUMPCORE 0x00000200 /* dumped core */ | 1416 | #define PF_DUMPCORE 0x00000200 /* dumped core */ |
1417 | #define PF_SIGNALED 0x00000400 /* killed by a signal */ | 1417 | #define PF_SIGNALED 0x00000400 /* killed by a signal */ |
1418 | #define PF_MEMALLOC 0x00000800 /* Allocating memory */ | 1418 | #define PF_MEMALLOC 0x00000800 /* Allocating memory */ |
1419 | #define PF_FLUSHER 0x00001000 /* responsible for disk writeback */ | 1419 | #define PF_FLUSHER 0x00001000 /* responsible for disk writeback */ |
1420 | #define PF_USED_MATH 0x00002000 /* if unset the fpu must be initialized before use */ | 1420 | #define PF_USED_MATH 0x00002000 /* if unset the fpu must be initialized before use */ |
1421 | #define PF_NOFREEZE 0x00008000 /* this thread should not be frozen */ | 1421 | #define PF_NOFREEZE 0x00008000 /* this thread should not be frozen */ |
1422 | #define PF_FROZEN 0x00010000 /* frozen for system suspend */ | 1422 | #define PF_FROZEN 0x00010000 /* frozen for system suspend */ |
1423 | #define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */ | 1423 | #define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */ |
1424 | #define PF_KSWAPD 0x00040000 /* I am kswapd */ | 1424 | #define PF_KSWAPD 0x00040000 /* I am kswapd */ |
1425 | #define PF_SWAPOFF 0x00080000 /* I am in swapoff */ | 1425 | #define PF_SWAPOFF 0x00080000 /* I am in swapoff */ |
1426 | #define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ | 1426 | #define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ |
1427 | #define PF_BORROWED_MM 0x00200000 /* I am a kthread doing use_mm */ | 1427 | #define PF_BORROWED_MM 0x00200000 /* I am a kthread doing use_mm */ |
1428 | #define PF_RANDOMIZE 0x00400000 /* randomize virtual address space */ | 1428 | #define PF_RANDOMIZE 0x00400000 /* randomize virtual address space */ |
1429 | #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ | 1429 | #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ |
1430 | #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ | 1430 | #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ |
1431 | #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ | 1431 | #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ |
1432 | #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ | 1432 | #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ |
1433 | #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ | 1433 | #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ |
1434 | #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezeable */ | 1434 | #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezeable */ |
1435 | 1435 | ||
1436 | /* | 1436 | /* |
1437 | * Only the _current_ task can read/write to tsk->flags, but other | 1437 | * Only the _current_ task can read/write to tsk->flags, but other |
1438 | * tasks can access tsk->flags in readonly mode for example | 1438 | * tasks can access tsk->flags in readonly mode for example |
1439 | * with tsk_used_math (like during threaded core dumping). | 1439 | * with tsk_used_math (like during threaded core dumping). |
1440 | * There is however an exception to this rule during ptrace | 1440 | * There is however an exception to this rule during ptrace |
1441 | * or during fork: the ptracer task is allowed to write to the | 1441 | * or during fork: the ptracer task is allowed to write to the |
1442 | * child->flags of its traced child (same goes for fork, the parent | 1442 | * child->flags of its traced child (same goes for fork, the parent |
1443 | * can write to the child->flags), because we're guaranteed the | 1443 | * can write to the child->flags), because we're guaranteed the |
1444 | * child is not running and in turn not changing child->flags | 1444 | * child is not running and in turn not changing child->flags |
1445 | * at the same time the parent does it. | 1445 | * at the same time the parent does it. |
1446 | */ | 1446 | */ |
1447 | #define clear_stopped_child_used_math(child) do { (child)->flags &= ~PF_USED_MATH; } while (0) | 1447 | #define clear_stopped_child_used_math(child) do { (child)->flags &= ~PF_USED_MATH; } while (0) |
1448 | #define set_stopped_child_used_math(child) do { (child)->flags |= PF_USED_MATH; } while (0) | 1448 | #define set_stopped_child_used_math(child) do { (child)->flags |= PF_USED_MATH; } while (0) |
1449 | #define clear_used_math() clear_stopped_child_used_math(current) | 1449 | #define clear_used_math() clear_stopped_child_used_math(current) |
1450 | #define set_used_math() set_stopped_child_used_math(current) | 1450 | #define set_used_math() set_stopped_child_used_math(current) |
1451 | #define conditional_stopped_child_used_math(condition, child) \ | 1451 | #define conditional_stopped_child_used_math(condition, child) \ |
1452 | do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0) | 1452 | do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0) |
1453 | #define conditional_used_math(condition) \ | 1453 | #define conditional_used_math(condition) \ |
1454 | conditional_stopped_child_used_math(condition, current) | 1454 | conditional_stopped_child_used_math(condition, current) |
1455 | #define copy_to_stopped_child_used_math(child) \ | 1455 | #define copy_to_stopped_child_used_math(child) \ |
1456 | do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0) | 1456 | do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0) |
1457 | /* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */ | 1457 | /* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */ |
1458 | #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) | 1458 | #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) |
1459 | #define used_math() tsk_used_math(current) | 1459 | #define used_math() tsk_used_math(current) |
1460 | 1460 | ||
1461 | #ifdef CONFIG_SMP | 1461 | #ifdef CONFIG_SMP |
1462 | extern int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask); | 1462 | extern int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask); |
1463 | #else | 1463 | #else |
1464 | static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) | 1464 | static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) |
1465 | { | 1465 | { |
1466 | if (!cpu_isset(0, new_mask)) | 1466 | if (!cpu_isset(0, new_mask)) |
1467 | return -EINVAL; | 1467 | return -EINVAL; |
1468 | return 0; | 1468 | return 0; |
1469 | } | 1469 | } |
1470 | #endif | 1470 | #endif |
1471 | 1471 | ||
1472 | extern unsigned long long sched_clock(void); | 1472 | extern unsigned long long sched_clock(void); |
1473 | 1473 | ||
1474 | /* | 1474 | /* |
1475 | * For kernel-internal use: high-speed (but slightly incorrect) per-cpu | 1475 | * For kernel-internal use: high-speed (but slightly incorrect) per-cpu |
1476 | * clock constructed from sched_clock(): | 1476 | * clock constructed from sched_clock(): |
1477 | */ | 1477 | */ |
1478 | extern unsigned long long cpu_clock(int cpu); | 1478 | extern unsigned long long cpu_clock(int cpu); |
1479 | 1479 | ||
1480 | extern unsigned long long | 1480 | extern unsigned long long |
1481 | task_sched_runtime(struct task_struct *task); | 1481 | task_sched_runtime(struct task_struct *task); |
1482 | 1482 | ||
1483 | /* sched_exec is called by processes performing an exec */ | 1483 | /* sched_exec is called by processes performing an exec */ |
1484 | #ifdef CONFIG_SMP | 1484 | #ifdef CONFIG_SMP |
1485 | extern void sched_exec(void); | 1485 | extern void sched_exec(void); |
1486 | #else | 1486 | #else |
1487 | #define sched_exec() {} | 1487 | #define sched_exec() {} |
1488 | #endif | 1488 | #endif |
1489 | 1489 | ||
1490 | extern void sched_clock_idle_sleep_event(void); | 1490 | extern void sched_clock_idle_sleep_event(void); |
1491 | extern void sched_clock_idle_wakeup_event(u64 delta_ns); | 1491 | extern void sched_clock_idle_wakeup_event(u64 delta_ns); |
1492 | 1492 | ||
1493 | #ifdef CONFIG_HOTPLUG_CPU | 1493 | #ifdef CONFIG_HOTPLUG_CPU |
1494 | extern void idle_task_exit(void); | 1494 | extern void idle_task_exit(void); |
1495 | #else | 1495 | #else |
1496 | static inline void idle_task_exit(void) {} | 1496 | static inline void idle_task_exit(void) {} |
1497 | #endif | 1497 | #endif |
1498 | 1498 | ||
1499 | extern void sched_idle_next(void); | 1499 | extern void sched_idle_next(void); |
1500 | 1500 | ||
1501 | #ifdef CONFIG_SCHED_DEBUG | 1501 | #ifdef CONFIG_SCHED_DEBUG |
1502 | extern unsigned int sysctl_sched_latency; | 1502 | extern unsigned int sysctl_sched_latency; |
1503 | extern unsigned int sysctl_sched_min_granularity; | 1503 | extern unsigned int sysctl_sched_min_granularity; |
1504 | extern unsigned int sysctl_sched_wakeup_granularity; | 1504 | extern unsigned int sysctl_sched_wakeup_granularity; |
1505 | extern unsigned int sysctl_sched_batch_wakeup_granularity; | 1505 | extern unsigned int sysctl_sched_batch_wakeup_granularity; |
1506 | extern unsigned int sysctl_sched_child_runs_first; | 1506 | extern unsigned int sysctl_sched_child_runs_first; |
1507 | extern unsigned int sysctl_sched_features; | 1507 | extern unsigned int sysctl_sched_features; |
1508 | extern unsigned int sysctl_sched_migration_cost; | 1508 | extern unsigned int sysctl_sched_migration_cost; |
1509 | extern unsigned int sysctl_sched_nr_migrate; | 1509 | extern unsigned int sysctl_sched_nr_migrate; |
1510 | extern unsigned int sysctl_sched_rt_period; | 1510 | extern unsigned int sysctl_sched_rt_period; |
1511 | extern unsigned int sysctl_sched_rt_ratio; | 1511 | extern unsigned int sysctl_sched_rt_ratio; |
1512 | #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) | 1512 | #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) |
1513 | extern unsigned int sysctl_sched_min_bal_int_shares; | 1513 | extern unsigned int sysctl_sched_min_bal_int_shares; |
1514 | extern unsigned int sysctl_sched_max_bal_int_shares; | 1514 | extern unsigned int sysctl_sched_max_bal_int_shares; |
1515 | #endif | 1515 | #endif |
1516 | 1516 | ||
1517 | int sched_nr_latency_handler(struct ctl_table *table, int write, | 1517 | int sched_nr_latency_handler(struct ctl_table *table, int write, |
1518 | struct file *file, void __user *buffer, size_t *length, | 1518 | struct file *file, void __user *buffer, size_t *length, |
1519 | loff_t *ppos); | 1519 | loff_t *ppos); |
1520 | #endif | 1520 | #endif |
1521 | 1521 | ||
1522 | extern unsigned int sysctl_sched_compat_yield; | 1522 | extern unsigned int sysctl_sched_compat_yield; |
1523 | 1523 | ||
1524 | #ifdef CONFIG_RT_MUTEXES | 1524 | #ifdef CONFIG_RT_MUTEXES |
1525 | extern int rt_mutex_getprio(struct task_struct *p); | 1525 | extern int rt_mutex_getprio(struct task_struct *p); |
1526 | extern void rt_mutex_setprio(struct task_struct *p, int prio); | 1526 | extern void rt_mutex_setprio(struct task_struct *p, int prio); |
1527 | extern void rt_mutex_adjust_pi(struct task_struct *p); | 1527 | extern void rt_mutex_adjust_pi(struct task_struct *p); |
1528 | #else | 1528 | #else |
1529 | static inline int rt_mutex_getprio(struct task_struct *p) | 1529 | static inline int rt_mutex_getprio(struct task_struct *p) |
1530 | { | 1530 | { |
1531 | return p->normal_prio; | 1531 | return p->normal_prio; |
1532 | } | 1532 | } |
1533 | # define rt_mutex_adjust_pi(p) do { } while (0) | 1533 | # define rt_mutex_adjust_pi(p) do { } while (0) |
1534 | #endif | 1534 | #endif |
1535 | 1535 | ||
1536 | extern void set_user_nice(struct task_struct *p, long nice); | 1536 | extern void set_user_nice(struct task_struct *p, long nice); |
1537 | extern int task_prio(const struct task_struct *p); | 1537 | extern int task_prio(const struct task_struct *p); |
1538 | extern int task_nice(const struct task_struct *p); | 1538 | extern int task_nice(const struct task_struct *p); |
1539 | extern int can_nice(const struct task_struct *p, const int nice); | 1539 | extern int can_nice(const struct task_struct *p, const int nice); |
1540 | extern int task_curr(const struct task_struct *p); | 1540 | extern int task_curr(const struct task_struct *p); |
1541 | extern int idle_cpu(int cpu); | 1541 | extern int idle_cpu(int cpu); |
1542 | extern int sched_setscheduler(struct task_struct *, int, struct sched_param *); | 1542 | extern int sched_setscheduler(struct task_struct *, int, struct sched_param *); |
1543 | extern struct task_struct *idle_task(int cpu); | 1543 | extern struct task_struct *idle_task(int cpu); |
1544 | extern struct task_struct *curr_task(int cpu); | 1544 | extern struct task_struct *curr_task(int cpu); |
1545 | extern void set_curr_task(int cpu, struct task_struct *p); | 1545 | extern void set_curr_task(int cpu, struct task_struct *p); |
1546 | 1546 | ||
1547 | void yield(void); | 1547 | void yield(void); |
1548 | 1548 | ||
1549 | /* | 1549 | /* |
1550 | * The default (Linux) execution domain. | 1550 | * The default (Linux) execution domain. |
1551 | */ | 1551 | */ |
1552 | extern struct exec_domain default_exec_domain; | 1552 | extern struct exec_domain default_exec_domain; |
1553 | 1553 | ||
1554 | union thread_union { | 1554 | union thread_union { |
1555 | struct thread_info thread_info; | 1555 | struct thread_info thread_info; |
1556 | unsigned long stack[THREAD_SIZE/sizeof(long)]; | 1556 | unsigned long stack[THREAD_SIZE/sizeof(long)]; |
1557 | }; | 1557 | }; |
1558 | 1558 | ||
1559 | #ifndef __HAVE_ARCH_KSTACK_END | 1559 | #ifndef __HAVE_ARCH_KSTACK_END |
1560 | static inline int kstack_end(void *addr) | 1560 | static inline int kstack_end(void *addr) |
1561 | { | 1561 | { |
1562 | /* Reliable end of stack detection: | 1562 | /* Reliable end of stack detection: |
1563 | * Some APM bios versions misalign the stack | 1563 | * Some APM bios versions misalign the stack |
1564 | */ | 1564 | */ |
1565 | return !(((unsigned long)addr+sizeof(void*)-1) & (THREAD_SIZE-sizeof(void*))); | 1565 | return !(((unsigned long)addr+sizeof(void*)-1) & (THREAD_SIZE-sizeof(void*))); |
1566 | } | 1566 | } |
1567 | #endif | 1567 | #endif |
1568 | 1568 | ||
1569 | extern union thread_union init_thread_union; | 1569 | extern union thread_union init_thread_union; |
1570 | extern struct task_struct init_task; | 1570 | extern struct task_struct init_task; |
1571 | 1571 | ||
1572 | extern struct mm_struct init_mm; | 1572 | extern struct mm_struct init_mm; |
1573 | 1573 | ||
1574 | extern struct pid_namespace init_pid_ns; | 1574 | extern struct pid_namespace init_pid_ns; |
1575 | 1575 | ||
1576 | /* | 1576 | /* |
1577 | * find a task by one of its numerical ids | 1577 | * find a task by one of its numerical ids |
1578 | * | 1578 | * |
1579 | * find_task_by_pid_type_ns(): | 1579 | * find_task_by_pid_type_ns(): |
1580 | * it is the most generic call - it finds a task by all id, | 1580 | * it is the most generic call - it finds a task by all id, |
1581 | * type and namespace specified | 1581 | * type and namespace specified |
1582 | * find_task_by_pid_ns(): | 1582 | * find_task_by_pid_ns(): |
1583 | * finds a task by its pid in the specified namespace | 1583 | * finds a task by its pid in the specified namespace |
1584 | * find_task_by_vpid(): | 1584 | * find_task_by_vpid(): |
1585 | * finds a task by its virtual pid | 1585 | * finds a task by its virtual pid |
1586 | * find_task_by_pid(): | 1586 | * find_task_by_pid(): |
1587 | * finds a task by its global pid | 1587 | * finds a task by its global pid |
1588 | * | 1588 | * |
1589 | * see also find_pid() etc in include/linux/pid.h | 1589 | * see also find_pid() etc in include/linux/pid.h |
1590 | */ | 1590 | */ |
1591 | 1591 | ||
1592 | extern struct task_struct *find_task_by_pid_type_ns(int type, int pid, | 1592 | extern struct task_struct *find_task_by_pid_type_ns(int type, int pid, |
1593 | struct pid_namespace *ns); | 1593 | struct pid_namespace *ns); |
1594 | 1594 | ||
1595 | extern struct task_struct *find_task_by_pid(pid_t nr); | 1595 | extern struct task_struct *find_task_by_pid(pid_t nr); |
1596 | extern struct task_struct *find_task_by_vpid(pid_t nr); | 1596 | extern struct task_struct *find_task_by_vpid(pid_t nr); |
1597 | extern struct task_struct *find_task_by_pid_ns(pid_t nr, | 1597 | extern struct task_struct *find_task_by_pid_ns(pid_t nr, |
1598 | struct pid_namespace *ns); | 1598 | struct pid_namespace *ns); |
1599 | 1599 | ||
1600 | extern void __set_special_pids(pid_t session, pid_t pgrp); | 1600 | extern void __set_special_pids(pid_t session, pid_t pgrp); |
1601 | 1601 | ||
1602 | /* per-UID process charging. */ | 1602 | /* per-UID process charging. */ |
1603 | extern struct user_struct * alloc_uid(struct user_namespace *, uid_t); | 1603 | extern struct user_struct * alloc_uid(struct user_namespace *, uid_t); |
1604 | static inline struct user_struct *get_uid(struct user_struct *u) | 1604 | static inline struct user_struct *get_uid(struct user_struct *u) |
1605 | { | 1605 | { |
1606 | atomic_inc(&u->__count); | 1606 | atomic_inc(&u->__count); |
1607 | return u; | 1607 | return u; |
1608 | } | 1608 | } |
1609 | extern void free_uid(struct user_struct *); | 1609 | extern void free_uid(struct user_struct *); |
1610 | extern void switch_uid(struct user_struct *); | 1610 | extern void switch_uid(struct user_struct *); |
1611 | extern void release_uids(struct user_namespace *ns); | 1611 | extern void release_uids(struct user_namespace *ns); |
1612 | 1612 | ||
1613 | #include <asm/current.h> | 1613 | #include <asm/current.h> |
1614 | 1614 | ||
1615 | extern void do_timer(unsigned long ticks); | 1615 | extern void do_timer(unsigned long ticks); |
1616 | 1616 | ||
1617 | extern int FASTCALL(wake_up_state(struct task_struct * tsk, unsigned int state)); | 1617 | extern int FASTCALL(wake_up_state(struct task_struct * tsk, unsigned int state)); |
1618 | extern int FASTCALL(wake_up_process(struct task_struct * tsk)); | 1618 | extern int FASTCALL(wake_up_process(struct task_struct * tsk)); |
1619 | extern void FASTCALL(wake_up_new_task(struct task_struct * tsk, | 1619 | extern void FASTCALL(wake_up_new_task(struct task_struct * tsk, |
1620 | unsigned long clone_flags)); | 1620 | unsigned long clone_flags)); |
1621 | #ifdef CONFIG_SMP | 1621 | #ifdef CONFIG_SMP |
1622 | extern void kick_process(struct task_struct *tsk); | 1622 | extern void kick_process(struct task_struct *tsk); |
1623 | #else | 1623 | #else |
1624 | static inline void kick_process(struct task_struct *tsk) { } | 1624 | static inline void kick_process(struct task_struct *tsk) { } |
1625 | #endif | 1625 | #endif |
1626 | extern void sched_fork(struct task_struct *p, int clone_flags); | 1626 | extern void sched_fork(struct task_struct *p, int clone_flags); |
1627 | extern void sched_dead(struct task_struct *p); | 1627 | extern void sched_dead(struct task_struct *p); |
1628 | 1628 | ||
1629 | extern int in_group_p(gid_t); | 1629 | extern int in_group_p(gid_t); |
1630 | extern int in_egroup_p(gid_t); | 1630 | extern int in_egroup_p(gid_t); |
1631 | 1631 | ||
1632 | extern void proc_caches_init(void); | 1632 | extern void proc_caches_init(void); |
1633 | extern void flush_signals(struct task_struct *); | 1633 | extern void flush_signals(struct task_struct *); |
1634 | extern void ignore_signals(struct task_struct *); | 1634 | extern void ignore_signals(struct task_struct *); |
1635 | extern void flush_signal_handlers(struct task_struct *, int force_default); | 1635 | extern void flush_signal_handlers(struct task_struct *, int force_default); |
1636 | extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info); | 1636 | extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info); |
1637 | 1637 | ||
1638 | static inline int dequeue_signal_lock(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) | 1638 | static inline int dequeue_signal_lock(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) |
1639 | { | 1639 | { |
1640 | unsigned long flags; | 1640 | unsigned long flags; |
1641 | int ret; | 1641 | int ret; |
1642 | 1642 | ||
1643 | spin_lock_irqsave(&tsk->sighand->siglock, flags); | 1643 | spin_lock_irqsave(&tsk->sighand->siglock, flags); |
1644 | ret = dequeue_signal(tsk, mask, info); | 1644 | ret = dequeue_signal(tsk, mask, info); |
1645 | spin_unlock_irqrestore(&tsk->sighand->siglock, flags); | 1645 | spin_unlock_irqrestore(&tsk->sighand->siglock, flags); |
1646 | 1646 | ||
1647 | return ret; | 1647 | return ret; |
1648 | } | 1648 | } |
1649 | 1649 | ||
1650 | extern void block_all_signals(int (*notifier)(void *priv), void *priv, | 1650 | extern void block_all_signals(int (*notifier)(void *priv), void *priv, |
1651 | sigset_t *mask); | 1651 | sigset_t *mask); |
1652 | extern void unblock_all_signals(void); | 1652 | extern void unblock_all_signals(void); |
1653 | extern void release_task(struct task_struct * p); | 1653 | extern void release_task(struct task_struct * p); |
1654 | extern int send_sig_info(int, struct siginfo *, struct task_struct *); | 1654 | extern int send_sig_info(int, struct siginfo *, struct task_struct *); |
1655 | extern int send_group_sig_info(int, struct siginfo *, struct task_struct *); | 1655 | extern int send_group_sig_info(int, struct siginfo *, struct task_struct *); |
1656 | extern int force_sigsegv(int, struct task_struct *); | 1656 | extern int force_sigsegv(int, struct task_struct *); |
1657 | extern int force_sig_info(int, struct siginfo *, struct task_struct *); | 1657 | extern int force_sig_info(int, struct siginfo *, struct task_struct *); |
1658 | extern int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp); | 1658 | extern int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp); |
1659 | extern int kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp); | 1659 | extern int kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp); |
1660 | extern int kill_pid_info(int sig, struct siginfo *info, struct pid *pid); | 1660 | extern int kill_pid_info(int sig, struct siginfo *info, struct pid *pid); |
1661 | extern int kill_pid_info_as_uid(int, struct siginfo *, struct pid *, uid_t, uid_t, u32); | 1661 | extern int kill_pid_info_as_uid(int, struct siginfo *, struct pid *, uid_t, uid_t, u32); |
1662 | extern int kill_pgrp(struct pid *pid, int sig, int priv); | 1662 | extern int kill_pgrp(struct pid *pid, int sig, int priv); |
1663 | extern int kill_pid(struct pid *pid, int sig, int priv); | 1663 | extern int kill_pid(struct pid *pid, int sig, int priv); |
1664 | extern int kill_proc_info(int, struct siginfo *, pid_t); | 1664 | extern int kill_proc_info(int, struct siginfo *, pid_t); |
1665 | extern void do_notify_parent(struct task_struct *, int); | 1665 | extern void do_notify_parent(struct task_struct *, int); |
1666 | extern void force_sig(int, struct task_struct *); | 1666 | extern void force_sig(int, struct task_struct *); |
1667 | extern void force_sig_specific(int, struct task_struct *); | 1667 | extern void force_sig_specific(int, struct task_struct *); |
1668 | extern int send_sig(int, struct task_struct *, int); | 1668 | extern int send_sig(int, struct task_struct *, int); |
1669 | extern void zap_other_threads(struct task_struct *p); | 1669 | extern void zap_other_threads(struct task_struct *p); |
1670 | extern int kill_proc(pid_t, int, int); | 1670 | extern int kill_proc(pid_t, int, int); |
1671 | extern struct sigqueue *sigqueue_alloc(void); | 1671 | extern struct sigqueue *sigqueue_alloc(void); |
1672 | extern void sigqueue_free(struct sigqueue *); | 1672 | extern void sigqueue_free(struct sigqueue *); |
1673 | extern int send_sigqueue(int, struct sigqueue *, struct task_struct *); | 1673 | extern int send_sigqueue(int, struct sigqueue *, struct task_struct *); |
1674 | extern int send_group_sigqueue(int, struct sigqueue *, struct task_struct *); | 1674 | extern int send_group_sigqueue(int, struct sigqueue *, struct task_struct *); |
1675 | extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *); | 1675 | extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *); |
1676 | extern int do_sigaltstack(const stack_t __user *, stack_t __user *, unsigned long); | 1676 | extern int do_sigaltstack(const stack_t __user *, stack_t __user *, unsigned long); |
1677 | 1677 | ||
1678 | static inline int kill_cad_pid(int sig, int priv) | 1678 | static inline int kill_cad_pid(int sig, int priv) |
1679 | { | 1679 | { |
1680 | return kill_pid(cad_pid, sig, priv); | 1680 | return kill_pid(cad_pid, sig, priv); |
1681 | } | 1681 | } |
1682 | 1682 | ||
1683 | /* These can be the second arg to send_sig_info/send_group_sig_info. */ | 1683 | /* These can be the second arg to send_sig_info/send_group_sig_info. */ |
1684 | #define SEND_SIG_NOINFO ((struct siginfo *) 0) | 1684 | #define SEND_SIG_NOINFO ((struct siginfo *) 0) |
1685 | #define SEND_SIG_PRIV ((struct siginfo *) 1) | 1685 | #define SEND_SIG_PRIV ((struct siginfo *) 1) |
1686 | #define SEND_SIG_FORCED ((struct siginfo *) 2) | 1686 | #define SEND_SIG_FORCED ((struct siginfo *) 2) |
1687 | 1687 | ||
1688 | static inline int is_si_special(const struct siginfo *info) | 1688 | static inline int is_si_special(const struct siginfo *info) |
1689 | { | 1689 | { |
1690 | return info <= SEND_SIG_FORCED; | 1690 | return info <= SEND_SIG_FORCED; |
1691 | } | 1691 | } |
1692 | 1692 | ||
1693 | /* True if we are on the alternate signal stack. */ | 1693 | /* True if we are on the alternate signal stack. */ |
1694 | 1694 | ||
1695 | static inline int on_sig_stack(unsigned long sp) | 1695 | static inline int on_sig_stack(unsigned long sp) |
1696 | { | 1696 | { |
1697 | return (sp - current->sas_ss_sp < current->sas_ss_size); | 1697 | return (sp - current->sas_ss_sp < current->sas_ss_size); |
1698 | } | 1698 | } |
1699 | 1699 | ||
1700 | static inline int sas_ss_flags(unsigned long sp) | 1700 | static inline int sas_ss_flags(unsigned long sp) |
1701 | { | 1701 | { |
1702 | return (current->sas_ss_size == 0 ? SS_DISABLE | 1702 | return (current->sas_ss_size == 0 ? SS_DISABLE |
1703 | : on_sig_stack(sp) ? SS_ONSTACK : 0); | 1703 | : on_sig_stack(sp) ? SS_ONSTACK : 0); |
1704 | } | 1704 | } |
1705 | 1705 | ||
1706 | /* | 1706 | /* |
1707 | * Routines for handling mm_structs | 1707 | * Routines for handling mm_structs |
1708 | */ | 1708 | */ |
1709 | extern struct mm_struct * mm_alloc(void); | 1709 | extern struct mm_struct * mm_alloc(void); |
1710 | 1710 | ||
1711 | /* mmdrop drops the mm and the page tables */ | 1711 | /* mmdrop drops the mm and the page tables */ |
1712 | extern void FASTCALL(__mmdrop(struct mm_struct *)); | 1712 | extern void FASTCALL(__mmdrop(struct mm_struct *)); |
1713 | static inline void mmdrop(struct mm_struct * mm) | 1713 | static inline void mmdrop(struct mm_struct * mm) |
1714 | { | 1714 | { |
1715 | if (unlikely(atomic_dec_and_test(&mm->mm_count))) | 1715 | if (unlikely(atomic_dec_and_test(&mm->mm_count))) |
1716 | __mmdrop(mm); | 1716 | __mmdrop(mm); |
1717 | } | 1717 | } |
1718 | 1718 | ||
1719 | /* mmput gets rid of the mappings and all user-space */ | 1719 | /* mmput gets rid of the mappings and all user-space */ |
1720 | extern void mmput(struct mm_struct *); | 1720 | extern void mmput(struct mm_struct *); |
1721 | /* Grab a reference to a task's mm, if it is not already going away */ | 1721 | /* Grab a reference to a task's mm, if it is not already going away */ |
1722 | extern struct mm_struct *get_task_mm(struct task_struct *task); | 1722 | extern struct mm_struct *get_task_mm(struct task_struct *task); |
1723 | /* Remove the current tasks stale references to the old mm_struct */ | 1723 | /* Remove the current tasks stale references to the old mm_struct */ |
1724 | extern void mm_release(struct task_struct *, struct mm_struct *); | 1724 | extern void mm_release(struct task_struct *, struct mm_struct *); |
1725 | 1725 | ||
1726 | extern int copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *); | 1726 | extern int copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *); |
1727 | extern void flush_thread(void); | 1727 | extern void flush_thread(void); |
1728 | extern void exit_thread(void); | 1728 | extern void exit_thread(void); |
1729 | 1729 | ||
1730 | extern void exit_files(struct task_struct *); | 1730 | extern void exit_files(struct task_struct *); |
1731 | extern void __cleanup_signal(struct signal_struct *); | 1731 | extern void __cleanup_signal(struct signal_struct *); |
1732 | extern void __cleanup_sighand(struct sighand_struct *); | 1732 | extern void __cleanup_sighand(struct sighand_struct *); |
1733 | extern void exit_itimers(struct signal_struct *); | 1733 | extern void exit_itimers(struct signal_struct *); |
1734 | 1734 | ||
1735 | extern NORET_TYPE void do_group_exit(int); | 1735 | extern NORET_TYPE void do_group_exit(int); |
1736 | 1736 | ||
1737 | extern void daemonize(const char *, ...); | 1737 | extern void daemonize(const char *, ...); |
1738 | extern int allow_signal(int); | 1738 | extern int allow_signal(int); |
1739 | extern int disallow_signal(int); | 1739 | extern int disallow_signal(int); |
1740 | 1740 | ||
1741 | extern int do_execve(char *, char __user * __user *, char __user * __user *, struct pt_regs *); | 1741 | extern int do_execve(char *, char __user * __user *, char __user * __user *, struct pt_regs *); |
1742 | extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *); | 1742 | extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *); |
1743 | struct task_struct *fork_idle(int); | 1743 | struct task_struct *fork_idle(int); |
1744 | 1744 | ||
1745 | extern void set_task_comm(struct task_struct *tsk, char *from); | 1745 | extern void set_task_comm(struct task_struct *tsk, char *from); |
1746 | extern void get_task_comm(char *to, struct task_struct *tsk); | 1746 | extern void get_task_comm(char *to, struct task_struct *tsk); |
1747 | 1747 | ||
1748 | #ifdef CONFIG_SMP | 1748 | #ifdef CONFIG_SMP |
1749 | extern void wait_task_inactive(struct task_struct * p); | 1749 | extern void wait_task_inactive(struct task_struct * p); |
1750 | #else | 1750 | #else |
1751 | #define wait_task_inactive(p) do { } while (0) | 1751 | #define wait_task_inactive(p) do { } while (0) |
1752 | #endif | 1752 | #endif |
1753 | 1753 | ||
1754 | #define remove_parent(p) list_del_init(&(p)->sibling) | 1754 | #define remove_parent(p) list_del_init(&(p)->sibling) |
1755 | #define add_parent(p) list_add_tail(&(p)->sibling,&(p)->parent->children) | 1755 | #define add_parent(p) list_add_tail(&(p)->sibling,&(p)->parent->children) |
1756 | 1756 | ||
1757 | #define next_task(p) list_entry(rcu_dereference((p)->tasks.next), struct task_struct, tasks) | 1757 | #define next_task(p) list_entry(rcu_dereference((p)->tasks.next), struct task_struct, tasks) |
1758 | 1758 | ||
1759 | #define for_each_process(p) \ | 1759 | #define for_each_process(p) \ |
1760 | for (p = &init_task ; (p = next_task(p)) != &init_task ; ) | 1760 | for (p = &init_task ; (p = next_task(p)) != &init_task ; ) |
1761 | 1761 | ||
1762 | /* | 1762 | /* |
1763 | * Careful: do_each_thread/while_each_thread is a double loop so | 1763 | * Careful: do_each_thread/while_each_thread is a double loop so |
1764 | * 'break' will not work as expected - use goto instead. | 1764 | * 'break' will not work as expected - use goto instead. |
1765 | */ | 1765 | */ |
1766 | #define do_each_thread(g, t) \ | 1766 | #define do_each_thread(g, t) \ |
1767 | for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do | 1767 | for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do |
1768 | 1768 | ||
1769 | #define while_each_thread(g, t) \ | 1769 | #define while_each_thread(g, t) \ |
1770 | while ((t = next_thread(t)) != g) | 1770 | while ((t = next_thread(t)) != g) |
1771 | 1771 | ||
1772 | /* de_thread depends on thread_group_leader not being a pid based check */ | 1772 | /* de_thread depends on thread_group_leader not being a pid based check */ |
1773 | #define thread_group_leader(p) (p == p->group_leader) | 1773 | #define thread_group_leader(p) (p == p->group_leader) |
1774 | 1774 | ||
1775 | /* Do to the insanities of de_thread it is possible for a process | 1775 | /* Do to the insanities of de_thread it is possible for a process |
1776 | * to have the pid of the thread group leader without actually being | 1776 | * to have the pid of the thread group leader without actually being |
1777 | * the thread group leader. For iteration through the pids in proc | 1777 | * the thread group leader. For iteration through the pids in proc |
1778 | * all we care about is that we have a task with the appropriate | 1778 | * all we care about is that we have a task with the appropriate |
1779 | * pid, we don't actually care if we have the right task. | 1779 | * pid, we don't actually care if we have the right task. |
1780 | */ | 1780 | */ |
1781 | static inline int has_group_leader_pid(struct task_struct *p) | 1781 | static inline int has_group_leader_pid(struct task_struct *p) |
1782 | { | 1782 | { |
1783 | return p->pid == p->tgid; | 1783 | return p->pid == p->tgid; |
1784 | } | 1784 | } |
1785 | 1785 | ||
1786 | static inline | 1786 | static inline |
1787 | int same_thread_group(struct task_struct *p1, struct task_struct *p2) | 1787 | int same_thread_group(struct task_struct *p1, struct task_struct *p2) |
1788 | { | 1788 | { |
1789 | return p1->tgid == p2->tgid; | 1789 | return p1->tgid == p2->tgid; |
1790 | } | 1790 | } |
1791 | 1791 | ||
1792 | static inline struct task_struct *next_thread(const struct task_struct *p) | 1792 | static inline struct task_struct *next_thread(const struct task_struct *p) |
1793 | { | 1793 | { |
1794 | return list_entry(rcu_dereference(p->thread_group.next), | 1794 | return list_entry(rcu_dereference(p->thread_group.next), |
1795 | struct task_struct, thread_group); | 1795 | struct task_struct, thread_group); |
1796 | } | 1796 | } |
1797 | 1797 | ||
1798 | static inline int thread_group_empty(struct task_struct *p) | 1798 | static inline int thread_group_empty(struct task_struct *p) |
1799 | { | 1799 | { |
1800 | return list_empty(&p->thread_group); | 1800 | return list_empty(&p->thread_group); |
1801 | } | 1801 | } |
1802 | 1802 | ||
1803 | #define delay_group_leader(p) \ | 1803 | #define delay_group_leader(p) \ |
1804 | (thread_group_leader(p) && !thread_group_empty(p)) | 1804 | (thread_group_leader(p) && !thread_group_empty(p)) |
1805 | 1805 | ||
1806 | /* | 1806 | /* |
1807 | * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring | 1807 | * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring |
1808 | * subscriptions and synchronises with wait4(). Also used in procfs. Also | 1808 | * subscriptions and synchronises with wait4(). Also used in procfs. Also |
1809 | * pins the final release of task.io_context. Also protects ->cpuset and | 1809 | * pins the final release of task.io_context. Also protects ->cpuset and |
1810 | * ->cgroup.subsys[]. | 1810 | * ->cgroup.subsys[]. |
1811 | * | 1811 | * |
1812 | * Nests both inside and outside of read_lock(&tasklist_lock). | 1812 | * Nests both inside and outside of read_lock(&tasklist_lock). |
1813 | * It must not be nested with write_lock_irq(&tasklist_lock), | 1813 | * It must not be nested with write_lock_irq(&tasklist_lock), |
1814 | * neither inside nor outside. | 1814 | * neither inside nor outside. |
1815 | */ | 1815 | */ |
1816 | static inline void task_lock(struct task_struct *p) | 1816 | static inline void task_lock(struct task_struct *p) |
1817 | { | 1817 | { |
1818 | spin_lock(&p->alloc_lock); | 1818 | spin_lock(&p->alloc_lock); |
1819 | } | 1819 | } |
1820 | 1820 | ||
1821 | static inline void task_unlock(struct task_struct *p) | 1821 | static inline void task_unlock(struct task_struct *p) |
1822 | { | 1822 | { |
1823 | spin_unlock(&p->alloc_lock); | 1823 | spin_unlock(&p->alloc_lock); |
1824 | } | 1824 | } |
1825 | 1825 | ||
1826 | extern struct sighand_struct *lock_task_sighand(struct task_struct *tsk, | 1826 | extern struct sighand_struct *lock_task_sighand(struct task_struct *tsk, |
1827 | unsigned long *flags); | 1827 | unsigned long *flags); |
1828 | 1828 | ||
1829 | static inline void unlock_task_sighand(struct task_struct *tsk, | 1829 | static inline void unlock_task_sighand(struct task_struct *tsk, |
1830 | unsigned long *flags) | 1830 | unsigned long *flags) |
1831 | { | 1831 | { |
1832 | spin_unlock_irqrestore(&tsk->sighand->siglock, *flags); | 1832 | spin_unlock_irqrestore(&tsk->sighand->siglock, *flags); |
1833 | } | 1833 | } |
1834 | 1834 | ||
1835 | #ifndef __HAVE_THREAD_FUNCTIONS | 1835 | #ifndef __HAVE_THREAD_FUNCTIONS |
1836 | 1836 | ||
1837 | #define task_thread_info(task) ((struct thread_info *)(task)->stack) | 1837 | #define task_thread_info(task) ((struct thread_info *)(task)->stack) |
1838 | #define task_stack_page(task) ((task)->stack) | 1838 | #define task_stack_page(task) ((task)->stack) |
1839 | 1839 | ||
1840 | static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org) | 1840 | static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org) |
1841 | { | 1841 | { |
1842 | *task_thread_info(p) = *task_thread_info(org); | 1842 | *task_thread_info(p) = *task_thread_info(org); |
1843 | task_thread_info(p)->task = p; | 1843 | task_thread_info(p)->task = p; |
1844 | } | 1844 | } |
1845 | 1845 | ||
1846 | static inline unsigned long *end_of_stack(struct task_struct *p) | 1846 | static inline unsigned long *end_of_stack(struct task_struct *p) |
1847 | { | 1847 | { |
1848 | return (unsigned long *)(task_thread_info(p) + 1); | 1848 | return (unsigned long *)(task_thread_info(p) + 1); |
1849 | } | 1849 | } |
1850 | 1850 | ||
1851 | #endif | 1851 | #endif |
1852 | 1852 | ||
1853 | /* set thread flags in other task's structures | 1853 | /* set thread flags in other task's structures |
1854 | * - see asm/thread_info.h for TIF_xxxx flags available | 1854 | * - see asm/thread_info.h for TIF_xxxx flags available |
1855 | */ | 1855 | */ |
1856 | static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag) | 1856 | static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag) |
1857 | { | 1857 | { |
1858 | set_ti_thread_flag(task_thread_info(tsk), flag); | 1858 | set_ti_thread_flag(task_thread_info(tsk), flag); |
1859 | } | 1859 | } |
1860 | 1860 | ||
1861 | static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag) | 1861 | static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag) |
1862 | { | 1862 | { |
1863 | clear_ti_thread_flag(task_thread_info(tsk), flag); | 1863 | clear_ti_thread_flag(task_thread_info(tsk), flag); |
1864 | } | 1864 | } |
1865 | 1865 | ||
1866 | static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag) | 1866 | static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag) |
1867 | { | 1867 | { |
1868 | return test_and_set_ti_thread_flag(task_thread_info(tsk), flag); | 1868 | return test_and_set_ti_thread_flag(task_thread_info(tsk), flag); |
1869 | } | 1869 | } |
1870 | 1870 | ||
1871 | static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag) | 1871 | static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag) |
1872 | { | 1872 | { |
1873 | return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag); | 1873 | return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag); |
1874 | } | 1874 | } |
1875 | 1875 | ||
1876 | static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag) | 1876 | static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag) |
1877 | { | 1877 | { |
1878 | return test_ti_thread_flag(task_thread_info(tsk), flag); | 1878 | return test_ti_thread_flag(task_thread_info(tsk), flag); |
1879 | } | 1879 | } |
1880 | 1880 | ||
1881 | static inline void set_tsk_need_resched(struct task_struct *tsk) | 1881 | static inline void set_tsk_need_resched(struct task_struct *tsk) |
1882 | { | 1882 | { |
1883 | set_tsk_thread_flag(tsk,TIF_NEED_RESCHED); | 1883 | set_tsk_thread_flag(tsk,TIF_NEED_RESCHED); |
1884 | } | 1884 | } |
1885 | 1885 | ||
1886 | static inline void clear_tsk_need_resched(struct task_struct *tsk) | 1886 | static inline void clear_tsk_need_resched(struct task_struct *tsk) |
1887 | { | 1887 | { |
1888 | clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED); | 1888 | clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED); |
1889 | } | 1889 | } |
1890 | 1890 | ||
1891 | static inline int signal_pending(struct task_struct *p) | 1891 | static inline int signal_pending(struct task_struct *p) |
1892 | { | 1892 | { |
1893 | return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING)); | 1893 | return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING)); |
1894 | } | 1894 | } |
1895 | 1895 | ||
1896 | static inline int need_resched(void) | 1896 | static inline int need_resched(void) |
1897 | { | 1897 | { |
1898 | return unlikely(test_thread_flag(TIF_NEED_RESCHED)); | 1898 | return unlikely(test_thread_flag(TIF_NEED_RESCHED)); |
1899 | } | 1899 | } |
1900 | 1900 | ||
1901 | /* | 1901 | /* |
1902 | * cond_resched() and cond_resched_lock(): latency reduction via | 1902 | * cond_resched() and cond_resched_lock(): latency reduction via |
1903 | * explicit rescheduling in places that are safe. The return | 1903 | * explicit rescheduling in places that are safe. The return |
1904 | * value indicates whether a reschedule was done in fact. | 1904 | * value indicates whether a reschedule was done in fact. |
1905 | * cond_resched_lock() will drop the spinlock before scheduling, | 1905 | * cond_resched_lock() will drop the spinlock before scheduling, |
1906 | * cond_resched_softirq() will enable bhs before scheduling. | 1906 | * cond_resched_softirq() will enable bhs before scheduling. |
1907 | */ | 1907 | */ |
1908 | #ifdef CONFIG_PREEMPT | 1908 | #ifdef CONFIG_PREEMPT |
1909 | static inline int cond_resched(void) | 1909 | static inline int cond_resched(void) |
1910 | { | 1910 | { |
1911 | return 0; | 1911 | return 0; |
1912 | } | 1912 | } |
1913 | #else | 1913 | #else |
1914 | extern int _cond_resched(void); | 1914 | extern int _cond_resched(void); |
1915 | static inline int cond_resched(void) | 1915 | static inline int cond_resched(void) |
1916 | { | 1916 | { |
1917 | return _cond_resched(); | 1917 | return _cond_resched(); |
1918 | } | 1918 | } |
1919 | #endif | 1919 | #endif |
1920 | extern int cond_resched_lock(spinlock_t * lock); | 1920 | extern int cond_resched_lock(spinlock_t * lock); |
1921 | extern int cond_resched_softirq(void); | 1921 | extern int cond_resched_softirq(void); |
1922 | 1922 | ||
1923 | /* | 1923 | /* |
1924 | * Does a critical section need to be broken due to another | 1924 | * Does a critical section need to be broken due to another |
1925 | * task waiting?: | 1925 | * task waiting?: (technically does not depend on CONFIG_PREEMPT, |
1926 | * but a general need for low latency) | ||
1926 | */ | 1927 | */ |
1927 | #if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP) | 1928 | static inline int spin_needbreak(spinlock_t *lock) |
1928 | # define need_lockbreak(lock) ((lock)->break_lock) | ||
1929 | #else | ||
1930 | # define need_lockbreak(lock) 0 | ||
1931 | #endif | ||
1932 | |||
1933 | /* | ||
1934 | * Does a critical section need to be broken due to another | ||
1935 | * task waiting or preemption being signalled: | ||
1936 | */ | ||
1937 | static inline int lock_need_resched(spinlock_t *lock) | ||
1938 | { | 1929 | { |
1939 | if (need_lockbreak(lock) || need_resched()) | 1930 | #ifdef CONFIG_PREEMPT |
1940 | return 1; | 1931 | return spin_is_contended(lock); |
1932 | #else | ||
1941 | return 0; | 1933 | return 0; |
1934 | #endif | ||
1942 | } | 1935 | } |
1943 | 1936 | ||
1944 | /* | 1937 | /* |
1945 | * Reevaluate whether the task has signals pending delivery. | 1938 | * Reevaluate whether the task has signals pending delivery. |
1946 | * Wake the task if so. | 1939 | * Wake the task if so. |
1947 | * This is required every time the blocked sigset_t changes. | 1940 | * This is required every time the blocked sigset_t changes. |
1948 | * callers must hold sighand->siglock. | 1941 | * callers must hold sighand->siglock. |
1949 | */ | 1942 | */ |
1950 | extern void recalc_sigpending_and_wake(struct task_struct *t); | 1943 | extern void recalc_sigpending_and_wake(struct task_struct *t); |
1951 | extern void recalc_sigpending(void); | 1944 | extern void recalc_sigpending(void); |
1952 | 1945 | ||
1953 | extern void signal_wake_up(struct task_struct *t, int resume_stopped); | 1946 | extern void signal_wake_up(struct task_struct *t, int resume_stopped); |
1954 | 1947 | ||
1955 | /* | 1948 | /* |
1956 | * Wrappers for p->thread_info->cpu access. No-op on UP. | 1949 | * Wrappers for p->thread_info->cpu access. No-op on UP. |
1957 | */ | 1950 | */ |
1958 | #ifdef CONFIG_SMP | 1951 | #ifdef CONFIG_SMP |
1959 | 1952 | ||
1960 | static inline unsigned int task_cpu(const struct task_struct *p) | 1953 | static inline unsigned int task_cpu(const struct task_struct *p) |
1961 | { | 1954 | { |
1962 | return task_thread_info(p)->cpu; | 1955 | return task_thread_info(p)->cpu; |
1963 | } | 1956 | } |
1964 | 1957 | ||
1965 | extern void set_task_cpu(struct task_struct *p, unsigned int cpu); | 1958 | extern void set_task_cpu(struct task_struct *p, unsigned int cpu); |
1966 | 1959 | ||
1967 | #else | 1960 | #else |
1968 | 1961 | ||
1969 | static inline unsigned int task_cpu(const struct task_struct *p) | 1962 | static inline unsigned int task_cpu(const struct task_struct *p) |
1970 | { | 1963 | { |
1971 | return 0; | 1964 | return 0; |
1972 | } | 1965 | } |
1973 | 1966 | ||
1974 | static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) | 1967 | static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) |
1975 | { | 1968 | { |
1976 | } | 1969 | } |
1977 | 1970 | ||
1978 | #endif /* CONFIG_SMP */ | 1971 | #endif /* CONFIG_SMP */ |
1979 | 1972 | ||
1980 | #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT | 1973 | #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT |
1981 | extern void arch_pick_mmap_layout(struct mm_struct *mm); | 1974 | extern void arch_pick_mmap_layout(struct mm_struct *mm); |
1982 | #else | 1975 | #else |
1983 | static inline void arch_pick_mmap_layout(struct mm_struct *mm) | 1976 | static inline void arch_pick_mmap_layout(struct mm_struct *mm) |
1984 | { | 1977 | { |
1985 | mm->mmap_base = TASK_UNMAPPED_BASE; | 1978 | mm->mmap_base = TASK_UNMAPPED_BASE; |
1986 | mm->get_unmapped_area = arch_get_unmapped_area; | 1979 | mm->get_unmapped_area = arch_get_unmapped_area; |
1987 | mm->unmap_area = arch_unmap_area; | 1980 | mm->unmap_area = arch_unmap_area; |
1988 | } | 1981 | } |
1989 | #endif | 1982 | #endif |
1990 | 1983 | ||
1991 | extern long sched_setaffinity(pid_t pid, cpumask_t new_mask); | 1984 | extern long sched_setaffinity(pid_t pid, cpumask_t new_mask); |
1992 | extern long sched_getaffinity(pid_t pid, cpumask_t *mask); | 1985 | extern long sched_getaffinity(pid_t pid, cpumask_t *mask); |
1993 | 1986 | ||
1994 | extern int sched_mc_power_savings, sched_smt_power_savings; | 1987 | extern int sched_mc_power_savings, sched_smt_power_savings; |
1995 | 1988 | ||
1996 | extern void normalize_rt_tasks(void); | 1989 | extern void normalize_rt_tasks(void); |
1997 | 1990 | ||
1998 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1991 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1999 | 1992 | ||
2000 | extern struct task_group init_task_group; | 1993 | extern struct task_group init_task_group; |
2001 | 1994 | ||
2002 | extern struct task_group *sched_create_group(void); | 1995 | extern struct task_group *sched_create_group(void); |
2003 | extern void sched_destroy_group(struct task_group *tg); | 1996 | extern void sched_destroy_group(struct task_group *tg); |
2004 | extern void sched_move_task(struct task_struct *tsk); | 1997 | extern void sched_move_task(struct task_struct *tsk); |
2005 | extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); | 1998 | extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); |
2006 | extern unsigned long sched_group_shares(struct task_group *tg); | 1999 | extern unsigned long sched_group_shares(struct task_group *tg); |
2007 | 2000 | ||
2008 | #endif | 2001 | #endif |
2009 | 2002 | ||
2010 | #ifdef CONFIG_TASK_XACCT | 2003 | #ifdef CONFIG_TASK_XACCT |
2011 | static inline void add_rchar(struct task_struct *tsk, ssize_t amt) | 2004 | static inline void add_rchar(struct task_struct *tsk, ssize_t amt) |
2012 | { | 2005 | { |
2013 | tsk->rchar += amt; | 2006 | tsk->rchar += amt; |
2014 | } | 2007 | } |
2015 | 2008 | ||
2016 | static inline void add_wchar(struct task_struct *tsk, ssize_t amt) | 2009 | static inline void add_wchar(struct task_struct *tsk, ssize_t amt) |
2017 | { | 2010 | { |
2018 | tsk->wchar += amt; | 2011 | tsk->wchar += amt; |
2019 | } | 2012 | } |
2020 | 2013 | ||
2021 | static inline void inc_syscr(struct task_struct *tsk) | 2014 | static inline void inc_syscr(struct task_struct *tsk) |
2022 | { | 2015 | { |
2023 | tsk->syscr++; | 2016 | tsk->syscr++; |
2024 | } | 2017 | } |
2025 | 2018 | ||
2026 | static inline void inc_syscw(struct task_struct *tsk) | 2019 | static inline void inc_syscw(struct task_struct *tsk) |
2027 | { | 2020 | { |
2028 | tsk->syscw++; | 2021 | tsk->syscw++; |
2029 | } | 2022 | } |
2030 | #else | 2023 | #else |
2031 | static inline void add_rchar(struct task_struct *tsk, ssize_t amt) | 2024 | static inline void add_rchar(struct task_struct *tsk, ssize_t amt) |
2032 | { | 2025 | { |
2033 | } | 2026 | } |
2034 | 2027 | ||
2035 | static inline void add_wchar(struct task_struct *tsk, ssize_t amt) | 2028 | static inline void add_wchar(struct task_struct *tsk, ssize_t amt) |
2036 | { | 2029 | { |
2037 | } | 2030 | } |
2038 | 2031 | ||
2039 | static inline void inc_syscr(struct task_struct *tsk) | 2032 | static inline void inc_syscr(struct task_struct *tsk) |
2040 | { | 2033 | { |
2041 | } | 2034 | } |
2042 | 2035 | ||
2043 | static inline void inc_syscw(struct task_struct *tsk) | 2036 | static inline void inc_syscw(struct task_struct *tsk) |
2044 | { | 2037 | { |
2045 | } | 2038 | } |
2046 | #endif | 2039 | #endif |
2047 | 2040 | ||
2048 | #ifdef CONFIG_SMP | 2041 | #ifdef CONFIG_SMP |
2049 | void migration_init(void); | 2042 | void migration_init(void); |
2050 | #else | 2043 | #else |
2051 | static inline void migration_init(void) | 2044 | static inline void migration_init(void) |
2052 | { | 2045 | { |
2053 | } | 2046 | } |
2054 | #endif | 2047 | #endif |
2055 | 2048 | ||
2056 | #endif /* __KERNEL__ */ | 2049 | #endif /* __KERNEL__ */ |
include/linux/spinlock.h
1 | #ifndef __LINUX_SPINLOCK_H | 1 | #ifndef __LINUX_SPINLOCK_H |
2 | #define __LINUX_SPINLOCK_H | 2 | #define __LINUX_SPINLOCK_H |
3 | 3 | ||
4 | /* | 4 | /* |
5 | * include/linux/spinlock.h - generic spinlock/rwlock declarations | 5 | * include/linux/spinlock.h - generic spinlock/rwlock declarations |
6 | * | 6 | * |
7 | * here's the role of the various spinlock/rwlock related include files: | 7 | * here's the role of the various spinlock/rwlock related include files: |
8 | * | 8 | * |
9 | * on SMP builds: | 9 | * on SMP builds: |
10 | * | 10 | * |
11 | * asm/spinlock_types.h: contains the raw_spinlock_t/raw_rwlock_t and the | 11 | * asm/spinlock_types.h: contains the raw_spinlock_t/raw_rwlock_t and the |
12 | * initializers | 12 | * initializers |
13 | * | 13 | * |
14 | * linux/spinlock_types.h: | 14 | * linux/spinlock_types.h: |
15 | * defines the generic type and initializers | 15 | * defines the generic type and initializers |
16 | * | 16 | * |
17 | * asm/spinlock.h: contains the __raw_spin_*()/etc. lowlevel | 17 | * asm/spinlock.h: contains the __raw_spin_*()/etc. lowlevel |
18 | * implementations, mostly inline assembly code | 18 | * implementations, mostly inline assembly code |
19 | * | 19 | * |
20 | * (also included on UP-debug builds:) | 20 | * (also included on UP-debug builds:) |
21 | * | 21 | * |
22 | * linux/spinlock_api_smp.h: | 22 | * linux/spinlock_api_smp.h: |
23 | * contains the prototypes for the _spin_*() APIs. | 23 | * contains the prototypes for the _spin_*() APIs. |
24 | * | 24 | * |
25 | * linux/spinlock.h: builds the final spin_*() APIs. | 25 | * linux/spinlock.h: builds the final spin_*() APIs. |
26 | * | 26 | * |
27 | * on UP builds: | 27 | * on UP builds: |
28 | * | 28 | * |
29 | * linux/spinlock_type_up.h: | 29 | * linux/spinlock_type_up.h: |
30 | * contains the generic, simplified UP spinlock type. | 30 | * contains the generic, simplified UP spinlock type. |
31 | * (which is an empty structure on non-debug builds) | 31 | * (which is an empty structure on non-debug builds) |
32 | * | 32 | * |
33 | * linux/spinlock_types.h: | 33 | * linux/spinlock_types.h: |
34 | * defines the generic type and initializers | 34 | * defines the generic type and initializers |
35 | * | 35 | * |
36 | * linux/spinlock_up.h: | 36 | * linux/spinlock_up.h: |
37 | * contains the __raw_spin_*()/etc. version of UP | 37 | * contains the __raw_spin_*()/etc. version of UP |
38 | * builds. (which are NOPs on non-debug, non-preempt | 38 | * builds. (which are NOPs on non-debug, non-preempt |
39 | * builds) | 39 | * builds) |
40 | * | 40 | * |
41 | * (included on UP-non-debug builds:) | 41 | * (included on UP-non-debug builds:) |
42 | * | 42 | * |
43 | * linux/spinlock_api_up.h: | 43 | * linux/spinlock_api_up.h: |
44 | * builds the _spin_*() APIs. | 44 | * builds the _spin_*() APIs. |
45 | * | 45 | * |
46 | * linux/spinlock.h: builds the final spin_*() APIs. | 46 | * linux/spinlock.h: builds the final spin_*() APIs. |
47 | */ | 47 | */ |
48 | 48 | ||
49 | #include <linux/preempt.h> | 49 | #include <linux/preempt.h> |
50 | #include <linux/linkage.h> | 50 | #include <linux/linkage.h> |
51 | #include <linux/compiler.h> | 51 | #include <linux/compiler.h> |
52 | #include <linux/thread_info.h> | 52 | #include <linux/thread_info.h> |
53 | #include <linux/kernel.h> | 53 | #include <linux/kernel.h> |
54 | #include <linux/stringify.h> | 54 | #include <linux/stringify.h> |
55 | #include <linux/bottom_half.h> | 55 | #include <linux/bottom_half.h> |
56 | 56 | ||
57 | #include <asm/system.h> | 57 | #include <asm/system.h> |
58 | 58 | ||
59 | /* | 59 | /* |
60 | * Must define these before including other files, inline functions need them | 60 | * Must define these before including other files, inline functions need them |
61 | */ | 61 | */ |
62 | #define LOCK_SECTION_NAME ".text.lock."KBUILD_BASENAME | 62 | #define LOCK_SECTION_NAME ".text.lock."KBUILD_BASENAME |
63 | 63 | ||
64 | #define LOCK_SECTION_START(extra) \ | 64 | #define LOCK_SECTION_START(extra) \ |
65 | ".subsection 1\n\t" \ | 65 | ".subsection 1\n\t" \ |
66 | extra \ | 66 | extra \ |
67 | ".ifndef " LOCK_SECTION_NAME "\n\t" \ | 67 | ".ifndef " LOCK_SECTION_NAME "\n\t" \ |
68 | LOCK_SECTION_NAME ":\n\t" \ | 68 | LOCK_SECTION_NAME ":\n\t" \ |
69 | ".endif\n" | 69 | ".endif\n" |
70 | 70 | ||
71 | #define LOCK_SECTION_END \ | 71 | #define LOCK_SECTION_END \ |
72 | ".previous\n\t" | 72 | ".previous\n\t" |
73 | 73 | ||
74 | #define __lockfunc fastcall __attribute__((section(".spinlock.text"))) | 74 | #define __lockfunc fastcall __attribute__((section(".spinlock.text"))) |
75 | 75 | ||
76 | /* | 76 | /* |
77 | * Pull the raw_spinlock_t and raw_rwlock_t definitions: | 77 | * Pull the raw_spinlock_t and raw_rwlock_t definitions: |
78 | */ | 78 | */ |
79 | #include <linux/spinlock_types.h> | 79 | #include <linux/spinlock_types.h> |
80 | 80 | ||
81 | extern int __lockfunc generic__raw_read_trylock(raw_rwlock_t *lock); | 81 | extern int __lockfunc generic__raw_read_trylock(raw_rwlock_t *lock); |
82 | 82 | ||
83 | /* | 83 | /* |
84 | * Pull the __raw*() functions/declarations (UP-nondebug doesnt need them): | 84 | * Pull the __raw*() functions/declarations (UP-nondebug doesnt need them): |
85 | */ | 85 | */ |
86 | #ifdef CONFIG_SMP | 86 | #ifdef CONFIG_SMP |
87 | # include <asm/spinlock.h> | 87 | # include <asm/spinlock.h> |
88 | #else | 88 | #else |
89 | # include <linux/spinlock_up.h> | 89 | # include <linux/spinlock_up.h> |
90 | #endif | 90 | #endif |
91 | 91 | ||
92 | #ifdef CONFIG_DEBUG_SPINLOCK | 92 | #ifdef CONFIG_DEBUG_SPINLOCK |
93 | extern void __spin_lock_init(spinlock_t *lock, const char *name, | 93 | extern void __spin_lock_init(spinlock_t *lock, const char *name, |
94 | struct lock_class_key *key); | 94 | struct lock_class_key *key); |
95 | # define spin_lock_init(lock) \ | 95 | # define spin_lock_init(lock) \ |
96 | do { \ | 96 | do { \ |
97 | static struct lock_class_key __key; \ | 97 | static struct lock_class_key __key; \ |
98 | \ | 98 | \ |
99 | __spin_lock_init((lock), #lock, &__key); \ | 99 | __spin_lock_init((lock), #lock, &__key); \ |
100 | } while (0) | 100 | } while (0) |
101 | 101 | ||
102 | #else | 102 | #else |
103 | # define spin_lock_init(lock) \ | 103 | # define spin_lock_init(lock) \ |
104 | do { *(lock) = SPIN_LOCK_UNLOCKED; } while (0) | 104 | do { *(lock) = SPIN_LOCK_UNLOCKED; } while (0) |
105 | #endif | 105 | #endif |
106 | 106 | ||
107 | #ifdef CONFIG_DEBUG_SPINLOCK | 107 | #ifdef CONFIG_DEBUG_SPINLOCK |
108 | extern void __rwlock_init(rwlock_t *lock, const char *name, | 108 | extern void __rwlock_init(rwlock_t *lock, const char *name, |
109 | struct lock_class_key *key); | 109 | struct lock_class_key *key); |
110 | # define rwlock_init(lock) \ | 110 | # define rwlock_init(lock) \ |
111 | do { \ | 111 | do { \ |
112 | static struct lock_class_key __key; \ | 112 | static struct lock_class_key __key; \ |
113 | \ | 113 | \ |
114 | __rwlock_init((lock), #lock, &__key); \ | 114 | __rwlock_init((lock), #lock, &__key); \ |
115 | } while (0) | 115 | } while (0) |
116 | #else | 116 | #else |
117 | # define rwlock_init(lock) \ | 117 | # define rwlock_init(lock) \ |
118 | do { *(lock) = RW_LOCK_UNLOCKED; } while (0) | 118 | do { *(lock) = RW_LOCK_UNLOCKED; } while (0) |
119 | #endif | 119 | #endif |
120 | 120 | ||
121 | #define spin_is_locked(lock) __raw_spin_is_locked(&(lock)->raw_lock) | 121 | #define spin_is_locked(lock) __raw_spin_is_locked(&(lock)->raw_lock) |
122 | 122 | ||
123 | #ifdef CONFIG_GENERIC_LOCKBREAK | ||
124 | #define spin_is_contended(lock) ((lock)->break_lock) | ||
125 | #else | ||
126 | #define spin_is_contended(lock) __raw_spin_is_contended(&(lock)->raw_lock) | ||
127 | #endif | ||
128 | |||
123 | /** | 129 | /** |
124 | * spin_unlock_wait - wait until the spinlock gets unlocked | 130 | * spin_unlock_wait - wait until the spinlock gets unlocked |
125 | * @lock: the spinlock in question. | 131 | * @lock: the spinlock in question. |
126 | */ | 132 | */ |
127 | #define spin_unlock_wait(lock) __raw_spin_unlock_wait(&(lock)->raw_lock) | 133 | #define spin_unlock_wait(lock) __raw_spin_unlock_wait(&(lock)->raw_lock) |
128 | 134 | ||
129 | /* | 135 | /* |
130 | * Pull the _spin_*()/_read_*()/_write_*() functions/declarations: | 136 | * Pull the _spin_*()/_read_*()/_write_*() functions/declarations: |
131 | */ | 137 | */ |
132 | #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) | 138 | #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) |
133 | # include <linux/spinlock_api_smp.h> | 139 | # include <linux/spinlock_api_smp.h> |
134 | #else | 140 | #else |
135 | # include <linux/spinlock_api_up.h> | 141 | # include <linux/spinlock_api_up.h> |
136 | #endif | 142 | #endif |
137 | 143 | ||
138 | #ifdef CONFIG_DEBUG_SPINLOCK | 144 | #ifdef CONFIG_DEBUG_SPINLOCK |
139 | extern void _raw_spin_lock(spinlock_t *lock); | 145 | extern void _raw_spin_lock(spinlock_t *lock); |
140 | #define _raw_spin_lock_flags(lock, flags) _raw_spin_lock(lock) | 146 | #define _raw_spin_lock_flags(lock, flags) _raw_spin_lock(lock) |
141 | extern int _raw_spin_trylock(spinlock_t *lock); | 147 | extern int _raw_spin_trylock(spinlock_t *lock); |
142 | extern void _raw_spin_unlock(spinlock_t *lock); | 148 | extern void _raw_spin_unlock(spinlock_t *lock); |
143 | extern void _raw_read_lock(rwlock_t *lock); | 149 | extern void _raw_read_lock(rwlock_t *lock); |
144 | extern int _raw_read_trylock(rwlock_t *lock); | 150 | extern int _raw_read_trylock(rwlock_t *lock); |
145 | extern void _raw_read_unlock(rwlock_t *lock); | 151 | extern void _raw_read_unlock(rwlock_t *lock); |
146 | extern void _raw_write_lock(rwlock_t *lock); | 152 | extern void _raw_write_lock(rwlock_t *lock); |
147 | extern int _raw_write_trylock(rwlock_t *lock); | 153 | extern int _raw_write_trylock(rwlock_t *lock); |
148 | extern void _raw_write_unlock(rwlock_t *lock); | 154 | extern void _raw_write_unlock(rwlock_t *lock); |
149 | #else | 155 | #else |
150 | # define _raw_spin_lock(lock) __raw_spin_lock(&(lock)->raw_lock) | 156 | # define _raw_spin_lock(lock) __raw_spin_lock(&(lock)->raw_lock) |
151 | # define _raw_spin_lock_flags(lock, flags) \ | 157 | # define _raw_spin_lock_flags(lock, flags) \ |
152 | __raw_spin_lock_flags(&(lock)->raw_lock, *(flags)) | 158 | __raw_spin_lock_flags(&(lock)->raw_lock, *(flags)) |
153 | # define _raw_spin_trylock(lock) __raw_spin_trylock(&(lock)->raw_lock) | 159 | # define _raw_spin_trylock(lock) __raw_spin_trylock(&(lock)->raw_lock) |
154 | # define _raw_spin_unlock(lock) __raw_spin_unlock(&(lock)->raw_lock) | 160 | # define _raw_spin_unlock(lock) __raw_spin_unlock(&(lock)->raw_lock) |
155 | # define _raw_read_lock(rwlock) __raw_read_lock(&(rwlock)->raw_lock) | 161 | # define _raw_read_lock(rwlock) __raw_read_lock(&(rwlock)->raw_lock) |
156 | # define _raw_read_trylock(rwlock) __raw_read_trylock(&(rwlock)->raw_lock) | 162 | # define _raw_read_trylock(rwlock) __raw_read_trylock(&(rwlock)->raw_lock) |
157 | # define _raw_read_unlock(rwlock) __raw_read_unlock(&(rwlock)->raw_lock) | 163 | # define _raw_read_unlock(rwlock) __raw_read_unlock(&(rwlock)->raw_lock) |
158 | # define _raw_write_lock(rwlock) __raw_write_lock(&(rwlock)->raw_lock) | 164 | # define _raw_write_lock(rwlock) __raw_write_lock(&(rwlock)->raw_lock) |
159 | # define _raw_write_trylock(rwlock) __raw_write_trylock(&(rwlock)->raw_lock) | 165 | # define _raw_write_trylock(rwlock) __raw_write_trylock(&(rwlock)->raw_lock) |
160 | # define _raw_write_unlock(rwlock) __raw_write_unlock(&(rwlock)->raw_lock) | 166 | # define _raw_write_unlock(rwlock) __raw_write_unlock(&(rwlock)->raw_lock) |
161 | #endif | 167 | #endif |
162 | 168 | ||
163 | #define read_can_lock(rwlock) __raw_read_can_lock(&(rwlock)->raw_lock) | 169 | #define read_can_lock(rwlock) __raw_read_can_lock(&(rwlock)->raw_lock) |
164 | #define write_can_lock(rwlock) __raw_write_can_lock(&(rwlock)->raw_lock) | 170 | #define write_can_lock(rwlock) __raw_write_can_lock(&(rwlock)->raw_lock) |
165 | 171 | ||
166 | /* | 172 | /* |
167 | * Define the various spin_lock and rw_lock methods. Note we define these | 173 | * Define the various spin_lock and rw_lock methods. Note we define these |
168 | * regardless of whether CONFIG_SMP or CONFIG_PREEMPT are set. The various | 174 | * regardless of whether CONFIG_SMP or CONFIG_PREEMPT are set. The various |
169 | * methods are defined as nops in the case they are not required. | 175 | * methods are defined as nops in the case they are not required. |
170 | */ | 176 | */ |
171 | #define spin_trylock(lock) __cond_lock(lock, _spin_trylock(lock)) | 177 | #define spin_trylock(lock) __cond_lock(lock, _spin_trylock(lock)) |
172 | #define read_trylock(lock) __cond_lock(lock, _read_trylock(lock)) | 178 | #define read_trylock(lock) __cond_lock(lock, _read_trylock(lock)) |
173 | #define write_trylock(lock) __cond_lock(lock, _write_trylock(lock)) | 179 | #define write_trylock(lock) __cond_lock(lock, _write_trylock(lock)) |
174 | 180 | ||
175 | #define spin_lock(lock) _spin_lock(lock) | 181 | #define spin_lock(lock) _spin_lock(lock) |
176 | 182 | ||
177 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 183 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
178 | # define spin_lock_nested(lock, subclass) _spin_lock_nested(lock, subclass) | 184 | # define spin_lock_nested(lock, subclass) _spin_lock_nested(lock, subclass) |
179 | #else | 185 | #else |
180 | # define spin_lock_nested(lock, subclass) _spin_lock(lock) | 186 | # define spin_lock_nested(lock, subclass) _spin_lock(lock) |
181 | #endif | 187 | #endif |
182 | 188 | ||
183 | #define write_lock(lock) _write_lock(lock) | 189 | #define write_lock(lock) _write_lock(lock) |
184 | #define read_lock(lock) _read_lock(lock) | 190 | #define read_lock(lock) _read_lock(lock) |
185 | 191 | ||
186 | #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) | 192 | #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) |
187 | 193 | ||
188 | #define spin_lock_irqsave(lock, flags) flags = _spin_lock_irqsave(lock) | 194 | #define spin_lock_irqsave(lock, flags) flags = _spin_lock_irqsave(lock) |
189 | #define read_lock_irqsave(lock, flags) flags = _read_lock_irqsave(lock) | 195 | #define read_lock_irqsave(lock, flags) flags = _read_lock_irqsave(lock) |
190 | #define write_lock_irqsave(lock, flags) flags = _write_lock_irqsave(lock) | 196 | #define write_lock_irqsave(lock, flags) flags = _write_lock_irqsave(lock) |
191 | 197 | ||
192 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 198 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
193 | #define spin_lock_irqsave_nested(lock, flags, subclass) \ | 199 | #define spin_lock_irqsave_nested(lock, flags, subclass) \ |
194 | flags = _spin_lock_irqsave_nested(lock, subclass) | 200 | flags = _spin_lock_irqsave_nested(lock, subclass) |
195 | #else | 201 | #else |
196 | #define spin_lock_irqsave_nested(lock, flags, subclass) \ | 202 | #define spin_lock_irqsave_nested(lock, flags, subclass) \ |
197 | flags = _spin_lock_irqsave(lock) | 203 | flags = _spin_lock_irqsave(lock) |
198 | #endif | 204 | #endif |
199 | 205 | ||
200 | #else | 206 | #else |
201 | 207 | ||
202 | #define spin_lock_irqsave(lock, flags) _spin_lock_irqsave(lock, flags) | 208 | #define spin_lock_irqsave(lock, flags) _spin_lock_irqsave(lock, flags) |
203 | #define read_lock_irqsave(lock, flags) _read_lock_irqsave(lock, flags) | 209 | #define read_lock_irqsave(lock, flags) _read_lock_irqsave(lock, flags) |
204 | #define write_lock_irqsave(lock, flags) _write_lock_irqsave(lock, flags) | 210 | #define write_lock_irqsave(lock, flags) _write_lock_irqsave(lock, flags) |
205 | #define spin_lock_irqsave_nested(lock, flags, subclass) \ | 211 | #define spin_lock_irqsave_nested(lock, flags, subclass) \ |
206 | spin_lock_irqsave(lock, flags) | 212 | spin_lock_irqsave(lock, flags) |
207 | 213 | ||
208 | #endif | 214 | #endif |
209 | 215 | ||
210 | #define spin_lock_irq(lock) _spin_lock_irq(lock) | 216 | #define spin_lock_irq(lock) _spin_lock_irq(lock) |
211 | #define spin_lock_bh(lock) _spin_lock_bh(lock) | 217 | #define spin_lock_bh(lock) _spin_lock_bh(lock) |
212 | 218 | ||
213 | #define read_lock_irq(lock) _read_lock_irq(lock) | 219 | #define read_lock_irq(lock) _read_lock_irq(lock) |
214 | #define read_lock_bh(lock) _read_lock_bh(lock) | 220 | #define read_lock_bh(lock) _read_lock_bh(lock) |
215 | 221 | ||
216 | #define write_lock_irq(lock) _write_lock_irq(lock) | 222 | #define write_lock_irq(lock) _write_lock_irq(lock) |
217 | #define write_lock_bh(lock) _write_lock_bh(lock) | 223 | #define write_lock_bh(lock) _write_lock_bh(lock) |
218 | 224 | ||
219 | /* | 225 | /* |
220 | * We inline the unlock functions in the nondebug case: | 226 | * We inline the unlock functions in the nondebug case: |
221 | */ | 227 | */ |
222 | #if defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT) || \ | 228 | #if defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT) || \ |
223 | !defined(CONFIG_SMP) | 229 | !defined(CONFIG_SMP) |
224 | # define spin_unlock(lock) _spin_unlock(lock) | 230 | # define spin_unlock(lock) _spin_unlock(lock) |
225 | # define read_unlock(lock) _read_unlock(lock) | 231 | # define read_unlock(lock) _read_unlock(lock) |
226 | # define write_unlock(lock) _write_unlock(lock) | 232 | # define write_unlock(lock) _write_unlock(lock) |
227 | # define spin_unlock_irq(lock) _spin_unlock_irq(lock) | 233 | # define spin_unlock_irq(lock) _spin_unlock_irq(lock) |
228 | # define read_unlock_irq(lock) _read_unlock_irq(lock) | 234 | # define read_unlock_irq(lock) _read_unlock_irq(lock) |
229 | # define write_unlock_irq(lock) _write_unlock_irq(lock) | 235 | # define write_unlock_irq(lock) _write_unlock_irq(lock) |
230 | #else | 236 | #else |
231 | # define spin_unlock(lock) \ | 237 | # define spin_unlock(lock) \ |
232 | do {__raw_spin_unlock(&(lock)->raw_lock); __release(lock); } while (0) | 238 | do {__raw_spin_unlock(&(lock)->raw_lock); __release(lock); } while (0) |
233 | # define read_unlock(lock) \ | 239 | # define read_unlock(lock) \ |
234 | do {__raw_read_unlock(&(lock)->raw_lock); __release(lock); } while (0) | 240 | do {__raw_read_unlock(&(lock)->raw_lock); __release(lock); } while (0) |
235 | # define write_unlock(lock) \ | 241 | # define write_unlock(lock) \ |
236 | do {__raw_write_unlock(&(lock)->raw_lock); __release(lock); } while (0) | 242 | do {__raw_write_unlock(&(lock)->raw_lock); __release(lock); } while (0) |
237 | # define spin_unlock_irq(lock) \ | 243 | # define spin_unlock_irq(lock) \ |
238 | do { \ | 244 | do { \ |
239 | __raw_spin_unlock(&(lock)->raw_lock); \ | 245 | __raw_spin_unlock(&(lock)->raw_lock); \ |
240 | __release(lock); \ | 246 | __release(lock); \ |
241 | local_irq_enable(); \ | 247 | local_irq_enable(); \ |
242 | } while (0) | 248 | } while (0) |
243 | # define read_unlock_irq(lock) \ | 249 | # define read_unlock_irq(lock) \ |
244 | do { \ | 250 | do { \ |
245 | __raw_read_unlock(&(lock)->raw_lock); \ | 251 | __raw_read_unlock(&(lock)->raw_lock); \ |
246 | __release(lock); \ | 252 | __release(lock); \ |
247 | local_irq_enable(); \ | 253 | local_irq_enable(); \ |
248 | } while (0) | 254 | } while (0) |
249 | # define write_unlock_irq(lock) \ | 255 | # define write_unlock_irq(lock) \ |
250 | do { \ | 256 | do { \ |
251 | __raw_write_unlock(&(lock)->raw_lock); \ | 257 | __raw_write_unlock(&(lock)->raw_lock); \ |
252 | __release(lock); \ | 258 | __release(lock); \ |
253 | local_irq_enable(); \ | 259 | local_irq_enable(); \ |
254 | } while (0) | 260 | } while (0) |
255 | #endif | 261 | #endif |
256 | 262 | ||
257 | #define spin_unlock_irqrestore(lock, flags) \ | 263 | #define spin_unlock_irqrestore(lock, flags) \ |
258 | _spin_unlock_irqrestore(lock, flags) | 264 | _spin_unlock_irqrestore(lock, flags) |
259 | #define spin_unlock_bh(lock) _spin_unlock_bh(lock) | 265 | #define spin_unlock_bh(lock) _spin_unlock_bh(lock) |
260 | 266 | ||
261 | #define read_unlock_irqrestore(lock, flags) \ | 267 | #define read_unlock_irqrestore(lock, flags) \ |
262 | _read_unlock_irqrestore(lock, flags) | 268 | _read_unlock_irqrestore(lock, flags) |
263 | #define read_unlock_bh(lock) _read_unlock_bh(lock) | 269 | #define read_unlock_bh(lock) _read_unlock_bh(lock) |
264 | 270 | ||
265 | #define write_unlock_irqrestore(lock, flags) \ | 271 | #define write_unlock_irqrestore(lock, flags) \ |
266 | _write_unlock_irqrestore(lock, flags) | 272 | _write_unlock_irqrestore(lock, flags) |
267 | #define write_unlock_bh(lock) _write_unlock_bh(lock) | 273 | #define write_unlock_bh(lock) _write_unlock_bh(lock) |
268 | 274 | ||
269 | #define spin_trylock_bh(lock) __cond_lock(lock, _spin_trylock_bh(lock)) | 275 | #define spin_trylock_bh(lock) __cond_lock(lock, _spin_trylock_bh(lock)) |
270 | 276 | ||
271 | #define spin_trylock_irq(lock) \ | 277 | #define spin_trylock_irq(lock) \ |
272 | ({ \ | 278 | ({ \ |
273 | local_irq_disable(); \ | 279 | local_irq_disable(); \ |
274 | spin_trylock(lock) ? \ | 280 | spin_trylock(lock) ? \ |
275 | 1 : ({ local_irq_enable(); 0; }); \ | 281 | 1 : ({ local_irq_enable(); 0; }); \ |
276 | }) | 282 | }) |
277 | 283 | ||
278 | #define spin_trylock_irqsave(lock, flags) \ | 284 | #define spin_trylock_irqsave(lock, flags) \ |
279 | ({ \ | 285 | ({ \ |
280 | local_irq_save(flags); \ | 286 | local_irq_save(flags); \ |
281 | spin_trylock(lock) ? \ | 287 | spin_trylock(lock) ? \ |
282 | 1 : ({ local_irq_restore(flags); 0; }); \ | 288 | 1 : ({ local_irq_restore(flags); 0; }); \ |
283 | }) | 289 | }) |
284 | 290 | ||
285 | #define write_trylock_irqsave(lock, flags) \ | 291 | #define write_trylock_irqsave(lock, flags) \ |
286 | ({ \ | 292 | ({ \ |
287 | local_irq_save(flags); \ | 293 | local_irq_save(flags); \ |
288 | write_trylock(lock) ? \ | 294 | write_trylock(lock) ? \ |
289 | 1 : ({ local_irq_restore(flags); 0; }); \ | 295 | 1 : ({ local_irq_restore(flags); 0; }); \ |
290 | }) | 296 | }) |
291 | 297 | ||
292 | /* | 298 | /* |
293 | * Locks two spinlocks l1 and l2. | 299 | * Locks two spinlocks l1 and l2. |
294 | * l1_first indicates if spinlock l1 should be taken first. | 300 | * l1_first indicates if spinlock l1 should be taken first. |
295 | */ | 301 | */ |
296 | static inline void double_spin_lock(spinlock_t *l1, spinlock_t *l2, | 302 | static inline void double_spin_lock(spinlock_t *l1, spinlock_t *l2, |
297 | bool l1_first) | 303 | bool l1_first) |
298 | __acquires(l1) | 304 | __acquires(l1) |
299 | __acquires(l2) | 305 | __acquires(l2) |
300 | { | 306 | { |
301 | if (l1_first) { | 307 | if (l1_first) { |
302 | spin_lock(l1); | 308 | spin_lock(l1); |
303 | spin_lock(l2); | 309 | spin_lock(l2); |
304 | } else { | 310 | } else { |
305 | spin_lock(l2); | 311 | spin_lock(l2); |
306 | spin_lock(l1); | 312 | spin_lock(l1); |
307 | } | 313 | } |
308 | } | 314 | } |
309 | 315 | ||
310 | /* | 316 | /* |
311 | * Unlocks two spinlocks l1 and l2. | 317 | * Unlocks two spinlocks l1 and l2. |
312 | * l1_taken_first indicates if spinlock l1 was taken first and therefore | 318 | * l1_taken_first indicates if spinlock l1 was taken first and therefore |
313 | * should be released after spinlock l2. | 319 | * should be released after spinlock l2. |
314 | */ | 320 | */ |
315 | static inline void double_spin_unlock(spinlock_t *l1, spinlock_t *l2, | 321 | static inline void double_spin_unlock(spinlock_t *l1, spinlock_t *l2, |
316 | bool l1_taken_first) | 322 | bool l1_taken_first) |
317 | __releases(l1) | 323 | __releases(l1) |
318 | __releases(l2) | 324 | __releases(l2) |
319 | { | 325 | { |
320 | if (l1_taken_first) { | 326 | if (l1_taken_first) { |
321 | spin_unlock(l2); | 327 | spin_unlock(l2); |
322 | spin_unlock(l1); | 328 | spin_unlock(l1); |
323 | } else { | 329 | } else { |
324 | spin_unlock(l1); | 330 | spin_unlock(l1); |
325 | spin_unlock(l2); | 331 | spin_unlock(l2); |
326 | } | 332 | } |
327 | } | 333 | } |
328 | 334 | ||
329 | /* | 335 | /* |
330 | * Pull the atomic_t declaration: | 336 | * Pull the atomic_t declaration: |
331 | * (asm-mips/atomic.h needs above definitions) | 337 | * (asm-mips/atomic.h needs above definitions) |
332 | */ | 338 | */ |
333 | #include <asm/atomic.h> | 339 | #include <asm/atomic.h> |
334 | /** | 340 | /** |
335 | * atomic_dec_and_lock - lock on reaching reference count zero | 341 | * atomic_dec_and_lock - lock on reaching reference count zero |
336 | * @atomic: the atomic counter | 342 | * @atomic: the atomic counter |
337 | * @lock: the spinlock in question | 343 | * @lock: the spinlock in question |
338 | */ | 344 | */ |
339 | extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock); | 345 | extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock); |
340 | #define atomic_dec_and_lock(atomic, lock) \ | 346 | #define atomic_dec_and_lock(atomic, lock) \ |
341 | __cond_lock(lock, _atomic_dec_and_lock(atomic, lock)) | 347 | __cond_lock(lock, _atomic_dec_and_lock(atomic, lock)) |
342 | 348 | ||
343 | /** | 349 | /** |
344 | * spin_can_lock - would spin_trylock() succeed? | 350 | * spin_can_lock - would spin_trylock() succeed? |
345 | * @lock: the spinlock in question. | 351 | * @lock: the spinlock in question. |
346 | */ | 352 | */ |
347 | #define spin_can_lock(lock) (!spin_is_locked(lock)) | 353 | #define spin_can_lock(lock) (!spin_is_locked(lock)) |
348 | 354 | ||
349 | #endif /* __LINUX_SPINLOCK_H */ | 355 | #endif /* __LINUX_SPINLOCK_H */ |
350 | 356 |
include/linux/spinlock_types.h
1 | #ifndef __LINUX_SPINLOCK_TYPES_H | 1 | #ifndef __LINUX_SPINLOCK_TYPES_H |
2 | #define __LINUX_SPINLOCK_TYPES_H | 2 | #define __LINUX_SPINLOCK_TYPES_H |
3 | 3 | ||
4 | /* | 4 | /* |
5 | * include/linux/spinlock_types.h - generic spinlock type definitions | 5 | * include/linux/spinlock_types.h - generic spinlock type definitions |
6 | * and initializers | 6 | * and initializers |
7 | * | 7 | * |
8 | * portions Copyright 2005, Red Hat, Inc., Ingo Molnar | 8 | * portions Copyright 2005, Red Hat, Inc., Ingo Molnar |
9 | * Released under the General Public License (GPL). | 9 | * Released under the General Public License (GPL). |
10 | */ | 10 | */ |
11 | 11 | ||
12 | #if defined(CONFIG_SMP) | 12 | #if defined(CONFIG_SMP) |
13 | # include <asm/spinlock_types.h> | 13 | # include <asm/spinlock_types.h> |
14 | #else | 14 | #else |
15 | # include <linux/spinlock_types_up.h> | 15 | # include <linux/spinlock_types_up.h> |
16 | #endif | 16 | #endif |
17 | 17 | ||
18 | #include <linux/lockdep.h> | 18 | #include <linux/lockdep.h> |
19 | 19 | ||
20 | typedef struct { | 20 | typedef struct { |
21 | raw_spinlock_t raw_lock; | 21 | raw_spinlock_t raw_lock; |
22 | #if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP) | 22 | #ifdef CONFIG_GENERIC_LOCKBREAK |
23 | unsigned int break_lock; | 23 | unsigned int break_lock; |
24 | #endif | 24 | #endif |
25 | #ifdef CONFIG_DEBUG_SPINLOCK | 25 | #ifdef CONFIG_DEBUG_SPINLOCK |
26 | unsigned int magic, owner_cpu; | 26 | unsigned int magic, owner_cpu; |
27 | void *owner; | 27 | void *owner; |
28 | #endif | 28 | #endif |
29 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 29 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
30 | struct lockdep_map dep_map; | 30 | struct lockdep_map dep_map; |
31 | #endif | 31 | #endif |
32 | } spinlock_t; | 32 | } spinlock_t; |
33 | 33 | ||
34 | #define SPINLOCK_MAGIC 0xdead4ead | 34 | #define SPINLOCK_MAGIC 0xdead4ead |
35 | 35 | ||
36 | typedef struct { | 36 | typedef struct { |
37 | raw_rwlock_t raw_lock; | 37 | raw_rwlock_t raw_lock; |
38 | #if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP) | 38 | #ifdef CONFIG_GENERIC_LOCKBREAK |
39 | unsigned int break_lock; | 39 | unsigned int break_lock; |
40 | #endif | 40 | #endif |
41 | #ifdef CONFIG_DEBUG_SPINLOCK | 41 | #ifdef CONFIG_DEBUG_SPINLOCK |
42 | unsigned int magic, owner_cpu; | 42 | unsigned int magic, owner_cpu; |
43 | void *owner; | 43 | void *owner; |
44 | #endif | 44 | #endif |
45 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 45 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
46 | struct lockdep_map dep_map; | 46 | struct lockdep_map dep_map; |
47 | #endif | 47 | #endif |
48 | } rwlock_t; | 48 | } rwlock_t; |
49 | 49 | ||
50 | #define RWLOCK_MAGIC 0xdeaf1eed | 50 | #define RWLOCK_MAGIC 0xdeaf1eed |
51 | 51 | ||
52 | #define SPINLOCK_OWNER_INIT ((void *)-1L) | 52 | #define SPINLOCK_OWNER_INIT ((void *)-1L) |
53 | 53 | ||
54 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 54 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
55 | # define SPIN_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname } | 55 | # define SPIN_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname } |
56 | #else | 56 | #else |
57 | # define SPIN_DEP_MAP_INIT(lockname) | 57 | # define SPIN_DEP_MAP_INIT(lockname) |
58 | #endif | 58 | #endif |
59 | 59 | ||
60 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 60 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
61 | # define RW_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname } | 61 | # define RW_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname } |
62 | #else | 62 | #else |
63 | # define RW_DEP_MAP_INIT(lockname) | 63 | # define RW_DEP_MAP_INIT(lockname) |
64 | #endif | 64 | #endif |
65 | 65 | ||
66 | #ifdef CONFIG_DEBUG_SPINLOCK | 66 | #ifdef CONFIG_DEBUG_SPINLOCK |
67 | # define __SPIN_LOCK_UNLOCKED(lockname) \ | 67 | # define __SPIN_LOCK_UNLOCKED(lockname) \ |
68 | (spinlock_t) { .raw_lock = __RAW_SPIN_LOCK_UNLOCKED, \ | 68 | (spinlock_t) { .raw_lock = __RAW_SPIN_LOCK_UNLOCKED, \ |
69 | .magic = SPINLOCK_MAGIC, \ | 69 | .magic = SPINLOCK_MAGIC, \ |
70 | .owner = SPINLOCK_OWNER_INIT, \ | 70 | .owner = SPINLOCK_OWNER_INIT, \ |
71 | .owner_cpu = -1, \ | 71 | .owner_cpu = -1, \ |
72 | SPIN_DEP_MAP_INIT(lockname) } | 72 | SPIN_DEP_MAP_INIT(lockname) } |
73 | #define __RW_LOCK_UNLOCKED(lockname) \ | 73 | #define __RW_LOCK_UNLOCKED(lockname) \ |
74 | (rwlock_t) { .raw_lock = __RAW_RW_LOCK_UNLOCKED, \ | 74 | (rwlock_t) { .raw_lock = __RAW_RW_LOCK_UNLOCKED, \ |
75 | .magic = RWLOCK_MAGIC, \ | 75 | .magic = RWLOCK_MAGIC, \ |
76 | .owner = SPINLOCK_OWNER_INIT, \ | 76 | .owner = SPINLOCK_OWNER_INIT, \ |
77 | .owner_cpu = -1, \ | 77 | .owner_cpu = -1, \ |
78 | RW_DEP_MAP_INIT(lockname) } | 78 | RW_DEP_MAP_INIT(lockname) } |
79 | #else | 79 | #else |
80 | # define __SPIN_LOCK_UNLOCKED(lockname) \ | 80 | # define __SPIN_LOCK_UNLOCKED(lockname) \ |
81 | (spinlock_t) { .raw_lock = __RAW_SPIN_LOCK_UNLOCKED, \ | 81 | (spinlock_t) { .raw_lock = __RAW_SPIN_LOCK_UNLOCKED, \ |
82 | SPIN_DEP_MAP_INIT(lockname) } | 82 | SPIN_DEP_MAP_INIT(lockname) } |
83 | #define __RW_LOCK_UNLOCKED(lockname) \ | 83 | #define __RW_LOCK_UNLOCKED(lockname) \ |
84 | (rwlock_t) { .raw_lock = __RAW_RW_LOCK_UNLOCKED, \ | 84 | (rwlock_t) { .raw_lock = __RAW_RW_LOCK_UNLOCKED, \ |
85 | RW_DEP_MAP_INIT(lockname) } | 85 | RW_DEP_MAP_INIT(lockname) } |
86 | #endif | 86 | #endif |
87 | 87 | ||
88 | /* | 88 | /* |
89 | * SPIN_LOCK_UNLOCKED and RW_LOCK_UNLOCKED defeat lockdep state tracking and | 89 | * SPIN_LOCK_UNLOCKED and RW_LOCK_UNLOCKED defeat lockdep state tracking and |
90 | * are hence deprecated. | 90 | * are hence deprecated. |
91 | * Please use DEFINE_SPINLOCK()/DEFINE_RWLOCK() or | 91 | * Please use DEFINE_SPINLOCK()/DEFINE_RWLOCK() or |
92 | * __SPIN_LOCK_UNLOCKED()/__RW_LOCK_UNLOCKED() as appropriate. | 92 | * __SPIN_LOCK_UNLOCKED()/__RW_LOCK_UNLOCKED() as appropriate. |
93 | */ | 93 | */ |
94 | #define SPIN_LOCK_UNLOCKED __SPIN_LOCK_UNLOCKED(old_style_spin_init) | 94 | #define SPIN_LOCK_UNLOCKED __SPIN_LOCK_UNLOCKED(old_style_spin_init) |
95 | #define RW_LOCK_UNLOCKED __RW_LOCK_UNLOCKED(old_style_rw_init) | 95 | #define RW_LOCK_UNLOCKED __RW_LOCK_UNLOCKED(old_style_rw_init) |
96 | 96 | ||
97 | #define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x) | 97 | #define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x) |
98 | #define DEFINE_RWLOCK(x) rwlock_t x = __RW_LOCK_UNLOCKED(x) | 98 | #define DEFINE_RWLOCK(x) rwlock_t x = __RW_LOCK_UNLOCKED(x) |
99 | 99 | ||
100 | #endif /* __LINUX_SPINLOCK_TYPES_H */ | 100 | #endif /* __LINUX_SPINLOCK_TYPES_H */ |
101 | 101 |
include/linux/spinlock_up.h
1 | #ifndef __LINUX_SPINLOCK_UP_H | 1 | #ifndef __LINUX_SPINLOCK_UP_H |
2 | #define __LINUX_SPINLOCK_UP_H | 2 | #define __LINUX_SPINLOCK_UP_H |
3 | 3 | ||
4 | #ifndef __LINUX_SPINLOCK_H | 4 | #ifndef __LINUX_SPINLOCK_H |
5 | # error "please don't include this file directly" | 5 | # error "please don't include this file directly" |
6 | #endif | 6 | #endif |
7 | 7 | ||
8 | /* | 8 | /* |
9 | * include/linux/spinlock_up.h - UP-debug version of spinlocks. | 9 | * include/linux/spinlock_up.h - UP-debug version of spinlocks. |
10 | * | 10 | * |
11 | * portions Copyright 2005, Red Hat, Inc., Ingo Molnar | 11 | * portions Copyright 2005, Red Hat, Inc., Ingo Molnar |
12 | * Released under the General Public License (GPL). | 12 | * Released under the General Public License (GPL). |
13 | * | 13 | * |
14 | * In the debug case, 1 means unlocked, 0 means locked. (the values | 14 | * In the debug case, 1 means unlocked, 0 means locked. (the values |
15 | * are inverted, to catch initialization bugs) | 15 | * are inverted, to catch initialization bugs) |
16 | * | 16 | * |
17 | * No atomicity anywhere, we are on UP. | 17 | * No atomicity anywhere, we are on UP. |
18 | */ | 18 | */ |
19 | 19 | ||
20 | #ifdef CONFIG_DEBUG_SPINLOCK | 20 | #ifdef CONFIG_DEBUG_SPINLOCK |
21 | #define __raw_spin_is_locked(x) ((x)->slock == 0) | 21 | #define __raw_spin_is_locked(x) ((x)->slock == 0) |
22 | 22 | ||
23 | static inline void __raw_spin_lock(raw_spinlock_t *lock) | 23 | static inline void __raw_spin_lock(raw_spinlock_t *lock) |
24 | { | 24 | { |
25 | lock->slock = 0; | 25 | lock->slock = 0; |
26 | } | 26 | } |
27 | 27 | ||
28 | static inline void | 28 | static inline void |
29 | __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags) | 29 | __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags) |
30 | { | 30 | { |
31 | local_irq_save(flags); | 31 | local_irq_save(flags); |
32 | lock->slock = 0; | 32 | lock->slock = 0; |
33 | } | 33 | } |
34 | 34 | ||
35 | static inline int __raw_spin_trylock(raw_spinlock_t *lock) | 35 | static inline int __raw_spin_trylock(raw_spinlock_t *lock) |
36 | { | 36 | { |
37 | char oldval = lock->slock; | 37 | char oldval = lock->slock; |
38 | 38 | ||
39 | lock->slock = 0; | 39 | lock->slock = 0; |
40 | 40 | ||
41 | return oldval > 0; | 41 | return oldval > 0; |
42 | } | 42 | } |
43 | 43 | ||
44 | static inline void __raw_spin_unlock(raw_spinlock_t *lock) | 44 | static inline void __raw_spin_unlock(raw_spinlock_t *lock) |
45 | { | 45 | { |
46 | lock->slock = 1; | 46 | lock->slock = 1; |
47 | } | 47 | } |
48 | 48 | ||
49 | /* | 49 | /* |
50 | * Read-write spinlocks. No debug version. | 50 | * Read-write spinlocks. No debug version. |
51 | */ | 51 | */ |
52 | #define __raw_read_lock(lock) do { (void)(lock); } while (0) | 52 | #define __raw_read_lock(lock) do { (void)(lock); } while (0) |
53 | #define __raw_write_lock(lock) do { (void)(lock); } while (0) | 53 | #define __raw_write_lock(lock) do { (void)(lock); } while (0) |
54 | #define __raw_read_trylock(lock) ({ (void)(lock); 1; }) | 54 | #define __raw_read_trylock(lock) ({ (void)(lock); 1; }) |
55 | #define __raw_write_trylock(lock) ({ (void)(lock); 1; }) | 55 | #define __raw_write_trylock(lock) ({ (void)(lock); 1; }) |
56 | #define __raw_read_unlock(lock) do { (void)(lock); } while (0) | 56 | #define __raw_read_unlock(lock) do { (void)(lock); } while (0) |
57 | #define __raw_write_unlock(lock) do { (void)(lock); } while (0) | 57 | #define __raw_write_unlock(lock) do { (void)(lock); } while (0) |
58 | 58 | ||
59 | #else /* DEBUG_SPINLOCK */ | 59 | #else /* DEBUG_SPINLOCK */ |
60 | #define __raw_spin_is_locked(lock) ((void)(lock), 0) | 60 | #define __raw_spin_is_locked(lock) ((void)(lock), 0) |
61 | /* for sched.c and kernel_lock.c: */ | 61 | /* for sched.c and kernel_lock.c: */ |
62 | # define __raw_spin_lock(lock) do { (void)(lock); } while (0) | 62 | # define __raw_spin_lock(lock) do { (void)(lock); } while (0) |
63 | # define __raw_spin_unlock(lock) do { (void)(lock); } while (0) | 63 | # define __raw_spin_unlock(lock) do { (void)(lock); } while (0) |
64 | # define __raw_spin_trylock(lock) ({ (void)(lock); 1; }) | 64 | # define __raw_spin_trylock(lock) ({ (void)(lock); 1; }) |
65 | #endif /* DEBUG_SPINLOCK */ | 65 | #endif /* DEBUG_SPINLOCK */ |
66 | 66 | ||
67 | #define __raw_spin_is_contended(lock) (((void)(lock), 0)) | ||
68 | |||
67 | #define __raw_read_can_lock(lock) (((void)(lock), 1)) | 69 | #define __raw_read_can_lock(lock) (((void)(lock), 1)) |
68 | #define __raw_write_can_lock(lock) (((void)(lock), 1)) | 70 | #define __raw_write_can_lock(lock) (((void)(lock), 1)) |
69 | 71 | ||
70 | #define __raw_spin_unlock_wait(lock) \ | 72 | #define __raw_spin_unlock_wait(lock) \ |
71 | do { cpu_relax(); } while (__raw_spin_is_locked(lock)) | 73 | do { cpu_relax(); } while (__raw_spin_is_locked(lock)) |
72 | 74 | ||
73 | #endif /* __LINUX_SPINLOCK_UP_H */ | 75 | #endif /* __LINUX_SPINLOCK_UP_H */ |
74 | 76 |
kernel/sched.c
1 | /* | 1 | /* |
2 | * kernel/sched.c | 2 | * kernel/sched.c |
3 | * | 3 | * |
4 | * Kernel scheduler and related syscalls | 4 | * Kernel scheduler and related syscalls |
5 | * | 5 | * |
6 | * Copyright (C) 1991-2002 Linus Torvalds | 6 | * Copyright (C) 1991-2002 Linus Torvalds |
7 | * | 7 | * |
8 | * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and | 8 | * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and |
9 | * make semaphores SMP safe | 9 | * make semaphores SMP safe |
10 | * 1998-11-19 Implemented schedule_timeout() and related stuff | 10 | * 1998-11-19 Implemented schedule_timeout() and related stuff |
11 | * by Andrea Arcangeli | 11 | * by Andrea Arcangeli |
12 | * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: | 12 | * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: |
13 | * hybrid priority-list and round-robin design with | 13 | * hybrid priority-list and round-robin design with |
14 | * an array-switch method of distributing timeslices | 14 | * an array-switch method of distributing timeslices |
15 | * and per-CPU runqueues. Cleanups and useful suggestions | 15 | * and per-CPU runqueues. Cleanups and useful suggestions |
16 | * by Davide Libenzi, preemptible kernel bits by Robert Love. | 16 | * by Davide Libenzi, preemptible kernel bits by Robert Love. |
17 | * 2003-09-03 Interactivity tuning by Con Kolivas. | 17 | * 2003-09-03 Interactivity tuning by Con Kolivas. |
18 | * 2004-04-02 Scheduler domains code by Nick Piggin | 18 | * 2004-04-02 Scheduler domains code by Nick Piggin |
19 | * 2007-04-15 Work begun on replacing all interactivity tuning with a | 19 | * 2007-04-15 Work begun on replacing all interactivity tuning with a |
20 | * fair scheduling design by Con Kolivas. | 20 | * fair scheduling design by Con Kolivas. |
21 | * 2007-05-05 Load balancing (smp-nice) and other improvements | 21 | * 2007-05-05 Load balancing (smp-nice) and other improvements |
22 | * by Peter Williams | 22 | * by Peter Williams |
23 | * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith | 23 | * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith |
24 | * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri | 24 | * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri |
25 | * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, | 25 | * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, |
26 | * Thomas Gleixner, Mike Kravetz | 26 | * Thomas Gleixner, Mike Kravetz |
27 | */ | 27 | */ |
28 | 28 | ||
29 | #include <linux/mm.h> | 29 | #include <linux/mm.h> |
30 | #include <linux/module.h> | 30 | #include <linux/module.h> |
31 | #include <linux/nmi.h> | 31 | #include <linux/nmi.h> |
32 | #include <linux/init.h> | 32 | #include <linux/init.h> |
33 | #include <linux/uaccess.h> | 33 | #include <linux/uaccess.h> |
34 | #include <linux/highmem.h> | 34 | #include <linux/highmem.h> |
35 | #include <linux/smp_lock.h> | 35 | #include <linux/smp_lock.h> |
36 | #include <asm/mmu_context.h> | 36 | #include <asm/mmu_context.h> |
37 | #include <linux/interrupt.h> | 37 | #include <linux/interrupt.h> |
38 | #include <linux/capability.h> | 38 | #include <linux/capability.h> |
39 | #include <linux/completion.h> | 39 | #include <linux/completion.h> |
40 | #include <linux/kernel_stat.h> | 40 | #include <linux/kernel_stat.h> |
41 | #include <linux/debug_locks.h> | 41 | #include <linux/debug_locks.h> |
42 | #include <linux/security.h> | 42 | #include <linux/security.h> |
43 | #include <linux/notifier.h> | 43 | #include <linux/notifier.h> |
44 | #include <linux/profile.h> | 44 | #include <linux/profile.h> |
45 | #include <linux/freezer.h> | 45 | #include <linux/freezer.h> |
46 | #include <linux/vmalloc.h> | 46 | #include <linux/vmalloc.h> |
47 | #include <linux/blkdev.h> | 47 | #include <linux/blkdev.h> |
48 | #include <linux/delay.h> | 48 | #include <linux/delay.h> |
49 | #include <linux/pid_namespace.h> | 49 | #include <linux/pid_namespace.h> |
50 | #include <linux/smp.h> | 50 | #include <linux/smp.h> |
51 | #include <linux/threads.h> | 51 | #include <linux/threads.h> |
52 | #include <linux/timer.h> | 52 | #include <linux/timer.h> |
53 | #include <linux/rcupdate.h> | 53 | #include <linux/rcupdate.h> |
54 | #include <linux/cpu.h> | 54 | #include <linux/cpu.h> |
55 | #include <linux/cpuset.h> | 55 | #include <linux/cpuset.h> |
56 | #include <linux/percpu.h> | 56 | #include <linux/percpu.h> |
57 | #include <linux/kthread.h> | 57 | #include <linux/kthread.h> |
58 | #include <linux/seq_file.h> | 58 | #include <linux/seq_file.h> |
59 | #include <linux/sysctl.h> | 59 | #include <linux/sysctl.h> |
60 | #include <linux/syscalls.h> | 60 | #include <linux/syscalls.h> |
61 | #include <linux/times.h> | 61 | #include <linux/times.h> |
62 | #include <linux/tsacct_kern.h> | 62 | #include <linux/tsacct_kern.h> |
63 | #include <linux/kprobes.h> | 63 | #include <linux/kprobes.h> |
64 | #include <linux/delayacct.h> | 64 | #include <linux/delayacct.h> |
65 | #include <linux/reciprocal_div.h> | 65 | #include <linux/reciprocal_div.h> |
66 | #include <linux/unistd.h> | 66 | #include <linux/unistd.h> |
67 | #include <linux/pagemap.h> | 67 | #include <linux/pagemap.h> |
68 | #include <linux/hrtimer.h> | 68 | #include <linux/hrtimer.h> |
69 | 69 | ||
70 | #include <asm/tlb.h> | 70 | #include <asm/tlb.h> |
71 | #include <asm/irq_regs.h> | 71 | #include <asm/irq_regs.h> |
72 | 72 | ||
73 | /* | 73 | /* |
74 | * Scheduler clock - returns current time in nanosec units. | 74 | * Scheduler clock - returns current time in nanosec units. |
75 | * This is default implementation. | 75 | * This is default implementation. |
76 | * Architectures and sub-architectures can override this. | 76 | * Architectures and sub-architectures can override this. |
77 | */ | 77 | */ |
78 | unsigned long long __attribute__((weak)) sched_clock(void) | 78 | unsigned long long __attribute__((weak)) sched_clock(void) |
79 | { | 79 | { |
80 | return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); | 80 | return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); |
81 | } | 81 | } |
82 | 82 | ||
83 | /* | 83 | /* |
84 | * Convert user-nice values [ -20 ... 0 ... 19 ] | 84 | * Convert user-nice values [ -20 ... 0 ... 19 ] |
85 | * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], | 85 | * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], |
86 | * and back. | 86 | * and back. |
87 | */ | 87 | */ |
88 | #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) | 88 | #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) |
89 | #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) | 89 | #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) |
90 | #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) | 90 | #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) |
91 | 91 | ||
92 | /* | 92 | /* |
93 | * 'User priority' is the nice value converted to something we | 93 | * 'User priority' is the nice value converted to something we |
94 | * can work with better when scaling various scheduler parameters, | 94 | * can work with better when scaling various scheduler parameters, |
95 | * it's a [ 0 ... 39 ] range. | 95 | * it's a [ 0 ... 39 ] range. |
96 | */ | 96 | */ |
97 | #define USER_PRIO(p) ((p)-MAX_RT_PRIO) | 97 | #define USER_PRIO(p) ((p)-MAX_RT_PRIO) |
98 | #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) | 98 | #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) |
99 | #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) | 99 | #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) |
100 | 100 | ||
101 | /* | 101 | /* |
102 | * Helpers for converting nanosecond timing to jiffy resolution | 102 | * Helpers for converting nanosecond timing to jiffy resolution |
103 | */ | 103 | */ |
104 | #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) | 104 | #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) |
105 | 105 | ||
106 | #define NICE_0_LOAD SCHED_LOAD_SCALE | 106 | #define NICE_0_LOAD SCHED_LOAD_SCALE |
107 | #define NICE_0_SHIFT SCHED_LOAD_SHIFT | 107 | #define NICE_0_SHIFT SCHED_LOAD_SHIFT |
108 | 108 | ||
109 | /* | 109 | /* |
110 | * These are the 'tuning knobs' of the scheduler: | 110 | * These are the 'tuning knobs' of the scheduler: |
111 | * | 111 | * |
112 | * default timeslice is 100 msecs (used only for SCHED_RR tasks). | 112 | * default timeslice is 100 msecs (used only for SCHED_RR tasks). |
113 | * Timeslices get refilled after they expire. | 113 | * Timeslices get refilled after they expire. |
114 | */ | 114 | */ |
115 | #define DEF_TIMESLICE (100 * HZ / 1000) | 115 | #define DEF_TIMESLICE (100 * HZ / 1000) |
116 | 116 | ||
117 | #ifdef CONFIG_SMP | 117 | #ifdef CONFIG_SMP |
118 | /* | 118 | /* |
119 | * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) | 119 | * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) |
120 | * Since cpu_power is a 'constant', we can use a reciprocal divide. | 120 | * Since cpu_power is a 'constant', we can use a reciprocal divide. |
121 | */ | 121 | */ |
122 | static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load) | 122 | static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load) |
123 | { | 123 | { |
124 | return reciprocal_divide(load, sg->reciprocal_cpu_power); | 124 | return reciprocal_divide(load, sg->reciprocal_cpu_power); |
125 | } | 125 | } |
126 | 126 | ||
127 | /* | 127 | /* |
128 | * Each time a sched group cpu_power is changed, | 128 | * Each time a sched group cpu_power is changed, |
129 | * we must compute its reciprocal value | 129 | * we must compute its reciprocal value |
130 | */ | 130 | */ |
131 | static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) | 131 | static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) |
132 | { | 132 | { |
133 | sg->__cpu_power += val; | 133 | sg->__cpu_power += val; |
134 | sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power); | 134 | sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power); |
135 | } | 135 | } |
136 | #endif | 136 | #endif |
137 | 137 | ||
138 | static inline int rt_policy(int policy) | 138 | static inline int rt_policy(int policy) |
139 | { | 139 | { |
140 | if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR)) | 140 | if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR)) |
141 | return 1; | 141 | return 1; |
142 | return 0; | 142 | return 0; |
143 | } | 143 | } |
144 | 144 | ||
145 | static inline int task_has_rt_policy(struct task_struct *p) | 145 | static inline int task_has_rt_policy(struct task_struct *p) |
146 | { | 146 | { |
147 | return rt_policy(p->policy); | 147 | return rt_policy(p->policy); |
148 | } | 148 | } |
149 | 149 | ||
150 | /* | 150 | /* |
151 | * This is the priority-queue data structure of the RT scheduling class: | 151 | * This is the priority-queue data structure of the RT scheduling class: |
152 | */ | 152 | */ |
153 | struct rt_prio_array { | 153 | struct rt_prio_array { |
154 | DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ | 154 | DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ |
155 | struct list_head queue[MAX_RT_PRIO]; | 155 | struct list_head queue[MAX_RT_PRIO]; |
156 | }; | 156 | }; |
157 | 157 | ||
158 | #ifdef CONFIG_FAIR_GROUP_SCHED | 158 | #ifdef CONFIG_FAIR_GROUP_SCHED |
159 | 159 | ||
160 | #include <linux/cgroup.h> | 160 | #include <linux/cgroup.h> |
161 | 161 | ||
162 | struct cfs_rq; | 162 | struct cfs_rq; |
163 | 163 | ||
164 | static LIST_HEAD(task_groups); | 164 | static LIST_HEAD(task_groups); |
165 | 165 | ||
166 | /* task group related information */ | 166 | /* task group related information */ |
167 | struct task_group { | 167 | struct task_group { |
168 | #ifdef CONFIG_FAIR_CGROUP_SCHED | 168 | #ifdef CONFIG_FAIR_CGROUP_SCHED |
169 | struct cgroup_subsys_state css; | 169 | struct cgroup_subsys_state css; |
170 | #endif | 170 | #endif |
171 | /* schedulable entities of this group on each cpu */ | 171 | /* schedulable entities of this group on each cpu */ |
172 | struct sched_entity **se; | 172 | struct sched_entity **se; |
173 | /* runqueue "owned" by this group on each cpu */ | 173 | /* runqueue "owned" by this group on each cpu */ |
174 | struct cfs_rq **cfs_rq; | 174 | struct cfs_rq **cfs_rq; |
175 | 175 | ||
176 | struct sched_rt_entity **rt_se; | 176 | struct sched_rt_entity **rt_se; |
177 | struct rt_rq **rt_rq; | 177 | struct rt_rq **rt_rq; |
178 | 178 | ||
179 | unsigned int rt_ratio; | 179 | unsigned int rt_ratio; |
180 | 180 | ||
181 | /* | 181 | /* |
182 | * shares assigned to a task group governs how much of cpu bandwidth | 182 | * shares assigned to a task group governs how much of cpu bandwidth |
183 | * is allocated to the group. The more shares a group has, the more is | 183 | * is allocated to the group. The more shares a group has, the more is |
184 | * the cpu bandwidth allocated to it. | 184 | * the cpu bandwidth allocated to it. |
185 | * | 185 | * |
186 | * For ex, lets say that there are three task groups, A, B and C which | 186 | * For ex, lets say that there are three task groups, A, B and C which |
187 | * have been assigned shares 1000, 2000 and 3000 respectively. Then, | 187 | * have been assigned shares 1000, 2000 and 3000 respectively. Then, |
188 | * cpu bandwidth allocated by the scheduler to task groups A, B and C | 188 | * cpu bandwidth allocated by the scheduler to task groups A, B and C |
189 | * should be: | 189 | * should be: |
190 | * | 190 | * |
191 | * Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66% | 191 | * Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66% |
192 | * Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33% | 192 | * Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33% |
193 | * Bw(C) = 3000/(1000+2000+3000) * 100 = 50% | 193 | * Bw(C) = 3000/(1000+2000+3000) * 100 = 50% |
194 | * | 194 | * |
195 | * The weight assigned to a task group's schedulable entities on every | 195 | * The weight assigned to a task group's schedulable entities on every |
196 | * cpu (task_group.se[a_cpu]->load.weight) is derived from the task | 196 | * cpu (task_group.se[a_cpu]->load.weight) is derived from the task |
197 | * group's shares. For ex: lets say that task group A has been | 197 | * group's shares. For ex: lets say that task group A has been |
198 | * assigned shares of 1000 and there are two CPUs in a system. Then, | 198 | * assigned shares of 1000 and there are two CPUs in a system. Then, |
199 | * | 199 | * |
200 | * tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000; | 200 | * tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000; |
201 | * | 201 | * |
202 | * Note: It's not necessary that each of a task's group schedulable | 202 | * Note: It's not necessary that each of a task's group schedulable |
203 | * entity have the same weight on all CPUs. If the group | 203 | * entity have the same weight on all CPUs. If the group |
204 | * has 2 of its tasks on CPU0 and 1 task on CPU1, then a | 204 | * has 2 of its tasks on CPU0 and 1 task on CPU1, then a |
205 | * better distribution of weight could be: | 205 | * better distribution of weight could be: |
206 | * | 206 | * |
207 | * tg_A->se[0]->load.weight = 2/3 * 2000 = 1333 | 207 | * tg_A->se[0]->load.weight = 2/3 * 2000 = 1333 |
208 | * tg_A->se[1]->load.weight = 1/2 * 2000 = 667 | 208 | * tg_A->se[1]->load.weight = 1/2 * 2000 = 667 |
209 | * | 209 | * |
210 | * rebalance_shares() is responsible for distributing the shares of a | 210 | * rebalance_shares() is responsible for distributing the shares of a |
211 | * task groups like this among the group's schedulable entities across | 211 | * task groups like this among the group's schedulable entities across |
212 | * cpus. | 212 | * cpus. |
213 | * | 213 | * |
214 | */ | 214 | */ |
215 | unsigned long shares; | 215 | unsigned long shares; |
216 | 216 | ||
217 | struct rcu_head rcu; | 217 | struct rcu_head rcu; |
218 | struct list_head list; | 218 | struct list_head list; |
219 | }; | 219 | }; |
220 | 220 | ||
221 | /* Default task group's sched entity on each cpu */ | 221 | /* Default task group's sched entity on each cpu */ |
222 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); | 222 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); |
223 | /* Default task group's cfs_rq on each cpu */ | 223 | /* Default task group's cfs_rq on each cpu */ |
224 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; | 224 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; |
225 | 225 | ||
226 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); | 226 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); |
227 | static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; | 227 | static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; |
228 | 228 | ||
229 | static struct sched_entity *init_sched_entity_p[NR_CPUS]; | 229 | static struct sched_entity *init_sched_entity_p[NR_CPUS]; |
230 | static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; | 230 | static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; |
231 | 231 | ||
232 | static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS]; | 232 | static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS]; |
233 | static struct rt_rq *init_rt_rq_p[NR_CPUS]; | 233 | static struct rt_rq *init_rt_rq_p[NR_CPUS]; |
234 | 234 | ||
235 | /* task_group_mutex serializes add/remove of task groups and also changes to | 235 | /* task_group_mutex serializes add/remove of task groups and also changes to |
236 | * a task group's cpu shares. | 236 | * a task group's cpu shares. |
237 | */ | 237 | */ |
238 | static DEFINE_MUTEX(task_group_mutex); | 238 | static DEFINE_MUTEX(task_group_mutex); |
239 | 239 | ||
240 | /* doms_cur_mutex serializes access to doms_cur[] array */ | 240 | /* doms_cur_mutex serializes access to doms_cur[] array */ |
241 | static DEFINE_MUTEX(doms_cur_mutex); | 241 | static DEFINE_MUTEX(doms_cur_mutex); |
242 | 242 | ||
243 | #ifdef CONFIG_SMP | 243 | #ifdef CONFIG_SMP |
244 | /* kernel thread that runs rebalance_shares() periodically */ | 244 | /* kernel thread that runs rebalance_shares() periodically */ |
245 | static struct task_struct *lb_monitor_task; | 245 | static struct task_struct *lb_monitor_task; |
246 | static int load_balance_monitor(void *unused); | 246 | static int load_balance_monitor(void *unused); |
247 | #endif | 247 | #endif |
248 | 248 | ||
249 | static void set_se_shares(struct sched_entity *se, unsigned long shares); | 249 | static void set_se_shares(struct sched_entity *se, unsigned long shares); |
250 | 250 | ||
251 | /* Default task group. | 251 | /* Default task group. |
252 | * Every task in system belong to this group at bootup. | 252 | * Every task in system belong to this group at bootup. |
253 | */ | 253 | */ |
254 | struct task_group init_task_group = { | 254 | struct task_group init_task_group = { |
255 | .se = init_sched_entity_p, | 255 | .se = init_sched_entity_p, |
256 | .cfs_rq = init_cfs_rq_p, | 256 | .cfs_rq = init_cfs_rq_p, |
257 | 257 | ||
258 | .rt_se = init_sched_rt_entity_p, | 258 | .rt_se = init_sched_rt_entity_p, |
259 | .rt_rq = init_rt_rq_p, | 259 | .rt_rq = init_rt_rq_p, |
260 | }; | 260 | }; |
261 | 261 | ||
262 | #ifdef CONFIG_FAIR_USER_SCHED | 262 | #ifdef CONFIG_FAIR_USER_SCHED |
263 | # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) | 263 | # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) |
264 | #else | 264 | #else |
265 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD | 265 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD |
266 | #endif | 266 | #endif |
267 | 267 | ||
268 | #define MIN_GROUP_SHARES 2 | 268 | #define MIN_GROUP_SHARES 2 |
269 | 269 | ||
270 | static int init_task_group_load = INIT_TASK_GROUP_LOAD; | 270 | static int init_task_group_load = INIT_TASK_GROUP_LOAD; |
271 | 271 | ||
272 | /* return group to which a task belongs */ | 272 | /* return group to which a task belongs */ |
273 | static inline struct task_group *task_group(struct task_struct *p) | 273 | static inline struct task_group *task_group(struct task_struct *p) |
274 | { | 274 | { |
275 | struct task_group *tg; | 275 | struct task_group *tg; |
276 | 276 | ||
277 | #ifdef CONFIG_FAIR_USER_SCHED | 277 | #ifdef CONFIG_FAIR_USER_SCHED |
278 | tg = p->user->tg; | 278 | tg = p->user->tg; |
279 | #elif defined(CONFIG_FAIR_CGROUP_SCHED) | 279 | #elif defined(CONFIG_FAIR_CGROUP_SCHED) |
280 | tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), | 280 | tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), |
281 | struct task_group, css); | 281 | struct task_group, css); |
282 | #else | 282 | #else |
283 | tg = &init_task_group; | 283 | tg = &init_task_group; |
284 | #endif | 284 | #endif |
285 | return tg; | 285 | return tg; |
286 | } | 286 | } |
287 | 287 | ||
288 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ | 288 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ |
289 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) | 289 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) |
290 | { | 290 | { |
291 | p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; | 291 | p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; |
292 | p->se.parent = task_group(p)->se[cpu]; | 292 | p->se.parent = task_group(p)->se[cpu]; |
293 | 293 | ||
294 | p->rt.rt_rq = task_group(p)->rt_rq[cpu]; | 294 | p->rt.rt_rq = task_group(p)->rt_rq[cpu]; |
295 | p->rt.parent = task_group(p)->rt_se[cpu]; | 295 | p->rt.parent = task_group(p)->rt_se[cpu]; |
296 | } | 296 | } |
297 | 297 | ||
298 | static inline void lock_task_group_list(void) | 298 | static inline void lock_task_group_list(void) |
299 | { | 299 | { |
300 | mutex_lock(&task_group_mutex); | 300 | mutex_lock(&task_group_mutex); |
301 | } | 301 | } |
302 | 302 | ||
303 | static inline void unlock_task_group_list(void) | 303 | static inline void unlock_task_group_list(void) |
304 | { | 304 | { |
305 | mutex_unlock(&task_group_mutex); | 305 | mutex_unlock(&task_group_mutex); |
306 | } | 306 | } |
307 | 307 | ||
308 | static inline void lock_doms_cur(void) | 308 | static inline void lock_doms_cur(void) |
309 | { | 309 | { |
310 | mutex_lock(&doms_cur_mutex); | 310 | mutex_lock(&doms_cur_mutex); |
311 | } | 311 | } |
312 | 312 | ||
313 | static inline void unlock_doms_cur(void) | 313 | static inline void unlock_doms_cur(void) |
314 | { | 314 | { |
315 | mutex_unlock(&doms_cur_mutex); | 315 | mutex_unlock(&doms_cur_mutex); |
316 | } | 316 | } |
317 | 317 | ||
318 | #else | 318 | #else |
319 | 319 | ||
320 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } | 320 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } |
321 | static inline void lock_task_group_list(void) { } | 321 | static inline void lock_task_group_list(void) { } |
322 | static inline void unlock_task_group_list(void) { } | 322 | static inline void unlock_task_group_list(void) { } |
323 | static inline void lock_doms_cur(void) { } | 323 | static inline void lock_doms_cur(void) { } |
324 | static inline void unlock_doms_cur(void) { } | 324 | static inline void unlock_doms_cur(void) { } |
325 | 325 | ||
326 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 326 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
327 | 327 | ||
328 | /* CFS-related fields in a runqueue */ | 328 | /* CFS-related fields in a runqueue */ |
329 | struct cfs_rq { | 329 | struct cfs_rq { |
330 | struct load_weight load; | 330 | struct load_weight load; |
331 | unsigned long nr_running; | 331 | unsigned long nr_running; |
332 | 332 | ||
333 | u64 exec_clock; | 333 | u64 exec_clock; |
334 | u64 min_vruntime; | 334 | u64 min_vruntime; |
335 | 335 | ||
336 | struct rb_root tasks_timeline; | 336 | struct rb_root tasks_timeline; |
337 | struct rb_node *rb_leftmost; | 337 | struct rb_node *rb_leftmost; |
338 | struct rb_node *rb_load_balance_curr; | 338 | struct rb_node *rb_load_balance_curr; |
339 | /* 'curr' points to currently running entity on this cfs_rq. | 339 | /* 'curr' points to currently running entity on this cfs_rq. |
340 | * It is set to NULL otherwise (i.e when none are currently running). | 340 | * It is set to NULL otherwise (i.e when none are currently running). |
341 | */ | 341 | */ |
342 | struct sched_entity *curr; | 342 | struct sched_entity *curr; |
343 | 343 | ||
344 | unsigned long nr_spread_over; | 344 | unsigned long nr_spread_over; |
345 | 345 | ||
346 | #ifdef CONFIG_FAIR_GROUP_SCHED | 346 | #ifdef CONFIG_FAIR_GROUP_SCHED |
347 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ | 347 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ |
348 | 348 | ||
349 | /* | 349 | /* |
350 | * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in | 350 | * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in |
351 | * a hierarchy). Non-leaf lrqs hold other higher schedulable entities | 351 | * a hierarchy). Non-leaf lrqs hold other higher schedulable entities |
352 | * (like users, containers etc.) | 352 | * (like users, containers etc.) |
353 | * | 353 | * |
354 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This | 354 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This |
355 | * list is used during load balance. | 355 | * list is used during load balance. |
356 | */ | 356 | */ |
357 | struct list_head leaf_cfs_rq_list; | 357 | struct list_head leaf_cfs_rq_list; |
358 | struct task_group *tg; /* group that "owns" this runqueue */ | 358 | struct task_group *tg; /* group that "owns" this runqueue */ |
359 | #endif | 359 | #endif |
360 | }; | 360 | }; |
361 | 361 | ||
362 | /* Real-Time classes' related field in a runqueue: */ | 362 | /* Real-Time classes' related field in a runqueue: */ |
363 | struct rt_rq { | 363 | struct rt_rq { |
364 | struct rt_prio_array active; | 364 | struct rt_prio_array active; |
365 | unsigned long rt_nr_running; | 365 | unsigned long rt_nr_running; |
366 | #if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED | 366 | #if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED |
367 | int highest_prio; /* highest queued rt task prio */ | 367 | int highest_prio; /* highest queued rt task prio */ |
368 | #endif | 368 | #endif |
369 | #ifdef CONFIG_SMP | 369 | #ifdef CONFIG_SMP |
370 | unsigned long rt_nr_migratory; | 370 | unsigned long rt_nr_migratory; |
371 | int overloaded; | 371 | int overloaded; |
372 | #endif | 372 | #endif |
373 | int rt_throttled; | 373 | int rt_throttled; |
374 | u64 rt_time; | 374 | u64 rt_time; |
375 | 375 | ||
376 | #ifdef CONFIG_FAIR_GROUP_SCHED | 376 | #ifdef CONFIG_FAIR_GROUP_SCHED |
377 | struct rq *rq; | 377 | struct rq *rq; |
378 | struct list_head leaf_rt_rq_list; | 378 | struct list_head leaf_rt_rq_list; |
379 | struct task_group *tg; | 379 | struct task_group *tg; |
380 | struct sched_rt_entity *rt_se; | 380 | struct sched_rt_entity *rt_se; |
381 | #endif | 381 | #endif |
382 | }; | 382 | }; |
383 | 383 | ||
384 | #ifdef CONFIG_SMP | 384 | #ifdef CONFIG_SMP |
385 | 385 | ||
386 | /* | 386 | /* |
387 | * We add the notion of a root-domain which will be used to define per-domain | 387 | * We add the notion of a root-domain which will be used to define per-domain |
388 | * variables. Each exclusive cpuset essentially defines an island domain by | 388 | * variables. Each exclusive cpuset essentially defines an island domain by |
389 | * fully partitioning the member cpus from any other cpuset. Whenever a new | 389 | * fully partitioning the member cpus from any other cpuset. Whenever a new |
390 | * exclusive cpuset is created, we also create and attach a new root-domain | 390 | * exclusive cpuset is created, we also create and attach a new root-domain |
391 | * object. | 391 | * object. |
392 | * | 392 | * |
393 | */ | 393 | */ |
394 | struct root_domain { | 394 | struct root_domain { |
395 | atomic_t refcount; | 395 | atomic_t refcount; |
396 | cpumask_t span; | 396 | cpumask_t span; |
397 | cpumask_t online; | 397 | cpumask_t online; |
398 | 398 | ||
399 | /* | 399 | /* |
400 | * The "RT overload" flag: it gets set if a CPU has more than | 400 | * The "RT overload" flag: it gets set if a CPU has more than |
401 | * one runnable RT task. | 401 | * one runnable RT task. |
402 | */ | 402 | */ |
403 | cpumask_t rto_mask; | 403 | cpumask_t rto_mask; |
404 | atomic_t rto_count; | 404 | atomic_t rto_count; |
405 | }; | 405 | }; |
406 | 406 | ||
407 | /* | 407 | /* |
408 | * By default the system creates a single root-domain with all cpus as | 408 | * By default the system creates a single root-domain with all cpus as |
409 | * members (mimicking the global state we have today). | 409 | * members (mimicking the global state we have today). |
410 | */ | 410 | */ |
411 | static struct root_domain def_root_domain; | 411 | static struct root_domain def_root_domain; |
412 | 412 | ||
413 | #endif | 413 | #endif |
414 | 414 | ||
415 | /* | 415 | /* |
416 | * This is the main, per-CPU runqueue data structure. | 416 | * This is the main, per-CPU runqueue data structure. |
417 | * | 417 | * |
418 | * Locking rule: those places that want to lock multiple runqueues | 418 | * Locking rule: those places that want to lock multiple runqueues |
419 | * (such as the load balancing or the thread migration code), lock | 419 | * (such as the load balancing or the thread migration code), lock |
420 | * acquire operations must be ordered by ascending &runqueue. | 420 | * acquire operations must be ordered by ascending &runqueue. |
421 | */ | 421 | */ |
422 | struct rq { | 422 | struct rq { |
423 | /* runqueue lock: */ | 423 | /* runqueue lock: */ |
424 | spinlock_t lock; | 424 | spinlock_t lock; |
425 | 425 | ||
426 | /* | 426 | /* |
427 | * nr_running and cpu_load should be in the same cacheline because | 427 | * nr_running and cpu_load should be in the same cacheline because |
428 | * remote CPUs use both these fields when doing load calculation. | 428 | * remote CPUs use both these fields when doing load calculation. |
429 | */ | 429 | */ |
430 | unsigned long nr_running; | 430 | unsigned long nr_running; |
431 | #define CPU_LOAD_IDX_MAX 5 | 431 | #define CPU_LOAD_IDX_MAX 5 |
432 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; | 432 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; |
433 | unsigned char idle_at_tick; | 433 | unsigned char idle_at_tick; |
434 | #ifdef CONFIG_NO_HZ | 434 | #ifdef CONFIG_NO_HZ |
435 | unsigned char in_nohz_recently; | 435 | unsigned char in_nohz_recently; |
436 | #endif | 436 | #endif |
437 | /* capture load from *all* tasks on this cpu: */ | 437 | /* capture load from *all* tasks on this cpu: */ |
438 | struct load_weight load; | 438 | struct load_weight load; |
439 | unsigned long nr_load_updates; | 439 | unsigned long nr_load_updates; |
440 | u64 nr_switches; | 440 | u64 nr_switches; |
441 | 441 | ||
442 | struct cfs_rq cfs; | 442 | struct cfs_rq cfs; |
443 | struct rt_rq rt; | 443 | struct rt_rq rt; |
444 | u64 rt_period_expire; | 444 | u64 rt_period_expire; |
445 | int rt_throttled; | 445 | int rt_throttled; |
446 | 446 | ||
447 | #ifdef CONFIG_FAIR_GROUP_SCHED | 447 | #ifdef CONFIG_FAIR_GROUP_SCHED |
448 | /* list of leaf cfs_rq on this cpu: */ | 448 | /* list of leaf cfs_rq on this cpu: */ |
449 | struct list_head leaf_cfs_rq_list; | 449 | struct list_head leaf_cfs_rq_list; |
450 | struct list_head leaf_rt_rq_list; | 450 | struct list_head leaf_rt_rq_list; |
451 | #endif | 451 | #endif |
452 | 452 | ||
453 | /* | 453 | /* |
454 | * This is part of a global counter where only the total sum | 454 | * This is part of a global counter where only the total sum |
455 | * over all CPUs matters. A task can increase this counter on | 455 | * over all CPUs matters. A task can increase this counter on |
456 | * one CPU and if it got migrated afterwards it may decrease | 456 | * one CPU and if it got migrated afterwards it may decrease |
457 | * it on another CPU. Always updated under the runqueue lock: | 457 | * it on another CPU. Always updated under the runqueue lock: |
458 | */ | 458 | */ |
459 | unsigned long nr_uninterruptible; | 459 | unsigned long nr_uninterruptible; |
460 | 460 | ||
461 | struct task_struct *curr, *idle; | 461 | struct task_struct *curr, *idle; |
462 | unsigned long next_balance; | 462 | unsigned long next_balance; |
463 | struct mm_struct *prev_mm; | 463 | struct mm_struct *prev_mm; |
464 | 464 | ||
465 | u64 clock, prev_clock_raw; | 465 | u64 clock, prev_clock_raw; |
466 | s64 clock_max_delta; | 466 | s64 clock_max_delta; |
467 | 467 | ||
468 | unsigned int clock_warps, clock_overflows, clock_underflows; | 468 | unsigned int clock_warps, clock_overflows, clock_underflows; |
469 | u64 idle_clock; | 469 | u64 idle_clock; |
470 | unsigned int clock_deep_idle_events; | 470 | unsigned int clock_deep_idle_events; |
471 | u64 tick_timestamp; | 471 | u64 tick_timestamp; |
472 | 472 | ||
473 | atomic_t nr_iowait; | 473 | atomic_t nr_iowait; |
474 | 474 | ||
475 | #ifdef CONFIG_SMP | 475 | #ifdef CONFIG_SMP |
476 | struct root_domain *rd; | 476 | struct root_domain *rd; |
477 | struct sched_domain *sd; | 477 | struct sched_domain *sd; |
478 | 478 | ||
479 | /* For active balancing */ | 479 | /* For active balancing */ |
480 | int active_balance; | 480 | int active_balance; |
481 | int push_cpu; | 481 | int push_cpu; |
482 | /* cpu of this runqueue: */ | 482 | /* cpu of this runqueue: */ |
483 | int cpu; | 483 | int cpu; |
484 | 484 | ||
485 | struct task_struct *migration_thread; | 485 | struct task_struct *migration_thread; |
486 | struct list_head migration_queue; | 486 | struct list_head migration_queue; |
487 | #endif | 487 | #endif |
488 | 488 | ||
489 | #ifdef CONFIG_SCHED_HRTICK | 489 | #ifdef CONFIG_SCHED_HRTICK |
490 | unsigned long hrtick_flags; | 490 | unsigned long hrtick_flags; |
491 | ktime_t hrtick_expire; | 491 | ktime_t hrtick_expire; |
492 | struct hrtimer hrtick_timer; | 492 | struct hrtimer hrtick_timer; |
493 | #endif | 493 | #endif |
494 | 494 | ||
495 | #ifdef CONFIG_SCHEDSTATS | 495 | #ifdef CONFIG_SCHEDSTATS |
496 | /* latency stats */ | 496 | /* latency stats */ |
497 | struct sched_info rq_sched_info; | 497 | struct sched_info rq_sched_info; |
498 | 498 | ||
499 | /* sys_sched_yield() stats */ | 499 | /* sys_sched_yield() stats */ |
500 | unsigned int yld_exp_empty; | 500 | unsigned int yld_exp_empty; |
501 | unsigned int yld_act_empty; | 501 | unsigned int yld_act_empty; |
502 | unsigned int yld_both_empty; | 502 | unsigned int yld_both_empty; |
503 | unsigned int yld_count; | 503 | unsigned int yld_count; |
504 | 504 | ||
505 | /* schedule() stats */ | 505 | /* schedule() stats */ |
506 | unsigned int sched_switch; | 506 | unsigned int sched_switch; |
507 | unsigned int sched_count; | 507 | unsigned int sched_count; |
508 | unsigned int sched_goidle; | 508 | unsigned int sched_goidle; |
509 | 509 | ||
510 | /* try_to_wake_up() stats */ | 510 | /* try_to_wake_up() stats */ |
511 | unsigned int ttwu_count; | 511 | unsigned int ttwu_count; |
512 | unsigned int ttwu_local; | 512 | unsigned int ttwu_local; |
513 | 513 | ||
514 | /* BKL stats */ | 514 | /* BKL stats */ |
515 | unsigned int bkl_count; | 515 | unsigned int bkl_count; |
516 | #endif | 516 | #endif |
517 | struct lock_class_key rq_lock_key; | 517 | struct lock_class_key rq_lock_key; |
518 | }; | 518 | }; |
519 | 519 | ||
520 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | 520 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
521 | 521 | ||
522 | static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) | 522 | static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) |
523 | { | 523 | { |
524 | rq->curr->sched_class->check_preempt_curr(rq, p); | 524 | rq->curr->sched_class->check_preempt_curr(rq, p); |
525 | } | 525 | } |
526 | 526 | ||
527 | static inline int cpu_of(struct rq *rq) | 527 | static inline int cpu_of(struct rq *rq) |
528 | { | 528 | { |
529 | #ifdef CONFIG_SMP | 529 | #ifdef CONFIG_SMP |
530 | return rq->cpu; | 530 | return rq->cpu; |
531 | #else | 531 | #else |
532 | return 0; | 532 | return 0; |
533 | #endif | 533 | #endif |
534 | } | 534 | } |
535 | 535 | ||
536 | /* | 536 | /* |
537 | * Update the per-runqueue clock, as finegrained as the platform can give | 537 | * Update the per-runqueue clock, as finegrained as the platform can give |
538 | * us, but without assuming monotonicity, etc.: | 538 | * us, but without assuming monotonicity, etc.: |
539 | */ | 539 | */ |
540 | static void __update_rq_clock(struct rq *rq) | 540 | static void __update_rq_clock(struct rq *rq) |
541 | { | 541 | { |
542 | u64 prev_raw = rq->prev_clock_raw; | 542 | u64 prev_raw = rq->prev_clock_raw; |
543 | u64 now = sched_clock(); | 543 | u64 now = sched_clock(); |
544 | s64 delta = now - prev_raw; | 544 | s64 delta = now - prev_raw; |
545 | u64 clock = rq->clock; | 545 | u64 clock = rq->clock; |
546 | 546 | ||
547 | #ifdef CONFIG_SCHED_DEBUG | 547 | #ifdef CONFIG_SCHED_DEBUG |
548 | WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); | 548 | WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); |
549 | #endif | 549 | #endif |
550 | /* | 550 | /* |
551 | * Protect against sched_clock() occasionally going backwards: | 551 | * Protect against sched_clock() occasionally going backwards: |
552 | */ | 552 | */ |
553 | if (unlikely(delta < 0)) { | 553 | if (unlikely(delta < 0)) { |
554 | clock++; | 554 | clock++; |
555 | rq->clock_warps++; | 555 | rq->clock_warps++; |
556 | } else { | 556 | } else { |
557 | /* | 557 | /* |
558 | * Catch too large forward jumps too: | 558 | * Catch too large forward jumps too: |
559 | */ | 559 | */ |
560 | if (unlikely(clock + delta > rq->tick_timestamp + TICK_NSEC)) { | 560 | if (unlikely(clock + delta > rq->tick_timestamp + TICK_NSEC)) { |
561 | if (clock < rq->tick_timestamp + TICK_NSEC) | 561 | if (clock < rq->tick_timestamp + TICK_NSEC) |
562 | clock = rq->tick_timestamp + TICK_NSEC; | 562 | clock = rq->tick_timestamp + TICK_NSEC; |
563 | else | 563 | else |
564 | clock++; | 564 | clock++; |
565 | rq->clock_overflows++; | 565 | rq->clock_overflows++; |
566 | } else { | 566 | } else { |
567 | if (unlikely(delta > rq->clock_max_delta)) | 567 | if (unlikely(delta > rq->clock_max_delta)) |
568 | rq->clock_max_delta = delta; | 568 | rq->clock_max_delta = delta; |
569 | clock += delta; | 569 | clock += delta; |
570 | } | 570 | } |
571 | } | 571 | } |
572 | 572 | ||
573 | rq->prev_clock_raw = now; | 573 | rq->prev_clock_raw = now; |
574 | rq->clock = clock; | 574 | rq->clock = clock; |
575 | } | 575 | } |
576 | 576 | ||
577 | static void update_rq_clock(struct rq *rq) | 577 | static void update_rq_clock(struct rq *rq) |
578 | { | 578 | { |
579 | if (likely(smp_processor_id() == cpu_of(rq))) | 579 | if (likely(smp_processor_id() == cpu_of(rq))) |
580 | __update_rq_clock(rq); | 580 | __update_rq_clock(rq); |
581 | } | 581 | } |
582 | 582 | ||
583 | /* | 583 | /* |
584 | * The domain tree (rq->sd) is protected by RCU's quiescent state transition. | 584 | * The domain tree (rq->sd) is protected by RCU's quiescent state transition. |
585 | * See detach_destroy_domains: synchronize_sched for details. | 585 | * See detach_destroy_domains: synchronize_sched for details. |
586 | * | 586 | * |
587 | * The domain tree of any CPU may only be accessed from within | 587 | * The domain tree of any CPU may only be accessed from within |
588 | * preempt-disabled sections. | 588 | * preempt-disabled sections. |
589 | */ | 589 | */ |
590 | #define for_each_domain(cpu, __sd) \ | 590 | #define for_each_domain(cpu, __sd) \ |
591 | for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) | 591 | for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) |
592 | 592 | ||
593 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) | 593 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) |
594 | #define this_rq() (&__get_cpu_var(runqueues)) | 594 | #define this_rq() (&__get_cpu_var(runqueues)) |
595 | #define task_rq(p) cpu_rq(task_cpu(p)) | 595 | #define task_rq(p) cpu_rq(task_cpu(p)) |
596 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | 596 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) |
597 | 597 | ||
598 | unsigned long rt_needs_cpu(int cpu) | 598 | unsigned long rt_needs_cpu(int cpu) |
599 | { | 599 | { |
600 | struct rq *rq = cpu_rq(cpu); | 600 | struct rq *rq = cpu_rq(cpu); |
601 | u64 delta; | 601 | u64 delta; |
602 | 602 | ||
603 | if (!rq->rt_throttled) | 603 | if (!rq->rt_throttled) |
604 | return 0; | 604 | return 0; |
605 | 605 | ||
606 | if (rq->clock > rq->rt_period_expire) | 606 | if (rq->clock > rq->rt_period_expire) |
607 | return 1; | 607 | return 1; |
608 | 608 | ||
609 | delta = rq->rt_period_expire - rq->clock; | 609 | delta = rq->rt_period_expire - rq->clock; |
610 | do_div(delta, NSEC_PER_SEC / HZ); | 610 | do_div(delta, NSEC_PER_SEC / HZ); |
611 | 611 | ||
612 | return (unsigned long)delta; | 612 | return (unsigned long)delta; |
613 | } | 613 | } |
614 | 614 | ||
615 | /* | 615 | /* |
616 | * Tunables that become constants when CONFIG_SCHED_DEBUG is off: | 616 | * Tunables that become constants when CONFIG_SCHED_DEBUG is off: |
617 | */ | 617 | */ |
618 | #ifdef CONFIG_SCHED_DEBUG | 618 | #ifdef CONFIG_SCHED_DEBUG |
619 | # define const_debug __read_mostly | 619 | # define const_debug __read_mostly |
620 | #else | 620 | #else |
621 | # define const_debug static const | 621 | # define const_debug static const |
622 | #endif | 622 | #endif |
623 | 623 | ||
624 | /* | 624 | /* |
625 | * Debugging: various feature bits | 625 | * Debugging: various feature bits |
626 | */ | 626 | */ |
627 | enum { | 627 | enum { |
628 | SCHED_FEAT_NEW_FAIR_SLEEPERS = 1, | 628 | SCHED_FEAT_NEW_FAIR_SLEEPERS = 1, |
629 | SCHED_FEAT_WAKEUP_PREEMPT = 2, | 629 | SCHED_FEAT_WAKEUP_PREEMPT = 2, |
630 | SCHED_FEAT_START_DEBIT = 4, | 630 | SCHED_FEAT_START_DEBIT = 4, |
631 | SCHED_FEAT_TREE_AVG = 8, | 631 | SCHED_FEAT_TREE_AVG = 8, |
632 | SCHED_FEAT_APPROX_AVG = 16, | 632 | SCHED_FEAT_APPROX_AVG = 16, |
633 | SCHED_FEAT_HRTICK = 32, | 633 | SCHED_FEAT_HRTICK = 32, |
634 | SCHED_FEAT_DOUBLE_TICK = 64, | 634 | SCHED_FEAT_DOUBLE_TICK = 64, |
635 | }; | 635 | }; |
636 | 636 | ||
637 | const_debug unsigned int sysctl_sched_features = | 637 | const_debug unsigned int sysctl_sched_features = |
638 | SCHED_FEAT_NEW_FAIR_SLEEPERS * 1 | | 638 | SCHED_FEAT_NEW_FAIR_SLEEPERS * 1 | |
639 | SCHED_FEAT_WAKEUP_PREEMPT * 1 | | 639 | SCHED_FEAT_WAKEUP_PREEMPT * 1 | |
640 | SCHED_FEAT_START_DEBIT * 1 | | 640 | SCHED_FEAT_START_DEBIT * 1 | |
641 | SCHED_FEAT_TREE_AVG * 0 | | 641 | SCHED_FEAT_TREE_AVG * 0 | |
642 | SCHED_FEAT_APPROX_AVG * 0 | | 642 | SCHED_FEAT_APPROX_AVG * 0 | |
643 | SCHED_FEAT_HRTICK * 1 | | 643 | SCHED_FEAT_HRTICK * 1 | |
644 | SCHED_FEAT_DOUBLE_TICK * 0; | 644 | SCHED_FEAT_DOUBLE_TICK * 0; |
645 | 645 | ||
646 | #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) | 646 | #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) |
647 | 647 | ||
648 | /* | 648 | /* |
649 | * Number of tasks to iterate in a single balance run. | 649 | * Number of tasks to iterate in a single balance run. |
650 | * Limited because this is done with IRQs disabled. | 650 | * Limited because this is done with IRQs disabled. |
651 | */ | 651 | */ |
652 | const_debug unsigned int sysctl_sched_nr_migrate = 32; | 652 | const_debug unsigned int sysctl_sched_nr_migrate = 32; |
653 | 653 | ||
654 | /* | 654 | /* |
655 | * period over which we measure -rt task cpu usage in ms. | 655 | * period over which we measure -rt task cpu usage in ms. |
656 | * default: 1s | 656 | * default: 1s |
657 | */ | 657 | */ |
658 | const_debug unsigned int sysctl_sched_rt_period = 1000; | 658 | const_debug unsigned int sysctl_sched_rt_period = 1000; |
659 | 659 | ||
660 | #define SCHED_RT_FRAC_SHIFT 16 | 660 | #define SCHED_RT_FRAC_SHIFT 16 |
661 | #define SCHED_RT_FRAC (1UL << SCHED_RT_FRAC_SHIFT) | 661 | #define SCHED_RT_FRAC (1UL << SCHED_RT_FRAC_SHIFT) |
662 | 662 | ||
663 | /* | 663 | /* |
664 | * ratio of time -rt tasks may consume. | 664 | * ratio of time -rt tasks may consume. |
665 | * default: 95% | 665 | * default: 95% |
666 | */ | 666 | */ |
667 | const_debug unsigned int sysctl_sched_rt_ratio = 62259; | 667 | const_debug unsigned int sysctl_sched_rt_ratio = 62259; |
668 | 668 | ||
669 | /* | 669 | /* |
670 | * For kernel-internal use: high-speed (but slightly incorrect) per-cpu | 670 | * For kernel-internal use: high-speed (but slightly incorrect) per-cpu |
671 | * clock constructed from sched_clock(): | 671 | * clock constructed from sched_clock(): |
672 | */ | 672 | */ |
673 | unsigned long long cpu_clock(int cpu) | 673 | unsigned long long cpu_clock(int cpu) |
674 | { | 674 | { |
675 | unsigned long long now; | 675 | unsigned long long now; |
676 | unsigned long flags; | 676 | unsigned long flags; |
677 | struct rq *rq; | 677 | struct rq *rq; |
678 | 678 | ||
679 | local_irq_save(flags); | 679 | local_irq_save(flags); |
680 | rq = cpu_rq(cpu); | 680 | rq = cpu_rq(cpu); |
681 | /* | 681 | /* |
682 | * Only call sched_clock() if the scheduler has already been | 682 | * Only call sched_clock() if the scheduler has already been |
683 | * initialized (some code might call cpu_clock() very early): | 683 | * initialized (some code might call cpu_clock() very early): |
684 | */ | 684 | */ |
685 | if (rq->idle) | 685 | if (rq->idle) |
686 | update_rq_clock(rq); | 686 | update_rq_clock(rq); |
687 | now = rq->clock; | 687 | now = rq->clock; |
688 | local_irq_restore(flags); | 688 | local_irq_restore(flags); |
689 | 689 | ||
690 | return now; | 690 | return now; |
691 | } | 691 | } |
692 | EXPORT_SYMBOL_GPL(cpu_clock); | 692 | EXPORT_SYMBOL_GPL(cpu_clock); |
693 | 693 | ||
694 | #ifndef prepare_arch_switch | 694 | #ifndef prepare_arch_switch |
695 | # define prepare_arch_switch(next) do { } while (0) | 695 | # define prepare_arch_switch(next) do { } while (0) |
696 | #endif | 696 | #endif |
697 | #ifndef finish_arch_switch | 697 | #ifndef finish_arch_switch |
698 | # define finish_arch_switch(prev) do { } while (0) | 698 | # define finish_arch_switch(prev) do { } while (0) |
699 | #endif | 699 | #endif |
700 | 700 | ||
701 | static inline int task_current(struct rq *rq, struct task_struct *p) | 701 | static inline int task_current(struct rq *rq, struct task_struct *p) |
702 | { | 702 | { |
703 | return rq->curr == p; | 703 | return rq->curr == p; |
704 | } | 704 | } |
705 | 705 | ||
706 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | 706 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW |
707 | static inline int task_running(struct rq *rq, struct task_struct *p) | 707 | static inline int task_running(struct rq *rq, struct task_struct *p) |
708 | { | 708 | { |
709 | return task_current(rq, p); | 709 | return task_current(rq, p); |
710 | } | 710 | } |
711 | 711 | ||
712 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | 712 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) |
713 | { | 713 | { |
714 | } | 714 | } |
715 | 715 | ||
716 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | 716 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) |
717 | { | 717 | { |
718 | #ifdef CONFIG_DEBUG_SPINLOCK | 718 | #ifdef CONFIG_DEBUG_SPINLOCK |
719 | /* this is a valid case when another task releases the spinlock */ | 719 | /* this is a valid case when another task releases the spinlock */ |
720 | rq->lock.owner = current; | 720 | rq->lock.owner = current; |
721 | #endif | 721 | #endif |
722 | /* | 722 | /* |
723 | * If we are tracking spinlock dependencies then we have to | 723 | * If we are tracking spinlock dependencies then we have to |
724 | * fix up the runqueue lock - which gets 'carried over' from | 724 | * fix up the runqueue lock - which gets 'carried over' from |
725 | * prev into current: | 725 | * prev into current: |
726 | */ | 726 | */ |
727 | spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); | 727 | spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); |
728 | 728 | ||
729 | spin_unlock_irq(&rq->lock); | 729 | spin_unlock_irq(&rq->lock); |
730 | } | 730 | } |
731 | 731 | ||
732 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ | 732 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ |
733 | static inline int task_running(struct rq *rq, struct task_struct *p) | 733 | static inline int task_running(struct rq *rq, struct task_struct *p) |
734 | { | 734 | { |
735 | #ifdef CONFIG_SMP | 735 | #ifdef CONFIG_SMP |
736 | return p->oncpu; | 736 | return p->oncpu; |
737 | #else | 737 | #else |
738 | return task_current(rq, p); | 738 | return task_current(rq, p); |
739 | #endif | 739 | #endif |
740 | } | 740 | } |
741 | 741 | ||
742 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | 742 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) |
743 | { | 743 | { |
744 | #ifdef CONFIG_SMP | 744 | #ifdef CONFIG_SMP |
745 | /* | 745 | /* |
746 | * We can optimise this out completely for !SMP, because the | 746 | * We can optimise this out completely for !SMP, because the |
747 | * SMP rebalancing from interrupt is the only thing that cares | 747 | * SMP rebalancing from interrupt is the only thing that cares |
748 | * here. | 748 | * here. |
749 | */ | 749 | */ |
750 | next->oncpu = 1; | 750 | next->oncpu = 1; |
751 | #endif | 751 | #endif |
752 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | 752 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
753 | spin_unlock_irq(&rq->lock); | 753 | spin_unlock_irq(&rq->lock); |
754 | #else | 754 | #else |
755 | spin_unlock(&rq->lock); | 755 | spin_unlock(&rq->lock); |
756 | #endif | 756 | #endif |
757 | } | 757 | } |
758 | 758 | ||
759 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | 759 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) |
760 | { | 760 | { |
761 | #ifdef CONFIG_SMP | 761 | #ifdef CONFIG_SMP |
762 | /* | 762 | /* |
763 | * After ->oncpu is cleared, the task can be moved to a different CPU. | 763 | * After ->oncpu is cleared, the task can be moved to a different CPU. |
764 | * We must ensure this doesn't happen until the switch is completely | 764 | * We must ensure this doesn't happen until the switch is completely |
765 | * finished. | 765 | * finished. |
766 | */ | 766 | */ |
767 | smp_wmb(); | 767 | smp_wmb(); |
768 | prev->oncpu = 0; | 768 | prev->oncpu = 0; |
769 | #endif | 769 | #endif |
770 | #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW | 770 | #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
771 | local_irq_enable(); | 771 | local_irq_enable(); |
772 | #endif | 772 | #endif |
773 | } | 773 | } |
774 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | 774 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ |
775 | 775 | ||
776 | /* | 776 | /* |
777 | * __task_rq_lock - lock the runqueue a given task resides on. | 777 | * __task_rq_lock - lock the runqueue a given task resides on. |
778 | * Must be called interrupts disabled. | 778 | * Must be called interrupts disabled. |
779 | */ | 779 | */ |
780 | static inline struct rq *__task_rq_lock(struct task_struct *p) | 780 | static inline struct rq *__task_rq_lock(struct task_struct *p) |
781 | __acquires(rq->lock) | 781 | __acquires(rq->lock) |
782 | { | 782 | { |
783 | for (;;) { | 783 | for (;;) { |
784 | struct rq *rq = task_rq(p); | 784 | struct rq *rq = task_rq(p); |
785 | spin_lock(&rq->lock); | 785 | spin_lock(&rq->lock); |
786 | if (likely(rq == task_rq(p))) | 786 | if (likely(rq == task_rq(p))) |
787 | return rq; | 787 | return rq; |
788 | spin_unlock(&rq->lock); | 788 | spin_unlock(&rq->lock); |
789 | } | 789 | } |
790 | } | 790 | } |
791 | 791 | ||
792 | /* | 792 | /* |
793 | * task_rq_lock - lock the runqueue a given task resides on and disable | 793 | * task_rq_lock - lock the runqueue a given task resides on and disable |
794 | * interrupts. Note the ordering: we can safely lookup the task_rq without | 794 | * interrupts. Note the ordering: we can safely lookup the task_rq without |
795 | * explicitly disabling preemption. | 795 | * explicitly disabling preemption. |
796 | */ | 796 | */ |
797 | static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) | 797 | static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) |
798 | __acquires(rq->lock) | 798 | __acquires(rq->lock) |
799 | { | 799 | { |
800 | struct rq *rq; | 800 | struct rq *rq; |
801 | 801 | ||
802 | for (;;) { | 802 | for (;;) { |
803 | local_irq_save(*flags); | 803 | local_irq_save(*flags); |
804 | rq = task_rq(p); | 804 | rq = task_rq(p); |
805 | spin_lock(&rq->lock); | 805 | spin_lock(&rq->lock); |
806 | if (likely(rq == task_rq(p))) | 806 | if (likely(rq == task_rq(p))) |
807 | return rq; | 807 | return rq; |
808 | spin_unlock_irqrestore(&rq->lock, *flags); | 808 | spin_unlock_irqrestore(&rq->lock, *flags); |
809 | } | 809 | } |
810 | } | 810 | } |
811 | 811 | ||
812 | static void __task_rq_unlock(struct rq *rq) | 812 | static void __task_rq_unlock(struct rq *rq) |
813 | __releases(rq->lock) | 813 | __releases(rq->lock) |
814 | { | 814 | { |
815 | spin_unlock(&rq->lock); | 815 | spin_unlock(&rq->lock); |
816 | } | 816 | } |
817 | 817 | ||
818 | static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) | 818 | static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) |
819 | __releases(rq->lock) | 819 | __releases(rq->lock) |
820 | { | 820 | { |
821 | spin_unlock_irqrestore(&rq->lock, *flags); | 821 | spin_unlock_irqrestore(&rq->lock, *flags); |
822 | } | 822 | } |
823 | 823 | ||
824 | /* | 824 | /* |
825 | * this_rq_lock - lock this runqueue and disable interrupts. | 825 | * this_rq_lock - lock this runqueue and disable interrupts. |
826 | */ | 826 | */ |
827 | static struct rq *this_rq_lock(void) | 827 | static struct rq *this_rq_lock(void) |
828 | __acquires(rq->lock) | 828 | __acquires(rq->lock) |
829 | { | 829 | { |
830 | struct rq *rq; | 830 | struct rq *rq; |
831 | 831 | ||
832 | local_irq_disable(); | 832 | local_irq_disable(); |
833 | rq = this_rq(); | 833 | rq = this_rq(); |
834 | spin_lock(&rq->lock); | 834 | spin_lock(&rq->lock); |
835 | 835 | ||
836 | return rq; | 836 | return rq; |
837 | } | 837 | } |
838 | 838 | ||
839 | /* | 839 | /* |
840 | * We are going deep-idle (irqs are disabled): | 840 | * We are going deep-idle (irqs are disabled): |
841 | */ | 841 | */ |
842 | void sched_clock_idle_sleep_event(void) | 842 | void sched_clock_idle_sleep_event(void) |
843 | { | 843 | { |
844 | struct rq *rq = cpu_rq(smp_processor_id()); | 844 | struct rq *rq = cpu_rq(smp_processor_id()); |
845 | 845 | ||
846 | spin_lock(&rq->lock); | 846 | spin_lock(&rq->lock); |
847 | __update_rq_clock(rq); | 847 | __update_rq_clock(rq); |
848 | spin_unlock(&rq->lock); | 848 | spin_unlock(&rq->lock); |
849 | rq->clock_deep_idle_events++; | 849 | rq->clock_deep_idle_events++; |
850 | } | 850 | } |
851 | EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); | 851 | EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); |
852 | 852 | ||
853 | /* | 853 | /* |
854 | * We just idled delta nanoseconds (called with irqs disabled): | 854 | * We just idled delta nanoseconds (called with irqs disabled): |
855 | */ | 855 | */ |
856 | void sched_clock_idle_wakeup_event(u64 delta_ns) | 856 | void sched_clock_idle_wakeup_event(u64 delta_ns) |
857 | { | 857 | { |
858 | struct rq *rq = cpu_rq(smp_processor_id()); | 858 | struct rq *rq = cpu_rq(smp_processor_id()); |
859 | u64 now = sched_clock(); | 859 | u64 now = sched_clock(); |
860 | 860 | ||
861 | rq->idle_clock += delta_ns; | 861 | rq->idle_clock += delta_ns; |
862 | /* | 862 | /* |
863 | * Override the previous timestamp and ignore all | 863 | * Override the previous timestamp and ignore all |
864 | * sched_clock() deltas that occured while we idled, | 864 | * sched_clock() deltas that occured while we idled, |
865 | * and use the PM-provided delta_ns to advance the | 865 | * and use the PM-provided delta_ns to advance the |
866 | * rq clock: | 866 | * rq clock: |
867 | */ | 867 | */ |
868 | spin_lock(&rq->lock); | 868 | spin_lock(&rq->lock); |
869 | rq->prev_clock_raw = now; | 869 | rq->prev_clock_raw = now; |
870 | rq->clock += delta_ns; | 870 | rq->clock += delta_ns; |
871 | spin_unlock(&rq->lock); | 871 | spin_unlock(&rq->lock); |
872 | touch_softlockup_watchdog(); | 872 | touch_softlockup_watchdog(); |
873 | } | 873 | } |
874 | EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); | 874 | EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); |
875 | 875 | ||
876 | static void __resched_task(struct task_struct *p, int tif_bit); | 876 | static void __resched_task(struct task_struct *p, int tif_bit); |
877 | 877 | ||
878 | static inline void resched_task(struct task_struct *p) | 878 | static inline void resched_task(struct task_struct *p) |
879 | { | 879 | { |
880 | __resched_task(p, TIF_NEED_RESCHED); | 880 | __resched_task(p, TIF_NEED_RESCHED); |
881 | } | 881 | } |
882 | 882 | ||
883 | #ifdef CONFIG_SCHED_HRTICK | 883 | #ifdef CONFIG_SCHED_HRTICK |
884 | /* | 884 | /* |
885 | * Use HR-timers to deliver accurate preemption points. | 885 | * Use HR-timers to deliver accurate preemption points. |
886 | * | 886 | * |
887 | * Its all a bit involved since we cannot program an hrt while holding the | 887 | * Its all a bit involved since we cannot program an hrt while holding the |
888 | * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a | 888 | * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a |
889 | * reschedule event. | 889 | * reschedule event. |
890 | * | 890 | * |
891 | * When we get rescheduled we reprogram the hrtick_timer outside of the | 891 | * When we get rescheduled we reprogram the hrtick_timer outside of the |
892 | * rq->lock. | 892 | * rq->lock. |
893 | */ | 893 | */ |
894 | static inline void resched_hrt(struct task_struct *p) | 894 | static inline void resched_hrt(struct task_struct *p) |
895 | { | 895 | { |
896 | __resched_task(p, TIF_HRTICK_RESCHED); | 896 | __resched_task(p, TIF_HRTICK_RESCHED); |
897 | } | 897 | } |
898 | 898 | ||
899 | static inline void resched_rq(struct rq *rq) | 899 | static inline void resched_rq(struct rq *rq) |
900 | { | 900 | { |
901 | unsigned long flags; | 901 | unsigned long flags; |
902 | 902 | ||
903 | spin_lock_irqsave(&rq->lock, flags); | 903 | spin_lock_irqsave(&rq->lock, flags); |
904 | resched_task(rq->curr); | 904 | resched_task(rq->curr); |
905 | spin_unlock_irqrestore(&rq->lock, flags); | 905 | spin_unlock_irqrestore(&rq->lock, flags); |
906 | } | 906 | } |
907 | 907 | ||
908 | enum { | 908 | enum { |
909 | HRTICK_SET, /* re-programm hrtick_timer */ | 909 | HRTICK_SET, /* re-programm hrtick_timer */ |
910 | HRTICK_RESET, /* not a new slice */ | 910 | HRTICK_RESET, /* not a new slice */ |
911 | }; | 911 | }; |
912 | 912 | ||
913 | /* | 913 | /* |
914 | * Use hrtick when: | 914 | * Use hrtick when: |
915 | * - enabled by features | 915 | * - enabled by features |
916 | * - hrtimer is actually high res | 916 | * - hrtimer is actually high res |
917 | */ | 917 | */ |
918 | static inline int hrtick_enabled(struct rq *rq) | 918 | static inline int hrtick_enabled(struct rq *rq) |
919 | { | 919 | { |
920 | if (!sched_feat(HRTICK)) | 920 | if (!sched_feat(HRTICK)) |
921 | return 0; | 921 | return 0; |
922 | return hrtimer_is_hres_active(&rq->hrtick_timer); | 922 | return hrtimer_is_hres_active(&rq->hrtick_timer); |
923 | } | 923 | } |
924 | 924 | ||
925 | /* | 925 | /* |
926 | * Called to set the hrtick timer state. | 926 | * Called to set the hrtick timer state. |
927 | * | 927 | * |
928 | * called with rq->lock held and irqs disabled | 928 | * called with rq->lock held and irqs disabled |
929 | */ | 929 | */ |
930 | static void hrtick_start(struct rq *rq, u64 delay, int reset) | 930 | static void hrtick_start(struct rq *rq, u64 delay, int reset) |
931 | { | 931 | { |
932 | assert_spin_locked(&rq->lock); | 932 | assert_spin_locked(&rq->lock); |
933 | 933 | ||
934 | /* | 934 | /* |
935 | * preempt at: now + delay | 935 | * preempt at: now + delay |
936 | */ | 936 | */ |
937 | rq->hrtick_expire = | 937 | rq->hrtick_expire = |
938 | ktime_add_ns(rq->hrtick_timer.base->get_time(), delay); | 938 | ktime_add_ns(rq->hrtick_timer.base->get_time(), delay); |
939 | /* | 939 | /* |
940 | * indicate we need to program the timer | 940 | * indicate we need to program the timer |
941 | */ | 941 | */ |
942 | __set_bit(HRTICK_SET, &rq->hrtick_flags); | 942 | __set_bit(HRTICK_SET, &rq->hrtick_flags); |
943 | if (reset) | 943 | if (reset) |
944 | __set_bit(HRTICK_RESET, &rq->hrtick_flags); | 944 | __set_bit(HRTICK_RESET, &rq->hrtick_flags); |
945 | 945 | ||
946 | /* | 946 | /* |
947 | * New slices are called from the schedule path and don't need a | 947 | * New slices are called from the schedule path and don't need a |
948 | * forced reschedule. | 948 | * forced reschedule. |
949 | */ | 949 | */ |
950 | if (reset) | 950 | if (reset) |
951 | resched_hrt(rq->curr); | 951 | resched_hrt(rq->curr); |
952 | } | 952 | } |
953 | 953 | ||
954 | static void hrtick_clear(struct rq *rq) | 954 | static void hrtick_clear(struct rq *rq) |
955 | { | 955 | { |
956 | if (hrtimer_active(&rq->hrtick_timer)) | 956 | if (hrtimer_active(&rq->hrtick_timer)) |
957 | hrtimer_cancel(&rq->hrtick_timer); | 957 | hrtimer_cancel(&rq->hrtick_timer); |
958 | } | 958 | } |
959 | 959 | ||
960 | /* | 960 | /* |
961 | * Update the timer from the possible pending state. | 961 | * Update the timer from the possible pending state. |
962 | */ | 962 | */ |
963 | static void hrtick_set(struct rq *rq) | 963 | static void hrtick_set(struct rq *rq) |
964 | { | 964 | { |
965 | ktime_t time; | 965 | ktime_t time; |
966 | int set, reset; | 966 | int set, reset; |
967 | unsigned long flags; | 967 | unsigned long flags; |
968 | 968 | ||
969 | WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); | 969 | WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); |
970 | 970 | ||
971 | spin_lock_irqsave(&rq->lock, flags); | 971 | spin_lock_irqsave(&rq->lock, flags); |
972 | set = __test_and_clear_bit(HRTICK_SET, &rq->hrtick_flags); | 972 | set = __test_and_clear_bit(HRTICK_SET, &rq->hrtick_flags); |
973 | reset = __test_and_clear_bit(HRTICK_RESET, &rq->hrtick_flags); | 973 | reset = __test_and_clear_bit(HRTICK_RESET, &rq->hrtick_flags); |
974 | time = rq->hrtick_expire; | 974 | time = rq->hrtick_expire; |
975 | clear_thread_flag(TIF_HRTICK_RESCHED); | 975 | clear_thread_flag(TIF_HRTICK_RESCHED); |
976 | spin_unlock_irqrestore(&rq->lock, flags); | 976 | spin_unlock_irqrestore(&rq->lock, flags); |
977 | 977 | ||
978 | if (set) { | 978 | if (set) { |
979 | hrtimer_start(&rq->hrtick_timer, time, HRTIMER_MODE_ABS); | 979 | hrtimer_start(&rq->hrtick_timer, time, HRTIMER_MODE_ABS); |
980 | if (reset && !hrtimer_active(&rq->hrtick_timer)) | 980 | if (reset && !hrtimer_active(&rq->hrtick_timer)) |
981 | resched_rq(rq); | 981 | resched_rq(rq); |
982 | } else | 982 | } else |
983 | hrtick_clear(rq); | 983 | hrtick_clear(rq); |
984 | } | 984 | } |
985 | 985 | ||
986 | /* | 986 | /* |
987 | * High-resolution timer tick. | 987 | * High-resolution timer tick. |
988 | * Runs from hardirq context with interrupts disabled. | 988 | * Runs from hardirq context with interrupts disabled. |
989 | */ | 989 | */ |
990 | static enum hrtimer_restart hrtick(struct hrtimer *timer) | 990 | static enum hrtimer_restart hrtick(struct hrtimer *timer) |
991 | { | 991 | { |
992 | struct rq *rq = container_of(timer, struct rq, hrtick_timer); | 992 | struct rq *rq = container_of(timer, struct rq, hrtick_timer); |
993 | 993 | ||
994 | WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); | 994 | WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); |
995 | 995 | ||
996 | spin_lock(&rq->lock); | 996 | spin_lock(&rq->lock); |
997 | __update_rq_clock(rq); | 997 | __update_rq_clock(rq); |
998 | rq->curr->sched_class->task_tick(rq, rq->curr, 1); | 998 | rq->curr->sched_class->task_tick(rq, rq->curr, 1); |
999 | spin_unlock(&rq->lock); | 999 | spin_unlock(&rq->lock); |
1000 | 1000 | ||
1001 | return HRTIMER_NORESTART; | 1001 | return HRTIMER_NORESTART; |
1002 | } | 1002 | } |
1003 | 1003 | ||
1004 | static inline void init_rq_hrtick(struct rq *rq) | 1004 | static inline void init_rq_hrtick(struct rq *rq) |
1005 | { | 1005 | { |
1006 | rq->hrtick_flags = 0; | 1006 | rq->hrtick_flags = 0; |
1007 | hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 1007 | hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
1008 | rq->hrtick_timer.function = hrtick; | 1008 | rq->hrtick_timer.function = hrtick; |
1009 | rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; | 1009 | rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; |
1010 | } | 1010 | } |
1011 | 1011 | ||
1012 | void hrtick_resched(void) | 1012 | void hrtick_resched(void) |
1013 | { | 1013 | { |
1014 | struct rq *rq; | 1014 | struct rq *rq; |
1015 | unsigned long flags; | 1015 | unsigned long flags; |
1016 | 1016 | ||
1017 | if (!test_thread_flag(TIF_HRTICK_RESCHED)) | 1017 | if (!test_thread_flag(TIF_HRTICK_RESCHED)) |
1018 | return; | 1018 | return; |
1019 | 1019 | ||
1020 | local_irq_save(flags); | 1020 | local_irq_save(flags); |
1021 | rq = cpu_rq(smp_processor_id()); | 1021 | rq = cpu_rq(smp_processor_id()); |
1022 | hrtick_set(rq); | 1022 | hrtick_set(rq); |
1023 | local_irq_restore(flags); | 1023 | local_irq_restore(flags); |
1024 | } | 1024 | } |
1025 | #else | 1025 | #else |
1026 | static inline void hrtick_clear(struct rq *rq) | 1026 | static inline void hrtick_clear(struct rq *rq) |
1027 | { | 1027 | { |
1028 | } | 1028 | } |
1029 | 1029 | ||
1030 | static inline void hrtick_set(struct rq *rq) | 1030 | static inline void hrtick_set(struct rq *rq) |
1031 | { | 1031 | { |
1032 | } | 1032 | } |
1033 | 1033 | ||
1034 | static inline void init_rq_hrtick(struct rq *rq) | 1034 | static inline void init_rq_hrtick(struct rq *rq) |
1035 | { | 1035 | { |
1036 | } | 1036 | } |
1037 | 1037 | ||
1038 | void hrtick_resched(void) | 1038 | void hrtick_resched(void) |
1039 | { | 1039 | { |
1040 | } | 1040 | } |
1041 | #endif | 1041 | #endif |
1042 | 1042 | ||
1043 | /* | 1043 | /* |
1044 | * resched_task - mark a task 'to be rescheduled now'. | 1044 | * resched_task - mark a task 'to be rescheduled now'. |
1045 | * | 1045 | * |
1046 | * On UP this means the setting of the need_resched flag, on SMP it | 1046 | * On UP this means the setting of the need_resched flag, on SMP it |
1047 | * might also involve a cross-CPU call to trigger the scheduler on | 1047 | * might also involve a cross-CPU call to trigger the scheduler on |
1048 | * the target CPU. | 1048 | * the target CPU. |
1049 | */ | 1049 | */ |
1050 | #ifdef CONFIG_SMP | 1050 | #ifdef CONFIG_SMP |
1051 | 1051 | ||
1052 | #ifndef tsk_is_polling | 1052 | #ifndef tsk_is_polling |
1053 | #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) | 1053 | #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) |
1054 | #endif | 1054 | #endif |
1055 | 1055 | ||
1056 | static void __resched_task(struct task_struct *p, int tif_bit) | 1056 | static void __resched_task(struct task_struct *p, int tif_bit) |
1057 | { | 1057 | { |
1058 | int cpu; | 1058 | int cpu; |
1059 | 1059 | ||
1060 | assert_spin_locked(&task_rq(p)->lock); | 1060 | assert_spin_locked(&task_rq(p)->lock); |
1061 | 1061 | ||
1062 | if (unlikely(test_tsk_thread_flag(p, tif_bit))) | 1062 | if (unlikely(test_tsk_thread_flag(p, tif_bit))) |
1063 | return; | 1063 | return; |
1064 | 1064 | ||
1065 | set_tsk_thread_flag(p, tif_bit); | 1065 | set_tsk_thread_flag(p, tif_bit); |
1066 | 1066 | ||
1067 | cpu = task_cpu(p); | 1067 | cpu = task_cpu(p); |
1068 | if (cpu == smp_processor_id()) | 1068 | if (cpu == smp_processor_id()) |
1069 | return; | 1069 | return; |
1070 | 1070 | ||
1071 | /* NEED_RESCHED must be visible before we test polling */ | 1071 | /* NEED_RESCHED must be visible before we test polling */ |
1072 | smp_mb(); | 1072 | smp_mb(); |
1073 | if (!tsk_is_polling(p)) | 1073 | if (!tsk_is_polling(p)) |
1074 | smp_send_reschedule(cpu); | 1074 | smp_send_reschedule(cpu); |
1075 | } | 1075 | } |
1076 | 1076 | ||
1077 | static void resched_cpu(int cpu) | 1077 | static void resched_cpu(int cpu) |
1078 | { | 1078 | { |
1079 | struct rq *rq = cpu_rq(cpu); | 1079 | struct rq *rq = cpu_rq(cpu); |
1080 | unsigned long flags; | 1080 | unsigned long flags; |
1081 | 1081 | ||
1082 | if (!spin_trylock_irqsave(&rq->lock, flags)) | 1082 | if (!spin_trylock_irqsave(&rq->lock, flags)) |
1083 | return; | 1083 | return; |
1084 | resched_task(cpu_curr(cpu)); | 1084 | resched_task(cpu_curr(cpu)); |
1085 | spin_unlock_irqrestore(&rq->lock, flags); | 1085 | spin_unlock_irqrestore(&rq->lock, flags); |
1086 | } | 1086 | } |
1087 | #else | 1087 | #else |
1088 | static void __resched_task(struct task_struct *p, int tif_bit) | 1088 | static void __resched_task(struct task_struct *p, int tif_bit) |
1089 | { | 1089 | { |
1090 | assert_spin_locked(&task_rq(p)->lock); | 1090 | assert_spin_locked(&task_rq(p)->lock); |
1091 | set_tsk_thread_flag(p, tif_bit); | 1091 | set_tsk_thread_flag(p, tif_bit); |
1092 | } | 1092 | } |
1093 | #endif | 1093 | #endif |
1094 | 1094 | ||
1095 | #if BITS_PER_LONG == 32 | 1095 | #if BITS_PER_LONG == 32 |
1096 | # define WMULT_CONST (~0UL) | 1096 | # define WMULT_CONST (~0UL) |
1097 | #else | 1097 | #else |
1098 | # define WMULT_CONST (1UL << 32) | 1098 | # define WMULT_CONST (1UL << 32) |
1099 | #endif | 1099 | #endif |
1100 | 1100 | ||
1101 | #define WMULT_SHIFT 32 | 1101 | #define WMULT_SHIFT 32 |
1102 | 1102 | ||
1103 | /* | 1103 | /* |
1104 | * Shift right and round: | 1104 | * Shift right and round: |
1105 | */ | 1105 | */ |
1106 | #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) | 1106 | #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) |
1107 | 1107 | ||
1108 | static unsigned long | 1108 | static unsigned long |
1109 | calc_delta_mine(unsigned long delta_exec, unsigned long weight, | 1109 | calc_delta_mine(unsigned long delta_exec, unsigned long weight, |
1110 | struct load_weight *lw) | 1110 | struct load_weight *lw) |
1111 | { | 1111 | { |
1112 | u64 tmp; | 1112 | u64 tmp; |
1113 | 1113 | ||
1114 | if (unlikely(!lw->inv_weight)) | 1114 | if (unlikely(!lw->inv_weight)) |
1115 | lw->inv_weight = (WMULT_CONST - lw->weight/2) / lw->weight + 1; | 1115 | lw->inv_weight = (WMULT_CONST - lw->weight/2) / lw->weight + 1; |
1116 | 1116 | ||
1117 | tmp = (u64)delta_exec * weight; | 1117 | tmp = (u64)delta_exec * weight; |
1118 | /* | 1118 | /* |
1119 | * Check whether we'd overflow the 64-bit multiplication: | 1119 | * Check whether we'd overflow the 64-bit multiplication: |
1120 | */ | 1120 | */ |
1121 | if (unlikely(tmp > WMULT_CONST)) | 1121 | if (unlikely(tmp > WMULT_CONST)) |
1122 | tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, | 1122 | tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, |
1123 | WMULT_SHIFT/2); | 1123 | WMULT_SHIFT/2); |
1124 | else | 1124 | else |
1125 | tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); | 1125 | tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); |
1126 | 1126 | ||
1127 | return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); | 1127 | return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); |
1128 | } | 1128 | } |
1129 | 1129 | ||
1130 | static inline unsigned long | 1130 | static inline unsigned long |
1131 | calc_delta_fair(unsigned long delta_exec, struct load_weight *lw) | 1131 | calc_delta_fair(unsigned long delta_exec, struct load_weight *lw) |
1132 | { | 1132 | { |
1133 | return calc_delta_mine(delta_exec, NICE_0_LOAD, lw); | 1133 | return calc_delta_mine(delta_exec, NICE_0_LOAD, lw); |
1134 | } | 1134 | } |
1135 | 1135 | ||
1136 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) | 1136 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) |
1137 | { | 1137 | { |
1138 | lw->weight += inc; | 1138 | lw->weight += inc; |
1139 | } | 1139 | } |
1140 | 1140 | ||
1141 | static inline void update_load_sub(struct load_weight *lw, unsigned long dec) | 1141 | static inline void update_load_sub(struct load_weight *lw, unsigned long dec) |
1142 | { | 1142 | { |
1143 | lw->weight -= dec; | 1143 | lw->weight -= dec; |
1144 | } | 1144 | } |
1145 | 1145 | ||
1146 | /* | 1146 | /* |
1147 | * To aid in avoiding the subversion of "niceness" due to uneven distribution | 1147 | * To aid in avoiding the subversion of "niceness" due to uneven distribution |
1148 | * of tasks with abnormal "nice" values across CPUs the contribution that | 1148 | * of tasks with abnormal "nice" values across CPUs the contribution that |
1149 | * each task makes to its run queue's load is weighted according to its | 1149 | * each task makes to its run queue's load is weighted according to its |
1150 | * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a | 1150 | * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a |
1151 | * scaled version of the new time slice allocation that they receive on time | 1151 | * scaled version of the new time slice allocation that they receive on time |
1152 | * slice expiry etc. | 1152 | * slice expiry etc. |
1153 | */ | 1153 | */ |
1154 | 1154 | ||
1155 | #define WEIGHT_IDLEPRIO 2 | 1155 | #define WEIGHT_IDLEPRIO 2 |
1156 | #define WMULT_IDLEPRIO (1 << 31) | 1156 | #define WMULT_IDLEPRIO (1 << 31) |
1157 | 1157 | ||
1158 | /* | 1158 | /* |
1159 | * Nice levels are multiplicative, with a gentle 10% change for every | 1159 | * Nice levels are multiplicative, with a gentle 10% change for every |
1160 | * nice level changed. I.e. when a CPU-bound task goes from nice 0 to | 1160 | * nice level changed. I.e. when a CPU-bound task goes from nice 0 to |
1161 | * nice 1, it will get ~10% less CPU time than another CPU-bound task | 1161 | * nice 1, it will get ~10% less CPU time than another CPU-bound task |
1162 | * that remained on nice 0. | 1162 | * that remained on nice 0. |
1163 | * | 1163 | * |
1164 | * The "10% effect" is relative and cumulative: from _any_ nice level, | 1164 | * The "10% effect" is relative and cumulative: from _any_ nice level, |
1165 | * if you go up 1 level, it's -10% CPU usage, if you go down 1 level | 1165 | * if you go up 1 level, it's -10% CPU usage, if you go down 1 level |
1166 | * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. | 1166 | * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. |
1167 | * If a task goes up by ~10% and another task goes down by ~10% then | 1167 | * If a task goes up by ~10% and another task goes down by ~10% then |
1168 | * the relative distance between them is ~25%.) | 1168 | * the relative distance between them is ~25%.) |
1169 | */ | 1169 | */ |
1170 | static const int prio_to_weight[40] = { | 1170 | static const int prio_to_weight[40] = { |
1171 | /* -20 */ 88761, 71755, 56483, 46273, 36291, | 1171 | /* -20 */ 88761, 71755, 56483, 46273, 36291, |
1172 | /* -15 */ 29154, 23254, 18705, 14949, 11916, | 1172 | /* -15 */ 29154, 23254, 18705, 14949, 11916, |
1173 | /* -10 */ 9548, 7620, 6100, 4904, 3906, | 1173 | /* -10 */ 9548, 7620, 6100, 4904, 3906, |
1174 | /* -5 */ 3121, 2501, 1991, 1586, 1277, | 1174 | /* -5 */ 3121, 2501, 1991, 1586, 1277, |
1175 | /* 0 */ 1024, 820, 655, 526, 423, | 1175 | /* 0 */ 1024, 820, 655, 526, 423, |
1176 | /* 5 */ 335, 272, 215, 172, 137, | 1176 | /* 5 */ 335, 272, 215, 172, 137, |
1177 | /* 10 */ 110, 87, 70, 56, 45, | 1177 | /* 10 */ 110, 87, 70, 56, 45, |
1178 | /* 15 */ 36, 29, 23, 18, 15, | 1178 | /* 15 */ 36, 29, 23, 18, 15, |
1179 | }; | 1179 | }; |
1180 | 1180 | ||
1181 | /* | 1181 | /* |
1182 | * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. | 1182 | * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. |
1183 | * | 1183 | * |
1184 | * In cases where the weight does not change often, we can use the | 1184 | * In cases where the weight does not change often, we can use the |
1185 | * precalculated inverse to speed up arithmetics by turning divisions | 1185 | * precalculated inverse to speed up arithmetics by turning divisions |
1186 | * into multiplications: | 1186 | * into multiplications: |
1187 | */ | 1187 | */ |
1188 | static const u32 prio_to_wmult[40] = { | 1188 | static const u32 prio_to_wmult[40] = { |
1189 | /* -20 */ 48388, 59856, 76040, 92818, 118348, | 1189 | /* -20 */ 48388, 59856, 76040, 92818, 118348, |
1190 | /* -15 */ 147320, 184698, 229616, 287308, 360437, | 1190 | /* -15 */ 147320, 184698, 229616, 287308, 360437, |
1191 | /* -10 */ 449829, 563644, 704093, 875809, 1099582, | 1191 | /* -10 */ 449829, 563644, 704093, 875809, 1099582, |
1192 | /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, | 1192 | /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, |
1193 | /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, | 1193 | /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, |
1194 | /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, | 1194 | /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, |
1195 | /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, | 1195 | /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, |
1196 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, | 1196 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, |
1197 | }; | 1197 | }; |
1198 | 1198 | ||
1199 | static void activate_task(struct rq *rq, struct task_struct *p, int wakeup); | 1199 | static void activate_task(struct rq *rq, struct task_struct *p, int wakeup); |
1200 | 1200 | ||
1201 | /* | 1201 | /* |
1202 | * runqueue iterator, to support SMP load-balancing between different | 1202 | * runqueue iterator, to support SMP load-balancing between different |
1203 | * scheduling classes, without having to expose their internal data | 1203 | * scheduling classes, without having to expose their internal data |
1204 | * structures to the load-balancing proper: | 1204 | * structures to the load-balancing proper: |
1205 | */ | 1205 | */ |
1206 | struct rq_iterator { | 1206 | struct rq_iterator { |
1207 | void *arg; | 1207 | void *arg; |
1208 | struct task_struct *(*start)(void *); | 1208 | struct task_struct *(*start)(void *); |
1209 | struct task_struct *(*next)(void *); | 1209 | struct task_struct *(*next)(void *); |
1210 | }; | 1210 | }; |
1211 | 1211 | ||
1212 | #ifdef CONFIG_SMP | 1212 | #ifdef CONFIG_SMP |
1213 | static unsigned long | 1213 | static unsigned long |
1214 | balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | 1214 | balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, |
1215 | unsigned long max_load_move, struct sched_domain *sd, | 1215 | unsigned long max_load_move, struct sched_domain *sd, |
1216 | enum cpu_idle_type idle, int *all_pinned, | 1216 | enum cpu_idle_type idle, int *all_pinned, |
1217 | int *this_best_prio, struct rq_iterator *iterator); | 1217 | int *this_best_prio, struct rq_iterator *iterator); |
1218 | 1218 | ||
1219 | static int | 1219 | static int |
1220 | iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | 1220 | iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, |
1221 | struct sched_domain *sd, enum cpu_idle_type idle, | 1221 | struct sched_domain *sd, enum cpu_idle_type idle, |
1222 | struct rq_iterator *iterator); | 1222 | struct rq_iterator *iterator); |
1223 | #endif | 1223 | #endif |
1224 | 1224 | ||
1225 | #ifdef CONFIG_CGROUP_CPUACCT | 1225 | #ifdef CONFIG_CGROUP_CPUACCT |
1226 | static void cpuacct_charge(struct task_struct *tsk, u64 cputime); | 1226 | static void cpuacct_charge(struct task_struct *tsk, u64 cputime); |
1227 | #else | 1227 | #else |
1228 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} | 1228 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} |
1229 | #endif | 1229 | #endif |
1230 | 1230 | ||
1231 | static inline void inc_cpu_load(struct rq *rq, unsigned long load) | 1231 | static inline void inc_cpu_load(struct rq *rq, unsigned long load) |
1232 | { | 1232 | { |
1233 | update_load_add(&rq->load, load); | 1233 | update_load_add(&rq->load, load); |
1234 | } | 1234 | } |
1235 | 1235 | ||
1236 | static inline void dec_cpu_load(struct rq *rq, unsigned long load) | 1236 | static inline void dec_cpu_load(struct rq *rq, unsigned long load) |
1237 | { | 1237 | { |
1238 | update_load_sub(&rq->load, load); | 1238 | update_load_sub(&rq->load, load); |
1239 | } | 1239 | } |
1240 | 1240 | ||
1241 | #ifdef CONFIG_SMP | 1241 | #ifdef CONFIG_SMP |
1242 | static unsigned long source_load(int cpu, int type); | 1242 | static unsigned long source_load(int cpu, int type); |
1243 | static unsigned long target_load(int cpu, int type); | 1243 | static unsigned long target_load(int cpu, int type); |
1244 | static unsigned long cpu_avg_load_per_task(int cpu); | 1244 | static unsigned long cpu_avg_load_per_task(int cpu); |
1245 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | 1245 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); |
1246 | #endif /* CONFIG_SMP */ | 1246 | #endif /* CONFIG_SMP */ |
1247 | 1247 | ||
1248 | #include "sched_stats.h" | 1248 | #include "sched_stats.h" |
1249 | #include "sched_idletask.c" | 1249 | #include "sched_idletask.c" |
1250 | #include "sched_fair.c" | 1250 | #include "sched_fair.c" |
1251 | #include "sched_rt.c" | 1251 | #include "sched_rt.c" |
1252 | #ifdef CONFIG_SCHED_DEBUG | 1252 | #ifdef CONFIG_SCHED_DEBUG |
1253 | # include "sched_debug.c" | 1253 | # include "sched_debug.c" |
1254 | #endif | 1254 | #endif |
1255 | 1255 | ||
1256 | #define sched_class_highest (&rt_sched_class) | 1256 | #define sched_class_highest (&rt_sched_class) |
1257 | 1257 | ||
1258 | static void inc_nr_running(struct task_struct *p, struct rq *rq) | 1258 | static void inc_nr_running(struct task_struct *p, struct rq *rq) |
1259 | { | 1259 | { |
1260 | rq->nr_running++; | 1260 | rq->nr_running++; |
1261 | } | 1261 | } |
1262 | 1262 | ||
1263 | static void dec_nr_running(struct task_struct *p, struct rq *rq) | 1263 | static void dec_nr_running(struct task_struct *p, struct rq *rq) |
1264 | { | 1264 | { |
1265 | rq->nr_running--; | 1265 | rq->nr_running--; |
1266 | } | 1266 | } |
1267 | 1267 | ||
1268 | static void set_load_weight(struct task_struct *p) | 1268 | static void set_load_weight(struct task_struct *p) |
1269 | { | 1269 | { |
1270 | if (task_has_rt_policy(p)) { | 1270 | if (task_has_rt_policy(p)) { |
1271 | p->se.load.weight = prio_to_weight[0] * 2; | 1271 | p->se.load.weight = prio_to_weight[0] * 2; |
1272 | p->se.load.inv_weight = prio_to_wmult[0] >> 1; | 1272 | p->se.load.inv_weight = prio_to_wmult[0] >> 1; |
1273 | return; | 1273 | return; |
1274 | } | 1274 | } |
1275 | 1275 | ||
1276 | /* | 1276 | /* |
1277 | * SCHED_IDLE tasks get minimal weight: | 1277 | * SCHED_IDLE tasks get minimal weight: |
1278 | */ | 1278 | */ |
1279 | if (p->policy == SCHED_IDLE) { | 1279 | if (p->policy == SCHED_IDLE) { |
1280 | p->se.load.weight = WEIGHT_IDLEPRIO; | 1280 | p->se.load.weight = WEIGHT_IDLEPRIO; |
1281 | p->se.load.inv_weight = WMULT_IDLEPRIO; | 1281 | p->se.load.inv_weight = WMULT_IDLEPRIO; |
1282 | return; | 1282 | return; |
1283 | } | 1283 | } |
1284 | 1284 | ||
1285 | p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO]; | 1285 | p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO]; |
1286 | p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; | 1286 | p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; |
1287 | } | 1287 | } |
1288 | 1288 | ||
1289 | static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) | 1289 | static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) |
1290 | { | 1290 | { |
1291 | sched_info_queued(p); | 1291 | sched_info_queued(p); |
1292 | p->sched_class->enqueue_task(rq, p, wakeup); | 1292 | p->sched_class->enqueue_task(rq, p, wakeup); |
1293 | p->se.on_rq = 1; | 1293 | p->se.on_rq = 1; |
1294 | } | 1294 | } |
1295 | 1295 | ||
1296 | static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) | 1296 | static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) |
1297 | { | 1297 | { |
1298 | p->sched_class->dequeue_task(rq, p, sleep); | 1298 | p->sched_class->dequeue_task(rq, p, sleep); |
1299 | p->se.on_rq = 0; | 1299 | p->se.on_rq = 0; |
1300 | } | 1300 | } |
1301 | 1301 | ||
1302 | /* | 1302 | /* |
1303 | * __normal_prio - return the priority that is based on the static prio | 1303 | * __normal_prio - return the priority that is based on the static prio |
1304 | */ | 1304 | */ |
1305 | static inline int __normal_prio(struct task_struct *p) | 1305 | static inline int __normal_prio(struct task_struct *p) |
1306 | { | 1306 | { |
1307 | return p->static_prio; | 1307 | return p->static_prio; |
1308 | } | 1308 | } |
1309 | 1309 | ||
1310 | /* | 1310 | /* |
1311 | * Calculate the expected normal priority: i.e. priority | 1311 | * Calculate the expected normal priority: i.e. priority |
1312 | * without taking RT-inheritance into account. Might be | 1312 | * without taking RT-inheritance into account. Might be |
1313 | * boosted by interactivity modifiers. Changes upon fork, | 1313 | * boosted by interactivity modifiers. Changes upon fork, |
1314 | * setprio syscalls, and whenever the interactivity | 1314 | * setprio syscalls, and whenever the interactivity |
1315 | * estimator recalculates. | 1315 | * estimator recalculates. |
1316 | */ | 1316 | */ |
1317 | static inline int normal_prio(struct task_struct *p) | 1317 | static inline int normal_prio(struct task_struct *p) |
1318 | { | 1318 | { |
1319 | int prio; | 1319 | int prio; |
1320 | 1320 | ||
1321 | if (task_has_rt_policy(p)) | 1321 | if (task_has_rt_policy(p)) |
1322 | prio = MAX_RT_PRIO-1 - p->rt_priority; | 1322 | prio = MAX_RT_PRIO-1 - p->rt_priority; |
1323 | else | 1323 | else |
1324 | prio = __normal_prio(p); | 1324 | prio = __normal_prio(p); |
1325 | return prio; | 1325 | return prio; |
1326 | } | 1326 | } |
1327 | 1327 | ||
1328 | /* | 1328 | /* |
1329 | * Calculate the current priority, i.e. the priority | 1329 | * Calculate the current priority, i.e. the priority |
1330 | * taken into account by the scheduler. This value might | 1330 | * taken into account by the scheduler. This value might |
1331 | * be boosted by RT tasks, or might be boosted by | 1331 | * be boosted by RT tasks, or might be boosted by |
1332 | * interactivity modifiers. Will be RT if the task got | 1332 | * interactivity modifiers. Will be RT if the task got |
1333 | * RT-boosted. If not then it returns p->normal_prio. | 1333 | * RT-boosted. If not then it returns p->normal_prio. |
1334 | */ | 1334 | */ |
1335 | static int effective_prio(struct task_struct *p) | 1335 | static int effective_prio(struct task_struct *p) |
1336 | { | 1336 | { |
1337 | p->normal_prio = normal_prio(p); | 1337 | p->normal_prio = normal_prio(p); |
1338 | /* | 1338 | /* |
1339 | * If we are RT tasks or we were boosted to RT priority, | 1339 | * If we are RT tasks or we were boosted to RT priority, |
1340 | * keep the priority unchanged. Otherwise, update priority | 1340 | * keep the priority unchanged. Otherwise, update priority |
1341 | * to the normal priority: | 1341 | * to the normal priority: |
1342 | */ | 1342 | */ |
1343 | if (!rt_prio(p->prio)) | 1343 | if (!rt_prio(p->prio)) |
1344 | return p->normal_prio; | 1344 | return p->normal_prio; |
1345 | return p->prio; | 1345 | return p->prio; |
1346 | } | 1346 | } |
1347 | 1347 | ||
1348 | /* | 1348 | /* |
1349 | * activate_task - move a task to the runqueue. | 1349 | * activate_task - move a task to the runqueue. |
1350 | */ | 1350 | */ |
1351 | static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) | 1351 | static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) |
1352 | { | 1352 | { |
1353 | if (p->state == TASK_UNINTERRUPTIBLE) | 1353 | if (p->state == TASK_UNINTERRUPTIBLE) |
1354 | rq->nr_uninterruptible--; | 1354 | rq->nr_uninterruptible--; |
1355 | 1355 | ||
1356 | enqueue_task(rq, p, wakeup); | 1356 | enqueue_task(rq, p, wakeup); |
1357 | inc_nr_running(p, rq); | 1357 | inc_nr_running(p, rq); |
1358 | } | 1358 | } |
1359 | 1359 | ||
1360 | /* | 1360 | /* |
1361 | * deactivate_task - remove a task from the runqueue. | 1361 | * deactivate_task - remove a task from the runqueue. |
1362 | */ | 1362 | */ |
1363 | static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) | 1363 | static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) |
1364 | { | 1364 | { |
1365 | if (p->state == TASK_UNINTERRUPTIBLE) | 1365 | if (p->state == TASK_UNINTERRUPTIBLE) |
1366 | rq->nr_uninterruptible++; | 1366 | rq->nr_uninterruptible++; |
1367 | 1367 | ||
1368 | dequeue_task(rq, p, sleep); | 1368 | dequeue_task(rq, p, sleep); |
1369 | dec_nr_running(p, rq); | 1369 | dec_nr_running(p, rq); |
1370 | } | 1370 | } |
1371 | 1371 | ||
1372 | /** | 1372 | /** |
1373 | * task_curr - is this task currently executing on a CPU? | 1373 | * task_curr - is this task currently executing on a CPU? |
1374 | * @p: the task in question. | 1374 | * @p: the task in question. |
1375 | */ | 1375 | */ |
1376 | inline int task_curr(const struct task_struct *p) | 1376 | inline int task_curr(const struct task_struct *p) |
1377 | { | 1377 | { |
1378 | return cpu_curr(task_cpu(p)) == p; | 1378 | return cpu_curr(task_cpu(p)) == p; |
1379 | } | 1379 | } |
1380 | 1380 | ||
1381 | /* Used instead of source_load when we know the type == 0 */ | 1381 | /* Used instead of source_load when we know the type == 0 */ |
1382 | unsigned long weighted_cpuload(const int cpu) | 1382 | unsigned long weighted_cpuload(const int cpu) |
1383 | { | 1383 | { |
1384 | return cpu_rq(cpu)->load.weight; | 1384 | return cpu_rq(cpu)->load.weight; |
1385 | } | 1385 | } |
1386 | 1386 | ||
1387 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | 1387 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) |
1388 | { | 1388 | { |
1389 | set_task_rq(p, cpu); | 1389 | set_task_rq(p, cpu); |
1390 | #ifdef CONFIG_SMP | 1390 | #ifdef CONFIG_SMP |
1391 | /* | 1391 | /* |
1392 | * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be | 1392 | * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be |
1393 | * successfuly executed on another CPU. We must ensure that updates of | 1393 | * successfuly executed on another CPU. We must ensure that updates of |
1394 | * per-task data have been completed by this moment. | 1394 | * per-task data have been completed by this moment. |
1395 | */ | 1395 | */ |
1396 | smp_wmb(); | 1396 | smp_wmb(); |
1397 | task_thread_info(p)->cpu = cpu; | 1397 | task_thread_info(p)->cpu = cpu; |
1398 | #endif | 1398 | #endif |
1399 | } | 1399 | } |
1400 | 1400 | ||
1401 | static inline void check_class_changed(struct rq *rq, struct task_struct *p, | 1401 | static inline void check_class_changed(struct rq *rq, struct task_struct *p, |
1402 | const struct sched_class *prev_class, | 1402 | const struct sched_class *prev_class, |
1403 | int oldprio, int running) | 1403 | int oldprio, int running) |
1404 | { | 1404 | { |
1405 | if (prev_class != p->sched_class) { | 1405 | if (prev_class != p->sched_class) { |
1406 | if (prev_class->switched_from) | 1406 | if (prev_class->switched_from) |
1407 | prev_class->switched_from(rq, p, running); | 1407 | prev_class->switched_from(rq, p, running); |
1408 | p->sched_class->switched_to(rq, p, running); | 1408 | p->sched_class->switched_to(rq, p, running); |
1409 | } else | 1409 | } else |
1410 | p->sched_class->prio_changed(rq, p, oldprio, running); | 1410 | p->sched_class->prio_changed(rq, p, oldprio, running); |
1411 | } | 1411 | } |
1412 | 1412 | ||
1413 | #ifdef CONFIG_SMP | 1413 | #ifdef CONFIG_SMP |
1414 | 1414 | ||
1415 | /* | 1415 | /* |
1416 | * Is this task likely cache-hot: | 1416 | * Is this task likely cache-hot: |
1417 | */ | 1417 | */ |
1418 | static int | 1418 | static int |
1419 | task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | 1419 | task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) |
1420 | { | 1420 | { |
1421 | s64 delta; | 1421 | s64 delta; |
1422 | 1422 | ||
1423 | if (p->sched_class != &fair_sched_class) | 1423 | if (p->sched_class != &fair_sched_class) |
1424 | return 0; | 1424 | return 0; |
1425 | 1425 | ||
1426 | if (sysctl_sched_migration_cost == -1) | 1426 | if (sysctl_sched_migration_cost == -1) |
1427 | return 1; | 1427 | return 1; |
1428 | if (sysctl_sched_migration_cost == 0) | 1428 | if (sysctl_sched_migration_cost == 0) |
1429 | return 0; | 1429 | return 0; |
1430 | 1430 | ||
1431 | delta = now - p->se.exec_start; | 1431 | delta = now - p->se.exec_start; |
1432 | 1432 | ||
1433 | return delta < (s64)sysctl_sched_migration_cost; | 1433 | return delta < (s64)sysctl_sched_migration_cost; |
1434 | } | 1434 | } |
1435 | 1435 | ||
1436 | 1436 | ||
1437 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | 1437 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) |
1438 | { | 1438 | { |
1439 | int old_cpu = task_cpu(p); | 1439 | int old_cpu = task_cpu(p); |
1440 | struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); | 1440 | struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); |
1441 | struct cfs_rq *old_cfsrq = task_cfs_rq(p), | 1441 | struct cfs_rq *old_cfsrq = task_cfs_rq(p), |
1442 | *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); | 1442 | *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); |
1443 | u64 clock_offset; | 1443 | u64 clock_offset; |
1444 | 1444 | ||
1445 | clock_offset = old_rq->clock - new_rq->clock; | 1445 | clock_offset = old_rq->clock - new_rq->clock; |
1446 | 1446 | ||
1447 | #ifdef CONFIG_SCHEDSTATS | 1447 | #ifdef CONFIG_SCHEDSTATS |
1448 | if (p->se.wait_start) | 1448 | if (p->se.wait_start) |
1449 | p->se.wait_start -= clock_offset; | 1449 | p->se.wait_start -= clock_offset; |
1450 | if (p->se.sleep_start) | 1450 | if (p->se.sleep_start) |
1451 | p->se.sleep_start -= clock_offset; | 1451 | p->se.sleep_start -= clock_offset; |
1452 | if (p->se.block_start) | 1452 | if (p->se.block_start) |
1453 | p->se.block_start -= clock_offset; | 1453 | p->se.block_start -= clock_offset; |
1454 | if (old_cpu != new_cpu) { | 1454 | if (old_cpu != new_cpu) { |
1455 | schedstat_inc(p, se.nr_migrations); | 1455 | schedstat_inc(p, se.nr_migrations); |
1456 | if (task_hot(p, old_rq->clock, NULL)) | 1456 | if (task_hot(p, old_rq->clock, NULL)) |
1457 | schedstat_inc(p, se.nr_forced2_migrations); | 1457 | schedstat_inc(p, se.nr_forced2_migrations); |
1458 | } | 1458 | } |
1459 | #endif | 1459 | #endif |
1460 | p->se.vruntime -= old_cfsrq->min_vruntime - | 1460 | p->se.vruntime -= old_cfsrq->min_vruntime - |
1461 | new_cfsrq->min_vruntime; | 1461 | new_cfsrq->min_vruntime; |
1462 | 1462 | ||
1463 | __set_task_cpu(p, new_cpu); | 1463 | __set_task_cpu(p, new_cpu); |
1464 | } | 1464 | } |
1465 | 1465 | ||
1466 | struct migration_req { | 1466 | struct migration_req { |
1467 | struct list_head list; | 1467 | struct list_head list; |
1468 | 1468 | ||
1469 | struct task_struct *task; | 1469 | struct task_struct *task; |
1470 | int dest_cpu; | 1470 | int dest_cpu; |
1471 | 1471 | ||
1472 | struct completion done; | 1472 | struct completion done; |
1473 | }; | 1473 | }; |
1474 | 1474 | ||
1475 | /* | 1475 | /* |
1476 | * The task's runqueue lock must be held. | 1476 | * The task's runqueue lock must be held. |
1477 | * Returns true if you have to wait for migration thread. | 1477 | * Returns true if you have to wait for migration thread. |
1478 | */ | 1478 | */ |
1479 | static int | 1479 | static int |
1480 | migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) | 1480 | migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) |
1481 | { | 1481 | { |
1482 | struct rq *rq = task_rq(p); | 1482 | struct rq *rq = task_rq(p); |
1483 | 1483 | ||
1484 | /* | 1484 | /* |
1485 | * If the task is not on a runqueue (and not running), then | 1485 | * If the task is not on a runqueue (and not running), then |
1486 | * it is sufficient to simply update the task's cpu field. | 1486 | * it is sufficient to simply update the task's cpu field. |
1487 | */ | 1487 | */ |
1488 | if (!p->se.on_rq && !task_running(rq, p)) { | 1488 | if (!p->se.on_rq && !task_running(rq, p)) { |
1489 | set_task_cpu(p, dest_cpu); | 1489 | set_task_cpu(p, dest_cpu); |
1490 | return 0; | 1490 | return 0; |
1491 | } | 1491 | } |
1492 | 1492 | ||
1493 | init_completion(&req->done); | 1493 | init_completion(&req->done); |
1494 | req->task = p; | 1494 | req->task = p; |
1495 | req->dest_cpu = dest_cpu; | 1495 | req->dest_cpu = dest_cpu; |
1496 | list_add(&req->list, &rq->migration_queue); | 1496 | list_add(&req->list, &rq->migration_queue); |
1497 | 1497 | ||
1498 | return 1; | 1498 | return 1; |
1499 | } | 1499 | } |
1500 | 1500 | ||
1501 | /* | 1501 | /* |
1502 | * wait_task_inactive - wait for a thread to unschedule. | 1502 | * wait_task_inactive - wait for a thread to unschedule. |
1503 | * | 1503 | * |
1504 | * The caller must ensure that the task *will* unschedule sometime soon, | 1504 | * The caller must ensure that the task *will* unschedule sometime soon, |
1505 | * else this function might spin for a *long* time. This function can't | 1505 | * else this function might spin for a *long* time. This function can't |
1506 | * be called with interrupts off, or it may introduce deadlock with | 1506 | * be called with interrupts off, or it may introduce deadlock with |
1507 | * smp_call_function() if an IPI is sent by the same process we are | 1507 | * smp_call_function() if an IPI is sent by the same process we are |
1508 | * waiting to become inactive. | 1508 | * waiting to become inactive. |
1509 | */ | 1509 | */ |
1510 | void wait_task_inactive(struct task_struct *p) | 1510 | void wait_task_inactive(struct task_struct *p) |
1511 | { | 1511 | { |
1512 | unsigned long flags; | 1512 | unsigned long flags; |
1513 | int running, on_rq; | 1513 | int running, on_rq; |
1514 | struct rq *rq; | 1514 | struct rq *rq; |
1515 | 1515 | ||
1516 | for (;;) { | 1516 | for (;;) { |
1517 | /* | 1517 | /* |
1518 | * We do the initial early heuristics without holding | 1518 | * We do the initial early heuristics without holding |
1519 | * any task-queue locks at all. We'll only try to get | 1519 | * any task-queue locks at all. We'll only try to get |
1520 | * the runqueue lock when things look like they will | 1520 | * the runqueue lock when things look like they will |
1521 | * work out! | 1521 | * work out! |
1522 | */ | 1522 | */ |
1523 | rq = task_rq(p); | 1523 | rq = task_rq(p); |
1524 | 1524 | ||
1525 | /* | 1525 | /* |
1526 | * If the task is actively running on another CPU | 1526 | * If the task is actively running on another CPU |
1527 | * still, just relax and busy-wait without holding | 1527 | * still, just relax and busy-wait without holding |
1528 | * any locks. | 1528 | * any locks. |
1529 | * | 1529 | * |
1530 | * NOTE! Since we don't hold any locks, it's not | 1530 | * NOTE! Since we don't hold any locks, it's not |
1531 | * even sure that "rq" stays as the right runqueue! | 1531 | * even sure that "rq" stays as the right runqueue! |
1532 | * But we don't care, since "task_running()" will | 1532 | * But we don't care, since "task_running()" will |
1533 | * return false if the runqueue has changed and p | 1533 | * return false if the runqueue has changed and p |
1534 | * is actually now running somewhere else! | 1534 | * is actually now running somewhere else! |
1535 | */ | 1535 | */ |
1536 | while (task_running(rq, p)) | 1536 | while (task_running(rq, p)) |
1537 | cpu_relax(); | 1537 | cpu_relax(); |
1538 | 1538 | ||
1539 | /* | 1539 | /* |
1540 | * Ok, time to look more closely! We need the rq | 1540 | * Ok, time to look more closely! We need the rq |
1541 | * lock now, to be *sure*. If we're wrong, we'll | 1541 | * lock now, to be *sure*. If we're wrong, we'll |
1542 | * just go back and repeat. | 1542 | * just go back and repeat. |
1543 | */ | 1543 | */ |
1544 | rq = task_rq_lock(p, &flags); | 1544 | rq = task_rq_lock(p, &flags); |
1545 | running = task_running(rq, p); | 1545 | running = task_running(rq, p); |
1546 | on_rq = p->se.on_rq; | 1546 | on_rq = p->se.on_rq; |
1547 | task_rq_unlock(rq, &flags); | 1547 | task_rq_unlock(rq, &flags); |
1548 | 1548 | ||
1549 | /* | 1549 | /* |
1550 | * Was it really running after all now that we | 1550 | * Was it really running after all now that we |
1551 | * checked with the proper locks actually held? | 1551 | * checked with the proper locks actually held? |
1552 | * | 1552 | * |
1553 | * Oops. Go back and try again.. | 1553 | * Oops. Go back and try again.. |
1554 | */ | 1554 | */ |
1555 | if (unlikely(running)) { | 1555 | if (unlikely(running)) { |
1556 | cpu_relax(); | 1556 | cpu_relax(); |
1557 | continue; | 1557 | continue; |
1558 | } | 1558 | } |
1559 | 1559 | ||
1560 | /* | 1560 | /* |
1561 | * It's not enough that it's not actively running, | 1561 | * It's not enough that it's not actively running, |
1562 | * it must be off the runqueue _entirely_, and not | 1562 | * it must be off the runqueue _entirely_, and not |
1563 | * preempted! | 1563 | * preempted! |
1564 | * | 1564 | * |
1565 | * So if it wa still runnable (but just not actively | 1565 | * So if it wa still runnable (but just not actively |
1566 | * running right now), it's preempted, and we should | 1566 | * running right now), it's preempted, and we should |
1567 | * yield - it could be a while. | 1567 | * yield - it could be a while. |
1568 | */ | 1568 | */ |
1569 | if (unlikely(on_rq)) { | 1569 | if (unlikely(on_rq)) { |
1570 | schedule_timeout_uninterruptible(1); | 1570 | schedule_timeout_uninterruptible(1); |
1571 | continue; | 1571 | continue; |
1572 | } | 1572 | } |
1573 | 1573 | ||
1574 | /* | 1574 | /* |
1575 | * Ahh, all good. It wasn't running, and it wasn't | 1575 | * Ahh, all good. It wasn't running, and it wasn't |
1576 | * runnable, which means that it will never become | 1576 | * runnable, which means that it will never become |
1577 | * running in the future either. We're all done! | 1577 | * running in the future either. We're all done! |
1578 | */ | 1578 | */ |
1579 | break; | 1579 | break; |
1580 | } | 1580 | } |
1581 | } | 1581 | } |
1582 | 1582 | ||
1583 | /*** | 1583 | /*** |
1584 | * kick_process - kick a running thread to enter/exit the kernel | 1584 | * kick_process - kick a running thread to enter/exit the kernel |
1585 | * @p: the to-be-kicked thread | 1585 | * @p: the to-be-kicked thread |
1586 | * | 1586 | * |
1587 | * Cause a process which is running on another CPU to enter | 1587 | * Cause a process which is running on another CPU to enter |
1588 | * kernel-mode, without any delay. (to get signals handled.) | 1588 | * kernel-mode, without any delay. (to get signals handled.) |
1589 | * | 1589 | * |
1590 | * NOTE: this function doesnt have to take the runqueue lock, | 1590 | * NOTE: this function doesnt have to take the runqueue lock, |
1591 | * because all it wants to ensure is that the remote task enters | 1591 | * because all it wants to ensure is that the remote task enters |
1592 | * the kernel. If the IPI races and the task has been migrated | 1592 | * the kernel. If the IPI races and the task has been migrated |
1593 | * to another CPU then no harm is done and the purpose has been | 1593 | * to another CPU then no harm is done and the purpose has been |
1594 | * achieved as well. | 1594 | * achieved as well. |
1595 | */ | 1595 | */ |
1596 | void kick_process(struct task_struct *p) | 1596 | void kick_process(struct task_struct *p) |
1597 | { | 1597 | { |
1598 | int cpu; | 1598 | int cpu; |
1599 | 1599 | ||
1600 | preempt_disable(); | 1600 | preempt_disable(); |
1601 | cpu = task_cpu(p); | 1601 | cpu = task_cpu(p); |
1602 | if ((cpu != smp_processor_id()) && task_curr(p)) | 1602 | if ((cpu != smp_processor_id()) && task_curr(p)) |
1603 | smp_send_reschedule(cpu); | 1603 | smp_send_reschedule(cpu); |
1604 | preempt_enable(); | 1604 | preempt_enable(); |
1605 | } | 1605 | } |
1606 | 1606 | ||
1607 | /* | 1607 | /* |
1608 | * Return a low guess at the load of a migration-source cpu weighted | 1608 | * Return a low guess at the load of a migration-source cpu weighted |
1609 | * according to the scheduling class and "nice" value. | 1609 | * according to the scheduling class and "nice" value. |
1610 | * | 1610 | * |
1611 | * We want to under-estimate the load of migration sources, to | 1611 | * We want to under-estimate the load of migration sources, to |
1612 | * balance conservatively. | 1612 | * balance conservatively. |
1613 | */ | 1613 | */ |
1614 | static unsigned long source_load(int cpu, int type) | 1614 | static unsigned long source_load(int cpu, int type) |
1615 | { | 1615 | { |
1616 | struct rq *rq = cpu_rq(cpu); | 1616 | struct rq *rq = cpu_rq(cpu); |
1617 | unsigned long total = weighted_cpuload(cpu); | 1617 | unsigned long total = weighted_cpuload(cpu); |
1618 | 1618 | ||
1619 | if (type == 0) | 1619 | if (type == 0) |
1620 | return total; | 1620 | return total; |
1621 | 1621 | ||
1622 | return min(rq->cpu_load[type-1], total); | 1622 | return min(rq->cpu_load[type-1], total); |
1623 | } | 1623 | } |
1624 | 1624 | ||
1625 | /* | 1625 | /* |
1626 | * Return a high guess at the load of a migration-target cpu weighted | 1626 | * Return a high guess at the load of a migration-target cpu weighted |
1627 | * according to the scheduling class and "nice" value. | 1627 | * according to the scheduling class and "nice" value. |
1628 | */ | 1628 | */ |
1629 | static unsigned long target_load(int cpu, int type) | 1629 | static unsigned long target_load(int cpu, int type) |
1630 | { | 1630 | { |
1631 | struct rq *rq = cpu_rq(cpu); | 1631 | struct rq *rq = cpu_rq(cpu); |
1632 | unsigned long total = weighted_cpuload(cpu); | 1632 | unsigned long total = weighted_cpuload(cpu); |
1633 | 1633 | ||
1634 | if (type == 0) | 1634 | if (type == 0) |
1635 | return total; | 1635 | return total; |
1636 | 1636 | ||
1637 | return max(rq->cpu_load[type-1], total); | 1637 | return max(rq->cpu_load[type-1], total); |
1638 | } | 1638 | } |
1639 | 1639 | ||
1640 | /* | 1640 | /* |
1641 | * Return the average load per task on the cpu's run queue | 1641 | * Return the average load per task on the cpu's run queue |
1642 | */ | 1642 | */ |
1643 | static unsigned long cpu_avg_load_per_task(int cpu) | 1643 | static unsigned long cpu_avg_load_per_task(int cpu) |
1644 | { | 1644 | { |
1645 | struct rq *rq = cpu_rq(cpu); | 1645 | struct rq *rq = cpu_rq(cpu); |
1646 | unsigned long total = weighted_cpuload(cpu); | 1646 | unsigned long total = weighted_cpuload(cpu); |
1647 | unsigned long n = rq->nr_running; | 1647 | unsigned long n = rq->nr_running; |
1648 | 1648 | ||
1649 | return n ? total / n : SCHED_LOAD_SCALE; | 1649 | return n ? total / n : SCHED_LOAD_SCALE; |
1650 | } | 1650 | } |
1651 | 1651 | ||
1652 | /* | 1652 | /* |
1653 | * find_idlest_group finds and returns the least busy CPU group within the | 1653 | * find_idlest_group finds and returns the least busy CPU group within the |
1654 | * domain. | 1654 | * domain. |
1655 | */ | 1655 | */ |
1656 | static struct sched_group * | 1656 | static struct sched_group * |
1657 | find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) | 1657 | find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) |
1658 | { | 1658 | { |
1659 | struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; | 1659 | struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; |
1660 | unsigned long min_load = ULONG_MAX, this_load = 0; | 1660 | unsigned long min_load = ULONG_MAX, this_load = 0; |
1661 | int load_idx = sd->forkexec_idx; | 1661 | int load_idx = sd->forkexec_idx; |
1662 | int imbalance = 100 + (sd->imbalance_pct-100)/2; | 1662 | int imbalance = 100 + (sd->imbalance_pct-100)/2; |
1663 | 1663 | ||
1664 | do { | 1664 | do { |
1665 | unsigned long load, avg_load; | 1665 | unsigned long load, avg_load; |
1666 | int local_group; | 1666 | int local_group; |
1667 | int i; | 1667 | int i; |
1668 | 1668 | ||
1669 | /* Skip over this group if it has no CPUs allowed */ | 1669 | /* Skip over this group if it has no CPUs allowed */ |
1670 | if (!cpus_intersects(group->cpumask, p->cpus_allowed)) | 1670 | if (!cpus_intersects(group->cpumask, p->cpus_allowed)) |
1671 | continue; | 1671 | continue; |
1672 | 1672 | ||
1673 | local_group = cpu_isset(this_cpu, group->cpumask); | 1673 | local_group = cpu_isset(this_cpu, group->cpumask); |
1674 | 1674 | ||
1675 | /* Tally up the load of all CPUs in the group */ | 1675 | /* Tally up the load of all CPUs in the group */ |
1676 | avg_load = 0; | 1676 | avg_load = 0; |
1677 | 1677 | ||
1678 | for_each_cpu_mask(i, group->cpumask) { | 1678 | for_each_cpu_mask(i, group->cpumask) { |
1679 | /* Bias balancing toward cpus of our domain */ | 1679 | /* Bias balancing toward cpus of our domain */ |
1680 | if (local_group) | 1680 | if (local_group) |
1681 | load = source_load(i, load_idx); | 1681 | load = source_load(i, load_idx); |
1682 | else | 1682 | else |
1683 | load = target_load(i, load_idx); | 1683 | load = target_load(i, load_idx); |
1684 | 1684 | ||
1685 | avg_load += load; | 1685 | avg_load += load; |
1686 | } | 1686 | } |
1687 | 1687 | ||
1688 | /* Adjust by relative CPU power of the group */ | 1688 | /* Adjust by relative CPU power of the group */ |
1689 | avg_load = sg_div_cpu_power(group, | 1689 | avg_load = sg_div_cpu_power(group, |
1690 | avg_load * SCHED_LOAD_SCALE); | 1690 | avg_load * SCHED_LOAD_SCALE); |
1691 | 1691 | ||
1692 | if (local_group) { | 1692 | if (local_group) { |
1693 | this_load = avg_load; | 1693 | this_load = avg_load; |
1694 | this = group; | 1694 | this = group; |
1695 | } else if (avg_load < min_load) { | 1695 | } else if (avg_load < min_load) { |
1696 | min_load = avg_load; | 1696 | min_load = avg_load; |
1697 | idlest = group; | 1697 | idlest = group; |
1698 | } | 1698 | } |
1699 | } while (group = group->next, group != sd->groups); | 1699 | } while (group = group->next, group != sd->groups); |
1700 | 1700 | ||
1701 | if (!idlest || 100*this_load < imbalance*min_load) | 1701 | if (!idlest || 100*this_load < imbalance*min_load) |
1702 | return NULL; | 1702 | return NULL; |
1703 | return idlest; | 1703 | return idlest; |
1704 | } | 1704 | } |
1705 | 1705 | ||
1706 | /* | 1706 | /* |
1707 | * find_idlest_cpu - find the idlest cpu among the cpus in group. | 1707 | * find_idlest_cpu - find the idlest cpu among the cpus in group. |
1708 | */ | 1708 | */ |
1709 | static int | 1709 | static int |
1710 | find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | 1710 | find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) |
1711 | { | 1711 | { |
1712 | cpumask_t tmp; | 1712 | cpumask_t tmp; |
1713 | unsigned long load, min_load = ULONG_MAX; | 1713 | unsigned long load, min_load = ULONG_MAX; |
1714 | int idlest = -1; | 1714 | int idlest = -1; |
1715 | int i; | 1715 | int i; |
1716 | 1716 | ||
1717 | /* Traverse only the allowed CPUs */ | 1717 | /* Traverse only the allowed CPUs */ |
1718 | cpus_and(tmp, group->cpumask, p->cpus_allowed); | 1718 | cpus_and(tmp, group->cpumask, p->cpus_allowed); |
1719 | 1719 | ||
1720 | for_each_cpu_mask(i, tmp) { | 1720 | for_each_cpu_mask(i, tmp) { |
1721 | load = weighted_cpuload(i); | 1721 | load = weighted_cpuload(i); |
1722 | 1722 | ||
1723 | if (load < min_load || (load == min_load && i == this_cpu)) { | 1723 | if (load < min_load || (load == min_load && i == this_cpu)) { |
1724 | min_load = load; | 1724 | min_load = load; |
1725 | idlest = i; | 1725 | idlest = i; |
1726 | } | 1726 | } |
1727 | } | 1727 | } |
1728 | 1728 | ||
1729 | return idlest; | 1729 | return idlest; |
1730 | } | 1730 | } |
1731 | 1731 | ||
1732 | /* | 1732 | /* |
1733 | * sched_balance_self: balance the current task (running on cpu) in domains | 1733 | * sched_balance_self: balance the current task (running on cpu) in domains |
1734 | * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and | 1734 | * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and |
1735 | * SD_BALANCE_EXEC. | 1735 | * SD_BALANCE_EXEC. |
1736 | * | 1736 | * |
1737 | * Balance, ie. select the least loaded group. | 1737 | * Balance, ie. select the least loaded group. |
1738 | * | 1738 | * |
1739 | * Returns the target CPU number, or the same CPU if no balancing is needed. | 1739 | * Returns the target CPU number, or the same CPU if no balancing is needed. |
1740 | * | 1740 | * |
1741 | * preempt must be disabled. | 1741 | * preempt must be disabled. |
1742 | */ | 1742 | */ |
1743 | static int sched_balance_self(int cpu, int flag) | 1743 | static int sched_balance_self(int cpu, int flag) |
1744 | { | 1744 | { |
1745 | struct task_struct *t = current; | 1745 | struct task_struct *t = current; |
1746 | struct sched_domain *tmp, *sd = NULL; | 1746 | struct sched_domain *tmp, *sd = NULL; |
1747 | 1747 | ||
1748 | for_each_domain(cpu, tmp) { | 1748 | for_each_domain(cpu, tmp) { |
1749 | /* | 1749 | /* |
1750 | * If power savings logic is enabled for a domain, stop there. | 1750 | * If power savings logic is enabled for a domain, stop there. |
1751 | */ | 1751 | */ |
1752 | if (tmp->flags & SD_POWERSAVINGS_BALANCE) | 1752 | if (tmp->flags & SD_POWERSAVINGS_BALANCE) |
1753 | break; | 1753 | break; |
1754 | if (tmp->flags & flag) | 1754 | if (tmp->flags & flag) |
1755 | sd = tmp; | 1755 | sd = tmp; |
1756 | } | 1756 | } |
1757 | 1757 | ||
1758 | while (sd) { | 1758 | while (sd) { |
1759 | cpumask_t span; | 1759 | cpumask_t span; |
1760 | struct sched_group *group; | 1760 | struct sched_group *group; |
1761 | int new_cpu, weight; | 1761 | int new_cpu, weight; |
1762 | 1762 | ||
1763 | if (!(sd->flags & flag)) { | 1763 | if (!(sd->flags & flag)) { |
1764 | sd = sd->child; | 1764 | sd = sd->child; |
1765 | continue; | 1765 | continue; |
1766 | } | 1766 | } |
1767 | 1767 | ||
1768 | span = sd->span; | 1768 | span = sd->span; |
1769 | group = find_idlest_group(sd, t, cpu); | 1769 | group = find_idlest_group(sd, t, cpu); |
1770 | if (!group) { | 1770 | if (!group) { |
1771 | sd = sd->child; | 1771 | sd = sd->child; |
1772 | continue; | 1772 | continue; |
1773 | } | 1773 | } |
1774 | 1774 | ||
1775 | new_cpu = find_idlest_cpu(group, t, cpu); | 1775 | new_cpu = find_idlest_cpu(group, t, cpu); |
1776 | if (new_cpu == -1 || new_cpu == cpu) { | 1776 | if (new_cpu == -1 || new_cpu == cpu) { |
1777 | /* Now try balancing at a lower domain level of cpu */ | 1777 | /* Now try balancing at a lower domain level of cpu */ |
1778 | sd = sd->child; | 1778 | sd = sd->child; |
1779 | continue; | 1779 | continue; |
1780 | } | 1780 | } |
1781 | 1781 | ||
1782 | /* Now try balancing at a lower domain level of new_cpu */ | 1782 | /* Now try balancing at a lower domain level of new_cpu */ |
1783 | cpu = new_cpu; | 1783 | cpu = new_cpu; |
1784 | sd = NULL; | 1784 | sd = NULL; |
1785 | weight = cpus_weight(span); | 1785 | weight = cpus_weight(span); |
1786 | for_each_domain(cpu, tmp) { | 1786 | for_each_domain(cpu, tmp) { |
1787 | if (weight <= cpus_weight(tmp->span)) | 1787 | if (weight <= cpus_weight(tmp->span)) |
1788 | break; | 1788 | break; |
1789 | if (tmp->flags & flag) | 1789 | if (tmp->flags & flag) |
1790 | sd = tmp; | 1790 | sd = tmp; |
1791 | } | 1791 | } |
1792 | /* while loop will break here if sd == NULL */ | 1792 | /* while loop will break here if sd == NULL */ |
1793 | } | 1793 | } |
1794 | 1794 | ||
1795 | return cpu; | 1795 | return cpu; |
1796 | } | 1796 | } |
1797 | 1797 | ||
1798 | #endif /* CONFIG_SMP */ | 1798 | #endif /* CONFIG_SMP */ |
1799 | 1799 | ||
1800 | /*** | 1800 | /*** |
1801 | * try_to_wake_up - wake up a thread | 1801 | * try_to_wake_up - wake up a thread |
1802 | * @p: the to-be-woken-up thread | 1802 | * @p: the to-be-woken-up thread |
1803 | * @state: the mask of task states that can be woken | 1803 | * @state: the mask of task states that can be woken |
1804 | * @sync: do a synchronous wakeup? | 1804 | * @sync: do a synchronous wakeup? |
1805 | * | 1805 | * |
1806 | * Put it on the run-queue if it's not already there. The "current" | 1806 | * Put it on the run-queue if it's not already there. The "current" |
1807 | * thread is always on the run-queue (except when the actual | 1807 | * thread is always on the run-queue (except when the actual |
1808 | * re-schedule is in progress), and as such you're allowed to do | 1808 | * re-schedule is in progress), and as such you're allowed to do |
1809 | * the simpler "current->state = TASK_RUNNING" to mark yourself | 1809 | * the simpler "current->state = TASK_RUNNING" to mark yourself |
1810 | * runnable without the overhead of this. | 1810 | * runnable without the overhead of this. |
1811 | * | 1811 | * |
1812 | * returns failure only if the task is already active. | 1812 | * returns failure only if the task is already active. |
1813 | */ | 1813 | */ |
1814 | static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | 1814 | static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) |
1815 | { | 1815 | { |
1816 | int cpu, orig_cpu, this_cpu, success = 0; | 1816 | int cpu, orig_cpu, this_cpu, success = 0; |
1817 | unsigned long flags; | 1817 | unsigned long flags; |
1818 | long old_state; | 1818 | long old_state; |
1819 | struct rq *rq; | 1819 | struct rq *rq; |
1820 | 1820 | ||
1821 | rq = task_rq_lock(p, &flags); | 1821 | rq = task_rq_lock(p, &flags); |
1822 | old_state = p->state; | 1822 | old_state = p->state; |
1823 | if (!(old_state & state)) | 1823 | if (!(old_state & state)) |
1824 | goto out; | 1824 | goto out; |
1825 | 1825 | ||
1826 | if (p->se.on_rq) | 1826 | if (p->se.on_rq) |
1827 | goto out_running; | 1827 | goto out_running; |
1828 | 1828 | ||
1829 | cpu = task_cpu(p); | 1829 | cpu = task_cpu(p); |
1830 | orig_cpu = cpu; | 1830 | orig_cpu = cpu; |
1831 | this_cpu = smp_processor_id(); | 1831 | this_cpu = smp_processor_id(); |
1832 | 1832 | ||
1833 | #ifdef CONFIG_SMP | 1833 | #ifdef CONFIG_SMP |
1834 | if (unlikely(task_running(rq, p))) | 1834 | if (unlikely(task_running(rq, p))) |
1835 | goto out_activate; | 1835 | goto out_activate; |
1836 | 1836 | ||
1837 | cpu = p->sched_class->select_task_rq(p, sync); | 1837 | cpu = p->sched_class->select_task_rq(p, sync); |
1838 | if (cpu != orig_cpu) { | 1838 | if (cpu != orig_cpu) { |
1839 | set_task_cpu(p, cpu); | 1839 | set_task_cpu(p, cpu); |
1840 | task_rq_unlock(rq, &flags); | 1840 | task_rq_unlock(rq, &flags); |
1841 | /* might preempt at this point */ | 1841 | /* might preempt at this point */ |
1842 | rq = task_rq_lock(p, &flags); | 1842 | rq = task_rq_lock(p, &flags); |
1843 | old_state = p->state; | 1843 | old_state = p->state; |
1844 | if (!(old_state & state)) | 1844 | if (!(old_state & state)) |
1845 | goto out; | 1845 | goto out; |
1846 | if (p->se.on_rq) | 1846 | if (p->se.on_rq) |
1847 | goto out_running; | 1847 | goto out_running; |
1848 | 1848 | ||
1849 | this_cpu = smp_processor_id(); | 1849 | this_cpu = smp_processor_id(); |
1850 | cpu = task_cpu(p); | 1850 | cpu = task_cpu(p); |
1851 | } | 1851 | } |
1852 | 1852 | ||
1853 | #ifdef CONFIG_SCHEDSTATS | 1853 | #ifdef CONFIG_SCHEDSTATS |
1854 | schedstat_inc(rq, ttwu_count); | 1854 | schedstat_inc(rq, ttwu_count); |
1855 | if (cpu == this_cpu) | 1855 | if (cpu == this_cpu) |
1856 | schedstat_inc(rq, ttwu_local); | 1856 | schedstat_inc(rq, ttwu_local); |
1857 | else { | 1857 | else { |
1858 | struct sched_domain *sd; | 1858 | struct sched_domain *sd; |
1859 | for_each_domain(this_cpu, sd) { | 1859 | for_each_domain(this_cpu, sd) { |
1860 | if (cpu_isset(cpu, sd->span)) { | 1860 | if (cpu_isset(cpu, sd->span)) { |
1861 | schedstat_inc(sd, ttwu_wake_remote); | 1861 | schedstat_inc(sd, ttwu_wake_remote); |
1862 | break; | 1862 | break; |
1863 | } | 1863 | } |
1864 | } | 1864 | } |
1865 | } | 1865 | } |
1866 | #endif | 1866 | #endif |
1867 | 1867 | ||
1868 | out_activate: | 1868 | out_activate: |
1869 | #endif /* CONFIG_SMP */ | 1869 | #endif /* CONFIG_SMP */ |
1870 | schedstat_inc(p, se.nr_wakeups); | 1870 | schedstat_inc(p, se.nr_wakeups); |
1871 | if (sync) | 1871 | if (sync) |
1872 | schedstat_inc(p, se.nr_wakeups_sync); | 1872 | schedstat_inc(p, se.nr_wakeups_sync); |
1873 | if (orig_cpu != cpu) | 1873 | if (orig_cpu != cpu) |
1874 | schedstat_inc(p, se.nr_wakeups_migrate); | 1874 | schedstat_inc(p, se.nr_wakeups_migrate); |
1875 | if (cpu == this_cpu) | 1875 | if (cpu == this_cpu) |
1876 | schedstat_inc(p, se.nr_wakeups_local); | 1876 | schedstat_inc(p, se.nr_wakeups_local); |
1877 | else | 1877 | else |
1878 | schedstat_inc(p, se.nr_wakeups_remote); | 1878 | schedstat_inc(p, se.nr_wakeups_remote); |
1879 | update_rq_clock(rq); | 1879 | update_rq_clock(rq); |
1880 | activate_task(rq, p, 1); | 1880 | activate_task(rq, p, 1); |
1881 | check_preempt_curr(rq, p); | 1881 | check_preempt_curr(rq, p); |
1882 | success = 1; | 1882 | success = 1; |
1883 | 1883 | ||
1884 | out_running: | 1884 | out_running: |
1885 | p->state = TASK_RUNNING; | 1885 | p->state = TASK_RUNNING; |
1886 | #ifdef CONFIG_SMP | 1886 | #ifdef CONFIG_SMP |
1887 | if (p->sched_class->task_wake_up) | 1887 | if (p->sched_class->task_wake_up) |
1888 | p->sched_class->task_wake_up(rq, p); | 1888 | p->sched_class->task_wake_up(rq, p); |
1889 | #endif | 1889 | #endif |
1890 | out: | 1890 | out: |
1891 | task_rq_unlock(rq, &flags); | 1891 | task_rq_unlock(rq, &flags); |
1892 | 1892 | ||
1893 | return success; | 1893 | return success; |
1894 | } | 1894 | } |
1895 | 1895 | ||
1896 | int fastcall wake_up_process(struct task_struct *p) | 1896 | int fastcall wake_up_process(struct task_struct *p) |
1897 | { | 1897 | { |
1898 | return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | | 1898 | return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | |
1899 | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); | 1899 | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); |
1900 | } | 1900 | } |
1901 | EXPORT_SYMBOL(wake_up_process); | 1901 | EXPORT_SYMBOL(wake_up_process); |
1902 | 1902 | ||
1903 | int fastcall wake_up_state(struct task_struct *p, unsigned int state) | 1903 | int fastcall wake_up_state(struct task_struct *p, unsigned int state) |
1904 | { | 1904 | { |
1905 | return try_to_wake_up(p, state, 0); | 1905 | return try_to_wake_up(p, state, 0); |
1906 | } | 1906 | } |
1907 | 1907 | ||
1908 | /* | 1908 | /* |
1909 | * Perform scheduler related setup for a newly forked process p. | 1909 | * Perform scheduler related setup for a newly forked process p. |
1910 | * p is forked by current. | 1910 | * p is forked by current. |
1911 | * | 1911 | * |
1912 | * __sched_fork() is basic setup used by init_idle() too: | 1912 | * __sched_fork() is basic setup used by init_idle() too: |
1913 | */ | 1913 | */ |
1914 | static void __sched_fork(struct task_struct *p) | 1914 | static void __sched_fork(struct task_struct *p) |
1915 | { | 1915 | { |
1916 | p->se.exec_start = 0; | 1916 | p->se.exec_start = 0; |
1917 | p->se.sum_exec_runtime = 0; | 1917 | p->se.sum_exec_runtime = 0; |
1918 | p->se.prev_sum_exec_runtime = 0; | 1918 | p->se.prev_sum_exec_runtime = 0; |
1919 | 1919 | ||
1920 | #ifdef CONFIG_SCHEDSTATS | 1920 | #ifdef CONFIG_SCHEDSTATS |
1921 | p->se.wait_start = 0; | 1921 | p->se.wait_start = 0; |
1922 | p->se.sum_sleep_runtime = 0; | 1922 | p->se.sum_sleep_runtime = 0; |
1923 | p->se.sleep_start = 0; | 1923 | p->se.sleep_start = 0; |
1924 | p->se.block_start = 0; | 1924 | p->se.block_start = 0; |
1925 | p->se.sleep_max = 0; | 1925 | p->se.sleep_max = 0; |
1926 | p->se.block_max = 0; | 1926 | p->se.block_max = 0; |
1927 | p->se.exec_max = 0; | 1927 | p->se.exec_max = 0; |
1928 | p->se.slice_max = 0; | 1928 | p->se.slice_max = 0; |
1929 | p->se.wait_max = 0; | 1929 | p->se.wait_max = 0; |
1930 | #endif | 1930 | #endif |
1931 | 1931 | ||
1932 | INIT_LIST_HEAD(&p->rt.run_list); | 1932 | INIT_LIST_HEAD(&p->rt.run_list); |
1933 | p->se.on_rq = 0; | 1933 | p->se.on_rq = 0; |
1934 | 1934 | ||
1935 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 1935 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
1936 | INIT_HLIST_HEAD(&p->preempt_notifiers); | 1936 | INIT_HLIST_HEAD(&p->preempt_notifiers); |
1937 | #endif | 1937 | #endif |
1938 | 1938 | ||
1939 | /* | 1939 | /* |
1940 | * We mark the process as running here, but have not actually | 1940 | * We mark the process as running here, but have not actually |
1941 | * inserted it onto the runqueue yet. This guarantees that | 1941 | * inserted it onto the runqueue yet. This guarantees that |
1942 | * nobody will actually run it, and a signal or other external | 1942 | * nobody will actually run it, and a signal or other external |
1943 | * event cannot wake it up and insert it on the runqueue either. | 1943 | * event cannot wake it up and insert it on the runqueue either. |
1944 | */ | 1944 | */ |
1945 | p->state = TASK_RUNNING; | 1945 | p->state = TASK_RUNNING; |
1946 | } | 1946 | } |
1947 | 1947 | ||
1948 | /* | 1948 | /* |
1949 | * fork()/clone()-time setup: | 1949 | * fork()/clone()-time setup: |
1950 | */ | 1950 | */ |
1951 | void sched_fork(struct task_struct *p, int clone_flags) | 1951 | void sched_fork(struct task_struct *p, int clone_flags) |
1952 | { | 1952 | { |
1953 | int cpu = get_cpu(); | 1953 | int cpu = get_cpu(); |
1954 | 1954 | ||
1955 | __sched_fork(p); | 1955 | __sched_fork(p); |
1956 | 1956 | ||
1957 | #ifdef CONFIG_SMP | 1957 | #ifdef CONFIG_SMP |
1958 | cpu = sched_balance_self(cpu, SD_BALANCE_FORK); | 1958 | cpu = sched_balance_self(cpu, SD_BALANCE_FORK); |
1959 | #endif | 1959 | #endif |
1960 | set_task_cpu(p, cpu); | 1960 | set_task_cpu(p, cpu); |
1961 | 1961 | ||
1962 | /* | 1962 | /* |
1963 | * Make sure we do not leak PI boosting priority to the child: | 1963 | * Make sure we do not leak PI boosting priority to the child: |
1964 | */ | 1964 | */ |
1965 | p->prio = current->normal_prio; | 1965 | p->prio = current->normal_prio; |
1966 | if (!rt_prio(p->prio)) | 1966 | if (!rt_prio(p->prio)) |
1967 | p->sched_class = &fair_sched_class; | 1967 | p->sched_class = &fair_sched_class; |
1968 | 1968 | ||
1969 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | 1969 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
1970 | if (likely(sched_info_on())) | 1970 | if (likely(sched_info_on())) |
1971 | memset(&p->sched_info, 0, sizeof(p->sched_info)); | 1971 | memset(&p->sched_info, 0, sizeof(p->sched_info)); |
1972 | #endif | 1972 | #endif |
1973 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 1973 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) |
1974 | p->oncpu = 0; | 1974 | p->oncpu = 0; |
1975 | #endif | 1975 | #endif |
1976 | #ifdef CONFIG_PREEMPT | 1976 | #ifdef CONFIG_PREEMPT |
1977 | /* Want to start with kernel preemption disabled. */ | 1977 | /* Want to start with kernel preemption disabled. */ |
1978 | task_thread_info(p)->preempt_count = 1; | 1978 | task_thread_info(p)->preempt_count = 1; |
1979 | #endif | 1979 | #endif |
1980 | put_cpu(); | 1980 | put_cpu(); |
1981 | } | 1981 | } |
1982 | 1982 | ||
1983 | /* | 1983 | /* |
1984 | * wake_up_new_task - wake up a newly created task for the first time. | 1984 | * wake_up_new_task - wake up a newly created task for the first time. |
1985 | * | 1985 | * |
1986 | * This function will do some initial scheduler statistics housekeeping | 1986 | * This function will do some initial scheduler statistics housekeeping |
1987 | * that must be done for every newly created context, then puts the task | 1987 | * that must be done for every newly created context, then puts the task |
1988 | * on the runqueue and wakes it. | 1988 | * on the runqueue and wakes it. |
1989 | */ | 1989 | */ |
1990 | void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | 1990 | void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) |
1991 | { | 1991 | { |
1992 | unsigned long flags; | 1992 | unsigned long flags; |
1993 | struct rq *rq; | 1993 | struct rq *rq; |
1994 | 1994 | ||
1995 | rq = task_rq_lock(p, &flags); | 1995 | rq = task_rq_lock(p, &flags); |
1996 | BUG_ON(p->state != TASK_RUNNING); | 1996 | BUG_ON(p->state != TASK_RUNNING); |
1997 | update_rq_clock(rq); | 1997 | update_rq_clock(rq); |
1998 | 1998 | ||
1999 | p->prio = effective_prio(p); | 1999 | p->prio = effective_prio(p); |
2000 | 2000 | ||
2001 | if (!p->sched_class->task_new || !current->se.on_rq) { | 2001 | if (!p->sched_class->task_new || !current->se.on_rq) { |
2002 | activate_task(rq, p, 0); | 2002 | activate_task(rq, p, 0); |
2003 | } else { | 2003 | } else { |
2004 | /* | 2004 | /* |
2005 | * Let the scheduling class do new task startup | 2005 | * Let the scheduling class do new task startup |
2006 | * management (if any): | 2006 | * management (if any): |
2007 | */ | 2007 | */ |
2008 | p->sched_class->task_new(rq, p); | 2008 | p->sched_class->task_new(rq, p); |
2009 | inc_nr_running(p, rq); | 2009 | inc_nr_running(p, rq); |
2010 | } | 2010 | } |
2011 | check_preempt_curr(rq, p); | 2011 | check_preempt_curr(rq, p); |
2012 | #ifdef CONFIG_SMP | 2012 | #ifdef CONFIG_SMP |
2013 | if (p->sched_class->task_wake_up) | 2013 | if (p->sched_class->task_wake_up) |
2014 | p->sched_class->task_wake_up(rq, p); | 2014 | p->sched_class->task_wake_up(rq, p); |
2015 | #endif | 2015 | #endif |
2016 | task_rq_unlock(rq, &flags); | 2016 | task_rq_unlock(rq, &flags); |
2017 | } | 2017 | } |
2018 | 2018 | ||
2019 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 2019 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
2020 | 2020 | ||
2021 | /** | 2021 | /** |
2022 | * preempt_notifier_register - tell me when current is being being preempted & rescheduled | 2022 | * preempt_notifier_register - tell me when current is being being preempted & rescheduled |
2023 | * @notifier: notifier struct to register | 2023 | * @notifier: notifier struct to register |
2024 | */ | 2024 | */ |
2025 | void preempt_notifier_register(struct preempt_notifier *notifier) | 2025 | void preempt_notifier_register(struct preempt_notifier *notifier) |
2026 | { | 2026 | { |
2027 | hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); | 2027 | hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); |
2028 | } | 2028 | } |
2029 | EXPORT_SYMBOL_GPL(preempt_notifier_register); | 2029 | EXPORT_SYMBOL_GPL(preempt_notifier_register); |
2030 | 2030 | ||
2031 | /** | 2031 | /** |
2032 | * preempt_notifier_unregister - no longer interested in preemption notifications | 2032 | * preempt_notifier_unregister - no longer interested in preemption notifications |
2033 | * @notifier: notifier struct to unregister | 2033 | * @notifier: notifier struct to unregister |
2034 | * | 2034 | * |
2035 | * This is safe to call from within a preemption notifier. | 2035 | * This is safe to call from within a preemption notifier. |
2036 | */ | 2036 | */ |
2037 | void preempt_notifier_unregister(struct preempt_notifier *notifier) | 2037 | void preempt_notifier_unregister(struct preempt_notifier *notifier) |
2038 | { | 2038 | { |
2039 | hlist_del(¬ifier->link); | 2039 | hlist_del(¬ifier->link); |
2040 | } | 2040 | } |
2041 | EXPORT_SYMBOL_GPL(preempt_notifier_unregister); | 2041 | EXPORT_SYMBOL_GPL(preempt_notifier_unregister); |
2042 | 2042 | ||
2043 | static void fire_sched_in_preempt_notifiers(struct task_struct *curr) | 2043 | static void fire_sched_in_preempt_notifiers(struct task_struct *curr) |
2044 | { | 2044 | { |
2045 | struct preempt_notifier *notifier; | 2045 | struct preempt_notifier *notifier; |
2046 | struct hlist_node *node; | 2046 | struct hlist_node *node; |
2047 | 2047 | ||
2048 | hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) | 2048 | hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) |
2049 | notifier->ops->sched_in(notifier, raw_smp_processor_id()); | 2049 | notifier->ops->sched_in(notifier, raw_smp_processor_id()); |
2050 | } | 2050 | } |
2051 | 2051 | ||
2052 | static void | 2052 | static void |
2053 | fire_sched_out_preempt_notifiers(struct task_struct *curr, | 2053 | fire_sched_out_preempt_notifiers(struct task_struct *curr, |
2054 | struct task_struct *next) | 2054 | struct task_struct *next) |
2055 | { | 2055 | { |
2056 | struct preempt_notifier *notifier; | 2056 | struct preempt_notifier *notifier; |
2057 | struct hlist_node *node; | 2057 | struct hlist_node *node; |
2058 | 2058 | ||
2059 | hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) | 2059 | hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) |
2060 | notifier->ops->sched_out(notifier, next); | 2060 | notifier->ops->sched_out(notifier, next); |
2061 | } | 2061 | } |
2062 | 2062 | ||
2063 | #else | 2063 | #else |
2064 | 2064 | ||
2065 | static void fire_sched_in_preempt_notifiers(struct task_struct *curr) | 2065 | static void fire_sched_in_preempt_notifiers(struct task_struct *curr) |
2066 | { | 2066 | { |
2067 | } | 2067 | } |
2068 | 2068 | ||
2069 | static void | 2069 | static void |
2070 | fire_sched_out_preempt_notifiers(struct task_struct *curr, | 2070 | fire_sched_out_preempt_notifiers(struct task_struct *curr, |
2071 | struct task_struct *next) | 2071 | struct task_struct *next) |
2072 | { | 2072 | { |
2073 | } | 2073 | } |
2074 | 2074 | ||
2075 | #endif | 2075 | #endif |
2076 | 2076 | ||
2077 | /** | 2077 | /** |
2078 | * prepare_task_switch - prepare to switch tasks | 2078 | * prepare_task_switch - prepare to switch tasks |
2079 | * @rq: the runqueue preparing to switch | 2079 | * @rq: the runqueue preparing to switch |
2080 | * @prev: the current task that is being switched out | 2080 | * @prev: the current task that is being switched out |
2081 | * @next: the task we are going to switch to. | 2081 | * @next: the task we are going to switch to. |
2082 | * | 2082 | * |
2083 | * This is called with the rq lock held and interrupts off. It must | 2083 | * This is called with the rq lock held and interrupts off. It must |
2084 | * be paired with a subsequent finish_task_switch after the context | 2084 | * be paired with a subsequent finish_task_switch after the context |
2085 | * switch. | 2085 | * switch. |
2086 | * | 2086 | * |
2087 | * prepare_task_switch sets up locking and calls architecture specific | 2087 | * prepare_task_switch sets up locking and calls architecture specific |
2088 | * hooks. | 2088 | * hooks. |
2089 | */ | 2089 | */ |
2090 | static inline void | 2090 | static inline void |
2091 | prepare_task_switch(struct rq *rq, struct task_struct *prev, | 2091 | prepare_task_switch(struct rq *rq, struct task_struct *prev, |
2092 | struct task_struct *next) | 2092 | struct task_struct *next) |
2093 | { | 2093 | { |
2094 | fire_sched_out_preempt_notifiers(prev, next); | 2094 | fire_sched_out_preempt_notifiers(prev, next); |
2095 | prepare_lock_switch(rq, next); | 2095 | prepare_lock_switch(rq, next); |
2096 | prepare_arch_switch(next); | 2096 | prepare_arch_switch(next); |
2097 | } | 2097 | } |
2098 | 2098 | ||
2099 | /** | 2099 | /** |
2100 | * finish_task_switch - clean up after a task-switch | 2100 | * finish_task_switch - clean up after a task-switch |
2101 | * @rq: runqueue associated with task-switch | 2101 | * @rq: runqueue associated with task-switch |
2102 | * @prev: the thread we just switched away from. | 2102 | * @prev: the thread we just switched away from. |
2103 | * | 2103 | * |
2104 | * finish_task_switch must be called after the context switch, paired | 2104 | * finish_task_switch must be called after the context switch, paired |
2105 | * with a prepare_task_switch call before the context switch. | 2105 | * with a prepare_task_switch call before the context switch. |
2106 | * finish_task_switch will reconcile locking set up by prepare_task_switch, | 2106 | * finish_task_switch will reconcile locking set up by prepare_task_switch, |
2107 | * and do any other architecture-specific cleanup actions. | 2107 | * and do any other architecture-specific cleanup actions. |
2108 | * | 2108 | * |
2109 | * Note that we may have delayed dropping an mm in context_switch(). If | 2109 | * Note that we may have delayed dropping an mm in context_switch(). If |
2110 | * so, we finish that here outside of the runqueue lock. (Doing it | 2110 | * so, we finish that here outside of the runqueue lock. (Doing it |
2111 | * with the lock held can cause deadlocks; see schedule() for | 2111 | * with the lock held can cause deadlocks; see schedule() for |
2112 | * details.) | 2112 | * details.) |
2113 | */ | 2113 | */ |
2114 | static void finish_task_switch(struct rq *rq, struct task_struct *prev) | 2114 | static void finish_task_switch(struct rq *rq, struct task_struct *prev) |
2115 | __releases(rq->lock) | 2115 | __releases(rq->lock) |
2116 | { | 2116 | { |
2117 | struct mm_struct *mm = rq->prev_mm; | 2117 | struct mm_struct *mm = rq->prev_mm; |
2118 | long prev_state; | 2118 | long prev_state; |
2119 | 2119 | ||
2120 | rq->prev_mm = NULL; | 2120 | rq->prev_mm = NULL; |
2121 | 2121 | ||
2122 | /* | 2122 | /* |
2123 | * A task struct has one reference for the use as "current". | 2123 | * A task struct has one reference for the use as "current". |
2124 | * If a task dies, then it sets TASK_DEAD in tsk->state and calls | 2124 | * If a task dies, then it sets TASK_DEAD in tsk->state and calls |
2125 | * schedule one last time. The schedule call will never return, and | 2125 | * schedule one last time. The schedule call will never return, and |
2126 | * the scheduled task must drop that reference. | 2126 | * the scheduled task must drop that reference. |
2127 | * The test for TASK_DEAD must occur while the runqueue locks are | 2127 | * The test for TASK_DEAD must occur while the runqueue locks are |
2128 | * still held, otherwise prev could be scheduled on another cpu, die | 2128 | * still held, otherwise prev could be scheduled on another cpu, die |
2129 | * there before we look at prev->state, and then the reference would | 2129 | * there before we look at prev->state, and then the reference would |
2130 | * be dropped twice. | 2130 | * be dropped twice. |
2131 | * Manfred Spraul <manfred@colorfullife.com> | 2131 | * Manfred Spraul <manfred@colorfullife.com> |
2132 | */ | 2132 | */ |
2133 | prev_state = prev->state; | 2133 | prev_state = prev->state; |
2134 | finish_arch_switch(prev); | 2134 | finish_arch_switch(prev); |
2135 | finish_lock_switch(rq, prev); | 2135 | finish_lock_switch(rq, prev); |
2136 | #ifdef CONFIG_SMP | 2136 | #ifdef CONFIG_SMP |
2137 | if (current->sched_class->post_schedule) | 2137 | if (current->sched_class->post_schedule) |
2138 | current->sched_class->post_schedule(rq); | 2138 | current->sched_class->post_schedule(rq); |
2139 | #endif | 2139 | #endif |
2140 | 2140 | ||
2141 | fire_sched_in_preempt_notifiers(current); | 2141 | fire_sched_in_preempt_notifiers(current); |
2142 | if (mm) | 2142 | if (mm) |
2143 | mmdrop(mm); | 2143 | mmdrop(mm); |
2144 | if (unlikely(prev_state == TASK_DEAD)) { | 2144 | if (unlikely(prev_state == TASK_DEAD)) { |
2145 | /* | 2145 | /* |
2146 | * Remove function-return probe instances associated with this | 2146 | * Remove function-return probe instances associated with this |
2147 | * task and put them back on the free list. | 2147 | * task and put them back on the free list. |
2148 | */ | 2148 | */ |
2149 | kprobe_flush_task(prev); | 2149 | kprobe_flush_task(prev); |
2150 | put_task_struct(prev); | 2150 | put_task_struct(prev); |
2151 | } | 2151 | } |
2152 | } | 2152 | } |
2153 | 2153 | ||
2154 | /** | 2154 | /** |
2155 | * schedule_tail - first thing a freshly forked thread must call. | 2155 | * schedule_tail - first thing a freshly forked thread must call. |
2156 | * @prev: the thread we just switched away from. | 2156 | * @prev: the thread we just switched away from. |
2157 | */ | 2157 | */ |
2158 | asmlinkage void schedule_tail(struct task_struct *prev) | 2158 | asmlinkage void schedule_tail(struct task_struct *prev) |
2159 | __releases(rq->lock) | 2159 | __releases(rq->lock) |
2160 | { | 2160 | { |
2161 | struct rq *rq = this_rq(); | 2161 | struct rq *rq = this_rq(); |
2162 | 2162 | ||
2163 | finish_task_switch(rq, prev); | 2163 | finish_task_switch(rq, prev); |
2164 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW | 2164 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW |
2165 | /* In this case, finish_task_switch does not reenable preemption */ | 2165 | /* In this case, finish_task_switch does not reenable preemption */ |
2166 | preempt_enable(); | 2166 | preempt_enable(); |
2167 | #endif | 2167 | #endif |
2168 | if (current->set_child_tid) | 2168 | if (current->set_child_tid) |
2169 | put_user(task_pid_vnr(current), current->set_child_tid); | 2169 | put_user(task_pid_vnr(current), current->set_child_tid); |
2170 | } | 2170 | } |
2171 | 2171 | ||
2172 | /* | 2172 | /* |
2173 | * context_switch - switch to the new MM and the new | 2173 | * context_switch - switch to the new MM and the new |
2174 | * thread's register state. | 2174 | * thread's register state. |
2175 | */ | 2175 | */ |
2176 | static inline void | 2176 | static inline void |
2177 | context_switch(struct rq *rq, struct task_struct *prev, | 2177 | context_switch(struct rq *rq, struct task_struct *prev, |
2178 | struct task_struct *next) | 2178 | struct task_struct *next) |
2179 | { | 2179 | { |
2180 | struct mm_struct *mm, *oldmm; | 2180 | struct mm_struct *mm, *oldmm; |
2181 | 2181 | ||
2182 | prepare_task_switch(rq, prev, next); | 2182 | prepare_task_switch(rq, prev, next); |
2183 | mm = next->mm; | 2183 | mm = next->mm; |
2184 | oldmm = prev->active_mm; | 2184 | oldmm = prev->active_mm; |
2185 | /* | 2185 | /* |
2186 | * For paravirt, this is coupled with an exit in switch_to to | 2186 | * For paravirt, this is coupled with an exit in switch_to to |
2187 | * combine the page table reload and the switch backend into | 2187 | * combine the page table reload and the switch backend into |
2188 | * one hypercall. | 2188 | * one hypercall. |
2189 | */ | 2189 | */ |
2190 | arch_enter_lazy_cpu_mode(); | 2190 | arch_enter_lazy_cpu_mode(); |
2191 | 2191 | ||
2192 | if (unlikely(!mm)) { | 2192 | if (unlikely(!mm)) { |
2193 | next->active_mm = oldmm; | 2193 | next->active_mm = oldmm; |
2194 | atomic_inc(&oldmm->mm_count); | 2194 | atomic_inc(&oldmm->mm_count); |
2195 | enter_lazy_tlb(oldmm, next); | 2195 | enter_lazy_tlb(oldmm, next); |
2196 | } else | 2196 | } else |
2197 | switch_mm(oldmm, mm, next); | 2197 | switch_mm(oldmm, mm, next); |
2198 | 2198 | ||
2199 | if (unlikely(!prev->mm)) { | 2199 | if (unlikely(!prev->mm)) { |
2200 | prev->active_mm = NULL; | 2200 | prev->active_mm = NULL; |
2201 | rq->prev_mm = oldmm; | 2201 | rq->prev_mm = oldmm; |
2202 | } | 2202 | } |
2203 | /* | 2203 | /* |
2204 | * Since the runqueue lock will be released by the next | 2204 | * Since the runqueue lock will be released by the next |
2205 | * task (which is an invalid locking op but in the case | 2205 | * task (which is an invalid locking op but in the case |
2206 | * of the scheduler it's an obvious special-case), so we | 2206 | * of the scheduler it's an obvious special-case), so we |
2207 | * do an early lockdep release here: | 2207 | * do an early lockdep release here: |
2208 | */ | 2208 | */ |
2209 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | 2209 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW |
2210 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); | 2210 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); |
2211 | #endif | 2211 | #endif |
2212 | 2212 | ||
2213 | /* Here we just switch the register state and the stack. */ | 2213 | /* Here we just switch the register state and the stack. */ |
2214 | switch_to(prev, next, prev); | 2214 | switch_to(prev, next, prev); |
2215 | 2215 | ||
2216 | barrier(); | 2216 | barrier(); |
2217 | /* | 2217 | /* |
2218 | * this_rq must be evaluated again because prev may have moved | 2218 | * this_rq must be evaluated again because prev may have moved |
2219 | * CPUs since it called schedule(), thus the 'rq' on its stack | 2219 | * CPUs since it called schedule(), thus the 'rq' on its stack |
2220 | * frame will be invalid. | 2220 | * frame will be invalid. |
2221 | */ | 2221 | */ |
2222 | finish_task_switch(this_rq(), prev); | 2222 | finish_task_switch(this_rq(), prev); |
2223 | } | 2223 | } |
2224 | 2224 | ||
2225 | /* | 2225 | /* |
2226 | * nr_running, nr_uninterruptible and nr_context_switches: | 2226 | * nr_running, nr_uninterruptible and nr_context_switches: |
2227 | * | 2227 | * |
2228 | * externally visible scheduler statistics: current number of runnable | 2228 | * externally visible scheduler statistics: current number of runnable |
2229 | * threads, current number of uninterruptible-sleeping threads, total | 2229 | * threads, current number of uninterruptible-sleeping threads, total |
2230 | * number of context switches performed since bootup. | 2230 | * number of context switches performed since bootup. |
2231 | */ | 2231 | */ |
2232 | unsigned long nr_running(void) | 2232 | unsigned long nr_running(void) |
2233 | { | 2233 | { |
2234 | unsigned long i, sum = 0; | 2234 | unsigned long i, sum = 0; |
2235 | 2235 | ||
2236 | for_each_online_cpu(i) | 2236 | for_each_online_cpu(i) |
2237 | sum += cpu_rq(i)->nr_running; | 2237 | sum += cpu_rq(i)->nr_running; |
2238 | 2238 | ||
2239 | return sum; | 2239 | return sum; |
2240 | } | 2240 | } |
2241 | 2241 | ||
2242 | unsigned long nr_uninterruptible(void) | 2242 | unsigned long nr_uninterruptible(void) |
2243 | { | 2243 | { |
2244 | unsigned long i, sum = 0; | 2244 | unsigned long i, sum = 0; |
2245 | 2245 | ||
2246 | for_each_possible_cpu(i) | 2246 | for_each_possible_cpu(i) |
2247 | sum += cpu_rq(i)->nr_uninterruptible; | 2247 | sum += cpu_rq(i)->nr_uninterruptible; |
2248 | 2248 | ||
2249 | /* | 2249 | /* |
2250 | * Since we read the counters lockless, it might be slightly | 2250 | * Since we read the counters lockless, it might be slightly |
2251 | * inaccurate. Do not allow it to go below zero though: | 2251 | * inaccurate. Do not allow it to go below zero though: |
2252 | */ | 2252 | */ |
2253 | if (unlikely((long)sum < 0)) | 2253 | if (unlikely((long)sum < 0)) |
2254 | sum = 0; | 2254 | sum = 0; |
2255 | 2255 | ||
2256 | return sum; | 2256 | return sum; |
2257 | } | 2257 | } |
2258 | 2258 | ||
2259 | unsigned long long nr_context_switches(void) | 2259 | unsigned long long nr_context_switches(void) |
2260 | { | 2260 | { |
2261 | int i; | 2261 | int i; |
2262 | unsigned long long sum = 0; | 2262 | unsigned long long sum = 0; |
2263 | 2263 | ||
2264 | for_each_possible_cpu(i) | 2264 | for_each_possible_cpu(i) |
2265 | sum += cpu_rq(i)->nr_switches; | 2265 | sum += cpu_rq(i)->nr_switches; |
2266 | 2266 | ||
2267 | return sum; | 2267 | return sum; |
2268 | } | 2268 | } |
2269 | 2269 | ||
2270 | unsigned long nr_iowait(void) | 2270 | unsigned long nr_iowait(void) |
2271 | { | 2271 | { |
2272 | unsigned long i, sum = 0; | 2272 | unsigned long i, sum = 0; |
2273 | 2273 | ||
2274 | for_each_possible_cpu(i) | 2274 | for_each_possible_cpu(i) |
2275 | sum += atomic_read(&cpu_rq(i)->nr_iowait); | 2275 | sum += atomic_read(&cpu_rq(i)->nr_iowait); |
2276 | 2276 | ||
2277 | return sum; | 2277 | return sum; |
2278 | } | 2278 | } |
2279 | 2279 | ||
2280 | unsigned long nr_active(void) | 2280 | unsigned long nr_active(void) |
2281 | { | 2281 | { |
2282 | unsigned long i, running = 0, uninterruptible = 0; | 2282 | unsigned long i, running = 0, uninterruptible = 0; |
2283 | 2283 | ||
2284 | for_each_online_cpu(i) { | 2284 | for_each_online_cpu(i) { |
2285 | running += cpu_rq(i)->nr_running; | 2285 | running += cpu_rq(i)->nr_running; |
2286 | uninterruptible += cpu_rq(i)->nr_uninterruptible; | 2286 | uninterruptible += cpu_rq(i)->nr_uninterruptible; |
2287 | } | 2287 | } |
2288 | 2288 | ||
2289 | if (unlikely((long)uninterruptible < 0)) | 2289 | if (unlikely((long)uninterruptible < 0)) |
2290 | uninterruptible = 0; | 2290 | uninterruptible = 0; |
2291 | 2291 | ||
2292 | return running + uninterruptible; | 2292 | return running + uninterruptible; |
2293 | } | 2293 | } |
2294 | 2294 | ||
2295 | /* | 2295 | /* |
2296 | * Update rq->cpu_load[] statistics. This function is usually called every | 2296 | * Update rq->cpu_load[] statistics. This function is usually called every |
2297 | * scheduler tick (TICK_NSEC). | 2297 | * scheduler tick (TICK_NSEC). |
2298 | */ | 2298 | */ |
2299 | static void update_cpu_load(struct rq *this_rq) | 2299 | static void update_cpu_load(struct rq *this_rq) |
2300 | { | 2300 | { |
2301 | unsigned long this_load = this_rq->load.weight; | 2301 | unsigned long this_load = this_rq->load.weight; |
2302 | int i, scale; | 2302 | int i, scale; |
2303 | 2303 | ||
2304 | this_rq->nr_load_updates++; | 2304 | this_rq->nr_load_updates++; |
2305 | 2305 | ||
2306 | /* Update our load: */ | 2306 | /* Update our load: */ |
2307 | for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { | 2307 | for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { |
2308 | unsigned long old_load, new_load; | 2308 | unsigned long old_load, new_load; |
2309 | 2309 | ||
2310 | /* scale is effectively 1 << i now, and >> i divides by scale */ | 2310 | /* scale is effectively 1 << i now, and >> i divides by scale */ |
2311 | 2311 | ||
2312 | old_load = this_rq->cpu_load[i]; | 2312 | old_load = this_rq->cpu_load[i]; |
2313 | new_load = this_load; | 2313 | new_load = this_load; |
2314 | /* | 2314 | /* |
2315 | * Round up the averaging division if load is increasing. This | 2315 | * Round up the averaging division if load is increasing. This |
2316 | * prevents us from getting stuck on 9 if the load is 10, for | 2316 | * prevents us from getting stuck on 9 if the load is 10, for |
2317 | * example. | 2317 | * example. |
2318 | */ | 2318 | */ |
2319 | if (new_load > old_load) | 2319 | if (new_load > old_load) |
2320 | new_load += scale-1; | 2320 | new_load += scale-1; |
2321 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; | 2321 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; |
2322 | } | 2322 | } |
2323 | } | 2323 | } |
2324 | 2324 | ||
2325 | #ifdef CONFIG_SMP | 2325 | #ifdef CONFIG_SMP |
2326 | 2326 | ||
2327 | /* | 2327 | /* |
2328 | * double_rq_lock - safely lock two runqueues | 2328 | * double_rq_lock - safely lock two runqueues |
2329 | * | 2329 | * |
2330 | * Note this does not disable interrupts like task_rq_lock, | 2330 | * Note this does not disable interrupts like task_rq_lock, |
2331 | * you need to do so manually before calling. | 2331 | * you need to do so manually before calling. |
2332 | */ | 2332 | */ |
2333 | static void double_rq_lock(struct rq *rq1, struct rq *rq2) | 2333 | static void double_rq_lock(struct rq *rq1, struct rq *rq2) |
2334 | __acquires(rq1->lock) | 2334 | __acquires(rq1->lock) |
2335 | __acquires(rq2->lock) | 2335 | __acquires(rq2->lock) |
2336 | { | 2336 | { |
2337 | BUG_ON(!irqs_disabled()); | 2337 | BUG_ON(!irqs_disabled()); |
2338 | if (rq1 == rq2) { | 2338 | if (rq1 == rq2) { |
2339 | spin_lock(&rq1->lock); | 2339 | spin_lock(&rq1->lock); |
2340 | __acquire(rq2->lock); /* Fake it out ;) */ | 2340 | __acquire(rq2->lock); /* Fake it out ;) */ |
2341 | } else { | 2341 | } else { |
2342 | if (rq1 < rq2) { | 2342 | if (rq1 < rq2) { |
2343 | spin_lock(&rq1->lock); | 2343 | spin_lock(&rq1->lock); |
2344 | spin_lock(&rq2->lock); | 2344 | spin_lock(&rq2->lock); |
2345 | } else { | 2345 | } else { |
2346 | spin_lock(&rq2->lock); | 2346 | spin_lock(&rq2->lock); |
2347 | spin_lock(&rq1->lock); | 2347 | spin_lock(&rq1->lock); |
2348 | } | 2348 | } |
2349 | } | 2349 | } |
2350 | update_rq_clock(rq1); | 2350 | update_rq_clock(rq1); |
2351 | update_rq_clock(rq2); | 2351 | update_rq_clock(rq2); |
2352 | } | 2352 | } |
2353 | 2353 | ||
2354 | /* | 2354 | /* |
2355 | * double_rq_unlock - safely unlock two runqueues | 2355 | * double_rq_unlock - safely unlock two runqueues |
2356 | * | 2356 | * |
2357 | * Note this does not restore interrupts like task_rq_unlock, | 2357 | * Note this does not restore interrupts like task_rq_unlock, |
2358 | * you need to do so manually after calling. | 2358 | * you need to do so manually after calling. |
2359 | */ | 2359 | */ |
2360 | static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | 2360 | static void double_rq_unlock(struct rq *rq1, struct rq *rq2) |
2361 | __releases(rq1->lock) | 2361 | __releases(rq1->lock) |
2362 | __releases(rq2->lock) | 2362 | __releases(rq2->lock) |
2363 | { | 2363 | { |
2364 | spin_unlock(&rq1->lock); | 2364 | spin_unlock(&rq1->lock); |
2365 | if (rq1 != rq2) | 2365 | if (rq1 != rq2) |
2366 | spin_unlock(&rq2->lock); | 2366 | spin_unlock(&rq2->lock); |
2367 | else | 2367 | else |
2368 | __release(rq2->lock); | 2368 | __release(rq2->lock); |
2369 | } | 2369 | } |
2370 | 2370 | ||
2371 | /* | 2371 | /* |
2372 | * double_lock_balance - lock the busiest runqueue, this_rq is locked already. | 2372 | * double_lock_balance - lock the busiest runqueue, this_rq is locked already. |
2373 | */ | 2373 | */ |
2374 | static int double_lock_balance(struct rq *this_rq, struct rq *busiest) | 2374 | static int double_lock_balance(struct rq *this_rq, struct rq *busiest) |
2375 | __releases(this_rq->lock) | 2375 | __releases(this_rq->lock) |
2376 | __acquires(busiest->lock) | 2376 | __acquires(busiest->lock) |
2377 | __acquires(this_rq->lock) | 2377 | __acquires(this_rq->lock) |
2378 | { | 2378 | { |
2379 | int ret = 0; | 2379 | int ret = 0; |
2380 | 2380 | ||
2381 | if (unlikely(!irqs_disabled())) { | 2381 | if (unlikely(!irqs_disabled())) { |
2382 | /* printk() doesn't work good under rq->lock */ | 2382 | /* printk() doesn't work good under rq->lock */ |
2383 | spin_unlock(&this_rq->lock); | 2383 | spin_unlock(&this_rq->lock); |
2384 | BUG_ON(1); | 2384 | BUG_ON(1); |
2385 | } | 2385 | } |
2386 | if (unlikely(!spin_trylock(&busiest->lock))) { | 2386 | if (unlikely(!spin_trylock(&busiest->lock))) { |
2387 | if (busiest < this_rq) { | 2387 | if (busiest < this_rq) { |
2388 | spin_unlock(&this_rq->lock); | 2388 | spin_unlock(&this_rq->lock); |
2389 | spin_lock(&busiest->lock); | 2389 | spin_lock(&busiest->lock); |
2390 | spin_lock(&this_rq->lock); | 2390 | spin_lock(&this_rq->lock); |
2391 | ret = 1; | 2391 | ret = 1; |
2392 | } else | 2392 | } else |
2393 | spin_lock(&busiest->lock); | 2393 | spin_lock(&busiest->lock); |
2394 | } | 2394 | } |
2395 | return ret; | 2395 | return ret; |
2396 | } | 2396 | } |
2397 | 2397 | ||
2398 | /* | 2398 | /* |
2399 | * If dest_cpu is allowed for this process, migrate the task to it. | 2399 | * If dest_cpu is allowed for this process, migrate the task to it. |
2400 | * This is accomplished by forcing the cpu_allowed mask to only | 2400 | * This is accomplished by forcing the cpu_allowed mask to only |
2401 | * allow dest_cpu, which will force the cpu onto dest_cpu. Then | 2401 | * allow dest_cpu, which will force the cpu onto dest_cpu. Then |
2402 | * the cpu_allowed mask is restored. | 2402 | * the cpu_allowed mask is restored. |
2403 | */ | 2403 | */ |
2404 | static void sched_migrate_task(struct task_struct *p, int dest_cpu) | 2404 | static void sched_migrate_task(struct task_struct *p, int dest_cpu) |
2405 | { | 2405 | { |
2406 | struct migration_req req; | 2406 | struct migration_req req; |
2407 | unsigned long flags; | 2407 | unsigned long flags; |
2408 | struct rq *rq; | 2408 | struct rq *rq; |
2409 | 2409 | ||
2410 | rq = task_rq_lock(p, &flags); | 2410 | rq = task_rq_lock(p, &flags); |
2411 | if (!cpu_isset(dest_cpu, p->cpus_allowed) | 2411 | if (!cpu_isset(dest_cpu, p->cpus_allowed) |
2412 | || unlikely(cpu_is_offline(dest_cpu))) | 2412 | || unlikely(cpu_is_offline(dest_cpu))) |
2413 | goto out; | 2413 | goto out; |
2414 | 2414 | ||
2415 | /* force the process onto the specified CPU */ | 2415 | /* force the process onto the specified CPU */ |
2416 | if (migrate_task(p, dest_cpu, &req)) { | 2416 | if (migrate_task(p, dest_cpu, &req)) { |
2417 | /* Need to wait for migration thread (might exit: take ref). */ | 2417 | /* Need to wait for migration thread (might exit: take ref). */ |
2418 | struct task_struct *mt = rq->migration_thread; | 2418 | struct task_struct *mt = rq->migration_thread; |
2419 | 2419 | ||
2420 | get_task_struct(mt); | 2420 | get_task_struct(mt); |
2421 | task_rq_unlock(rq, &flags); | 2421 | task_rq_unlock(rq, &flags); |
2422 | wake_up_process(mt); | 2422 | wake_up_process(mt); |
2423 | put_task_struct(mt); | 2423 | put_task_struct(mt); |
2424 | wait_for_completion(&req.done); | 2424 | wait_for_completion(&req.done); |
2425 | 2425 | ||
2426 | return; | 2426 | return; |
2427 | } | 2427 | } |
2428 | out: | 2428 | out: |
2429 | task_rq_unlock(rq, &flags); | 2429 | task_rq_unlock(rq, &flags); |
2430 | } | 2430 | } |
2431 | 2431 | ||
2432 | /* | 2432 | /* |
2433 | * sched_exec - execve() is a valuable balancing opportunity, because at | 2433 | * sched_exec - execve() is a valuable balancing opportunity, because at |
2434 | * this point the task has the smallest effective memory and cache footprint. | 2434 | * this point the task has the smallest effective memory and cache footprint. |
2435 | */ | 2435 | */ |
2436 | void sched_exec(void) | 2436 | void sched_exec(void) |
2437 | { | 2437 | { |
2438 | int new_cpu, this_cpu = get_cpu(); | 2438 | int new_cpu, this_cpu = get_cpu(); |
2439 | new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); | 2439 | new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); |
2440 | put_cpu(); | 2440 | put_cpu(); |
2441 | if (new_cpu != this_cpu) | 2441 | if (new_cpu != this_cpu) |
2442 | sched_migrate_task(current, new_cpu); | 2442 | sched_migrate_task(current, new_cpu); |
2443 | } | 2443 | } |
2444 | 2444 | ||
2445 | /* | 2445 | /* |
2446 | * pull_task - move a task from a remote runqueue to the local runqueue. | 2446 | * pull_task - move a task from a remote runqueue to the local runqueue. |
2447 | * Both runqueues must be locked. | 2447 | * Both runqueues must be locked. |
2448 | */ | 2448 | */ |
2449 | static void pull_task(struct rq *src_rq, struct task_struct *p, | 2449 | static void pull_task(struct rq *src_rq, struct task_struct *p, |
2450 | struct rq *this_rq, int this_cpu) | 2450 | struct rq *this_rq, int this_cpu) |
2451 | { | 2451 | { |
2452 | deactivate_task(src_rq, p, 0); | 2452 | deactivate_task(src_rq, p, 0); |
2453 | set_task_cpu(p, this_cpu); | 2453 | set_task_cpu(p, this_cpu); |
2454 | activate_task(this_rq, p, 0); | 2454 | activate_task(this_rq, p, 0); |
2455 | /* | 2455 | /* |
2456 | * Note that idle threads have a prio of MAX_PRIO, for this test | 2456 | * Note that idle threads have a prio of MAX_PRIO, for this test |
2457 | * to be always true for them. | 2457 | * to be always true for them. |
2458 | */ | 2458 | */ |
2459 | check_preempt_curr(this_rq, p); | 2459 | check_preempt_curr(this_rq, p); |
2460 | } | 2460 | } |
2461 | 2461 | ||
2462 | /* | 2462 | /* |
2463 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? | 2463 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? |
2464 | */ | 2464 | */ |
2465 | static | 2465 | static |
2466 | int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | 2466 | int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, |
2467 | struct sched_domain *sd, enum cpu_idle_type idle, | 2467 | struct sched_domain *sd, enum cpu_idle_type idle, |
2468 | int *all_pinned) | 2468 | int *all_pinned) |
2469 | { | 2469 | { |
2470 | /* | 2470 | /* |
2471 | * We do not migrate tasks that are: | 2471 | * We do not migrate tasks that are: |
2472 | * 1) running (obviously), or | 2472 | * 1) running (obviously), or |
2473 | * 2) cannot be migrated to this CPU due to cpus_allowed, or | 2473 | * 2) cannot be migrated to this CPU due to cpus_allowed, or |
2474 | * 3) are cache-hot on their current CPU. | 2474 | * 3) are cache-hot on their current CPU. |
2475 | */ | 2475 | */ |
2476 | if (!cpu_isset(this_cpu, p->cpus_allowed)) { | 2476 | if (!cpu_isset(this_cpu, p->cpus_allowed)) { |
2477 | schedstat_inc(p, se.nr_failed_migrations_affine); | 2477 | schedstat_inc(p, se.nr_failed_migrations_affine); |
2478 | return 0; | 2478 | return 0; |
2479 | } | 2479 | } |
2480 | *all_pinned = 0; | 2480 | *all_pinned = 0; |
2481 | 2481 | ||
2482 | if (task_running(rq, p)) { | 2482 | if (task_running(rq, p)) { |
2483 | schedstat_inc(p, se.nr_failed_migrations_running); | 2483 | schedstat_inc(p, se.nr_failed_migrations_running); |
2484 | return 0; | 2484 | return 0; |
2485 | } | 2485 | } |
2486 | 2486 | ||
2487 | /* | 2487 | /* |
2488 | * Aggressive migration if: | 2488 | * Aggressive migration if: |
2489 | * 1) task is cache cold, or | 2489 | * 1) task is cache cold, or |
2490 | * 2) too many balance attempts have failed. | 2490 | * 2) too many balance attempts have failed. |
2491 | */ | 2491 | */ |
2492 | 2492 | ||
2493 | if (!task_hot(p, rq->clock, sd) || | 2493 | if (!task_hot(p, rq->clock, sd) || |
2494 | sd->nr_balance_failed > sd->cache_nice_tries) { | 2494 | sd->nr_balance_failed > sd->cache_nice_tries) { |
2495 | #ifdef CONFIG_SCHEDSTATS | 2495 | #ifdef CONFIG_SCHEDSTATS |
2496 | if (task_hot(p, rq->clock, sd)) { | 2496 | if (task_hot(p, rq->clock, sd)) { |
2497 | schedstat_inc(sd, lb_hot_gained[idle]); | 2497 | schedstat_inc(sd, lb_hot_gained[idle]); |
2498 | schedstat_inc(p, se.nr_forced_migrations); | 2498 | schedstat_inc(p, se.nr_forced_migrations); |
2499 | } | 2499 | } |
2500 | #endif | 2500 | #endif |
2501 | return 1; | 2501 | return 1; |
2502 | } | 2502 | } |
2503 | 2503 | ||
2504 | if (task_hot(p, rq->clock, sd)) { | 2504 | if (task_hot(p, rq->clock, sd)) { |
2505 | schedstat_inc(p, se.nr_failed_migrations_hot); | 2505 | schedstat_inc(p, se.nr_failed_migrations_hot); |
2506 | return 0; | 2506 | return 0; |
2507 | } | 2507 | } |
2508 | return 1; | 2508 | return 1; |
2509 | } | 2509 | } |
2510 | 2510 | ||
2511 | static unsigned long | 2511 | static unsigned long |
2512 | balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | 2512 | balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, |
2513 | unsigned long max_load_move, struct sched_domain *sd, | 2513 | unsigned long max_load_move, struct sched_domain *sd, |
2514 | enum cpu_idle_type idle, int *all_pinned, | 2514 | enum cpu_idle_type idle, int *all_pinned, |
2515 | int *this_best_prio, struct rq_iterator *iterator) | 2515 | int *this_best_prio, struct rq_iterator *iterator) |
2516 | { | 2516 | { |
2517 | int loops = 0, pulled = 0, pinned = 0, skip_for_load; | 2517 | int loops = 0, pulled = 0, pinned = 0, skip_for_load; |
2518 | struct task_struct *p; | 2518 | struct task_struct *p; |
2519 | long rem_load_move = max_load_move; | 2519 | long rem_load_move = max_load_move; |
2520 | 2520 | ||
2521 | if (max_load_move == 0) | 2521 | if (max_load_move == 0) |
2522 | goto out; | 2522 | goto out; |
2523 | 2523 | ||
2524 | pinned = 1; | 2524 | pinned = 1; |
2525 | 2525 | ||
2526 | /* | 2526 | /* |
2527 | * Start the load-balancing iterator: | 2527 | * Start the load-balancing iterator: |
2528 | */ | 2528 | */ |
2529 | p = iterator->start(iterator->arg); | 2529 | p = iterator->start(iterator->arg); |
2530 | next: | 2530 | next: |
2531 | if (!p || loops++ > sysctl_sched_nr_migrate) | 2531 | if (!p || loops++ > sysctl_sched_nr_migrate) |
2532 | goto out; | 2532 | goto out; |
2533 | /* | 2533 | /* |
2534 | * To help distribute high priority tasks across CPUs we don't | 2534 | * To help distribute high priority tasks across CPUs we don't |
2535 | * skip a task if it will be the highest priority task (i.e. smallest | 2535 | * skip a task if it will be the highest priority task (i.e. smallest |
2536 | * prio value) on its new queue regardless of its load weight | 2536 | * prio value) on its new queue regardless of its load weight |
2537 | */ | 2537 | */ |
2538 | skip_for_load = (p->se.load.weight >> 1) > rem_load_move + | 2538 | skip_for_load = (p->se.load.weight >> 1) > rem_load_move + |
2539 | SCHED_LOAD_SCALE_FUZZ; | 2539 | SCHED_LOAD_SCALE_FUZZ; |
2540 | if ((skip_for_load && p->prio >= *this_best_prio) || | 2540 | if ((skip_for_load && p->prio >= *this_best_prio) || |
2541 | !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { | 2541 | !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { |
2542 | p = iterator->next(iterator->arg); | 2542 | p = iterator->next(iterator->arg); |
2543 | goto next; | 2543 | goto next; |
2544 | } | 2544 | } |
2545 | 2545 | ||
2546 | pull_task(busiest, p, this_rq, this_cpu); | 2546 | pull_task(busiest, p, this_rq, this_cpu); |
2547 | pulled++; | 2547 | pulled++; |
2548 | rem_load_move -= p->se.load.weight; | 2548 | rem_load_move -= p->se.load.weight; |
2549 | 2549 | ||
2550 | /* | 2550 | /* |
2551 | * We only want to steal up to the prescribed amount of weighted load. | 2551 | * We only want to steal up to the prescribed amount of weighted load. |
2552 | */ | 2552 | */ |
2553 | if (rem_load_move > 0) { | 2553 | if (rem_load_move > 0) { |
2554 | if (p->prio < *this_best_prio) | 2554 | if (p->prio < *this_best_prio) |
2555 | *this_best_prio = p->prio; | 2555 | *this_best_prio = p->prio; |
2556 | p = iterator->next(iterator->arg); | 2556 | p = iterator->next(iterator->arg); |
2557 | goto next; | 2557 | goto next; |
2558 | } | 2558 | } |
2559 | out: | 2559 | out: |
2560 | /* | 2560 | /* |
2561 | * Right now, this is one of only two places pull_task() is called, | 2561 | * Right now, this is one of only two places pull_task() is called, |
2562 | * so we can safely collect pull_task() stats here rather than | 2562 | * so we can safely collect pull_task() stats here rather than |
2563 | * inside pull_task(). | 2563 | * inside pull_task(). |
2564 | */ | 2564 | */ |
2565 | schedstat_add(sd, lb_gained[idle], pulled); | 2565 | schedstat_add(sd, lb_gained[idle], pulled); |
2566 | 2566 | ||
2567 | if (all_pinned) | 2567 | if (all_pinned) |
2568 | *all_pinned = pinned; | 2568 | *all_pinned = pinned; |
2569 | 2569 | ||
2570 | return max_load_move - rem_load_move; | 2570 | return max_load_move - rem_load_move; |
2571 | } | 2571 | } |
2572 | 2572 | ||
2573 | /* | 2573 | /* |
2574 | * move_tasks tries to move up to max_load_move weighted load from busiest to | 2574 | * move_tasks tries to move up to max_load_move weighted load from busiest to |
2575 | * this_rq, as part of a balancing operation within domain "sd". | 2575 | * this_rq, as part of a balancing operation within domain "sd". |
2576 | * Returns 1 if successful and 0 otherwise. | 2576 | * Returns 1 if successful and 0 otherwise. |
2577 | * | 2577 | * |
2578 | * Called with both runqueues locked. | 2578 | * Called with both runqueues locked. |
2579 | */ | 2579 | */ |
2580 | static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | 2580 | static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, |
2581 | unsigned long max_load_move, | 2581 | unsigned long max_load_move, |
2582 | struct sched_domain *sd, enum cpu_idle_type idle, | 2582 | struct sched_domain *sd, enum cpu_idle_type idle, |
2583 | int *all_pinned) | 2583 | int *all_pinned) |
2584 | { | 2584 | { |
2585 | const struct sched_class *class = sched_class_highest; | 2585 | const struct sched_class *class = sched_class_highest; |
2586 | unsigned long total_load_moved = 0; | 2586 | unsigned long total_load_moved = 0; |
2587 | int this_best_prio = this_rq->curr->prio; | 2587 | int this_best_prio = this_rq->curr->prio; |
2588 | 2588 | ||
2589 | do { | 2589 | do { |
2590 | total_load_moved += | 2590 | total_load_moved += |
2591 | class->load_balance(this_rq, this_cpu, busiest, | 2591 | class->load_balance(this_rq, this_cpu, busiest, |
2592 | max_load_move - total_load_moved, | 2592 | max_load_move - total_load_moved, |
2593 | sd, idle, all_pinned, &this_best_prio); | 2593 | sd, idle, all_pinned, &this_best_prio); |
2594 | class = class->next; | 2594 | class = class->next; |
2595 | } while (class && max_load_move > total_load_moved); | 2595 | } while (class && max_load_move > total_load_moved); |
2596 | 2596 | ||
2597 | return total_load_moved > 0; | 2597 | return total_load_moved > 0; |
2598 | } | 2598 | } |
2599 | 2599 | ||
2600 | static int | 2600 | static int |
2601 | iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | 2601 | iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, |
2602 | struct sched_domain *sd, enum cpu_idle_type idle, | 2602 | struct sched_domain *sd, enum cpu_idle_type idle, |
2603 | struct rq_iterator *iterator) | 2603 | struct rq_iterator *iterator) |
2604 | { | 2604 | { |
2605 | struct task_struct *p = iterator->start(iterator->arg); | 2605 | struct task_struct *p = iterator->start(iterator->arg); |
2606 | int pinned = 0; | 2606 | int pinned = 0; |
2607 | 2607 | ||
2608 | while (p) { | 2608 | while (p) { |
2609 | if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { | 2609 | if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { |
2610 | pull_task(busiest, p, this_rq, this_cpu); | 2610 | pull_task(busiest, p, this_rq, this_cpu); |
2611 | /* | 2611 | /* |
2612 | * Right now, this is only the second place pull_task() | 2612 | * Right now, this is only the second place pull_task() |
2613 | * is called, so we can safely collect pull_task() | 2613 | * is called, so we can safely collect pull_task() |
2614 | * stats here rather than inside pull_task(). | 2614 | * stats here rather than inside pull_task(). |
2615 | */ | 2615 | */ |
2616 | schedstat_inc(sd, lb_gained[idle]); | 2616 | schedstat_inc(sd, lb_gained[idle]); |
2617 | 2617 | ||
2618 | return 1; | 2618 | return 1; |
2619 | } | 2619 | } |
2620 | p = iterator->next(iterator->arg); | 2620 | p = iterator->next(iterator->arg); |
2621 | } | 2621 | } |
2622 | 2622 | ||
2623 | return 0; | 2623 | return 0; |
2624 | } | 2624 | } |
2625 | 2625 | ||
2626 | /* | 2626 | /* |
2627 | * move_one_task tries to move exactly one task from busiest to this_rq, as | 2627 | * move_one_task tries to move exactly one task from busiest to this_rq, as |
2628 | * part of active balancing operations within "domain". | 2628 | * part of active balancing operations within "domain". |
2629 | * Returns 1 if successful and 0 otherwise. | 2629 | * Returns 1 if successful and 0 otherwise. |
2630 | * | 2630 | * |
2631 | * Called with both runqueues locked. | 2631 | * Called with both runqueues locked. |
2632 | */ | 2632 | */ |
2633 | static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | 2633 | static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, |
2634 | struct sched_domain *sd, enum cpu_idle_type idle) | 2634 | struct sched_domain *sd, enum cpu_idle_type idle) |
2635 | { | 2635 | { |
2636 | const struct sched_class *class; | 2636 | const struct sched_class *class; |
2637 | 2637 | ||
2638 | for (class = sched_class_highest; class; class = class->next) | 2638 | for (class = sched_class_highest; class; class = class->next) |
2639 | if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) | 2639 | if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) |
2640 | return 1; | 2640 | return 1; |
2641 | 2641 | ||
2642 | return 0; | 2642 | return 0; |
2643 | } | 2643 | } |
2644 | 2644 | ||
2645 | /* | 2645 | /* |
2646 | * find_busiest_group finds and returns the busiest CPU group within the | 2646 | * find_busiest_group finds and returns the busiest CPU group within the |
2647 | * domain. It calculates and returns the amount of weighted load which | 2647 | * domain. It calculates and returns the amount of weighted load which |
2648 | * should be moved to restore balance via the imbalance parameter. | 2648 | * should be moved to restore balance via the imbalance parameter. |
2649 | */ | 2649 | */ |
2650 | static struct sched_group * | 2650 | static struct sched_group * |
2651 | find_busiest_group(struct sched_domain *sd, int this_cpu, | 2651 | find_busiest_group(struct sched_domain *sd, int this_cpu, |
2652 | unsigned long *imbalance, enum cpu_idle_type idle, | 2652 | unsigned long *imbalance, enum cpu_idle_type idle, |
2653 | int *sd_idle, cpumask_t *cpus, int *balance) | 2653 | int *sd_idle, cpumask_t *cpus, int *balance) |
2654 | { | 2654 | { |
2655 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; | 2655 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; |
2656 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; | 2656 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; |
2657 | unsigned long max_pull; | 2657 | unsigned long max_pull; |
2658 | unsigned long busiest_load_per_task, busiest_nr_running; | 2658 | unsigned long busiest_load_per_task, busiest_nr_running; |
2659 | unsigned long this_load_per_task, this_nr_running; | 2659 | unsigned long this_load_per_task, this_nr_running; |
2660 | int load_idx, group_imb = 0; | 2660 | int load_idx, group_imb = 0; |
2661 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 2661 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
2662 | int power_savings_balance = 1; | 2662 | int power_savings_balance = 1; |
2663 | unsigned long leader_nr_running = 0, min_load_per_task = 0; | 2663 | unsigned long leader_nr_running = 0, min_load_per_task = 0; |
2664 | unsigned long min_nr_running = ULONG_MAX; | 2664 | unsigned long min_nr_running = ULONG_MAX; |
2665 | struct sched_group *group_min = NULL, *group_leader = NULL; | 2665 | struct sched_group *group_min = NULL, *group_leader = NULL; |
2666 | #endif | 2666 | #endif |
2667 | 2667 | ||
2668 | max_load = this_load = total_load = total_pwr = 0; | 2668 | max_load = this_load = total_load = total_pwr = 0; |
2669 | busiest_load_per_task = busiest_nr_running = 0; | 2669 | busiest_load_per_task = busiest_nr_running = 0; |
2670 | this_load_per_task = this_nr_running = 0; | 2670 | this_load_per_task = this_nr_running = 0; |
2671 | if (idle == CPU_NOT_IDLE) | 2671 | if (idle == CPU_NOT_IDLE) |
2672 | load_idx = sd->busy_idx; | 2672 | load_idx = sd->busy_idx; |
2673 | else if (idle == CPU_NEWLY_IDLE) | 2673 | else if (idle == CPU_NEWLY_IDLE) |
2674 | load_idx = sd->newidle_idx; | 2674 | load_idx = sd->newidle_idx; |
2675 | else | 2675 | else |
2676 | load_idx = sd->idle_idx; | 2676 | load_idx = sd->idle_idx; |
2677 | 2677 | ||
2678 | do { | 2678 | do { |
2679 | unsigned long load, group_capacity, max_cpu_load, min_cpu_load; | 2679 | unsigned long load, group_capacity, max_cpu_load, min_cpu_load; |
2680 | int local_group; | 2680 | int local_group; |
2681 | int i; | 2681 | int i; |
2682 | int __group_imb = 0; | 2682 | int __group_imb = 0; |
2683 | unsigned int balance_cpu = -1, first_idle_cpu = 0; | 2683 | unsigned int balance_cpu = -1, first_idle_cpu = 0; |
2684 | unsigned long sum_nr_running, sum_weighted_load; | 2684 | unsigned long sum_nr_running, sum_weighted_load; |
2685 | 2685 | ||
2686 | local_group = cpu_isset(this_cpu, group->cpumask); | 2686 | local_group = cpu_isset(this_cpu, group->cpumask); |
2687 | 2687 | ||
2688 | if (local_group) | 2688 | if (local_group) |
2689 | balance_cpu = first_cpu(group->cpumask); | 2689 | balance_cpu = first_cpu(group->cpumask); |
2690 | 2690 | ||
2691 | /* Tally up the load of all CPUs in the group */ | 2691 | /* Tally up the load of all CPUs in the group */ |
2692 | sum_weighted_load = sum_nr_running = avg_load = 0; | 2692 | sum_weighted_load = sum_nr_running = avg_load = 0; |
2693 | max_cpu_load = 0; | 2693 | max_cpu_load = 0; |
2694 | min_cpu_load = ~0UL; | 2694 | min_cpu_load = ~0UL; |
2695 | 2695 | ||
2696 | for_each_cpu_mask(i, group->cpumask) { | 2696 | for_each_cpu_mask(i, group->cpumask) { |
2697 | struct rq *rq; | 2697 | struct rq *rq; |
2698 | 2698 | ||
2699 | if (!cpu_isset(i, *cpus)) | 2699 | if (!cpu_isset(i, *cpus)) |
2700 | continue; | 2700 | continue; |
2701 | 2701 | ||
2702 | rq = cpu_rq(i); | 2702 | rq = cpu_rq(i); |
2703 | 2703 | ||
2704 | if (*sd_idle && rq->nr_running) | 2704 | if (*sd_idle && rq->nr_running) |
2705 | *sd_idle = 0; | 2705 | *sd_idle = 0; |
2706 | 2706 | ||
2707 | /* Bias balancing toward cpus of our domain */ | 2707 | /* Bias balancing toward cpus of our domain */ |
2708 | if (local_group) { | 2708 | if (local_group) { |
2709 | if (idle_cpu(i) && !first_idle_cpu) { | 2709 | if (idle_cpu(i) && !first_idle_cpu) { |
2710 | first_idle_cpu = 1; | 2710 | first_idle_cpu = 1; |
2711 | balance_cpu = i; | 2711 | balance_cpu = i; |
2712 | } | 2712 | } |
2713 | 2713 | ||
2714 | load = target_load(i, load_idx); | 2714 | load = target_load(i, load_idx); |
2715 | } else { | 2715 | } else { |
2716 | load = source_load(i, load_idx); | 2716 | load = source_load(i, load_idx); |
2717 | if (load > max_cpu_load) | 2717 | if (load > max_cpu_load) |
2718 | max_cpu_load = load; | 2718 | max_cpu_load = load; |
2719 | if (min_cpu_load > load) | 2719 | if (min_cpu_load > load) |
2720 | min_cpu_load = load; | 2720 | min_cpu_load = load; |
2721 | } | 2721 | } |
2722 | 2722 | ||
2723 | avg_load += load; | 2723 | avg_load += load; |
2724 | sum_nr_running += rq->nr_running; | 2724 | sum_nr_running += rq->nr_running; |
2725 | sum_weighted_load += weighted_cpuload(i); | 2725 | sum_weighted_load += weighted_cpuload(i); |
2726 | } | 2726 | } |
2727 | 2727 | ||
2728 | /* | 2728 | /* |
2729 | * First idle cpu or the first cpu(busiest) in this sched group | 2729 | * First idle cpu or the first cpu(busiest) in this sched group |
2730 | * is eligible for doing load balancing at this and above | 2730 | * is eligible for doing load balancing at this and above |
2731 | * domains. In the newly idle case, we will allow all the cpu's | 2731 | * domains. In the newly idle case, we will allow all the cpu's |
2732 | * to do the newly idle load balance. | 2732 | * to do the newly idle load balance. |
2733 | */ | 2733 | */ |
2734 | if (idle != CPU_NEWLY_IDLE && local_group && | 2734 | if (idle != CPU_NEWLY_IDLE && local_group && |
2735 | balance_cpu != this_cpu && balance) { | 2735 | balance_cpu != this_cpu && balance) { |
2736 | *balance = 0; | 2736 | *balance = 0; |
2737 | goto ret; | 2737 | goto ret; |
2738 | } | 2738 | } |
2739 | 2739 | ||
2740 | total_load += avg_load; | 2740 | total_load += avg_load; |
2741 | total_pwr += group->__cpu_power; | 2741 | total_pwr += group->__cpu_power; |
2742 | 2742 | ||
2743 | /* Adjust by relative CPU power of the group */ | 2743 | /* Adjust by relative CPU power of the group */ |
2744 | avg_load = sg_div_cpu_power(group, | 2744 | avg_load = sg_div_cpu_power(group, |
2745 | avg_load * SCHED_LOAD_SCALE); | 2745 | avg_load * SCHED_LOAD_SCALE); |
2746 | 2746 | ||
2747 | if ((max_cpu_load - min_cpu_load) > SCHED_LOAD_SCALE) | 2747 | if ((max_cpu_load - min_cpu_load) > SCHED_LOAD_SCALE) |
2748 | __group_imb = 1; | 2748 | __group_imb = 1; |
2749 | 2749 | ||
2750 | group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; | 2750 | group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; |
2751 | 2751 | ||
2752 | if (local_group) { | 2752 | if (local_group) { |
2753 | this_load = avg_load; | 2753 | this_load = avg_load; |
2754 | this = group; | 2754 | this = group; |
2755 | this_nr_running = sum_nr_running; | 2755 | this_nr_running = sum_nr_running; |
2756 | this_load_per_task = sum_weighted_load; | 2756 | this_load_per_task = sum_weighted_load; |
2757 | } else if (avg_load > max_load && | 2757 | } else if (avg_load > max_load && |
2758 | (sum_nr_running > group_capacity || __group_imb)) { | 2758 | (sum_nr_running > group_capacity || __group_imb)) { |
2759 | max_load = avg_load; | 2759 | max_load = avg_load; |
2760 | busiest = group; | 2760 | busiest = group; |
2761 | busiest_nr_running = sum_nr_running; | 2761 | busiest_nr_running = sum_nr_running; |
2762 | busiest_load_per_task = sum_weighted_load; | 2762 | busiest_load_per_task = sum_weighted_load; |
2763 | group_imb = __group_imb; | 2763 | group_imb = __group_imb; |
2764 | } | 2764 | } |
2765 | 2765 | ||
2766 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 2766 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
2767 | /* | 2767 | /* |
2768 | * Busy processors will not participate in power savings | 2768 | * Busy processors will not participate in power savings |
2769 | * balance. | 2769 | * balance. |
2770 | */ | 2770 | */ |
2771 | if (idle == CPU_NOT_IDLE || | 2771 | if (idle == CPU_NOT_IDLE || |
2772 | !(sd->flags & SD_POWERSAVINGS_BALANCE)) | 2772 | !(sd->flags & SD_POWERSAVINGS_BALANCE)) |
2773 | goto group_next; | 2773 | goto group_next; |
2774 | 2774 | ||
2775 | /* | 2775 | /* |
2776 | * If the local group is idle or completely loaded | 2776 | * If the local group is idle or completely loaded |
2777 | * no need to do power savings balance at this domain | 2777 | * no need to do power savings balance at this domain |
2778 | */ | 2778 | */ |
2779 | if (local_group && (this_nr_running >= group_capacity || | 2779 | if (local_group && (this_nr_running >= group_capacity || |
2780 | !this_nr_running)) | 2780 | !this_nr_running)) |
2781 | power_savings_balance = 0; | 2781 | power_savings_balance = 0; |
2782 | 2782 | ||
2783 | /* | 2783 | /* |
2784 | * If a group is already running at full capacity or idle, | 2784 | * If a group is already running at full capacity or idle, |
2785 | * don't include that group in power savings calculations | 2785 | * don't include that group in power savings calculations |
2786 | */ | 2786 | */ |
2787 | if (!power_savings_balance || sum_nr_running >= group_capacity | 2787 | if (!power_savings_balance || sum_nr_running >= group_capacity |
2788 | || !sum_nr_running) | 2788 | || !sum_nr_running) |
2789 | goto group_next; | 2789 | goto group_next; |
2790 | 2790 | ||
2791 | /* | 2791 | /* |
2792 | * Calculate the group which has the least non-idle load. | 2792 | * Calculate the group which has the least non-idle load. |
2793 | * This is the group from where we need to pick up the load | 2793 | * This is the group from where we need to pick up the load |
2794 | * for saving power | 2794 | * for saving power |
2795 | */ | 2795 | */ |
2796 | if ((sum_nr_running < min_nr_running) || | 2796 | if ((sum_nr_running < min_nr_running) || |
2797 | (sum_nr_running == min_nr_running && | 2797 | (sum_nr_running == min_nr_running && |
2798 | first_cpu(group->cpumask) < | 2798 | first_cpu(group->cpumask) < |
2799 | first_cpu(group_min->cpumask))) { | 2799 | first_cpu(group_min->cpumask))) { |
2800 | group_min = group; | 2800 | group_min = group; |
2801 | min_nr_running = sum_nr_running; | 2801 | min_nr_running = sum_nr_running; |
2802 | min_load_per_task = sum_weighted_load / | 2802 | min_load_per_task = sum_weighted_load / |
2803 | sum_nr_running; | 2803 | sum_nr_running; |
2804 | } | 2804 | } |
2805 | 2805 | ||
2806 | /* | 2806 | /* |
2807 | * Calculate the group which is almost near its | 2807 | * Calculate the group which is almost near its |
2808 | * capacity but still has some space to pick up some load | 2808 | * capacity but still has some space to pick up some load |
2809 | * from other group and save more power | 2809 | * from other group and save more power |
2810 | */ | 2810 | */ |
2811 | if (sum_nr_running <= group_capacity - 1) { | 2811 | if (sum_nr_running <= group_capacity - 1) { |
2812 | if (sum_nr_running > leader_nr_running || | 2812 | if (sum_nr_running > leader_nr_running || |
2813 | (sum_nr_running == leader_nr_running && | 2813 | (sum_nr_running == leader_nr_running && |
2814 | first_cpu(group->cpumask) > | 2814 | first_cpu(group->cpumask) > |
2815 | first_cpu(group_leader->cpumask))) { | 2815 | first_cpu(group_leader->cpumask))) { |
2816 | group_leader = group; | 2816 | group_leader = group; |
2817 | leader_nr_running = sum_nr_running; | 2817 | leader_nr_running = sum_nr_running; |
2818 | } | 2818 | } |
2819 | } | 2819 | } |
2820 | group_next: | 2820 | group_next: |
2821 | #endif | 2821 | #endif |
2822 | group = group->next; | 2822 | group = group->next; |
2823 | } while (group != sd->groups); | 2823 | } while (group != sd->groups); |
2824 | 2824 | ||
2825 | if (!busiest || this_load >= max_load || busiest_nr_running == 0) | 2825 | if (!busiest || this_load >= max_load || busiest_nr_running == 0) |
2826 | goto out_balanced; | 2826 | goto out_balanced; |
2827 | 2827 | ||
2828 | avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; | 2828 | avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; |
2829 | 2829 | ||
2830 | if (this_load >= avg_load || | 2830 | if (this_load >= avg_load || |
2831 | 100*max_load <= sd->imbalance_pct*this_load) | 2831 | 100*max_load <= sd->imbalance_pct*this_load) |
2832 | goto out_balanced; | 2832 | goto out_balanced; |
2833 | 2833 | ||
2834 | busiest_load_per_task /= busiest_nr_running; | 2834 | busiest_load_per_task /= busiest_nr_running; |
2835 | if (group_imb) | 2835 | if (group_imb) |
2836 | busiest_load_per_task = min(busiest_load_per_task, avg_load); | 2836 | busiest_load_per_task = min(busiest_load_per_task, avg_load); |
2837 | 2837 | ||
2838 | /* | 2838 | /* |
2839 | * We're trying to get all the cpus to the average_load, so we don't | 2839 | * We're trying to get all the cpus to the average_load, so we don't |
2840 | * want to push ourselves above the average load, nor do we wish to | 2840 | * want to push ourselves above the average load, nor do we wish to |
2841 | * reduce the max loaded cpu below the average load, as either of these | 2841 | * reduce the max loaded cpu below the average load, as either of these |
2842 | * actions would just result in more rebalancing later, and ping-pong | 2842 | * actions would just result in more rebalancing later, and ping-pong |
2843 | * tasks around. Thus we look for the minimum possible imbalance. | 2843 | * tasks around. Thus we look for the minimum possible imbalance. |
2844 | * Negative imbalances (*we* are more loaded than anyone else) will | 2844 | * Negative imbalances (*we* are more loaded than anyone else) will |
2845 | * be counted as no imbalance for these purposes -- we can't fix that | 2845 | * be counted as no imbalance for these purposes -- we can't fix that |
2846 | * by pulling tasks to us. Be careful of negative numbers as they'll | 2846 | * by pulling tasks to us. Be careful of negative numbers as they'll |
2847 | * appear as very large values with unsigned longs. | 2847 | * appear as very large values with unsigned longs. |
2848 | */ | 2848 | */ |
2849 | if (max_load <= busiest_load_per_task) | 2849 | if (max_load <= busiest_load_per_task) |
2850 | goto out_balanced; | 2850 | goto out_balanced; |
2851 | 2851 | ||
2852 | /* | 2852 | /* |
2853 | * In the presence of smp nice balancing, certain scenarios can have | 2853 | * In the presence of smp nice balancing, certain scenarios can have |
2854 | * max load less than avg load(as we skip the groups at or below | 2854 | * max load less than avg load(as we skip the groups at or below |
2855 | * its cpu_power, while calculating max_load..) | 2855 | * its cpu_power, while calculating max_load..) |
2856 | */ | 2856 | */ |
2857 | if (max_load < avg_load) { | 2857 | if (max_load < avg_load) { |
2858 | *imbalance = 0; | 2858 | *imbalance = 0; |
2859 | goto small_imbalance; | 2859 | goto small_imbalance; |
2860 | } | 2860 | } |
2861 | 2861 | ||
2862 | /* Don't want to pull so many tasks that a group would go idle */ | 2862 | /* Don't want to pull so many tasks that a group would go idle */ |
2863 | max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); | 2863 | max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); |
2864 | 2864 | ||
2865 | /* How much load to actually move to equalise the imbalance */ | 2865 | /* How much load to actually move to equalise the imbalance */ |
2866 | *imbalance = min(max_pull * busiest->__cpu_power, | 2866 | *imbalance = min(max_pull * busiest->__cpu_power, |
2867 | (avg_load - this_load) * this->__cpu_power) | 2867 | (avg_load - this_load) * this->__cpu_power) |
2868 | / SCHED_LOAD_SCALE; | 2868 | / SCHED_LOAD_SCALE; |
2869 | 2869 | ||
2870 | /* | 2870 | /* |
2871 | * if *imbalance is less than the average load per runnable task | 2871 | * if *imbalance is less than the average load per runnable task |
2872 | * there is no gaurantee that any tasks will be moved so we'll have | 2872 | * there is no gaurantee that any tasks will be moved so we'll have |
2873 | * a think about bumping its value to force at least one task to be | 2873 | * a think about bumping its value to force at least one task to be |
2874 | * moved | 2874 | * moved |
2875 | */ | 2875 | */ |
2876 | if (*imbalance < busiest_load_per_task) { | 2876 | if (*imbalance < busiest_load_per_task) { |
2877 | unsigned long tmp, pwr_now, pwr_move; | 2877 | unsigned long tmp, pwr_now, pwr_move; |
2878 | unsigned int imbn; | 2878 | unsigned int imbn; |
2879 | 2879 | ||
2880 | small_imbalance: | 2880 | small_imbalance: |
2881 | pwr_move = pwr_now = 0; | 2881 | pwr_move = pwr_now = 0; |
2882 | imbn = 2; | 2882 | imbn = 2; |
2883 | if (this_nr_running) { | 2883 | if (this_nr_running) { |
2884 | this_load_per_task /= this_nr_running; | 2884 | this_load_per_task /= this_nr_running; |
2885 | if (busiest_load_per_task > this_load_per_task) | 2885 | if (busiest_load_per_task > this_load_per_task) |
2886 | imbn = 1; | 2886 | imbn = 1; |
2887 | } else | 2887 | } else |
2888 | this_load_per_task = SCHED_LOAD_SCALE; | 2888 | this_load_per_task = SCHED_LOAD_SCALE; |
2889 | 2889 | ||
2890 | if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >= | 2890 | if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >= |
2891 | busiest_load_per_task * imbn) { | 2891 | busiest_load_per_task * imbn) { |
2892 | *imbalance = busiest_load_per_task; | 2892 | *imbalance = busiest_load_per_task; |
2893 | return busiest; | 2893 | return busiest; |
2894 | } | 2894 | } |
2895 | 2895 | ||
2896 | /* | 2896 | /* |
2897 | * OK, we don't have enough imbalance to justify moving tasks, | 2897 | * OK, we don't have enough imbalance to justify moving tasks, |
2898 | * however we may be able to increase total CPU power used by | 2898 | * however we may be able to increase total CPU power used by |
2899 | * moving them. | 2899 | * moving them. |
2900 | */ | 2900 | */ |
2901 | 2901 | ||
2902 | pwr_now += busiest->__cpu_power * | 2902 | pwr_now += busiest->__cpu_power * |
2903 | min(busiest_load_per_task, max_load); | 2903 | min(busiest_load_per_task, max_load); |
2904 | pwr_now += this->__cpu_power * | 2904 | pwr_now += this->__cpu_power * |
2905 | min(this_load_per_task, this_load); | 2905 | min(this_load_per_task, this_load); |
2906 | pwr_now /= SCHED_LOAD_SCALE; | 2906 | pwr_now /= SCHED_LOAD_SCALE; |
2907 | 2907 | ||
2908 | /* Amount of load we'd subtract */ | 2908 | /* Amount of load we'd subtract */ |
2909 | tmp = sg_div_cpu_power(busiest, | 2909 | tmp = sg_div_cpu_power(busiest, |
2910 | busiest_load_per_task * SCHED_LOAD_SCALE); | 2910 | busiest_load_per_task * SCHED_LOAD_SCALE); |
2911 | if (max_load > tmp) | 2911 | if (max_load > tmp) |
2912 | pwr_move += busiest->__cpu_power * | 2912 | pwr_move += busiest->__cpu_power * |
2913 | min(busiest_load_per_task, max_load - tmp); | 2913 | min(busiest_load_per_task, max_load - tmp); |
2914 | 2914 | ||
2915 | /* Amount of load we'd add */ | 2915 | /* Amount of load we'd add */ |
2916 | if (max_load * busiest->__cpu_power < | 2916 | if (max_load * busiest->__cpu_power < |
2917 | busiest_load_per_task * SCHED_LOAD_SCALE) | 2917 | busiest_load_per_task * SCHED_LOAD_SCALE) |
2918 | tmp = sg_div_cpu_power(this, | 2918 | tmp = sg_div_cpu_power(this, |
2919 | max_load * busiest->__cpu_power); | 2919 | max_load * busiest->__cpu_power); |
2920 | else | 2920 | else |
2921 | tmp = sg_div_cpu_power(this, | 2921 | tmp = sg_div_cpu_power(this, |
2922 | busiest_load_per_task * SCHED_LOAD_SCALE); | 2922 | busiest_load_per_task * SCHED_LOAD_SCALE); |
2923 | pwr_move += this->__cpu_power * | 2923 | pwr_move += this->__cpu_power * |
2924 | min(this_load_per_task, this_load + tmp); | 2924 | min(this_load_per_task, this_load + tmp); |
2925 | pwr_move /= SCHED_LOAD_SCALE; | 2925 | pwr_move /= SCHED_LOAD_SCALE; |
2926 | 2926 | ||
2927 | /* Move if we gain throughput */ | 2927 | /* Move if we gain throughput */ |
2928 | if (pwr_move > pwr_now) | 2928 | if (pwr_move > pwr_now) |
2929 | *imbalance = busiest_load_per_task; | 2929 | *imbalance = busiest_load_per_task; |
2930 | } | 2930 | } |
2931 | 2931 | ||
2932 | return busiest; | 2932 | return busiest; |
2933 | 2933 | ||
2934 | out_balanced: | 2934 | out_balanced: |
2935 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 2935 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
2936 | if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) | 2936 | if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) |
2937 | goto ret; | 2937 | goto ret; |
2938 | 2938 | ||
2939 | if (this == group_leader && group_leader != group_min) { | 2939 | if (this == group_leader && group_leader != group_min) { |
2940 | *imbalance = min_load_per_task; | 2940 | *imbalance = min_load_per_task; |
2941 | return group_min; | 2941 | return group_min; |
2942 | } | 2942 | } |
2943 | #endif | 2943 | #endif |
2944 | ret: | 2944 | ret: |
2945 | *imbalance = 0; | 2945 | *imbalance = 0; |
2946 | return NULL; | 2946 | return NULL; |
2947 | } | 2947 | } |
2948 | 2948 | ||
2949 | /* | 2949 | /* |
2950 | * find_busiest_queue - find the busiest runqueue among the cpus in group. | 2950 | * find_busiest_queue - find the busiest runqueue among the cpus in group. |
2951 | */ | 2951 | */ |
2952 | static struct rq * | 2952 | static struct rq * |
2953 | find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, | 2953 | find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, |
2954 | unsigned long imbalance, cpumask_t *cpus) | 2954 | unsigned long imbalance, cpumask_t *cpus) |
2955 | { | 2955 | { |
2956 | struct rq *busiest = NULL, *rq; | 2956 | struct rq *busiest = NULL, *rq; |
2957 | unsigned long max_load = 0; | 2957 | unsigned long max_load = 0; |
2958 | int i; | 2958 | int i; |
2959 | 2959 | ||
2960 | for_each_cpu_mask(i, group->cpumask) { | 2960 | for_each_cpu_mask(i, group->cpumask) { |
2961 | unsigned long wl; | 2961 | unsigned long wl; |
2962 | 2962 | ||
2963 | if (!cpu_isset(i, *cpus)) | 2963 | if (!cpu_isset(i, *cpus)) |
2964 | continue; | 2964 | continue; |
2965 | 2965 | ||
2966 | rq = cpu_rq(i); | 2966 | rq = cpu_rq(i); |
2967 | wl = weighted_cpuload(i); | 2967 | wl = weighted_cpuload(i); |
2968 | 2968 | ||
2969 | if (rq->nr_running == 1 && wl > imbalance) | 2969 | if (rq->nr_running == 1 && wl > imbalance) |
2970 | continue; | 2970 | continue; |
2971 | 2971 | ||
2972 | if (wl > max_load) { | 2972 | if (wl > max_load) { |
2973 | max_load = wl; | 2973 | max_load = wl; |
2974 | busiest = rq; | 2974 | busiest = rq; |
2975 | } | 2975 | } |
2976 | } | 2976 | } |
2977 | 2977 | ||
2978 | return busiest; | 2978 | return busiest; |
2979 | } | 2979 | } |
2980 | 2980 | ||
2981 | /* | 2981 | /* |
2982 | * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but | 2982 | * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but |
2983 | * so long as it is large enough. | 2983 | * so long as it is large enough. |
2984 | */ | 2984 | */ |
2985 | #define MAX_PINNED_INTERVAL 512 | 2985 | #define MAX_PINNED_INTERVAL 512 |
2986 | 2986 | ||
2987 | /* | 2987 | /* |
2988 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | 2988 | * Check this_cpu to ensure it is balanced within domain. Attempt to move |
2989 | * tasks if there is an imbalance. | 2989 | * tasks if there is an imbalance. |
2990 | */ | 2990 | */ |
2991 | static int load_balance(int this_cpu, struct rq *this_rq, | 2991 | static int load_balance(int this_cpu, struct rq *this_rq, |
2992 | struct sched_domain *sd, enum cpu_idle_type idle, | 2992 | struct sched_domain *sd, enum cpu_idle_type idle, |
2993 | int *balance) | 2993 | int *balance) |
2994 | { | 2994 | { |
2995 | int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; | 2995 | int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; |
2996 | struct sched_group *group; | 2996 | struct sched_group *group; |
2997 | unsigned long imbalance; | 2997 | unsigned long imbalance; |
2998 | struct rq *busiest; | 2998 | struct rq *busiest; |
2999 | cpumask_t cpus = CPU_MASK_ALL; | 2999 | cpumask_t cpus = CPU_MASK_ALL; |
3000 | unsigned long flags; | 3000 | unsigned long flags; |
3001 | 3001 | ||
3002 | /* | 3002 | /* |
3003 | * When power savings policy is enabled for the parent domain, idle | 3003 | * When power savings policy is enabled for the parent domain, idle |
3004 | * sibling can pick up load irrespective of busy siblings. In this case, | 3004 | * sibling can pick up load irrespective of busy siblings. In this case, |
3005 | * let the state of idle sibling percolate up as CPU_IDLE, instead of | 3005 | * let the state of idle sibling percolate up as CPU_IDLE, instead of |
3006 | * portraying it as CPU_NOT_IDLE. | 3006 | * portraying it as CPU_NOT_IDLE. |
3007 | */ | 3007 | */ |
3008 | if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && | 3008 | if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && |
3009 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 3009 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
3010 | sd_idle = 1; | 3010 | sd_idle = 1; |
3011 | 3011 | ||
3012 | schedstat_inc(sd, lb_count[idle]); | 3012 | schedstat_inc(sd, lb_count[idle]); |
3013 | 3013 | ||
3014 | redo: | 3014 | redo: |
3015 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, | 3015 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, |
3016 | &cpus, balance); | 3016 | &cpus, balance); |
3017 | 3017 | ||
3018 | if (*balance == 0) | 3018 | if (*balance == 0) |
3019 | goto out_balanced; | 3019 | goto out_balanced; |
3020 | 3020 | ||
3021 | if (!group) { | 3021 | if (!group) { |
3022 | schedstat_inc(sd, lb_nobusyg[idle]); | 3022 | schedstat_inc(sd, lb_nobusyg[idle]); |
3023 | goto out_balanced; | 3023 | goto out_balanced; |
3024 | } | 3024 | } |
3025 | 3025 | ||
3026 | busiest = find_busiest_queue(group, idle, imbalance, &cpus); | 3026 | busiest = find_busiest_queue(group, idle, imbalance, &cpus); |
3027 | if (!busiest) { | 3027 | if (!busiest) { |
3028 | schedstat_inc(sd, lb_nobusyq[idle]); | 3028 | schedstat_inc(sd, lb_nobusyq[idle]); |
3029 | goto out_balanced; | 3029 | goto out_balanced; |
3030 | } | 3030 | } |
3031 | 3031 | ||
3032 | BUG_ON(busiest == this_rq); | 3032 | BUG_ON(busiest == this_rq); |
3033 | 3033 | ||
3034 | schedstat_add(sd, lb_imbalance[idle], imbalance); | 3034 | schedstat_add(sd, lb_imbalance[idle], imbalance); |
3035 | 3035 | ||
3036 | ld_moved = 0; | 3036 | ld_moved = 0; |
3037 | if (busiest->nr_running > 1) { | 3037 | if (busiest->nr_running > 1) { |
3038 | /* | 3038 | /* |
3039 | * Attempt to move tasks. If find_busiest_group has found | 3039 | * Attempt to move tasks. If find_busiest_group has found |
3040 | * an imbalance but busiest->nr_running <= 1, the group is | 3040 | * an imbalance but busiest->nr_running <= 1, the group is |
3041 | * still unbalanced. ld_moved simply stays zero, so it is | 3041 | * still unbalanced. ld_moved simply stays zero, so it is |
3042 | * correctly treated as an imbalance. | 3042 | * correctly treated as an imbalance. |
3043 | */ | 3043 | */ |
3044 | local_irq_save(flags); | 3044 | local_irq_save(flags); |
3045 | double_rq_lock(this_rq, busiest); | 3045 | double_rq_lock(this_rq, busiest); |
3046 | ld_moved = move_tasks(this_rq, this_cpu, busiest, | 3046 | ld_moved = move_tasks(this_rq, this_cpu, busiest, |
3047 | imbalance, sd, idle, &all_pinned); | 3047 | imbalance, sd, idle, &all_pinned); |
3048 | double_rq_unlock(this_rq, busiest); | 3048 | double_rq_unlock(this_rq, busiest); |
3049 | local_irq_restore(flags); | 3049 | local_irq_restore(flags); |
3050 | 3050 | ||
3051 | /* | 3051 | /* |
3052 | * some other cpu did the load balance for us. | 3052 | * some other cpu did the load balance for us. |
3053 | */ | 3053 | */ |
3054 | if (ld_moved && this_cpu != smp_processor_id()) | 3054 | if (ld_moved && this_cpu != smp_processor_id()) |
3055 | resched_cpu(this_cpu); | 3055 | resched_cpu(this_cpu); |
3056 | 3056 | ||
3057 | /* All tasks on this runqueue were pinned by CPU affinity */ | 3057 | /* All tasks on this runqueue were pinned by CPU affinity */ |
3058 | if (unlikely(all_pinned)) { | 3058 | if (unlikely(all_pinned)) { |
3059 | cpu_clear(cpu_of(busiest), cpus); | 3059 | cpu_clear(cpu_of(busiest), cpus); |
3060 | if (!cpus_empty(cpus)) | 3060 | if (!cpus_empty(cpus)) |
3061 | goto redo; | 3061 | goto redo; |
3062 | goto out_balanced; | 3062 | goto out_balanced; |
3063 | } | 3063 | } |
3064 | } | 3064 | } |
3065 | 3065 | ||
3066 | if (!ld_moved) { | 3066 | if (!ld_moved) { |
3067 | schedstat_inc(sd, lb_failed[idle]); | 3067 | schedstat_inc(sd, lb_failed[idle]); |
3068 | sd->nr_balance_failed++; | 3068 | sd->nr_balance_failed++; |
3069 | 3069 | ||
3070 | if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { | 3070 | if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { |
3071 | 3071 | ||
3072 | spin_lock_irqsave(&busiest->lock, flags); | 3072 | spin_lock_irqsave(&busiest->lock, flags); |
3073 | 3073 | ||
3074 | /* don't kick the migration_thread, if the curr | 3074 | /* don't kick the migration_thread, if the curr |
3075 | * task on busiest cpu can't be moved to this_cpu | 3075 | * task on busiest cpu can't be moved to this_cpu |
3076 | */ | 3076 | */ |
3077 | if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { | 3077 | if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { |
3078 | spin_unlock_irqrestore(&busiest->lock, flags); | 3078 | spin_unlock_irqrestore(&busiest->lock, flags); |
3079 | all_pinned = 1; | 3079 | all_pinned = 1; |
3080 | goto out_one_pinned; | 3080 | goto out_one_pinned; |
3081 | } | 3081 | } |
3082 | 3082 | ||
3083 | if (!busiest->active_balance) { | 3083 | if (!busiest->active_balance) { |
3084 | busiest->active_balance = 1; | 3084 | busiest->active_balance = 1; |
3085 | busiest->push_cpu = this_cpu; | 3085 | busiest->push_cpu = this_cpu; |
3086 | active_balance = 1; | 3086 | active_balance = 1; |
3087 | } | 3087 | } |
3088 | spin_unlock_irqrestore(&busiest->lock, flags); | 3088 | spin_unlock_irqrestore(&busiest->lock, flags); |
3089 | if (active_balance) | 3089 | if (active_balance) |
3090 | wake_up_process(busiest->migration_thread); | 3090 | wake_up_process(busiest->migration_thread); |
3091 | 3091 | ||
3092 | /* | 3092 | /* |
3093 | * We've kicked active balancing, reset the failure | 3093 | * We've kicked active balancing, reset the failure |
3094 | * counter. | 3094 | * counter. |
3095 | */ | 3095 | */ |
3096 | sd->nr_balance_failed = sd->cache_nice_tries+1; | 3096 | sd->nr_balance_failed = sd->cache_nice_tries+1; |
3097 | } | 3097 | } |
3098 | } else | 3098 | } else |
3099 | sd->nr_balance_failed = 0; | 3099 | sd->nr_balance_failed = 0; |
3100 | 3100 | ||
3101 | if (likely(!active_balance)) { | 3101 | if (likely(!active_balance)) { |
3102 | /* We were unbalanced, so reset the balancing interval */ | 3102 | /* We were unbalanced, so reset the balancing interval */ |
3103 | sd->balance_interval = sd->min_interval; | 3103 | sd->balance_interval = sd->min_interval; |
3104 | } else { | 3104 | } else { |
3105 | /* | 3105 | /* |
3106 | * If we've begun active balancing, start to back off. This | 3106 | * If we've begun active balancing, start to back off. This |
3107 | * case may not be covered by the all_pinned logic if there | 3107 | * case may not be covered by the all_pinned logic if there |
3108 | * is only 1 task on the busy runqueue (because we don't call | 3108 | * is only 1 task on the busy runqueue (because we don't call |
3109 | * move_tasks). | 3109 | * move_tasks). |
3110 | */ | 3110 | */ |
3111 | if (sd->balance_interval < sd->max_interval) | 3111 | if (sd->balance_interval < sd->max_interval) |
3112 | sd->balance_interval *= 2; | 3112 | sd->balance_interval *= 2; |
3113 | } | 3113 | } |
3114 | 3114 | ||
3115 | if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 3115 | if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
3116 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 3116 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
3117 | return -1; | 3117 | return -1; |
3118 | return ld_moved; | 3118 | return ld_moved; |
3119 | 3119 | ||
3120 | out_balanced: | 3120 | out_balanced: |
3121 | schedstat_inc(sd, lb_balanced[idle]); | 3121 | schedstat_inc(sd, lb_balanced[idle]); |
3122 | 3122 | ||
3123 | sd->nr_balance_failed = 0; | 3123 | sd->nr_balance_failed = 0; |
3124 | 3124 | ||
3125 | out_one_pinned: | 3125 | out_one_pinned: |
3126 | /* tune up the balancing interval */ | 3126 | /* tune up the balancing interval */ |
3127 | if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || | 3127 | if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || |
3128 | (sd->balance_interval < sd->max_interval)) | 3128 | (sd->balance_interval < sd->max_interval)) |
3129 | sd->balance_interval *= 2; | 3129 | sd->balance_interval *= 2; |
3130 | 3130 | ||
3131 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 3131 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
3132 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 3132 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
3133 | return -1; | 3133 | return -1; |
3134 | return 0; | 3134 | return 0; |
3135 | } | 3135 | } |
3136 | 3136 | ||
3137 | /* | 3137 | /* |
3138 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | 3138 | * Check this_cpu to ensure it is balanced within domain. Attempt to move |
3139 | * tasks if there is an imbalance. | 3139 | * tasks if there is an imbalance. |
3140 | * | 3140 | * |
3141 | * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE). | 3141 | * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE). |
3142 | * this_rq is locked. | 3142 | * this_rq is locked. |
3143 | */ | 3143 | */ |
3144 | static int | 3144 | static int |
3145 | load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) | 3145 | load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) |
3146 | { | 3146 | { |
3147 | struct sched_group *group; | 3147 | struct sched_group *group; |
3148 | struct rq *busiest = NULL; | 3148 | struct rq *busiest = NULL; |
3149 | unsigned long imbalance; | 3149 | unsigned long imbalance; |
3150 | int ld_moved = 0; | 3150 | int ld_moved = 0; |
3151 | int sd_idle = 0; | 3151 | int sd_idle = 0; |
3152 | int all_pinned = 0; | 3152 | int all_pinned = 0; |
3153 | cpumask_t cpus = CPU_MASK_ALL; | 3153 | cpumask_t cpus = CPU_MASK_ALL; |
3154 | 3154 | ||
3155 | /* | 3155 | /* |
3156 | * When power savings policy is enabled for the parent domain, idle | 3156 | * When power savings policy is enabled for the parent domain, idle |
3157 | * sibling can pick up load irrespective of busy siblings. In this case, | 3157 | * sibling can pick up load irrespective of busy siblings. In this case, |
3158 | * let the state of idle sibling percolate up as IDLE, instead of | 3158 | * let the state of idle sibling percolate up as IDLE, instead of |
3159 | * portraying it as CPU_NOT_IDLE. | 3159 | * portraying it as CPU_NOT_IDLE. |
3160 | */ | 3160 | */ |
3161 | if (sd->flags & SD_SHARE_CPUPOWER && | 3161 | if (sd->flags & SD_SHARE_CPUPOWER && |
3162 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 3162 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
3163 | sd_idle = 1; | 3163 | sd_idle = 1; |
3164 | 3164 | ||
3165 | schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); | 3165 | schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); |
3166 | redo: | 3166 | redo: |
3167 | group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, | 3167 | group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, |
3168 | &sd_idle, &cpus, NULL); | 3168 | &sd_idle, &cpus, NULL); |
3169 | if (!group) { | 3169 | if (!group) { |
3170 | schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]); | 3170 | schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]); |
3171 | goto out_balanced; | 3171 | goto out_balanced; |
3172 | } | 3172 | } |
3173 | 3173 | ||
3174 | busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, | 3174 | busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, |
3175 | &cpus); | 3175 | &cpus); |
3176 | if (!busiest) { | 3176 | if (!busiest) { |
3177 | schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]); | 3177 | schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]); |
3178 | goto out_balanced; | 3178 | goto out_balanced; |
3179 | } | 3179 | } |
3180 | 3180 | ||
3181 | BUG_ON(busiest == this_rq); | 3181 | BUG_ON(busiest == this_rq); |
3182 | 3182 | ||
3183 | schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance); | 3183 | schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance); |
3184 | 3184 | ||
3185 | ld_moved = 0; | 3185 | ld_moved = 0; |
3186 | if (busiest->nr_running > 1) { | 3186 | if (busiest->nr_running > 1) { |
3187 | /* Attempt to move tasks */ | 3187 | /* Attempt to move tasks */ |
3188 | double_lock_balance(this_rq, busiest); | 3188 | double_lock_balance(this_rq, busiest); |
3189 | /* this_rq->clock is already updated */ | 3189 | /* this_rq->clock is already updated */ |
3190 | update_rq_clock(busiest); | 3190 | update_rq_clock(busiest); |
3191 | ld_moved = move_tasks(this_rq, this_cpu, busiest, | 3191 | ld_moved = move_tasks(this_rq, this_cpu, busiest, |
3192 | imbalance, sd, CPU_NEWLY_IDLE, | 3192 | imbalance, sd, CPU_NEWLY_IDLE, |
3193 | &all_pinned); | 3193 | &all_pinned); |
3194 | spin_unlock(&busiest->lock); | 3194 | spin_unlock(&busiest->lock); |
3195 | 3195 | ||
3196 | if (unlikely(all_pinned)) { | 3196 | if (unlikely(all_pinned)) { |
3197 | cpu_clear(cpu_of(busiest), cpus); | 3197 | cpu_clear(cpu_of(busiest), cpus); |
3198 | if (!cpus_empty(cpus)) | 3198 | if (!cpus_empty(cpus)) |
3199 | goto redo; | 3199 | goto redo; |
3200 | } | 3200 | } |
3201 | } | 3201 | } |
3202 | 3202 | ||
3203 | if (!ld_moved) { | 3203 | if (!ld_moved) { |
3204 | schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]); | 3204 | schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]); |
3205 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 3205 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
3206 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 3206 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
3207 | return -1; | 3207 | return -1; |
3208 | } else | 3208 | } else |
3209 | sd->nr_balance_failed = 0; | 3209 | sd->nr_balance_failed = 0; |
3210 | 3210 | ||
3211 | return ld_moved; | 3211 | return ld_moved; |
3212 | 3212 | ||
3213 | out_balanced: | 3213 | out_balanced: |
3214 | schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]); | 3214 | schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]); |
3215 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 3215 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
3216 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 3216 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
3217 | return -1; | 3217 | return -1; |
3218 | sd->nr_balance_failed = 0; | 3218 | sd->nr_balance_failed = 0; |
3219 | 3219 | ||
3220 | return 0; | 3220 | return 0; |
3221 | } | 3221 | } |
3222 | 3222 | ||
3223 | /* | 3223 | /* |
3224 | * idle_balance is called by schedule() if this_cpu is about to become | 3224 | * idle_balance is called by schedule() if this_cpu is about to become |
3225 | * idle. Attempts to pull tasks from other CPUs. | 3225 | * idle. Attempts to pull tasks from other CPUs. |
3226 | */ | 3226 | */ |
3227 | static void idle_balance(int this_cpu, struct rq *this_rq) | 3227 | static void idle_balance(int this_cpu, struct rq *this_rq) |
3228 | { | 3228 | { |
3229 | struct sched_domain *sd; | 3229 | struct sched_domain *sd; |
3230 | int pulled_task = -1; | 3230 | int pulled_task = -1; |
3231 | unsigned long next_balance = jiffies + HZ; | 3231 | unsigned long next_balance = jiffies + HZ; |
3232 | 3232 | ||
3233 | for_each_domain(this_cpu, sd) { | 3233 | for_each_domain(this_cpu, sd) { |
3234 | unsigned long interval; | 3234 | unsigned long interval; |
3235 | 3235 | ||
3236 | if (!(sd->flags & SD_LOAD_BALANCE)) | 3236 | if (!(sd->flags & SD_LOAD_BALANCE)) |
3237 | continue; | 3237 | continue; |
3238 | 3238 | ||
3239 | if (sd->flags & SD_BALANCE_NEWIDLE) | 3239 | if (sd->flags & SD_BALANCE_NEWIDLE) |
3240 | /* If we've pulled tasks over stop searching: */ | 3240 | /* If we've pulled tasks over stop searching: */ |
3241 | pulled_task = load_balance_newidle(this_cpu, | 3241 | pulled_task = load_balance_newidle(this_cpu, |
3242 | this_rq, sd); | 3242 | this_rq, sd); |
3243 | 3243 | ||
3244 | interval = msecs_to_jiffies(sd->balance_interval); | 3244 | interval = msecs_to_jiffies(sd->balance_interval); |
3245 | if (time_after(next_balance, sd->last_balance + interval)) | 3245 | if (time_after(next_balance, sd->last_balance + interval)) |
3246 | next_balance = sd->last_balance + interval; | 3246 | next_balance = sd->last_balance + interval; |
3247 | if (pulled_task) | 3247 | if (pulled_task) |
3248 | break; | 3248 | break; |
3249 | } | 3249 | } |
3250 | if (pulled_task || time_after(jiffies, this_rq->next_balance)) { | 3250 | if (pulled_task || time_after(jiffies, this_rq->next_balance)) { |
3251 | /* | 3251 | /* |
3252 | * We are going idle. next_balance may be set based on | 3252 | * We are going idle. next_balance may be set based on |
3253 | * a busy processor. So reset next_balance. | 3253 | * a busy processor. So reset next_balance. |
3254 | */ | 3254 | */ |
3255 | this_rq->next_balance = next_balance; | 3255 | this_rq->next_balance = next_balance; |
3256 | } | 3256 | } |
3257 | } | 3257 | } |
3258 | 3258 | ||
3259 | /* | 3259 | /* |
3260 | * active_load_balance is run by migration threads. It pushes running tasks | 3260 | * active_load_balance is run by migration threads. It pushes running tasks |
3261 | * off the busiest CPU onto idle CPUs. It requires at least 1 task to be | 3261 | * off the busiest CPU onto idle CPUs. It requires at least 1 task to be |
3262 | * running on each physical CPU where possible, and avoids physical / | 3262 | * running on each physical CPU where possible, and avoids physical / |
3263 | * logical imbalances. | 3263 | * logical imbalances. |
3264 | * | 3264 | * |
3265 | * Called with busiest_rq locked. | 3265 | * Called with busiest_rq locked. |
3266 | */ | 3266 | */ |
3267 | static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) | 3267 | static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) |
3268 | { | 3268 | { |
3269 | int target_cpu = busiest_rq->push_cpu; | 3269 | int target_cpu = busiest_rq->push_cpu; |
3270 | struct sched_domain *sd; | 3270 | struct sched_domain *sd; |
3271 | struct rq *target_rq; | 3271 | struct rq *target_rq; |
3272 | 3272 | ||
3273 | /* Is there any task to move? */ | 3273 | /* Is there any task to move? */ |
3274 | if (busiest_rq->nr_running <= 1) | 3274 | if (busiest_rq->nr_running <= 1) |
3275 | return; | 3275 | return; |
3276 | 3276 | ||
3277 | target_rq = cpu_rq(target_cpu); | 3277 | target_rq = cpu_rq(target_cpu); |
3278 | 3278 | ||
3279 | /* | 3279 | /* |
3280 | * This condition is "impossible", if it occurs | 3280 | * This condition is "impossible", if it occurs |
3281 | * we need to fix it. Originally reported by | 3281 | * we need to fix it. Originally reported by |
3282 | * Bjorn Helgaas on a 128-cpu setup. | 3282 | * Bjorn Helgaas on a 128-cpu setup. |
3283 | */ | 3283 | */ |
3284 | BUG_ON(busiest_rq == target_rq); | 3284 | BUG_ON(busiest_rq == target_rq); |
3285 | 3285 | ||
3286 | /* move a task from busiest_rq to target_rq */ | 3286 | /* move a task from busiest_rq to target_rq */ |
3287 | double_lock_balance(busiest_rq, target_rq); | 3287 | double_lock_balance(busiest_rq, target_rq); |
3288 | update_rq_clock(busiest_rq); | 3288 | update_rq_clock(busiest_rq); |
3289 | update_rq_clock(target_rq); | 3289 | update_rq_clock(target_rq); |
3290 | 3290 | ||
3291 | /* Search for an sd spanning us and the target CPU. */ | 3291 | /* Search for an sd spanning us and the target CPU. */ |
3292 | for_each_domain(target_cpu, sd) { | 3292 | for_each_domain(target_cpu, sd) { |
3293 | if ((sd->flags & SD_LOAD_BALANCE) && | 3293 | if ((sd->flags & SD_LOAD_BALANCE) && |
3294 | cpu_isset(busiest_cpu, sd->span)) | 3294 | cpu_isset(busiest_cpu, sd->span)) |
3295 | break; | 3295 | break; |
3296 | } | 3296 | } |
3297 | 3297 | ||
3298 | if (likely(sd)) { | 3298 | if (likely(sd)) { |
3299 | schedstat_inc(sd, alb_count); | 3299 | schedstat_inc(sd, alb_count); |
3300 | 3300 | ||
3301 | if (move_one_task(target_rq, target_cpu, busiest_rq, | 3301 | if (move_one_task(target_rq, target_cpu, busiest_rq, |
3302 | sd, CPU_IDLE)) | 3302 | sd, CPU_IDLE)) |
3303 | schedstat_inc(sd, alb_pushed); | 3303 | schedstat_inc(sd, alb_pushed); |
3304 | else | 3304 | else |
3305 | schedstat_inc(sd, alb_failed); | 3305 | schedstat_inc(sd, alb_failed); |
3306 | } | 3306 | } |
3307 | spin_unlock(&target_rq->lock); | 3307 | spin_unlock(&target_rq->lock); |
3308 | } | 3308 | } |
3309 | 3309 | ||
3310 | #ifdef CONFIG_NO_HZ | 3310 | #ifdef CONFIG_NO_HZ |
3311 | static struct { | 3311 | static struct { |
3312 | atomic_t load_balancer; | 3312 | atomic_t load_balancer; |
3313 | cpumask_t cpu_mask; | 3313 | cpumask_t cpu_mask; |
3314 | } nohz ____cacheline_aligned = { | 3314 | } nohz ____cacheline_aligned = { |
3315 | .load_balancer = ATOMIC_INIT(-1), | 3315 | .load_balancer = ATOMIC_INIT(-1), |
3316 | .cpu_mask = CPU_MASK_NONE, | 3316 | .cpu_mask = CPU_MASK_NONE, |
3317 | }; | 3317 | }; |
3318 | 3318 | ||
3319 | /* | 3319 | /* |
3320 | * This routine will try to nominate the ilb (idle load balancing) | 3320 | * This routine will try to nominate the ilb (idle load balancing) |
3321 | * owner among the cpus whose ticks are stopped. ilb owner will do the idle | 3321 | * owner among the cpus whose ticks are stopped. ilb owner will do the idle |
3322 | * load balancing on behalf of all those cpus. If all the cpus in the system | 3322 | * load balancing on behalf of all those cpus. If all the cpus in the system |
3323 | * go into this tickless mode, then there will be no ilb owner (as there is | 3323 | * go into this tickless mode, then there will be no ilb owner (as there is |
3324 | * no need for one) and all the cpus will sleep till the next wakeup event | 3324 | * no need for one) and all the cpus will sleep till the next wakeup event |
3325 | * arrives... | 3325 | * arrives... |
3326 | * | 3326 | * |
3327 | * For the ilb owner, tick is not stopped. And this tick will be used | 3327 | * For the ilb owner, tick is not stopped. And this tick will be used |
3328 | * for idle load balancing. ilb owner will still be part of | 3328 | * for idle load balancing. ilb owner will still be part of |
3329 | * nohz.cpu_mask.. | 3329 | * nohz.cpu_mask.. |
3330 | * | 3330 | * |
3331 | * While stopping the tick, this cpu will become the ilb owner if there | 3331 | * While stopping the tick, this cpu will become the ilb owner if there |
3332 | * is no other owner. And will be the owner till that cpu becomes busy | 3332 | * is no other owner. And will be the owner till that cpu becomes busy |
3333 | * or if all cpus in the system stop their ticks at which point | 3333 | * or if all cpus in the system stop their ticks at which point |
3334 | * there is no need for ilb owner. | 3334 | * there is no need for ilb owner. |
3335 | * | 3335 | * |
3336 | * When the ilb owner becomes busy, it nominates another owner, during the | 3336 | * When the ilb owner becomes busy, it nominates another owner, during the |
3337 | * next busy scheduler_tick() | 3337 | * next busy scheduler_tick() |
3338 | */ | 3338 | */ |
3339 | int select_nohz_load_balancer(int stop_tick) | 3339 | int select_nohz_load_balancer(int stop_tick) |
3340 | { | 3340 | { |
3341 | int cpu = smp_processor_id(); | 3341 | int cpu = smp_processor_id(); |
3342 | 3342 | ||
3343 | if (stop_tick) { | 3343 | if (stop_tick) { |
3344 | cpu_set(cpu, nohz.cpu_mask); | 3344 | cpu_set(cpu, nohz.cpu_mask); |
3345 | cpu_rq(cpu)->in_nohz_recently = 1; | 3345 | cpu_rq(cpu)->in_nohz_recently = 1; |
3346 | 3346 | ||
3347 | /* | 3347 | /* |
3348 | * If we are going offline and still the leader, give up! | 3348 | * If we are going offline and still the leader, give up! |
3349 | */ | 3349 | */ |
3350 | if (cpu_is_offline(cpu) && | 3350 | if (cpu_is_offline(cpu) && |
3351 | atomic_read(&nohz.load_balancer) == cpu) { | 3351 | atomic_read(&nohz.load_balancer) == cpu) { |
3352 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) | 3352 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) |
3353 | BUG(); | 3353 | BUG(); |
3354 | return 0; | 3354 | return 0; |
3355 | } | 3355 | } |
3356 | 3356 | ||
3357 | /* time for ilb owner also to sleep */ | 3357 | /* time for ilb owner also to sleep */ |
3358 | if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) { | 3358 | if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) { |
3359 | if (atomic_read(&nohz.load_balancer) == cpu) | 3359 | if (atomic_read(&nohz.load_balancer) == cpu) |
3360 | atomic_set(&nohz.load_balancer, -1); | 3360 | atomic_set(&nohz.load_balancer, -1); |
3361 | return 0; | 3361 | return 0; |
3362 | } | 3362 | } |
3363 | 3363 | ||
3364 | if (atomic_read(&nohz.load_balancer) == -1) { | 3364 | if (atomic_read(&nohz.load_balancer) == -1) { |
3365 | /* make me the ilb owner */ | 3365 | /* make me the ilb owner */ |
3366 | if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) | 3366 | if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) |
3367 | return 1; | 3367 | return 1; |
3368 | } else if (atomic_read(&nohz.load_balancer) == cpu) | 3368 | } else if (atomic_read(&nohz.load_balancer) == cpu) |
3369 | return 1; | 3369 | return 1; |
3370 | } else { | 3370 | } else { |
3371 | if (!cpu_isset(cpu, nohz.cpu_mask)) | 3371 | if (!cpu_isset(cpu, nohz.cpu_mask)) |
3372 | return 0; | 3372 | return 0; |
3373 | 3373 | ||
3374 | cpu_clear(cpu, nohz.cpu_mask); | 3374 | cpu_clear(cpu, nohz.cpu_mask); |
3375 | 3375 | ||
3376 | if (atomic_read(&nohz.load_balancer) == cpu) | 3376 | if (atomic_read(&nohz.load_balancer) == cpu) |
3377 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) | 3377 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) |
3378 | BUG(); | 3378 | BUG(); |
3379 | } | 3379 | } |
3380 | return 0; | 3380 | return 0; |
3381 | } | 3381 | } |
3382 | #endif | 3382 | #endif |
3383 | 3383 | ||
3384 | static DEFINE_SPINLOCK(balancing); | 3384 | static DEFINE_SPINLOCK(balancing); |
3385 | 3385 | ||
3386 | /* | 3386 | /* |
3387 | * It checks each scheduling domain to see if it is due to be balanced, | 3387 | * It checks each scheduling domain to see if it is due to be balanced, |
3388 | * and initiates a balancing operation if so. | 3388 | * and initiates a balancing operation if so. |
3389 | * | 3389 | * |
3390 | * Balancing parameters are set up in arch_init_sched_domains. | 3390 | * Balancing parameters are set up in arch_init_sched_domains. |
3391 | */ | 3391 | */ |
3392 | static void rebalance_domains(int cpu, enum cpu_idle_type idle) | 3392 | static void rebalance_domains(int cpu, enum cpu_idle_type idle) |
3393 | { | 3393 | { |
3394 | int balance = 1; | 3394 | int balance = 1; |
3395 | struct rq *rq = cpu_rq(cpu); | 3395 | struct rq *rq = cpu_rq(cpu); |
3396 | unsigned long interval; | 3396 | unsigned long interval; |
3397 | struct sched_domain *sd; | 3397 | struct sched_domain *sd; |
3398 | /* Earliest time when we have to do rebalance again */ | 3398 | /* Earliest time when we have to do rebalance again */ |
3399 | unsigned long next_balance = jiffies + 60*HZ; | 3399 | unsigned long next_balance = jiffies + 60*HZ; |
3400 | int update_next_balance = 0; | 3400 | int update_next_balance = 0; |
3401 | 3401 | ||
3402 | for_each_domain(cpu, sd) { | 3402 | for_each_domain(cpu, sd) { |
3403 | if (!(sd->flags & SD_LOAD_BALANCE)) | 3403 | if (!(sd->flags & SD_LOAD_BALANCE)) |
3404 | continue; | 3404 | continue; |
3405 | 3405 | ||
3406 | interval = sd->balance_interval; | 3406 | interval = sd->balance_interval; |
3407 | if (idle != CPU_IDLE) | 3407 | if (idle != CPU_IDLE) |
3408 | interval *= sd->busy_factor; | 3408 | interval *= sd->busy_factor; |
3409 | 3409 | ||
3410 | /* scale ms to jiffies */ | 3410 | /* scale ms to jiffies */ |
3411 | interval = msecs_to_jiffies(interval); | 3411 | interval = msecs_to_jiffies(interval); |
3412 | if (unlikely(!interval)) | 3412 | if (unlikely(!interval)) |
3413 | interval = 1; | 3413 | interval = 1; |
3414 | if (interval > HZ*NR_CPUS/10) | 3414 | if (interval > HZ*NR_CPUS/10) |
3415 | interval = HZ*NR_CPUS/10; | 3415 | interval = HZ*NR_CPUS/10; |
3416 | 3416 | ||
3417 | 3417 | ||
3418 | if (sd->flags & SD_SERIALIZE) { | 3418 | if (sd->flags & SD_SERIALIZE) { |
3419 | if (!spin_trylock(&balancing)) | 3419 | if (!spin_trylock(&balancing)) |
3420 | goto out; | 3420 | goto out; |
3421 | } | 3421 | } |
3422 | 3422 | ||
3423 | if (time_after_eq(jiffies, sd->last_balance + interval)) { | 3423 | if (time_after_eq(jiffies, sd->last_balance + interval)) { |
3424 | if (load_balance(cpu, rq, sd, idle, &balance)) { | 3424 | if (load_balance(cpu, rq, sd, idle, &balance)) { |
3425 | /* | 3425 | /* |
3426 | * We've pulled tasks over so either we're no | 3426 | * We've pulled tasks over so either we're no |
3427 | * longer idle, or one of our SMT siblings is | 3427 | * longer idle, or one of our SMT siblings is |
3428 | * not idle. | 3428 | * not idle. |
3429 | */ | 3429 | */ |
3430 | idle = CPU_NOT_IDLE; | 3430 | idle = CPU_NOT_IDLE; |
3431 | } | 3431 | } |
3432 | sd->last_balance = jiffies; | 3432 | sd->last_balance = jiffies; |
3433 | } | 3433 | } |
3434 | if (sd->flags & SD_SERIALIZE) | 3434 | if (sd->flags & SD_SERIALIZE) |
3435 | spin_unlock(&balancing); | 3435 | spin_unlock(&balancing); |
3436 | out: | 3436 | out: |
3437 | if (time_after(next_balance, sd->last_balance + interval)) { | 3437 | if (time_after(next_balance, sd->last_balance + interval)) { |
3438 | next_balance = sd->last_balance + interval; | 3438 | next_balance = sd->last_balance + interval; |
3439 | update_next_balance = 1; | 3439 | update_next_balance = 1; |
3440 | } | 3440 | } |
3441 | 3441 | ||
3442 | /* | 3442 | /* |
3443 | * Stop the load balance at this level. There is another | 3443 | * Stop the load balance at this level. There is another |
3444 | * CPU in our sched group which is doing load balancing more | 3444 | * CPU in our sched group which is doing load balancing more |
3445 | * actively. | 3445 | * actively. |
3446 | */ | 3446 | */ |
3447 | if (!balance) | 3447 | if (!balance) |
3448 | break; | 3448 | break; |
3449 | } | 3449 | } |
3450 | 3450 | ||
3451 | /* | 3451 | /* |
3452 | * next_balance will be updated only when there is a need. | 3452 | * next_balance will be updated only when there is a need. |
3453 | * When the cpu is attached to null domain for ex, it will not be | 3453 | * When the cpu is attached to null domain for ex, it will not be |
3454 | * updated. | 3454 | * updated. |
3455 | */ | 3455 | */ |
3456 | if (likely(update_next_balance)) | 3456 | if (likely(update_next_balance)) |
3457 | rq->next_balance = next_balance; | 3457 | rq->next_balance = next_balance; |
3458 | } | 3458 | } |
3459 | 3459 | ||
3460 | /* | 3460 | /* |
3461 | * run_rebalance_domains is triggered when needed from the scheduler tick. | 3461 | * run_rebalance_domains is triggered when needed from the scheduler tick. |
3462 | * In CONFIG_NO_HZ case, the idle load balance owner will do the | 3462 | * In CONFIG_NO_HZ case, the idle load balance owner will do the |
3463 | * rebalancing for all the cpus for whom scheduler ticks are stopped. | 3463 | * rebalancing for all the cpus for whom scheduler ticks are stopped. |
3464 | */ | 3464 | */ |
3465 | static void run_rebalance_domains(struct softirq_action *h) | 3465 | static void run_rebalance_domains(struct softirq_action *h) |
3466 | { | 3466 | { |
3467 | int this_cpu = smp_processor_id(); | 3467 | int this_cpu = smp_processor_id(); |
3468 | struct rq *this_rq = cpu_rq(this_cpu); | 3468 | struct rq *this_rq = cpu_rq(this_cpu); |
3469 | enum cpu_idle_type idle = this_rq->idle_at_tick ? | 3469 | enum cpu_idle_type idle = this_rq->idle_at_tick ? |
3470 | CPU_IDLE : CPU_NOT_IDLE; | 3470 | CPU_IDLE : CPU_NOT_IDLE; |
3471 | 3471 | ||
3472 | rebalance_domains(this_cpu, idle); | 3472 | rebalance_domains(this_cpu, idle); |
3473 | 3473 | ||
3474 | #ifdef CONFIG_NO_HZ | 3474 | #ifdef CONFIG_NO_HZ |
3475 | /* | 3475 | /* |
3476 | * If this cpu is the owner for idle load balancing, then do the | 3476 | * If this cpu is the owner for idle load balancing, then do the |
3477 | * balancing on behalf of the other idle cpus whose ticks are | 3477 | * balancing on behalf of the other idle cpus whose ticks are |
3478 | * stopped. | 3478 | * stopped. |
3479 | */ | 3479 | */ |
3480 | if (this_rq->idle_at_tick && | 3480 | if (this_rq->idle_at_tick && |
3481 | atomic_read(&nohz.load_balancer) == this_cpu) { | 3481 | atomic_read(&nohz.load_balancer) == this_cpu) { |
3482 | cpumask_t cpus = nohz.cpu_mask; | 3482 | cpumask_t cpus = nohz.cpu_mask; |
3483 | struct rq *rq; | 3483 | struct rq *rq; |
3484 | int balance_cpu; | 3484 | int balance_cpu; |
3485 | 3485 | ||
3486 | cpu_clear(this_cpu, cpus); | 3486 | cpu_clear(this_cpu, cpus); |
3487 | for_each_cpu_mask(balance_cpu, cpus) { | 3487 | for_each_cpu_mask(balance_cpu, cpus) { |
3488 | /* | 3488 | /* |
3489 | * If this cpu gets work to do, stop the load balancing | 3489 | * If this cpu gets work to do, stop the load balancing |
3490 | * work being done for other cpus. Next load | 3490 | * work being done for other cpus. Next load |
3491 | * balancing owner will pick it up. | 3491 | * balancing owner will pick it up. |
3492 | */ | 3492 | */ |
3493 | if (need_resched()) | 3493 | if (need_resched()) |
3494 | break; | 3494 | break; |
3495 | 3495 | ||
3496 | rebalance_domains(balance_cpu, CPU_IDLE); | 3496 | rebalance_domains(balance_cpu, CPU_IDLE); |
3497 | 3497 | ||
3498 | rq = cpu_rq(balance_cpu); | 3498 | rq = cpu_rq(balance_cpu); |
3499 | if (time_after(this_rq->next_balance, rq->next_balance)) | 3499 | if (time_after(this_rq->next_balance, rq->next_balance)) |
3500 | this_rq->next_balance = rq->next_balance; | 3500 | this_rq->next_balance = rq->next_balance; |
3501 | } | 3501 | } |
3502 | } | 3502 | } |
3503 | #endif | 3503 | #endif |
3504 | } | 3504 | } |
3505 | 3505 | ||
3506 | /* | 3506 | /* |
3507 | * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. | 3507 | * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. |
3508 | * | 3508 | * |
3509 | * In case of CONFIG_NO_HZ, this is the place where we nominate a new | 3509 | * In case of CONFIG_NO_HZ, this is the place where we nominate a new |
3510 | * idle load balancing owner or decide to stop the periodic load balancing, | 3510 | * idle load balancing owner or decide to stop the periodic load balancing, |
3511 | * if the whole system is idle. | 3511 | * if the whole system is idle. |
3512 | */ | 3512 | */ |
3513 | static inline void trigger_load_balance(struct rq *rq, int cpu) | 3513 | static inline void trigger_load_balance(struct rq *rq, int cpu) |
3514 | { | 3514 | { |
3515 | #ifdef CONFIG_NO_HZ | 3515 | #ifdef CONFIG_NO_HZ |
3516 | /* | 3516 | /* |
3517 | * If we were in the nohz mode recently and busy at the current | 3517 | * If we were in the nohz mode recently and busy at the current |
3518 | * scheduler tick, then check if we need to nominate new idle | 3518 | * scheduler tick, then check if we need to nominate new idle |
3519 | * load balancer. | 3519 | * load balancer. |
3520 | */ | 3520 | */ |
3521 | if (rq->in_nohz_recently && !rq->idle_at_tick) { | 3521 | if (rq->in_nohz_recently && !rq->idle_at_tick) { |
3522 | rq->in_nohz_recently = 0; | 3522 | rq->in_nohz_recently = 0; |
3523 | 3523 | ||
3524 | if (atomic_read(&nohz.load_balancer) == cpu) { | 3524 | if (atomic_read(&nohz.load_balancer) == cpu) { |
3525 | cpu_clear(cpu, nohz.cpu_mask); | 3525 | cpu_clear(cpu, nohz.cpu_mask); |
3526 | atomic_set(&nohz.load_balancer, -1); | 3526 | atomic_set(&nohz.load_balancer, -1); |
3527 | } | 3527 | } |
3528 | 3528 | ||
3529 | if (atomic_read(&nohz.load_balancer) == -1) { | 3529 | if (atomic_read(&nohz.load_balancer) == -1) { |
3530 | /* | 3530 | /* |
3531 | * simple selection for now: Nominate the | 3531 | * simple selection for now: Nominate the |
3532 | * first cpu in the nohz list to be the next | 3532 | * first cpu in the nohz list to be the next |
3533 | * ilb owner. | 3533 | * ilb owner. |
3534 | * | 3534 | * |
3535 | * TBD: Traverse the sched domains and nominate | 3535 | * TBD: Traverse the sched domains and nominate |
3536 | * the nearest cpu in the nohz.cpu_mask. | 3536 | * the nearest cpu in the nohz.cpu_mask. |
3537 | */ | 3537 | */ |
3538 | int ilb = first_cpu(nohz.cpu_mask); | 3538 | int ilb = first_cpu(nohz.cpu_mask); |
3539 | 3539 | ||
3540 | if (ilb != NR_CPUS) | 3540 | if (ilb != NR_CPUS) |
3541 | resched_cpu(ilb); | 3541 | resched_cpu(ilb); |
3542 | } | 3542 | } |
3543 | } | 3543 | } |
3544 | 3544 | ||
3545 | /* | 3545 | /* |
3546 | * If this cpu is idle and doing idle load balancing for all the | 3546 | * If this cpu is idle and doing idle load balancing for all the |
3547 | * cpus with ticks stopped, is it time for that to stop? | 3547 | * cpus with ticks stopped, is it time for that to stop? |
3548 | */ | 3548 | */ |
3549 | if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && | 3549 | if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && |
3550 | cpus_weight(nohz.cpu_mask) == num_online_cpus()) { | 3550 | cpus_weight(nohz.cpu_mask) == num_online_cpus()) { |
3551 | resched_cpu(cpu); | 3551 | resched_cpu(cpu); |
3552 | return; | 3552 | return; |
3553 | } | 3553 | } |
3554 | 3554 | ||
3555 | /* | 3555 | /* |
3556 | * If this cpu is idle and the idle load balancing is done by | 3556 | * If this cpu is idle and the idle load balancing is done by |
3557 | * someone else, then no need raise the SCHED_SOFTIRQ | 3557 | * someone else, then no need raise the SCHED_SOFTIRQ |
3558 | */ | 3558 | */ |
3559 | if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && | 3559 | if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && |
3560 | cpu_isset(cpu, nohz.cpu_mask)) | 3560 | cpu_isset(cpu, nohz.cpu_mask)) |
3561 | return; | 3561 | return; |
3562 | #endif | 3562 | #endif |
3563 | if (time_after_eq(jiffies, rq->next_balance)) | 3563 | if (time_after_eq(jiffies, rq->next_balance)) |
3564 | raise_softirq(SCHED_SOFTIRQ); | 3564 | raise_softirq(SCHED_SOFTIRQ); |
3565 | } | 3565 | } |
3566 | 3566 | ||
3567 | #else /* CONFIG_SMP */ | 3567 | #else /* CONFIG_SMP */ |
3568 | 3568 | ||
3569 | /* | 3569 | /* |
3570 | * on UP we do not need to balance between CPUs: | 3570 | * on UP we do not need to balance between CPUs: |
3571 | */ | 3571 | */ |
3572 | static inline void idle_balance(int cpu, struct rq *rq) | 3572 | static inline void idle_balance(int cpu, struct rq *rq) |
3573 | { | 3573 | { |
3574 | } | 3574 | } |
3575 | 3575 | ||
3576 | #endif | 3576 | #endif |
3577 | 3577 | ||
3578 | DEFINE_PER_CPU(struct kernel_stat, kstat); | 3578 | DEFINE_PER_CPU(struct kernel_stat, kstat); |
3579 | 3579 | ||
3580 | EXPORT_PER_CPU_SYMBOL(kstat); | 3580 | EXPORT_PER_CPU_SYMBOL(kstat); |
3581 | 3581 | ||
3582 | /* | 3582 | /* |
3583 | * Return p->sum_exec_runtime plus any more ns on the sched_clock | 3583 | * Return p->sum_exec_runtime plus any more ns on the sched_clock |
3584 | * that have not yet been banked in case the task is currently running. | 3584 | * that have not yet been banked in case the task is currently running. |
3585 | */ | 3585 | */ |
3586 | unsigned long long task_sched_runtime(struct task_struct *p) | 3586 | unsigned long long task_sched_runtime(struct task_struct *p) |
3587 | { | 3587 | { |
3588 | unsigned long flags; | 3588 | unsigned long flags; |
3589 | u64 ns, delta_exec; | 3589 | u64 ns, delta_exec; |
3590 | struct rq *rq; | 3590 | struct rq *rq; |
3591 | 3591 | ||
3592 | rq = task_rq_lock(p, &flags); | 3592 | rq = task_rq_lock(p, &flags); |
3593 | ns = p->se.sum_exec_runtime; | 3593 | ns = p->se.sum_exec_runtime; |
3594 | if (task_current(rq, p)) { | 3594 | if (task_current(rq, p)) { |
3595 | update_rq_clock(rq); | 3595 | update_rq_clock(rq); |
3596 | delta_exec = rq->clock - p->se.exec_start; | 3596 | delta_exec = rq->clock - p->se.exec_start; |
3597 | if ((s64)delta_exec > 0) | 3597 | if ((s64)delta_exec > 0) |
3598 | ns += delta_exec; | 3598 | ns += delta_exec; |
3599 | } | 3599 | } |
3600 | task_rq_unlock(rq, &flags); | 3600 | task_rq_unlock(rq, &flags); |
3601 | 3601 | ||
3602 | return ns; | 3602 | return ns; |
3603 | } | 3603 | } |
3604 | 3604 | ||
3605 | /* | 3605 | /* |
3606 | * Account user cpu time to a process. | 3606 | * Account user cpu time to a process. |
3607 | * @p: the process that the cpu time gets accounted to | 3607 | * @p: the process that the cpu time gets accounted to |
3608 | * @cputime: the cpu time spent in user space since the last update | 3608 | * @cputime: the cpu time spent in user space since the last update |
3609 | */ | 3609 | */ |
3610 | void account_user_time(struct task_struct *p, cputime_t cputime) | 3610 | void account_user_time(struct task_struct *p, cputime_t cputime) |
3611 | { | 3611 | { |
3612 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 3612 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; |
3613 | cputime64_t tmp; | 3613 | cputime64_t tmp; |
3614 | 3614 | ||
3615 | p->utime = cputime_add(p->utime, cputime); | 3615 | p->utime = cputime_add(p->utime, cputime); |
3616 | 3616 | ||
3617 | /* Add user time to cpustat. */ | 3617 | /* Add user time to cpustat. */ |
3618 | tmp = cputime_to_cputime64(cputime); | 3618 | tmp = cputime_to_cputime64(cputime); |
3619 | if (TASK_NICE(p) > 0) | 3619 | if (TASK_NICE(p) > 0) |
3620 | cpustat->nice = cputime64_add(cpustat->nice, tmp); | 3620 | cpustat->nice = cputime64_add(cpustat->nice, tmp); |
3621 | else | 3621 | else |
3622 | cpustat->user = cputime64_add(cpustat->user, tmp); | 3622 | cpustat->user = cputime64_add(cpustat->user, tmp); |
3623 | } | 3623 | } |
3624 | 3624 | ||
3625 | /* | 3625 | /* |
3626 | * Account guest cpu time to a process. | 3626 | * Account guest cpu time to a process. |
3627 | * @p: the process that the cpu time gets accounted to | 3627 | * @p: the process that the cpu time gets accounted to |
3628 | * @cputime: the cpu time spent in virtual machine since the last update | 3628 | * @cputime: the cpu time spent in virtual machine since the last update |
3629 | */ | 3629 | */ |
3630 | static void account_guest_time(struct task_struct *p, cputime_t cputime) | 3630 | static void account_guest_time(struct task_struct *p, cputime_t cputime) |
3631 | { | 3631 | { |
3632 | cputime64_t tmp; | 3632 | cputime64_t tmp; |
3633 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 3633 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; |
3634 | 3634 | ||
3635 | tmp = cputime_to_cputime64(cputime); | 3635 | tmp = cputime_to_cputime64(cputime); |
3636 | 3636 | ||
3637 | p->utime = cputime_add(p->utime, cputime); | 3637 | p->utime = cputime_add(p->utime, cputime); |
3638 | p->gtime = cputime_add(p->gtime, cputime); | 3638 | p->gtime = cputime_add(p->gtime, cputime); |
3639 | 3639 | ||
3640 | cpustat->user = cputime64_add(cpustat->user, tmp); | 3640 | cpustat->user = cputime64_add(cpustat->user, tmp); |
3641 | cpustat->guest = cputime64_add(cpustat->guest, tmp); | 3641 | cpustat->guest = cputime64_add(cpustat->guest, tmp); |
3642 | } | 3642 | } |
3643 | 3643 | ||
3644 | /* | 3644 | /* |
3645 | * Account scaled user cpu time to a process. | 3645 | * Account scaled user cpu time to a process. |
3646 | * @p: the process that the cpu time gets accounted to | 3646 | * @p: the process that the cpu time gets accounted to |
3647 | * @cputime: the cpu time spent in user space since the last update | 3647 | * @cputime: the cpu time spent in user space since the last update |
3648 | */ | 3648 | */ |
3649 | void account_user_time_scaled(struct task_struct *p, cputime_t cputime) | 3649 | void account_user_time_scaled(struct task_struct *p, cputime_t cputime) |
3650 | { | 3650 | { |
3651 | p->utimescaled = cputime_add(p->utimescaled, cputime); | 3651 | p->utimescaled = cputime_add(p->utimescaled, cputime); |
3652 | } | 3652 | } |
3653 | 3653 | ||
3654 | /* | 3654 | /* |
3655 | * Account system cpu time to a process. | 3655 | * Account system cpu time to a process. |
3656 | * @p: the process that the cpu time gets accounted to | 3656 | * @p: the process that the cpu time gets accounted to |
3657 | * @hardirq_offset: the offset to subtract from hardirq_count() | 3657 | * @hardirq_offset: the offset to subtract from hardirq_count() |
3658 | * @cputime: the cpu time spent in kernel space since the last update | 3658 | * @cputime: the cpu time spent in kernel space since the last update |
3659 | */ | 3659 | */ |
3660 | void account_system_time(struct task_struct *p, int hardirq_offset, | 3660 | void account_system_time(struct task_struct *p, int hardirq_offset, |
3661 | cputime_t cputime) | 3661 | cputime_t cputime) |
3662 | { | 3662 | { |
3663 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 3663 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; |
3664 | struct rq *rq = this_rq(); | 3664 | struct rq *rq = this_rq(); |
3665 | cputime64_t tmp; | 3665 | cputime64_t tmp; |
3666 | 3666 | ||
3667 | if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) | 3667 | if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) |
3668 | return account_guest_time(p, cputime); | 3668 | return account_guest_time(p, cputime); |
3669 | 3669 | ||
3670 | p->stime = cputime_add(p->stime, cputime); | 3670 | p->stime = cputime_add(p->stime, cputime); |
3671 | 3671 | ||
3672 | /* Add system time to cpustat. */ | 3672 | /* Add system time to cpustat. */ |
3673 | tmp = cputime_to_cputime64(cputime); | 3673 | tmp = cputime_to_cputime64(cputime); |
3674 | if (hardirq_count() - hardirq_offset) | 3674 | if (hardirq_count() - hardirq_offset) |
3675 | cpustat->irq = cputime64_add(cpustat->irq, tmp); | 3675 | cpustat->irq = cputime64_add(cpustat->irq, tmp); |
3676 | else if (softirq_count()) | 3676 | else if (softirq_count()) |
3677 | cpustat->softirq = cputime64_add(cpustat->softirq, tmp); | 3677 | cpustat->softirq = cputime64_add(cpustat->softirq, tmp); |
3678 | else if (p != rq->idle) | 3678 | else if (p != rq->idle) |
3679 | cpustat->system = cputime64_add(cpustat->system, tmp); | 3679 | cpustat->system = cputime64_add(cpustat->system, tmp); |
3680 | else if (atomic_read(&rq->nr_iowait) > 0) | 3680 | else if (atomic_read(&rq->nr_iowait) > 0) |
3681 | cpustat->iowait = cputime64_add(cpustat->iowait, tmp); | 3681 | cpustat->iowait = cputime64_add(cpustat->iowait, tmp); |
3682 | else | 3682 | else |
3683 | cpustat->idle = cputime64_add(cpustat->idle, tmp); | 3683 | cpustat->idle = cputime64_add(cpustat->idle, tmp); |
3684 | /* Account for system time used */ | 3684 | /* Account for system time used */ |
3685 | acct_update_integrals(p); | 3685 | acct_update_integrals(p); |
3686 | } | 3686 | } |
3687 | 3687 | ||
3688 | /* | 3688 | /* |
3689 | * Account scaled system cpu time to a process. | 3689 | * Account scaled system cpu time to a process. |
3690 | * @p: the process that the cpu time gets accounted to | 3690 | * @p: the process that the cpu time gets accounted to |
3691 | * @hardirq_offset: the offset to subtract from hardirq_count() | 3691 | * @hardirq_offset: the offset to subtract from hardirq_count() |
3692 | * @cputime: the cpu time spent in kernel space since the last update | 3692 | * @cputime: the cpu time spent in kernel space since the last update |
3693 | */ | 3693 | */ |
3694 | void account_system_time_scaled(struct task_struct *p, cputime_t cputime) | 3694 | void account_system_time_scaled(struct task_struct *p, cputime_t cputime) |
3695 | { | 3695 | { |
3696 | p->stimescaled = cputime_add(p->stimescaled, cputime); | 3696 | p->stimescaled = cputime_add(p->stimescaled, cputime); |
3697 | } | 3697 | } |
3698 | 3698 | ||
3699 | /* | 3699 | /* |
3700 | * Account for involuntary wait time. | 3700 | * Account for involuntary wait time. |
3701 | * @p: the process from which the cpu time has been stolen | 3701 | * @p: the process from which the cpu time has been stolen |
3702 | * @steal: the cpu time spent in involuntary wait | 3702 | * @steal: the cpu time spent in involuntary wait |
3703 | */ | 3703 | */ |
3704 | void account_steal_time(struct task_struct *p, cputime_t steal) | 3704 | void account_steal_time(struct task_struct *p, cputime_t steal) |
3705 | { | 3705 | { |
3706 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 3706 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; |
3707 | cputime64_t tmp = cputime_to_cputime64(steal); | 3707 | cputime64_t tmp = cputime_to_cputime64(steal); |
3708 | struct rq *rq = this_rq(); | 3708 | struct rq *rq = this_rq(); |
3709 | 3709 | ||
3710 | if (p == rq->idle) { | 3710 | if (p == rq->idle) { |
3711 | p->stime = cputime_add(p->stime, steal); | 3711 | p->stime = cputime_add(p->stime, steal); |
3712 | if (atomic_read(&rq->nr_iowait) > 0) | 3712 | if (atomic_read(&rq->nr_iowait) > 0) |
3713 | cpustat->iowait = cputime64_add(cpustat->iowait, tmp); | 3713 | cpustat->iowait = cputime64_add(cpustat->iowait, tmp); |
3714 | else | 3714 | else |
3715 | cpustat->idle = cputime64_add(cpustat->idle, tmp); | 3715 | cpustat->idle = cputime64_add(cpustat->idle, tmp); |
3716 | } else | 3716 | } else |
3717 | cpustat->steal = cputime64_add(cpustat->steal, tmp); | 3717 | cpustat->steal = cputime64_add(cpustat->steal, tmp); |
3718 | } | 3718 | } |
3719 | 3719 | ||
3720 | /* | 3720 | /* |
3721 | * This function gets called by the timer code, with HZ frequency. | 3721 | * This function gets called by the timer code, with HZ frequency. |
3722 | * We call it with interrupts disabled. | 3722 | * We call it with interrupts disabled. |
3723 | * | 3723 | * |
3724 | * It also gets called by the fork code, when changing the parent's | 3724 | * It also gets called by the fork code, when changing the parent's |
3725 | * timeslices. | 3725 | * timeslices. |
3726 | */ | 3726 | */ |
3727 | void scheduler_tick(void) | 3727 | void scheduler_tick(void) |
3728 | { | 3728 | { |
3729 | int cpu = smp_processor_id(); | 3729 | int cpu = smp_processor_id(); |
3730 | struct rq *rq = cpu_rq(cpu); | 3730 | struct rq *rq = cpu_rq(cpu); |
3731 | struct task_struct *curr = rq->curr; | 3731 | struct task_struct *curr = rq->curr; |
3732 | u64 next_tick = rq->tick_timestamp + TICK_NSEC; | 3732 | u64 next_tick = rq->tick_timestamp + TICK_NSEC; |
3733 | 3733 | ||
3734 | spin_lock(&rq->lock); | 3734 | spin_lock(&rq->lock); |
3735 | __update_rq_clock(rq); | 3735 | __update_rq_clock(rq); |
3736 | /* | 3736 | /* |
3737 | * Let rq->clock advance by at least TICK_NSEC: | 3737 | * Let rq->clock advance by at least TICK_NSEC: |
3738 | */ | 3738 | */ |
3739 | if (unlikely(rq->clock < next_tick)) { | 3739 | if (unlikely(rq->clock < next_tick)) { |
3740 | rq->clock = next_tick; | 3740 | rq->clock = next_tick; |
3741 | rq->clock_underflows++; | 3741 | rq->clock_underflows++; |
3742 | } | 3742 | } |
3743 | rq->tick_timestamp = rq->clock; | 3743 | rq->tick_timestamp = rq->clock; |
3744 | update_cpu_load(rq); | 3744 | update_cpu_load(rq); |
3745 | curr->sched_class->task_tick(rq, curr, 0); | 3745 | curr->sched_class->task_tick(rq, curr, 0); |
3746 | update_sched_rt_period(rq); | 3746 | update_sched_rt_period(rq); |
3747 | spin_unlock(&rq->lock); | 3747 | spin_unlock(&rq->lock); |
3748 | 3748 | ||
3749 | #ifdef CONFIG_SMP | 3749 | #ifdef CONFIG_SMP |
3750 | rq->idle_at_tick = idle_cpu(cpu); | 3750 | rq->idle_at_tick = idle_cpu(cpu); |
3751 | trigger_load_balance(rq, cpu); | 3751 | trigger_load_balance(rq, cpu); |
3752 | #endif | 3752 | #endif |
3753 | } | 3753 | } |
3754 | 3754 | ||
3755 | #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) | 3755 | #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) |
3756 | 3756 | ||
3757 | void fastcall add_preempt_count(int val) | 3757 | void fastcall add_preempt_count(int val) |
3758 | { | 3758 | { |
3759 | /* | 3759 | /* |
3760 | * Underflow? | 3760 | * Underflow? |
3761 | */ | 3761 | */ |
3762 | if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) | 3762 | if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) |
3763 | return; | 3763 | return; |
3764 | preempt_count() += val; | 3764 | preempt_count() += val; |
3765 | /* | 3765 | /* |
3766 | * Spinlock count overflowing soon? | 3766 | * Spinlock count overflowing soon? |
3767 | */ | 3767 | */ |
3768 | DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= | 3768 | DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= |
3769 | PREEMPT_MASK - 10); | 3769 | PREEMPT_MASK - 10); |
3770 | } | 3770 | } |
3771 | EXPORT_SYMBOL(add_preempt_count); | 3771 | EXPORT_SYMBOL(add_preempt_count); |
3772 | 3772 | ||
3773 | void fastcall sub_preempt_count(int val) | 3773 | void fastcall sub_preempt_count(int val) |
3774 | { | 3774 | { |
3775 | /* | 3775 | /* |
3776 | * Underflow? | 3776 | * Underflow? |
3777 | */ | 3777 | */ |
3778 | if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) | 3778 | if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) |
3779 | return; | 3779 | return; |
3780 | /* | 3780 | /* |
3781 | * Is the spinlock portion underflowing? | 3781 | * Is the spinlock portion underflowing? |
3782 | */ | 3782 | */ |
3783 | if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && | 3783 | if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && |
3784 | !(preempt_count() & PREEMPT_MASK))) | 3784 | !(preempt_count() & PREEMPT_MASK))) |
3785 | return; | 3785 | return; |
3786 | 3786 | ||
3787 | preempt_count() -= val; | 3787 | preempt_count() -= val; |
3788 | } | 3788 | } |
3789 | EXPORT_SYMBOL(sub_preempt_count); | 3789 | EXPORT_SYMBOL(sub_preempt_count); |
3790 | 3790 | ||
3791 | #endif | 3791 | #endif |
3792 | 3792 | ||
3793 | /* | 3793 | /* |
3794 | * Print scheduling while atomic bug: | 3794 | * Print scheduling while atomic bug: |
3795 | */ | 3795 | */ |
3796 | static noinline void __schedule_bug(struct task_struct *prev) | 3796 | static noinline void __schedule_bug(struct task_struct *prev) |
3797 | { | 3797 | { |
3798 | struct pt_regs *regs = get_irq_regs(); | 3798 | struct pt_regs *regs = get_irq_regs(); |
3799 | 3799 | ||
3800 | printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", | 3800 | printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", |
3801 | prev->comm, prev->pid, preempt_count()); | 3801 | prev->comm, prev->pid, preempt_count()); |
3802 | 3802 | ||
3803 | debug_show_held_locks(prev); | 3803 | debug_show_held_locks(prev); |
3804 | if (irqs_disabled()) | 3804 | if (irqs_disabled()) |
3805 | print_irqtrace_events(prev); | 3805 | print_irqtrace_events(prev); |
3806 | 3806 | ||
3807 | if (regs) | 3807 | if (regs) |
3808 | show_regs(regs); | 3808 | show_regs(regs); |
3809 | else | 3809 | else |
3810 | dump_stack(); | 3810 | dump_stack(); |
3811 | } | 3811 | } |
3812 | 3812 | ||
3813 | /* | 3813 | /* |
3814 | * Various schedule()-time debugging checks and statistics: | 3814 | * Various schedule()-time debugging checks and statistics: |
3815 | */ | 3815 | */ |
3816 | static inline void schedule_debug(struct task_struct *prev) | 3816 | static inline void schedule_debug(struct task_struct *prev) |
3817 | { | 3817 | { |
3818 | /* | 3818 | /* |
3819 | * Test if we are atomic. Since do_exit() needs to call into | 3819 | * Test if we are atomic. Since do_exit() needs to call into |
3820 | * schedule() atomically, we ignore that path for now. | 3820 | * schedule() atomically, we ignore that path for now. |
3821 | * Otherwise, whine if we are scheduling when we should not be. | 3821 | * Otherwise, whine if we are scheduling when we should not be. |
3822 | */ | 3822 | */ |
3823 | if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state)) | 3823 | if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state)) |
3824 | __schedule_bug(prev); | 3824 | __schedule_bug(prev); |
3825 | 3825 | ||
3826 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); | 3826 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); |
3827 | 3827 | ||
3828 | schedstat_inc(this_rq(), sched_count); | 3828 | schedstat_inc(this_rq(), sched_count); |
3829 | #ifdef CONFIG_SCHEDSTATS | 3829 | #ifdef CONFIG_SCHEDSTATS |
3830 | if (unlikely(prev->lock_depth >= 0)) { | 3830 | if (unlikely(prev->lock_depth >= 0)) { |
3831 | schedstat_inc(this_rq(), bkl_count); | 3831 | schedstat_inc(this_rq(), bkl_count); |
3832 | schedstat_inc(prev, sched_info.bkl_count); | 3832 | schedstat_inc(prev, sched_info.bkl_count); |
3833 | } | 3833 | } |
3834 | #endif | 3834 | #endif |
3835 | } | 3835 | } |
3836 | 3836 | ||
3837 | /* | 3837 | /* |
3838 | * Pick up the highest-prio task: | 3838 | * Pick up the highest-prio task: |
3839 | */ | 3839 | */ |
3840 | static inline struct task_struct * | 3840 | static inline struct task_struct * |
3841 | pick_next_task(struct rq *rq, struct task_struct *prev) | 3841 | pick_next_task(struct rq *rq, struct task_struct *prev) |
3842 | { | 3842 | { |
3843 | const struct sched_class *class; | 3843 | const struct sched_class *class; |
3844 | struct task_struct *p; | 3844 | struct task_struct *p; |
3845 | 3845 | ||
3846 | /* | 3846 | /* |
3847 | * Optimization: we know that if all tasks are in | 3847 | * Optimization: we know that if all tasks are in |
3848 | * the fair class we can call that function directly: | 3848 | * the fair class we can call that function directly: |
3849 | */ | 3849 | */ |
3850 | if (likely(rq->nr_running == rq->cfs.nr_running)) { | 3850 | if (likely(rq->nr_running == rq->cfs.nr_running)) { |
3851 | p = fair_sched_class.pick_next_task(rq); | 3851 | p = fair_sched_class.pick_next_task(rq); |
3852 | if (likely(p)) | 3852 | if (likely(p)) |
3853 | return p; | 3853 | return p; |
3854 | } | 3854 | } |
3855 | 3855 | ||
3856 | class = sched_class_highest; | 3856 | class = sched_class_highest; |
3857 | for ( ; ; ) { | 3857 | for ( ; ; ) { |
3858 | p = class->pick_next_task(rq); | 3858 | p = class->pick_next_task(rq); |
3859 | if (p) | 3859 | if (p) |
3860 | return p; | 3860 | return p; |
3861 | /* | 3861 | /* |
3862 | * Will never be NULL as the idle class always | 3862 | * Will never be NULL as the idle class always |
3863 | * returns a non-NULL p: | 3863 | * returns a non-NULL p: |
3864 | */ | 3864 | */ |
3865 | class = class->next; | 3865 | class = class->next; |
3866 | } | 3866 | } |
3867 | } | 3867 | } |
3868 | 3868 | ||
3869 | /* | 3869 | /* |
3870 | * schedule() is the main scheduler function. | 3870 | * schedule() is the main scheduler function. |
3871 | */ | 3871 | */ |
3872 | asmlinkage void __sched schedule(void) | 3872 | asmlinkage void __sched schedule(void) |
3873 | { | 3873 | { |
3874 | struct task_struct *prev, *next; | 3874 | struct task_struct *prev, *next; |
3875 | long *switch_count; | 3875 | long *switch_count; |
3876 | struct rq *rq; | 3876 | struct rq *rq; |
3877 | int cpu; | 3877 | int cpu; |
3878 | 3878 | ||
3879 | need_resched: | 3879 | need_resched: |
3880 | preempt_disable(); | 3880 | preempt_disable(); |
3881 | cpu = smp_processor_id(); | 3881 | cpu = smp_processor_id(); |
3882 | rq = cpu_rq(cpu); | 3882 | rq = cpu_rq(cpu); |
3883 | rcu_qsctr_inc(cpu); | 3883 | rcu_qsctr_inc(cpu); |
3884 | prev = rq->curr; | 3884 | prev = rq->curr; |
3885 | switch_count = &prev->nivcsw; | 3885 | switch_count = &prev->nivcsw; |
3886 | 3886 | ||
3887 | release_kernel_lock(prev); | 3887 | release_kernel_lock(prev); |
3888 | need_resched_nonpreemptible: | 3888 | need_resched_nonpreemptible: |
3889 | 3889 | ||
3890 | schedule_debug(prev); | 3890 | schedule_debug(prev); |
3891 | 3891 | ||
3892 | hrtick_clear(rq); | 3892 | hrtick_clear(rq); |
3893 | 3893 | ||
3894 | /* | 3894 | /* |
3895 | * Do the rq-clock update outside the rq lock: | 3895 | * Do the rq-clock update outside the rq lock: |
3896 | */ | 3896 | */ |
3897 | local_irq_disable(); | 3897 | local_irq_disable(); |
3898 | __update_rq_clock(rq); | 3898 | __update_rq_clock(rq); |
3899 | spin_lock(&rq->lock); | 3899 | spin_lock(&rq->lock); |
3900 | clear_tsk_need_resched(prev); | 3900 | clear_tsk_need_resched(prev); |
3901 | 3901 | ||
3902 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { | 3902 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { |
3903 | if (unlikely((prev->state & TASK_INTERRUPTIBLE) && | 3903 | if (unlikely((prev->state & TASK_INTERRUPTIBLE) && |
3904 | unlikely(signal_pending(prev)))) { | 3904 | unlikely(signal_pending(prev)))) { |
3905 | prev->state = TASK_RUNNING; | 3905 | prev->state = TASK_RUNNING; |
3906 | } else { | 3906 | } else { |
3907 | deactivate_task(rq, prev, 1); | 3907 | deactivate_task(rq, prev, 1); |
3908 | } | 3908 | } |
3909 | switch_count = &prev->nvcsw; | 3909 | switch_count = &prev->nvcsw; |
3910 | } | 3910 | } |
3911 | 3911 | ||
3912 | #ifdef CONFIG_SMP | 3912 | #ifdef CONFIG_SMP |
3913 | if (prev->sched_class->pre_schedule) | 3913 | if (prev->sched_class->pre_schedule) |
3914 | prev->sched_class->pre_schedule(rq, prev); | 3914 | prev->sched_class->pre_schedule(rq, prev); |
3915 | #endif | 3915 | #endif |
3916 | 3916 | ||
3917 | if (unlikely(!rq->nr_running)) | 3917 | if (unlikely(!rq->nr_running)) |
3918 | idle_balance(cpu, rq); | 3918 | idle_balance(cpu, rq); |
3919 | 3919 | ||
3920 | prev->sched_class->put_prev_task(rq, prev); | 3920 | prev->sched_class->put_prev_task(rq, prev); |
3921 | next = pick_next_task(rq, prev); | 3921 | next = pick_next_task(rq, prev); |
3922 | 3922 | ||
3923 | sched_info_switch(prev, next); | 3923 | sched_info_switch(prev, next); |
3924 | 3924 | ||
3925 | if (likely(prev != next)) { | 3925 | if (likely(prev != next)) { |
3926 | rq->nr_switches++; | 3926 | rq->nr_switches++; |
3927 | rq->curr = next; | 3927 | rq->curr = next; |
3928 | ++*switch_count; | 3928 | ++*switch_count; |
3929 | 3929 | ||
3930 | context_switch(rq, prev, next); /* unlocks the rq */ | 3930 | context_switch(rq, prev, next); /* unlocks the rq */ |
3931 | /* | 3931 | /* |
3932 | * the context switch might have flipped the stack from under | 3932 | * the context switch might have flipped the stack from under |
3933 | * us, hence refresh the local variables. | 3933 | * us, hence refresh the local variables. |
3934 | */ | 3934 | */ |
3935 | cpu = smp_processor_id(); | 3935 | cpu = smp_processor_id(); |
3936 | rq = cpu_rq(cpu); | 3936 | rq = cpu_rq(cpu); |
3937 | } else | 3937 | } else |
3938 | spin_unlock_irq(&rq->lock); | 3938 | spin_unlock_irq(&rq->lock); |
3939 | 3939 | ||
3940 | hrtick_set(rq); | 3940 | hrtick_set(rq); |
3941 | 3941 | ||
3942 | if (unlikely(reacquire_kernel_lock(current) < 0)) | 3942 | if (unlikely(reacquire_kernel_lock(current) < 0)) |
3943 | goto need_resched_nonpreemptible; | 3943 | goto need_resched_nonpreemptible; |
3944 | 3944 | ||
3945 | preempt_enable_no_resched(); | 3945 | preempt_enable_no_resched(); |
3946 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) | 3946 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) |
3947 | goto need_resched; | 3947 | goto need_resched; |
3948 | } | 3948 | } |
3949 | EXPORT_SYMBOL(schedule); | 3949 | EXPORT_SYMBOL(schedule); |
3950 | 3950 | ||
3951 | #ifdef CONFIG_PREEMPT | 3951 | #ifdef CONFIG_PREEMPT |
3952 | /* | 3952 | /* |
3953 | * this is the entry point to schedule() from in-kernel preemption | 3953 | * this is the entry point to schedule() from in-kernel preemption |
3954 | * off of preempt_enable. Kernel preemptions off return from interrupt | 3954 | * off of preempt_enable. Kernel preemptions off return from interrupt |
3955 | * occur there and call schedule directly. | 3955 | * occur there and call schedule directly. |
3956 | */ | 3956 | */ |
3957 | asmlinkage void __sched preempt_schedule(void) | 3957 | asmlinkage void __sched preempt_schedule(void) |
3958 | { | 3958 | { |
3959 | struct thread_info *ti = current_thread_info(); | 3959 | struct thread_info *ti = current_thread_info(); |
3960 | struct task_struct *task = current; | 3960 | struct task_struct *task = current; |
3961 | int saved_lock_depth; | 3961 | int saved_lock_depth; |
3962 | 3962 | ||
3963 | /* | 3963 | /* |
3964 | * If there is a non-zero preempt_count or interrupts are disabled, | 3964 | * If there is a non-zero preempt_count or interrupts are disabled, |
3965 | * we do not want to preempt the current task. Just return.. | 3965 | * we do not want to preempt the current task. Just return.. |
3966 | */ | 3966 | */ |
3967 | if (likely(ti->preempt_count || irqs_disabled())) | 3967 | if (likely(ti->preempt_count || irqs_disabled())) |
3968 | return; | 3968 | return; |
3969 | 3969 | ||
3970 | do { | 3970 | do { |
3971 | add_preempt_count(PREEMPT_ACTIVE); | 3971 | add_preempt_count(PREEMPT_ACTIVE); |
3972 | 3972 | ||
3973 | /* | 3973 | /* |
3974 | * We keep the big kernel semaphore locked, but we | 3974 | * We keep the big kernel semaphore locked, but we |
3975 | * clear ->lock_depth so that schedule() doesnt | 3975 | * clear ->lock_depth so that schedule() doesnt |
3976 | * auto-release the semaphore: | 3976 | * auto-release the semaphore: |
3977 | */ | 3977 | */ |
3978 | saved_lock_depth = task->lock_depth; | 3978 | saved_lock_depth = task->lock_depth; |
3979 | task->lock_depth = -1; | 3979 | task->lock_depth = -1; |
3980 | schedule(); | 3980 | schedule(); |
3981 | task->lock_depth = saved_lock_depth; | 3981 | task->lock_depth = saved_lock_depth; |
3982 | sub_preempt_count(PREEMPT_ACTIVE); | 3982 | sub_preempt_count(PREEMPT_ACTIVE); |
3983 | 3983 | ||
3984 | /* | 3984 | /* |
3985 | * Check again in case we missed a preemption opportunity | 3985 | * Check again in case we missed a preemption opportunity |
3986 | * between schedule and now. | 3986 | * between schedule and now. |
3987 | */ | 3987 | */ |
3988 | barrier(); | 3988 | barrier(); |
3989 | } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); | 3989 | } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); |
3990 | } | 3990 | } |
3991 | EXPORT_SYMBOL(preempt_schedule); | 3991 | EXPORT_SYMBOL(preempt_schedule); |
3992 | 3992 | ||
3993 | /* | 3993 | /* |
3994 | * this is the entry point to schedule() from kernel preemption | 3994 | * this is the entry point to schedule() from kernel preemption |
3995 | * off of irq context. | 3995 | * off of irq context. |
3996 | * Note, that this is called and return with irqs disabled. This will | 3996 | * Note, that this is called and return with irqs disabled. This will |
3997 | * protect us against recursive calling from irq. | 3997 | * protect us against recursive calling from irq. |
3998 | */ | 3998 | */ |
3999 | asmlinkage void __sched preempt_schedule_irq(void) | 3999 | asmlinkage void __sched preempt_schedule_irq(void) |
4000 | { | 4000 | { |
4001 | struct thread_info *ti = current_thread_info(); | 4001 | struct thread_info *ti = current_thread_info(); |
4002 | struct task_struct *task = current; | 4002 | struct task_struct *task = current; |
4003 | int saved_lock_depth; | 4003 | int saved_lock_depth; |
4004 | 4004 | ||
4005 | /* Catch callers which need to be fixed */ | 4005 | /* Catch callers which need to be fixed */ |
4006 | BUG_ON(ti->preempt_count || !irqs_disabled()); | 4006 | BUG_ON(ti->preempt_count || !irqs_disabled()); |
4007 | 4007 | ||
4008 | do { | 4008 | do { |
4009 | add_preempt_count(PREEMPT_ACTIVE); | 4009 | add_preempt_count(PREEMPT_ACTIVE); |
4010 | 4010 | ||
4011 | /* | 4011 | /* |
4012 | * We keep the big kernel semaphore locked, but we | 4012 | * We keep the big kernel semaphore locked, but we |
4013 | * clear ->lock_depth so that schedule() doesnt | 4013 | * clear ->lock_depth so that schedule() doesnt |
4014 | * auto-release the semaphore: | 4014 | * auto-release the semaphore: |
4015 | */ | 4015 | */ |
4016 | saved_lock_depth = task->lock_depth; | 4016 | saved_lock_depth = task->lock_depth; |
4017 | task->lock_depth = -1; | 4017 | task->lock_depth = -1; |
4018 | local_irq_enable(); | 4018 | local_irq_enable(); |
4019 | schedule(); | 4019 | schedule(); |
4020 | local_irq_disable(); | 4020 | local_irq_disable(); |
4021 | task->lock_depth = saved_lock_depth; | 4021 | task->lock_depth = saved_lock_depth; |
4022 | sub_preempt_count(PREEMPT_ACTIVE); | 4022 | sub_preempt_count(PREEMPT_ACTIVE); |
4023 | 4023 | ||
4024 | /* | 4024 | /* |
4025 | * Check again in case we missed a preemption opportunity | 4025 | * Check again in case we missed a preemption opportunity |
4026 | * between schedule and now. | 4026 | * between schedule and now. |
4027 | */ | 4027 | */ |
4028 | barrier(); | 4028 | barrier(); |
4029 | } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); | 4029 | } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); |
4030 | } | 4030 | } |
4031 | 4031 | ||
4032 | #endif /* CONFIG_PREEMPT */ | 4032 | #endif /* CONFIG_PREEMPT */ |
4033 | 4033 | ||
4034 | int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, | 4034 | int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, |
4035 | void *key) | 4035 | void *key) |
4036 | { | 4036 | { |
4037 | return try_to_wake_up(curr->private, mode, sync); | 4037 | return try_to_wake_up(curr->private, mode, sync); |
4038 | } | 4038 | } |
4039 | EXPORT_SYMBOL(default_wake_function); | 4039 | EXPORT_SYMBOL(default_wake_function); |
4040 | 4040 | ||
4041 | /* | 4041 | /* |
4042 | * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just | 4042 | * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just |
4043 | * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve | 4043 | * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve |
4044 | * number) then we wake all the non-exclusive tasks and one exclusive task. | 4044 | * number) then we wake all the non-exclusive tasks and one exclusive task. |
4045 | * | 4045 | * |
4046 | * There are circumstances in which we can try to wake a task which has already | 4046 | * There are circumstances in which we can try to wake a task which has already |
4047 | * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns | 4047 | * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns |
4048 | * zero in this (rare) case, and we handle it by continuing to scan the queue. | 4048 | * zero in this (rare) case, and we handle it by continuing to scan the queue. |
4049 | */ | 4049 | */ |
4050 | static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, | 4050 | static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, |
4051 | int nr_exclusive, int sync, void *key) | 4051 | int nr_exclusive, int sync, void *key) |
4052 | { | 4052 | { |
4053 | wait_queue_t *curr, *next; | 4053 | wait_queue_t *curr, *next; |
4054 | 4054 | ||
4055 | list_for_each_entry_safe(curr, next, &q->task_list, task_list) { | 4055 | list_for_each_entry_safe(curr, next, &q->task_list, task_list) { |
4056 | unsigned flags = curr->flags; | 4056 | unsigned flags = curr->flags; |
4057 | 4057 | ||
4058 | if (curr->func(curr, mode, sync, key) && | 4058 | if (curr->func(curr, mode, sync, key) && |
4059 | (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) | 4059 | (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) |
4060 | break; | 4060 | break; |
4061 | } | 4061 | } |
4062 | } | 4062 | } |
4063 | 4063 | ||
4064 | /** | 4064 | /** |
4065 | * __wake_up - wake up threads blocked on a waitqueue. | 4065 | * __wake_up - wake up threads blocked on a waitqueue. |
4066 | * @q: the waitqueue | 4066 | * @q: the waitqueue |
4067 | * @mode: which threads | 4067 | * @mode: which threads |
4068 | * @nr_exclusive: how many wake-one or wake-many threads to wake up | 4068 | * @nr_exclusive: how many wake-one or wake-many threads to wake up |
4069 | * @key: is directly passed to the wakeup function | 4069 | * @key: is directly passed to the wakeup function |
4070 | */ | 4070 | */ |
4071 | void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, | 4071 | void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, |
4072 | int nr_exclusive, void *key) | 4072 | int nr_exclusive, void *key) |
4073 | { | 4073 | { |
4074 | unsigned long flags; | 4074 | unsigned long flags; |
4075 | 4075 | ||
4076 | spin_lock_irqsave(&q->lock, flags); | 4076 | spin_lock_irqsave(&q->lock, flags); |
4077 | __wake_up_common(q, mode, nr_exclusive, 0, key); | 4077 | __wake_up_common(q, mode, nr_exclusive, 0, key); |
4078 | spin_unlock_irqrestore(&q->lock, flags); | 4078 | spin_unlock_irqrestore(&q->lock, flags); |
4079 | } | 4079 | } |
4080 | EXPORT_SYMBOL(__wake_up); | 4080 | EXPORT_SYMBOL(__wake_up); |
4081 | 4081 | ||
4082 | /* | 4082 | /* |
4083 | * Same as __wake_up but called with the spinlock in wait_queue_head_t held. | 4083 | * Same as __wake_up but called with the spinlock in wait_queue_head_t held. |
4084 | */ | 4084 | */ |
4085 | void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode) | 4085 | void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode) |
4086 | { | 4086 | { |
4087 | __wake_up_common(q, mode, 1, 0, NULL); | 4087 | __wake_up_common(q, mode, 1, 0, NULL); |
4088 | } | 4088 | } |
4089 | 4089 | ||
4090 | /** | 4090 | /** |
4091 | * __wake_up_sync - wake up threads blocked on a waitqueue. | 4091 | * __wake_up_sync - wake up threads blocked on a waitqueue. |
4092 | * @q: the waitqueue | 4092 | * @q: the waitqueue |
4093 | * @mode: which threads | 4093 | * @mode: which threads |
4094 | * @nr_exclusive: how many wake-one or wake-many threads to wake up | 4094 | * @nr_exclusive: how many wake-one or wake-many threads to wake up |
4095 | * | 4095 | * |
4096 | * The sync wakeup differs that the waker knows that it will schedule | 4096 | * The sync wakeup differs that the waker knows that it will schedule |
4097 | * away soon, so while the target thread will be woken up, it will not | 4097 | * away soon, so while the target thread will be woken up, it will not |
4098 | * be migrated to another CPU - ie. the two threads are 'synchronized' | 4098 | * be migrated to another CPU - ie. the two threads are 'synchronized' |
4099 | * with each other. This can prevent needless bouncing between CPUs. | 4099 | * with each other. This can prevent needless bouncing between CPUs. |
4100 | * | 4100 | * |
4101 | * On UP it can prevent extra preemption. | 4101 | * On UP it can prevent extra preemption. |
4102 | */ | 4102 | */ |
4103 | void fastcall | 4103 | void fastcall |
4104 | __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) | 4104 | __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) |
4105 | { | 4105 | { |
4106 | unsigned long flags; | 4106 | unsigned long flags; |
4107 | int sync = 1; | 4107 | int sync = 1; |
4108 | 4108 | ||
4109 | if (unlikely(!q)) | 4109 | if (unlikely(!q)) |
4110 | return; | 4110 | return; |
4111 | 4111 | ||
4112 | if (unlikely(!nr_exclusive)) | 4112 | if (unlikely(!nr_exclusive)) |
4113 | sync = 0; | 4113 | sync = 0; |
4114 | 4114 | ||
4115 | spin_lock_irqsave(&q->lock, flags); | 4115 | spin_lock_irqsave(&q->lock, flags); |
4116 | __wake_up_common(q, mode, nr_exclusive, sync, NULL); | 4116 | __wake_up_common(q, mode, nr_exclusive, sync, NULL); |
4117 | spin_unlock_irqrestore(&q->lock, flags); | 4117 | spin_unlock_irqrestore(&q->lock, flags); |
4118 | } | 4118 | } |
4119 | EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ | 4119 | EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ |
4120 | 4120 | ||
4121 | void complete(struct completion *x) | 4121 | void complete(struct completion *x) |
4122 | { | 4122 | { |
4123 | unsigned long flags; | 4123 | unsigned long flags; |
4124 | 4124 | ||
4125 | spin_lock_irqsave(&x->wait.lock, flags); | 4125 | spin_lock_irqsave(&x->wait.lock, flags); |
4126 | x->done++; | 4126 | x->done++; |
4127 | __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, | 4127 | __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, |
4128 | 1, 0, NULL); | 4128 | 1, 0, NULL); |
4129 | spin_unlock_irqrestore(&x->wait.lock, flags); | 4129 | spin_unlock_irqrestore(&x->wait.lock, flags); |
4130 | } | 4130 | } |
4131 | EXPORT_SYMBOL(complete); | 4131 | EXPORT_SYMBOL(complete); |
4132 | 4132 | ||
4133 | void complete_all(struct completion *x) | 4133 | void complete_all(struct completion *x) |
4134 | { | 4134 | { |
4135 | unsigned long flags; | 4135 | unsigned long flags; |
4136 | 4136 | ||
4137 | spin_lock_irqsave(&x->wait.lock, flags); | 4137 | spin_lock_irqsave(&x->wait.lock, flags); |
4138 | x->done += UINT_MAX/2; | 4138 | x->done += UINT_MAX/2; |
4139 | __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, | 4139 | __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, |
4140 | 0, 0, NULL); | 4140 | 0, 0, NULL); |
4141 | spin_unlock_irqrestore(&x->wait.lock, flags); | 4141 | spin_unlock_irqrestore(&x->wait.lock, flags); |
4142 | } | 4142 | } |
4143 | EXPORT_SYMBOL(complete_all); | 4143 | EXPORT_SYMBOL(complete_all); |
4144 | 4144 | ||
4145 | static inline long __sched | 4145 | static inline long __sched |
4146 | do_wait_for_common(struct completion *x, long timeout, int state) | 4146 | do_wait_for_common(struct completion *x, long timeout, int state) |
4147 | { | 4147 | { |
4148 | if (!x->done) { | 4148 | if (!x->done) { |
4149 | DECLARE_WAITQUEUE(wait, current); | 4149 | DECLARE_WAITQUEUE(wait, current); |
4150 | 4150 | ||
4151 | wait.flags |= WQ_FLAG_EXCLUSIVE; | 4151 | wait.flags |= WQ_FLAG_EXCLUSIVE; |
4152 | __add_wait_queue_tail(&x->wait, &wait); | 4152 | __add_wait_queue_tail(&x->wait, &wait); |
4153 | do { | 4153 | do { |
4154 | if (state == TASK_INTERRUPTIBLE && | 4154 | if (state == TASK_INTERRUPTIBLE && |
4155 | signal_pending(current)) { | 4155 | signal_pending(current)) { |
4156 | __remove_wait_queue(&x->wait, &wait); | 4156 | __remove_wait_queue(&x->wait, &wait); |
4157 | return -ERESTARTSYS; | 4157 | return -ERESTARTSYS; |
4158 | } | 4158 | } |
4159 | __set_current_state(state); | 4159 | __set_current_state(state); |
4160 | spin_unlock_irq(&x->wait.lock); | 4160 | spin_unlock_irq(&x->wait.lock); |
4161 | timeout = schedule_timeout(timeout); | 4161 | timeout = schedule_timeout(timeout); |
4162 | spin_lock_irq(&x->wait.lock); | 4162 | spin_lock_irq(&x->wait.lock); |
4163 | if (!timeout) { | 4163 | if (!timeout) { |
4164 | __remove_wait_queue(&x->wait, &wait); | 4164 | __remove_wait_queue(&x->wait, &wait); |
4165 | return timeout; | 4165 | return timeout; |
4166 | } | 4166 | } |
4167 | } while (!x->done); | 4167 | } while (!x->done); |
4168 | __remove_wait_queue(&x->wait, &wait); | 4168 | __remove_wait_queue(&x->wait, &wait); |
4169 | } | 4169 | } |
4170 | x->done--; | 4170 | x->done--; |
4171 | return timeout; | 4171 | return timeout; |
4172 | } | 4172 | } |
4173 | 4173 | ||
4174 | static long __sched | 4174 | static long __sched |
4175 | wait_for_common(struct completion *x, long timeout, int state) | 4175 | wait_for_common(struct completion *x, long timeout, int state) |
4176 | { | 4176 | { |
4177 | might_sleep(); | 4177 | might_sleep(); |
4178 | 4178 | ||
4179 | spin_lock_irq(&x->wait.lock); | 4179 | spin_lock_irq(&x->wait.lock); |
4180 | timeout = do_wait_for_common(x, timeout, state); | 4180 | timeout = do_wait_for_common(x, timeout, state); |
4181 | spin_unlock_irq(&x->wait.lock); | 4181 | spin_unlock_irq(&x->wait.lock); |
4182 | return timeout; | 4182 | return timeout; |
4183 | } | 4183 | } |
4184 | 4184 | ||
4185 | void __sched wait_for_completion(struct completion *x) | 4185 | void __sched wait_for_completion(struct completion *x) |
4186 | { | 4186 | { |
4187 | wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); | 4187 | wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); |
4188 | } | 4188 | } |
4189 | EXPORT_SYMBOL(wait_for_completion); | 4189 | EXPORT_SYMBOL(wait_for_completion); |
4190 | 4190 | ||
4191 | unsigned long __sched | 4191 | unsigned long __sched |
4192 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) | 4192 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) |
4193 | { | 4193 | { |
4194 | return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); | 4194 | return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); |
4195 | } | 4195 | } |
4196 | EXPORT_SYMBOL(wait_for_completion_timeout); | 4196 | EXPORT_SYMBOL(wait_for_completion_timeout); |
4197 | 4197 | ||
4198 | int __sched wait_for_completion_interruptible(struct completion *x) | 4198 | int __sched wait_for_completion_interruptible(struct completion *x) |
4199 | { | 4199 | { |
4200 | long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); | 4200 | long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); |
4201 | if (t == -ERESTARTSYS) | 4201 | if (t == -ERESTARTSYS) |
4202 | return t; | 4202 | return t; |
4203 | return 0; | 4203 | return 0; |
4204 | } | 4204 | } |
4205 | EXPORT_SYMBOL(wait_for_completion_interruptible); | 4205 | EXPORT_SYMBOL(wait_for_completion_interruptible); |
4206 | 4206 | ||
4207 | unsigned long __sched | 4207 | unsigned long __sched |
4208 | wait_for_completion_interruptible_timeout(struct completion *x, | 4208 | wait_for_completion_interruptible_timeout(struct completion *x, |
4209 | unsigned long timeout) | 4209 | unsigned long timeout) |
4210 | { | 4210 | { |
4211 | return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); | 4211 | return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); |
4212 | } | 4212 | } |
4213 | EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); | 4213 | EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); |
4214 | 4214 | ||
4215 | static long __sched | 4215 | static long __sched |
4216 | sleep_on_common(wait_queue_head_t *q, int state, long timeout) | 4216 | sleep_on_common(wait_queue_head_t *q, int state, long timeout) |
4217 | { | 4217 | { |
4218 | unsigned long flags; | 4218 | unsigned long flags; |
4219 | wait_queue_t wait; | 4219 | wait_queue_t wait; |
4220 | 4220 | ||
4221 | init_waitqueue_entry(&wait, current); | 4221 | init_waitqueue_entry(&wait, current); |
4222 | 4222 | ||
4223 | __set_current_state(state); | 4223 | __set_current_state(state); |
4224 | 4224 | ||
4225 | spin_lock_irqsave(&q->lock, flags); | 4225 | spin_lock_irqsave(&q->lock, flags); |
4226 | __add_wait_queue(q, &wait); | 4226 | __add_wait_queue(q, &wait); |
4227 | spin_unlock(&q->lock); | 4227 | spin_unlock(&q->lock); |
4228 | timeout = schedule_timeout(timeout); | 4228 | timeout = schedule_timeout(timeout); |
4229 | spin_lock_irq(&q->lock); | 4229 | spin_lock_irq(&q->lock); |
4230 | __remove_wait_queue(q, &wait); | 4230 | __remove_wait_queue(q, &wait); |
4231 | spin_unlock_irqrestore(&q->lock, flags); | 4231 | spin_unlock_irqrestore(&q->lock, flags); |
4232 | 4232 | ||
4233 | return timeout; | 4233 | return timeout; |
4234 | } | 4234 | } |
4235 | 4235 | ||
4236 | void __sched interruptible_sleep_on(wait_queue_head_t *q) | 4236 | void __sched interruptible_sleep_on(wait_queue_head_t *q) |
4237 | { | 4237 | { |
4238 | sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); | 4238 | sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); |
4239 | } | 4239 | } |
4240 | EXPORT_SYMBOL(interruptible_sleep_on); | 4240 | EXPORT_SYMBOL(interruptible_sleep_on); |
4241 | 4241 | ||
4242 | long __sched | 4242 | long __sched |
4243 | interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) | 4243 | interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) |
4244 | { | 4244 | { |
4245 | return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout); | 4245 | return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout); |
4246 | } | 4246 | } |
4247 | EXPORT_SYMBOL(interruptible_sleep_on_timeout); | 4247 | EXPORT_SYMBOL(interruptible_sleep_on_timeout); |
4248 | 4248 | ||
4249 | void __sched sleep_on(wait_queue_head_t *q) | 4249 | void __sched sleep_on(wait_queue_head_t *q) |
4250 | { | 4250 | { |
4251 | sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); | 4251 | sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); |
4252 | } | 4252 | } |
4253 | EXPORT_SYMBOL(sleep_on); | 4253 | EXPORT_SYMBOL(sleep_on); |
4254 | 4254 | ||
4255 | long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) | 4255 | long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) |
4256 | { | 4256 | { |
4257 | return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout); | 4257 | return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout); |
4258 | } | 4258 | } |
4259 | EXPORT_SYMBOL(sleep_on_timeout); | 4259 | EXPORT_SYMBOL(sleep_on_timeout); |
4260 | 4260 | ||
4261 | #ifdef CONFIG_RT_MUTEXES | 4261 | #ifdef CONFIG_RT_MUTEXES |
4262 | 4262 | ||
4263 | /* | 4263 | /* |
4264 | * rt_mutex_setprio - set the current priority of a task | 4264 | * rt_mutex_setprio - set the current priority of a task |
4265 | * @p: task | 4265 | * @p: task |
4266 | * @prio: prio value (kernel-internal form) | 4266 | * @prio: prio value (kernel-internal form) |
4267 | * | 4267 | * |
4268 | * This function changes the 'effective' priority of a task. It does | 4268 | * This function changes the 'effective' priority of a task. It does |
4269 | * not touch ->normal_prio like __setscheduler(). | 4269 | * not touch ->normal_prio like __setscheduler(). |
4270 | * | 4270 | * |
4271 | * Used by the rt_mutex code to implement priority inheritance logic. | 4271 | * Used by the rt_mutex code to implement priority inheritance logic. |
4272 | */ | 4272 | */ |
4273 | void rt_mutex_setprio(struct task_struct *p, int prio) | 4273 | void rt_mutex_setprio(struct task_struct *p, int prio) |
4274 | { | 4274 | { |
4275 | unsigned long flags; | 4275 | unsigned long flags; |
4276 | int oldprio, on_rq, running; | 4276 | int oldprio, on_rq, running; |
4277 | struct rq *rq; | 4277 | struct rq *rq; |
4278 | const struct sched_class *prev_class = p->sched_class; | 4278 | const struct sched_class *prev_class = p->sched_class; |
4279 | 4279 | ||
4280 | BUG_ON(prio < 0 || prio > MAX_PRIO); | 4280 | BUG_ON(prio < 0 || prio > MAX_PRIO); |
4281 | 4281 | ||
4282 | rq = task_rq_lock(p, &flags); | 4282 | rq = task_rq_lock(p, &flags); |
4283 | update_rq_clock(rq); | 4283 | update_rq_clock(rq); |
4284 | 4284 | ||
4285 | oldprio = p->prio; | 4285 | oldprio = p->prio; |
4286 | on_rq = p->se.on_rq; | 4286 | on_rq = p->se.on_rq; |
4287 | running = task_current(rq, p); | 4287 | running = task_current(rq, p); |
4288 | if (on_rq) { | 4288 | if (on_rq) { |
4289 | dequeue_task(rq, p, 0); | 4289 | dequeue_task(rq, p, 0); |
4290 | if (running) | 4290 | if (running) |
4291 | p->sched_class->put_prev_task(rq, p); | 4291 | p->sched_class->put_prev_task(rq, p); |
4292 | } | 4292 | } |
4293 | 4293 | ||
4294 | if (rt_prio(prio)) | 4294 | if (rt_prio(prio)) |
4295 | p->sched_class = &rt_sched_class; | 4295 | p->sched_class = &rt_sched_class; |
4296 | else | 4296 | else |
4297 | p->sched_class = &fair_sched_class; | 4297 | p->sched_class = &fair_sched_class; |
4298 | 4298 | ||
4299 | p->prio = prio; | 4299 | p->prio = prio; |
4300 | 4300 | ||
4301 | if (on_rq) { | 4301 | if (on_rq) { |
4302 | if (running) | 4302 | if (running) |
4303 | p->sched_class->set_curr_task(rq); | 4303 | p->sched_class->set_curr_task(rq); |
4304 | 4304 | ||
4305 | enqueue_task(rq, p, 0); | 4305 | enqueue_task(rq, p, 0); |
4306 | 4306 | ||
4307 | check_class_changed(rq, p, prev_class, oldprio, running); | 4307 | check_class_changed(rq, p, prev_class, oldprio, running); |
4308 | } | 4308 | } |
4309 | task_rq_unlock(rq, &flags); | 4309 | task_rq_unlock(rq, &flags); |
4310 | } | 4310 | } |
4311 | 4311 | ||
4312 | #endif | 4312 | #endif |
4313 | 4313 | ||
4314 | void set_user_nice(struct task_struct *p, long nice) | 4314 | void set_user_nice(struct task_struct *p, long nice) |
4315 | { | 4315 | { |
4316 | int old_prio, delta, on_rq; | 4316 | int old_prio, delta, on_rq; |
4317 | unsigned long flags; | 4317 | unsigned long flags; |
4318 | struct rq *rq; | 4318 | struct rq *rq; |
4319 | 4319 | ||
4320 | if (TASK_NICE(p) == nice || nice < -20 || nice > 19) | 4320 | if (TASK_NICE(p) == nice || nice < -20 || nice > 19) |
4321 | return; | 4321 | return; |
4322 | /* | 4322 | /* |
4323 | * We have to be careful, if called from sys_setpriority(), | 4323 | * We have to be careful, if called from sys_setpriority(), |
4324 | * the task might be in the middle of scheduling on another CPU. | 4324 | * the task might be in the middle of scheduling on another CPU. |
4325 | */ | 4325 | */ |
4326 | rq = task_rq_lock(p, &flags); | 4326 | rq = task_rq_lock(p, &flags); |
4327 | update_rq_clock(rq); | 4327 | update_rq_clock(rq); |
4328 | /* | 4328 | /* |
4329 | * The RT priorities are set via sched_setscheduler(), but we still | 4329 | * The RT priorities are set via sched_setscheduler(), but we still |
4330 | * allow the 'normal' nice value to be set - but as expected | 4330 | * allow the 'normal' nice value to be set - but as expected |
4331 | * it wont have any effect on scheduling until the task is | 4331 | * it wont have any effect on scheduling until the task is |
4332 | * SCHED_FIFO/SCHED_RR: | 4332 | * SCHED_FIFO/SCHED_RR: |
4333 | */ | 4333 | */ |
4334 | if (task_has_rt_policy(p)) { | 4334 | if (task_has_rt_policy(p)) { |
4335 | p->static_prio = NICE_TO_PRIO(nice); | 4335 | p->static_prio = NICE_TO_PRIO(nice); |
4336 | goto out_unlock; | 4336 | goto out_unlock; |
4337 | } | 4337 | } |
4338 | on_rq = p->se.on_rq; | 4338 | on_rq = p->se.on_rq; |
4339 | if (on_rq) | 4339 | if (on_rq) |
4340 | dequeue_task(rq, p, 0); | 4340 | dequeue_task(rq, p, 0); |
4341 | 4341 | ||
4342 | p->static_prio = NICE_TO_PRIO(nice); | 4342 | p->static_prio = NICE_TO_PRIO(nice); |
4343 | set_load_weight(p); | 4343 | set_load_weight(p); |
4344 | old_prio = p->prio; | 4344 | old_prio = p->prio; |
4345 | p->prio = effective_prio(p); | 4345 | p->prio = effective_prio(p); |
4346 | delta = p->prio - old_prio; | 4346 | delta = p->prio - old_prio; |
4347 | 4347 | ||
4348 | if (on_rq) { | 4348 | if (on_rq) { |
4349 | enqueue_task(rq, p, 0); | 4349 | enqueue_task(rq, p, 0); |
4350 | /* | 4350 | /* |
4351 | * If the task increased its priority or is running and | 4351 | * If the task increased its priority or is running and |
4352 | * lowered its priority, then reschedule its CPU: | 4352 | * lowered its priority, then reschedule its CPU: |
4353 | */ | 4353 | */ |
4354 | if (delta < 0 || (delta > 0 && task_running(rq, p))) | 4354 | if (delta < 0 || (delta > 0 && task_running(rq, p))) |
4355 | resched_task(rq->curr); | 4355 | resched_task(rq->curr); |
4356 | } | 4356 | } |
4357 | out_unlock: | 4357 | out_unlock: |
4358 | task_rq_unlock(rq, &flags); | 4358 | task_rq_unlock(rq, &flags); |
4359 | } | 4359 | } |
4360 | EXPORT_SYMBOL(set_user_nice); | 4360 | EXPORT_SYMBOL(set_user_nice); |
4361 | 4361 | ||
4362 | /* | 4362 | /* |
4363 | * can_nice - check if a task can reduce its nice value | 4363 | * can_nice - check if a task can reduce its nice value |
4364 | * @p: task | 4364 | * @p: task |
4365 | * @nice: nice value | 4365 | * @nice: nice value |
4366 | */ | 4366 | */ |
4367 | int can_nice(const struct task_struct *p, const int nice) | 4367 | int can_nice(const struct task_struct *p, const int nice) |
4368 | { | 4368 | { |
4369 | /* convert nice value [19,-20] to rlimit style value [1,40] */ | 4369 | /* convert nice value [19,-20] to rlimit style value [1,40] */ |
4370 | int nice_rlim = 20 - nice; | 4370 | int nice_rlim = 20 - nice; |
4371 | 4371 | ||
4372 | return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || | 4372 | return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || |
4373 | capable(CAP_SYS_NICE)); | 4373 | capable(CAP_SYS_NICE)); |
4374 | } | 4374 | } |
4375 | 4375 | ||
4376 | #ifdef __ARCH_WANT_SYS_NICE | 4376 | #ifdef __ARCH_WANT_SYS_NICE |
4377 | 4377 | ||
4378 | /* | 4378 | /* |
4379 | * sys_nice - change the priority of the current process. | 4379 | * sys_nice - change the priority of the current process. |
4380 | * @increment: priority increment | 4380 | * @increment: priority increment |
4381 | * | 4381 | * |
4382 | * sys_setpriority is a more generic, but much slower function that | 4382 | * sys_setpriority is a more generic, but much slower function that |
4383 | * does similar things. | 4383 | * does similar things. |
4384 | */ | 4384 | */ |
4385 | asmlinkage long sys_nice(int increment) | 4385 | asmlinkage long sys_nice(int increment) |
4386 | { | 4386 | { |
4387 | long nice, retval; | 4387 | long nice, retval; |
4388 | 4388 | ||
4389 | /* | 4389 | /* |
4390 | * Setpriority might change our priority at the same moment. | 4390 | * Setpriority might change our priority at the same moment. |
4391 | * We don't have to worry. Conceptually one call occurs first | 4391 | * We don't have to worry. Conceptually one call occurs first |
4392 | * and we have a single winner. | 4392 | * and we have a single winner. |
4393 | */ | 4393 | */ |
4394 | if (increment < -40) | 4394 | if (increment < -40) |
4395 | increment = -40; | 4395 | increment = -40; |
4396 | if (increment > 40) | 4396 | if (increment > 40) |
4397 | increment = 40; | 4397 | increment = 40; |
4398 | 4398 | ||
4399 | nice = PRIO_TO_NICE(current->static_prio) + increment; | 4399 | nice = PRIO_TO_NICE(current->static_prio) + increment; |
4400 | if (nice < -20) | 4400 | if (nice < -20) |
4401 | nice = -20; | 4401 | nice = -20; |
4402 | if (nice > 19) | 4402 | if (nice > 19) |
4403 | nice = 19; | 4403 | nice = 19; |
4404 | 4404 | ||
4405 | if (increment < 0 && !can_nice(current, nice)) | 4405 | if (increment < 0 && !can_nice(current, nice)) |
4406 | return -EPERM; | 4406 | return -EPERM; |
4407 | 4407 | ||
4408 | retval = security_task_setnice(current, nice); | 4408 | retval = security_task_setnice(current, nice); |
4409 | if (retval) | 4409 | if (retval) |
4410 | return retval; | 4410 | return retval; |
4411 | 4411 | ||
4412 | set_user_nice(current, nice); | 4412 | set_user_nice(current, nice); |
4413 | return 0; | 4413 | return 0; |
4414 | } | 4414 | } |
4415 | 4415 | ||
4416 | #endif | 4416 | #endif |
4417 | 4417 | ||
4418 | /** | 4418 | /** |
4419 | * task_prio - return the priority value of a given task. | 4419 | * task_prio - return the priority value of a given task. |
4420 | * @p: the task in question. | 4420 | * @p: the task in question. |
4421 | * | 4421 | * |
4422 | * This is the priority value as seen by users in /proc. | 4422 | * This is the priority value as seen by users in /proc. |
4423 | * RT tasks are offset by -200. Normal tasks are centered | 4423 | * RT tasks are offset by -200. Normal tasks are centered |
4424 | * around 0, value goes from -16 to +15. | 4424 | * around 0, value goes from -16 to +15. |
4425 | */ | 4425 | */ |
4426 | int task_prio(const struct task_struct *p) | 4426 | int task_prio(const struct task_struct *p) |
4427 | { | 4427 | { |
4428 | return p->prio - MAX_RT_PRIO; | 4428 | return p->prio - MAX_RT_PRIO; |
4429 | } | 4429 | } |
4430 | 4430 | ||
4431 | /** | 4431 | /** |
4432 | * task_nice - return the nice value of a given task. | 4432 | * task_nice - return the nice value of a given task. |
4433 | * @p: the task in question. | 4433 | * @p: the task in question. |
4434 | */ | 4434 | */ |
4435 | int task_nice(const struct task_struct *p) | 4435 | int task_nice(const struct task_struct *p) |
4436 | { | 4436 | { |
4437 | return TASK_NICE(p); | 4437 | return TASK_NICE(p); |
4438 | } | 4438 | } |
4439 | EXPORT_SYMBOL_GPL(task_nice); | 4439 | EXPORT_SYMBOL_GPL(task_nice); |
4440 | 4440 | ||
4441 | /** | 4441 | /** |
4442 | * idle_cpu - is a given cpu idle currently? | 4442 | * idle_cpu - is a given cpu idle currently? |
4443 | * @cpu: the processor in question. | 4443 | * @cpu: the processor in question. |
4444 | */ | 4444 | */ |
4445 | int idle_cpu(int cpu) | 4445 | int idle_cpu(int cpu) |
4446 | { | 4446 | { |
4447 | return cpu_curr(cpu) == cpu_rq(cpu)->idle; | 4447 | return cpu_curr(cpu) == cpu_rq(cpu)->idle; |
4448 | } | 4448 | } |
4449 | 4449 | ||
4450 | /** | 4450 | /** |
4451 | * idle_task - return the idle task for a given cpu. | 4451 | * idle_task - return the idle task for a given cpu. |
4452 | * @cpu: the processor in question. | 4452 | * @cpu: the processor in question. |
4453 | */ | 4453 | */ |
4454 | struct task_struct *idle_task(int cpu) | 4454 | struct task_struct *idle_task(int cpu) |
4455 | { | 4455 | { |
4456 | return cpu_rq(cpu)->idle; | 4456 | return cpu_rq(cpu)->idle; |
4457 | } | 4457 | } |
4458 | 4458 | ||
4459 | /** | 4459 | /** |
4460 | * find_process_by_pid - find a process with a matching PID value. | 4460 | * find_process_by_pid - find a process with a matching PID value. |
4461 | * @pid: the pid in question. | 4461 | * @pid: the pid in question. |
4462 | */ | 4462 | */ |
4463 | static struct task_struct *find_process_by_pid(pid_t pid) | 4463 | static struct task_struct *find_process_by_pid(pid_t pid) |
4464 | { | 4464 | { |
4465 | return pid ? find_task_by_vpid(pid) : current; | 4465 | return pid ? find_task_by_vpid(pid) : current; |
4466 | } | 4466 | } |
4467 | 4467 | ||
4468 | /* Actually do priority change: must hold rq lock. */ | 4468 | /* Actually do priority change: must hold rq lock. */ |
4469 | static void | 4469 | static void |
4470 | __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) | 4470 | __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) |
4471 | { | 4471 | { |
4472 | BUG_ON(p->se.on_rq); | 4472 | BUG_ON(p->se.on_rq); |
4473 | 4473 | ||
4474 | p->policy = policy; | 4474 | p->policy = policy; |
4475 | switch (p->policy) { | 4475 | switch (p->policy) { |
4476 | case SCHED_NORMAL: | 4476 | case SCHED_NORMAL: |
4477 | case SCHED_BATCH: | 4477 | case SCHED_BATCH: |
4478 | case SCHED_IDLE: | 4478 | case SCHED_IDLE: |
4479 | p->sched_class = &fair_sched_class; | 4479 | p->sched_class = &fair_sched_class; |
4480 | break; | 4480 | break; |
4481 | case SCHED_FIFO: | 4481 | case SCHED_FIFO: |
4482 | case SCHED_RR: | 4482 | case SCHED_RR: |
4483 | p->sched_class = &rt_sched_class; | 4483 | p->sched_class = &rt_sched_class; |
4484 | break; | 4484 | break; |
4485 | } | 4485 | } |
4486 | 4486 | ||
4487 | p->rt_priority = prio; | 4487 | p->rt_priority = prio; |
4488 | p->normal_prio = normal_prio(p); | 4488 | p->normal_prio = normal_prio(p); |
4489 | /* we are holding p->pi_lock already */ | 4489 | /* we are holding p->pi_lock already */ |
4490 | p->prio = rt_mutex_getprio(p); | 4490 | p->prio = rt_mutex_getprio(p); |
4491 | set_load_weight(p); | 4491 | set_load_weight(p); |
4492 | } | 4492 | } |
4493 | 4493 | ||
4494 | /** | 4494 | /** |
4495 | * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. | 4495 | * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. |
4496 | * @p: the task in question. | 4496 | * @p: the task in question. |
4497 | * @policy: new policy. | 4497 | * @policy: new policy. |
4498 | * @param: structure containing the new RT priority. | 4498 | * @param: structure containing the new RT priority. |
4499 | * | 4499 | * |
4500 | * NOTE that the task may be already dead. | 4500 | * NOTE that the task may be already dead. |
4501 | */ | 4501 | */ |
4502 | int sched_setscheduler(struct task_struct *p, int policy, | 4502 | int sched_setscheduler(struct task_struct *p, int policy, |
4503 | struct sched_param *param) | 4503 | struct sched_param *param) |
4504 | { | 4504 | { |
4505 | int retval, oldprio, oldpolicy = -1, on_rq, running; | 4505 | int retval, oldprio, oldpolicy = -1, on_rq, running; |
4506 | unsigned long flags; | 4506 | unsigned long flags; |
4507 | const struct sched_class *prev_class = p->sched_class; | 4507 | const struct sched_class *prev_class = p->sched_class; |
4508 | struct rq *rq; | 4508 | struct rq *rq; |
4509 | 4509 | ||
4510 | /* may grab non-irq protected spin_locks */ | 4510 | /* may grab non-irq protected spin_locks */ |
4511 | BUG_ON(in_interrupt()); | 4511 | BUG_ON(in_interrupt()); |
4512 | recheck: | 4512 | recheck: |
4513 | /* double check policy once rq lock held */ | 4513 | /* double check policy once rq lock held */ |
4514 | if (policy < 0) | 4514 | if (policy < 0) |
4515 | policy = oldpolicy = p->policy; | 4515 | policy = oldpolicy = p->policy; |
4516 | else if (policy != SCHED_FIFO && policy != SCHED_RR && | 4516 | else if (policy != SCHED_FIFO && policy != SCHED_RR && |
4517 | policy != SCHED_NORMAL && policy != SCHED_BATCH && | 4517 | policy != SCHED_NORMAL && policy != SCHED_BATCH && |
4518 | policy != SCHED_IDLE) | 4518 | policy != SCHED_IDLE) |
4519 | return -EINVAL; | 4519 | return -EINVAL; |
4520 | /* | 4520 | /* |
4521 | * Valid priorities for SCHED_FIFO and SCHED_RR are | 4521 | * Valid priorities for SCHED_FIFO and SCHED_RR are |
4522 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, | 4522 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, |
4523 | * SCHED_BATCH and SCHED_IDLE is 0. | 4523 | * SCHED_BATCH and SCHED_IDLE is 0. |
4524 | */ | 4524 | */ |
4525 | if (param->sched_priority < 0 || | 4525 | if (param->sched_priority < 0 || |
4526 | (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || | 4526 | (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || |
4527 | (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) | 4527 | (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) |
4528 | return -EINVAL; | 4528 | return -EINVAL; |
4529 | if (rt_policy(policy) != (param->sched_priority != 0)) | 4529 | if (rt_policy(policy) != (param->sched_priority != 0)) |
4530 | return -EINVAL; | 4530 | return -EINVAL; |
4531 | 4531 | ||
4532 | /* | 4532 | /* |
4533 | * Allow unprivileged RT tasks to decrease priority: | 4533 | * Allow unprivileged RT tasks to decrease priority: |
4534 | */ | 4534 | */ |
4535 | if (!capable(CAP_SYS_NICE)) { | 4535 | if (!capable(CAP_SYS_NICE)) { |
4536 | if (rt_policy(policy)) { | 4536 | if (rt_policy(policy)) { |
4537 | unsigned long rlim_rtprio; | 4537 | unsigned long rlim_rtprio; |
4538 | 4538 | ||
4539 | if (!lock_task_sighand(p, &flags)) | 4539 | if (!lock_task_sighand(p, &flags)) |
4540 | return -ESRCH; | 4540 | return -ESRCH; |
4541 | rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; | 4541 | rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; |
4542 | unlock_task_sighand(p, &flags); | 4542 | unlock_task_sighand(p, &flags); |
4543 | 4543 | ||
4544 | /* can't set/change the rt policy */ | 4544 | /* can't set/change the rt policy */ |
4545 | if (policy != p->policy && !rlim_rtprio) | 4545 | if (policy != p->policy && !rlim_rtprio) |
4546 | return -EPERM; | 4546 | return -EPERM; |
4547 | 4547 | ||
4548 | /* can't increase priority */ | 4548 | /* can't increase priority */ |
4549 | if (param->sched_priority > p->rt_priority && | 4549 | if (param->sched_priority > p->rt_priority && |
4550 | param->sched_priority > rlim_rtprio) | 4550 | param->sched_priority > rlim_rtprio) |
4551 | return -EPERM; | 4551 | return -EPERM; |
4552 | } | 4552 | } |
4553 | /* | 4553 | /* |
4554 | * Like positive nice levels, dont allow tasks to | 4554 | * Like positive nice levels, dont allow tasks to |
4555 | * move out of SCHED_IDLE either: | 4555 | * move out of SCHED_IDLE either: |
4556 | */ | 4556 | */ |
4557 | if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) | 4557 | if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) |
4558 | return -EPERM; | 4558 | return -EPERM; |
4559 | 4559 | ||
4560 | /* can't change other user's priorities */ | 4560 | /* can't change other user's priorities */ |
4561 | if ((current->euid != p->euid) && | 4561 | if ((current->euid != p->euid) && |
4562 | (current->euid != p->uid)) | 4562 | (current->euid != p->uid)) |
4563 | return -EPERM; | 4563 | return -EPERM; |
4564 | } | 4564 | } |
4565 | 4565 | ||
4566 | retval = security_task_setscheduler(p, policy, param); | 4566 | retval = security_task_setscheduler(p, policy, param); |
4567 | if (retval) | 4567 | if (retval) |
4568 | return retval; | 4568 | return retval; |
4569 | /* | 4569 | /* |
4570 | * make sure no PI-waiters arrive (or leave) while we are | 4570 | * make sure no PI-waiters arrive (or leave) while we are |
4571 | * changing the priority of the task: | 4571 | * changing the priority of the task: |
4572 | */ | 4572 | */ |
4573 | spin_lock_irqsave(&p->pi_lock, flags); | 4573 | spin_lock_irqsave(&p->pi_lock, flags); |
4574 | /* | 4574 | /* |
4575 | * To be able to change p->policy safely, the apropriate | 4575 | * To be able to change p->policy safely, the apropriate |
4576 | * runqueue lock must be held. | 4576 | * runqueue lock must be held. |
4577 | */ | 4577 | */ |
4578 | rq = __task_rq_lock(p); | 4578 | rq = __task_rq_lock(p); |
4579 | /* recheck policy now with rq lock held */ | 4579 | /* recheck policy now with rq lock held */ |
4580 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { | 4580 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { |
4581 | policy = oldpolicy = -1; | 4581 | policy = oldpolicy = -1; |
4582 | __task_rq_unlock(rq); | 4582 | __task_rq_unlock(rq); |
4583 | spin_unlock_irqrestore(&p->pi_lock, flags); | 4583 | spin_unlock_irqrestore(&p->pi_lock, flags); |
4584 | goto recheck; | 4584 | goto recheck; |
4585 | } | 4585 | } |
4586 | update_rq_clock(rq); | 4586 | update_rq_clock(rq); |
4587 | on_rq = p->se.on_rq; | 4587 | on_rq = p->se.on_rq; |
4588 | running = task_current(rq, p); | 4588 | running = task_current(rq, p); |
4589 | if (on_rq) { | 4589 | if (on_rq) { |
4590 | deactivate_task(rq, p, 0); | 4590 | deactivate_task(rq, p, 0); |
4591 | if (running) | 4591 | if (running) |
4592 | p->sched_class->put_prev_task(rq, p); | 4592 | p->sched_class->put_prev_task(rq, p); |
4593 | } | 4593 | } |
4594 | 4594 | ||
4595 | oldprio = p->prio; | 4595 | oldprio = p->prio; |
4596 | __setscheduler(rq, p, policy, param->sched_priority); | 4596 | __setscheduler(rq, p, policy, param->sched_priority); |
4597 | 4597 | ||
4598 | if (on_rq) { | 4598 | if (on_rq) { |
4599 | if (running) | 4599 | if (running) |
4600 | p->sched_class->set_curr_task(rq); | 4600 | p->sched_class->set_curr_task(rq); |
4601 | 4601 | ||
4602 | activate_task(rq, p, 0); | 4602 | activate_task(rq, p, 0); |
4603 | 4603 | ||
4604 | check_class_changed(rq, p, prev_class, oldprio, running); | 4604 | check_class_changed(rq, p, prev_class, oldprio, running); |
4605 | } | 4605 | } |
4606 | __task_rq_unlock(rq); | 4606 | __task_rq_unlock(rq); |
4607 | spin_unlock_irqrestore(&p->pi_lock, flags); | 4607 | spin_unlock_irqrestore(&p->pi_lock, flags); |
4608 | 4608 | ||
4609 | rt_mutex_adjust_pi(p); | 4609 | rt_mutex_adjust_pi(p); |
4610 | 4610 | ||
4611 | return 0; | 4611 | return 0; |
4612 | } | 4612 | } |
4613 | EXPORT_SYMBOL_GPL(sched_setscheduler); | 4613 | EXPORT_SYMBOL_GPL(sched_setscheduler); |
4614 | 4614 | ||
4615 | static int | 4615 | static int |
4616 | do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) | 4616 | do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) |
4617 | { | 4617 | { |
4618 | struct sched_param lparam; | 4618 | struct sched_param lparam; |
4619 | struct task_struct *p; | 4619 | struct task_struct *p; |
4620 | int retval; | 4620 | int retval; |
4621 | 4621 | ||
4622 | if (!param || pid < 0) | 4622 | if (!param || pid < 0) |
4623 | return -EINVAL; | 4623 | return -EINVAL; |
4624 | if (copy_from_user(&lparam, param, sizeof(struct sched_param))) | 4624 | if (copy_from_user(&lparam, param, sizeof(struct sched_param))) |
4625 | return -EFAULT; | 4625 | return -EFAULT; |
4626 | 4626 | ||
4627 | rcu_read_lock(); | 4627 | rcu_read_lock(); |
4628 | retval = -ESRCH; | 4628 | retval = -ESRCH; |
4629 | p = find_process_by_pid(pid); | 4629 | p = find_process_by_pid(pid); |
4630 | if (p != NULL) | 4630 | if (p != NULL) |
4631 | retval = sched_setscheduler(p, policy, &lparam); | 4631 | retval = sched_setscheduler(p, policy, &lparam); |
4632 | rcu_read_unlock(); | 4632 | rcu_read_unlock(); |
4633 | 4633 | ||
4634 | return retval; | 4634 | return retval; |
4635 | } | 4635 | } |
4636 | 4636 | ||
4637 | /** | 4637 | /** |
4638 | * sys_sched_setscheduler - set/change the scheduler policy and RT priority | 4638 | * sys_sched_setscheduler - set/change the scheduler policy and RT priority |
4639 | * @pid: the pid in question. | 4639 | * @pid: the pid in question. |
4640 | * @policy: new policy. | 4640 | * @policy: new policy. |
4641 | * @param: structure containing the new RT priority. | 4641 | * @param: structure containing the new RT priority. |
4642 | */ | 4642 | */ |
4643 | asmlinkage long | 4643 | asmlinkage long |
4644 | sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) | 4644 | sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) |
4645 | { | 4645 | { |
4646 | /* negative values for policy are not valid */ | 4646 | /* negative values for policy are not valid */ |
4647 | if (policy < 0) | 4647 | if (policy < 0) |
4648 | return -EINVAL; | 4648 | return -EINVAL; |
4649 | 4649 | ||
4650 | return do_sched_setscheduler(pid, policy, param); | 4650 | return do_sched_setscheduler(pid, policy, param); |
4651 | } | 4651 | } |
4652 | 4652 | ||
4653 | /** | 4653 | /** |
4654 | * sys_sched_setparam - set/change the RT priority of a thread | 4654 | * sys_sched_setparam - set/change the RT priority of a thread |
4655 | * @pid: the pid in question. | 4655 | * @pid: the pid in question. |
4656 | * @param: structure containing the new RT priority. | 4656 | * @param: structure containing the new RT priority. |
4657 | */ | 4657 | */ |
4658 | asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param) | 4658 | asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param) |
4659 | { | 4659 | { |
4660 | return do_sched_setscheduler(pid, -1, param); | 4660 | return do_sched_setscheduler(pid, -1, param); |
4661 | } | 4661 | } |
4662 | 4662 | ||
4663 | /** | 4663 | /** |
4664 | * sys_sched_getscheduler - get the policy (scheduling class) of a thread | 4664 | * sys_sched_getscheduler - get the policy (scheduling class) of a thread |
4665 | * @pid: the pid in question. | 4665 | * @pid: the pid in question. |
4666 | */ | 4666 | */ |
4667 | asmlinkage long sys_sched_getscheduler(pid_t pid) | 4667 | asmlinkage long sys_sched_getscheduler(pid_t pid) |
4668 | { | 4668 | { |
4669 | struct task_struct *p; | 4669 | struct task_struct *p; |
4670 | int retval; | 4670 | int retval; |
4671 | 4671 | ||
4672 | if (pid < 0) | 4672 | if (pid < 0) |
4673 | return -EINVAL; | 4673 | return -EINVAL; |
4674 | 4674 | ||
4675 | retval = -ESRCH; | 4675 | retval = -ESRCH; |
4676 | read_lock(&tasklist_lock); | 4676 | read_lock(&tasklist_lock); |
4677 | p = find_process_by_pid(pid); | 4677 | p = find_process_by_pid(pid); |
4678 | if (p) { | 4678 | if (p) { |
4679 | retval = security_task_getscheduler(p); | 4679 | retval = security_task_getscheduler(p); |
4680 | if (!retval) | 4680 | if (!retval) |
4681 | retval = p->policy; | 4681 | retval = p->policy; |
4682 | } | 4682 | } |
4683 | read_unlock(&tasklist_lock); | 4683 | read_unlock(&tasklist_lock); |
4684 | return retval; | 4684 | return retval; |
4685 | } | 4685 | } |
4686 | 4686 | ||
4687 | /** | 4687 | /** |
4688 | * sys_sched_getscheduler - get the RT priority of a thread | 4688 | * sys_sched_getscheduler - get the RT priority of a thread |
4689 | * @pid: the pid in question. | 4689 | * @pid: the pid in question. |
4690 | * @param: structure containing the RT priority. | 4690 | * @param: structure containing the RT priority. |
4691 | */ | 4691 | */ |
4692 | asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) | 4692 | asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) |
4693 | { | 4693 | { |
4694 | struct sched_param lp; | 4694 | struct sched_param lp; |
4695 | struct task_struct *p; | 4695 | struct task_struct *p; |
4696 | int retval; | 4696 | int retval; |
4697 | 4697 | ||
4698 | if (!param || pid < 0) | 4698 | if (!param || pid < 0) |
4699 | return -EINVAL; | 4699 | return -EINVAL; |
4700 | 4700 | ||
4701 | read_lock(&tasklist_lock); | 4701 | read_lock(&tasklist_lock); |
4702 | p = find_process_by_pid(pid); | 4702 | p = find_process_by_pid(pid); |
4703 | retval = -ESRCH; | 4703 | retval = -ESRCH; |
4704 | if (!p) | 4704 | if (!p) |
4705 | goto out_unlock; | 4705 | goto out_unlock; |
4706 | 4706 | ||
4707 | retval = security_task_getscheduler(p); | 4707 | retval = security_task_getscheduler(p); |
4708 | if (retval) | 4708 | if (retval) |
4709 | goto out_unlock; | 4709 | goto out_unlock; |
4710 | 4710 | ||
4711 | lp.sched_priority = p->rt_priority; | 4711 | lp.sched_priority = p->rt_priority; |
4712 | read_unlock(&tasklist_lock); | 4712 | read_unlock(&tasklist_lock); |
4713 | 4713 | ||
4714 | /* | 4714 | /* |
4715 | * This one might sleep, we cannot do it with a spinlock held ... | 4715 | * This one might sleep, we cannot do it with a spinlock held ... |
4716 | */ | 4716 | */ |
4717 | retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; | 4717 | retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; |
4718 | 4718 | ||
4719 | return retval; | 4719 | return retval; |
4720 | 4720 | ||
4721 | out_unlock: | 4721 | out_unlock: |
4722 | read_unlock(&tasklist_lock); | 4722 | read_unlock(&tasklist_lock); |
4723 | return retval; | 4723 | return retval; |
4724 | } | 4724 | } |
4725 | 4725 | ||
4726 | long sched_setaffinity(pid_t pid, cpumask_t new_mask) | 4726 | long sched_setaffinity(pid_t pid, cpumask_t new_mask) |
4727 | { | 4727 | { |
4728 | cpumask_t cpus_allowed; | 4728 | cpumask_t cpus_allowed; |
4729 | struct task_struct *p; | 4729 | struct task_struct *p; |
4730 | int retval; | 4730 | int retval; |
4731 | 4731 | ||
4732 | get_online_cpus(); | 4732 | get_online_cpus(); |
4733 | read_lock(&tasklist_lock); | 4733 | read_lock(&tasklist_lock); |
4734 | 4734 | ||
4735 | p = find_process_by_pid(pid); | 4735 | p = find_process_by_pid(pid); |
4736 | if (!p) { | 4736 | if (!p) { |
4737 | read_unlock(&tasklist_lock); | 4737 | read_unlock(&tasklist_lock); |
4738 | put_online_cpus(); | 4738 | put_online_cpus(); |
4739 | return -ESRCH; | 4739 | return -ESRCH; |
4740 | } | 4740 | } |
4741 | 4741 | ||
4742 | /* | 4742 | /* |
4743 | * It is not safe to call set_cpus_allowed with the | 4743 | * It is not safe to call set_cpus_allowed with the |
4744 | * tasklist_lock held. We will bump the task_struct's | 4744 | * tasklist_lock held. We will bump the task_struct's |
4745 | * usage count and then drop tasklist_lock. | 4745 | * usage count and then drop tasklist_lock. |
4746 | */ | 4746 | */ |
4747 | get_task_struct(p); | 4747 | get_task_struct(p); |
4748 | read_unlock(&tasklist_lock); | 4748 | read_unlock(&tasklist_lock); |
4749 | 4749 | ||
4750 | retval = -EPERM; | 4750 | retval = -EPERM; |
4751 | if ((current->euid != p->euid) && (current->euid != p->uid) && | 4751 | if ((current->euid != p->euid) && (current->euid != p->uid) && |
4752 | !capable(CAP_SYS_NICE)) | 4752 | !capable(CAP_SYS_NICE)) |
4753 | goto out_unlock; | 4753 | goto out_unlock; |
4754 | 4754 | ||
4755 | retval = security_task_setscheduler(p, 0, NULL); | 4755 | retval = security_task_setscheduler(p, 0, NULL); |
4756 | if (retval) | 4756 | if (retval) |
4757 | goto out_unlock; | 4757 | goto out_unlock; |
4758 | 4758 | ||
4759 | cpus_allowed = cpuset_cpus_allowed(p); | 4759 | cpus_allowed = cpuset_cpus_allowed(p); |
4760 | cpus_and(new_mask, new_mask, cpus_allowed); | 4760 | cpus_and(new_mask, new_mask, cpus_allowed); |
4761 | again: | 4761 | again: |
4762 | retval = set_cpus_allowed(p, new_mask); | 4762 | retval = set_cpus_allowed(p, new_mask); |
4763 | 4763 | ||
4764 | if (!retval) { | 4764 | if (!retval) { |
4765 | cpus_allowed = cpuset_cpus_allowed(p); | 4765 | cpus_allowed = cpuset_cpus_allowed(p); |
4766 | if (!cpus_subset(new_mask, cpus_allowed)) { | 4766 | if (!cpus_subset(new_mask, cpus_allowed)) { |
4767 | /* | 4767 | /* |
4768 | * We must have raced with a concurrent cpuset | 4768 | * We must have raced with a concurrent cpuset |
4769 | * update. Just reset the cpus_allowed to the | 4769 | * update. Just reset the cpus_allowed to the |
4770 | * cpuset's cpus_allowed | 4770 | * cpuset's cpus_allowed |
4771 | */ | 4771 | */ |
4772 | new_mask = cpus_allowed; | 4772 | new_mask = cpus_allowed; |
4773 | goto again; | 4773 | goto again; |
4774 | } | 4774 | } |
4775 | } | 4775 | } |
4776 | out_unlock: | 4776 | out_unlock: |
4777 | put_task_struct(p); | 4777 | put_task_struct(p); |
4778 | put_online_cpus(); | 4778 | put_online_cpus(); |
4779 | return retval; | 4779 | return retval; |
4780 | } | 4780 | } |
4781 | 4781 | ||
4782 | static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, | 4782 | static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, |
4783 | cpumask_t *new_mask) | 4783 | cpumask_t *new_mask) |
4784 | { | 4784 | { |
4785 | if (len < sizeof(cpumask_t)) { | 4785 | if (len < sizeof(cpumask_t)) { |
4786 | memset(new_mask, 0, sizeof(cpumask_t)); | 4786 | memset(new_mask, 0, sizeof(cpumask_t)); |
4787 | } else if (len > sizeof(cpumask_t)) { | 4787 | } else if (len > sizeof(cpumask_t)) { |
4788 | len = sizeof(cpumask_t); | 4788 | len = sizeof(cpumask_t); |
4789 | } | 4789 | } |
4790 | return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; | 4790 | return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; |
4791 | } | 4791 | } |
4792 | 4792 | ||
4793 | /** | 4793 | /** |
4794 | * sys_sched_setaffinity - set the cpu affinity of a process | 4794 | * sys_sched_setaffinity - set the cpu affinity of a process |
4795 | * @pid: pid of the process | 4795 | * @pid: pid of the process |
4796 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr | 4796 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr |
4797 | * @user_mask_ptr: user-space pointer to the new cpu mask | 4797 | * @user_mask_ptr: user-space pointer to the new cpu mask |
4798 | */ | 4798 | */ |
4799 | asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, | 4799 | asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, |
4800 | unsigned long __user *user_mask_ptr) | 4800 | unsigned long __user *user_mask_ptr) |
4801 | { | 4801 | { |
4802 | cpumask_t new_mask; | 4802 | cpumask_t new_mask; |
4803 | int retval; | 4803 | int retval; |
4804 | 4804 | ||
4805 | retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask); | 4805 | retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask); |
4806 | if (retval) | 4806 | if (retval) |
4807 | return retval; | 4807 | return retval; |
4808 | 4808 | ||
4809 | return sched_setaffinity(pid, new_mask); | 4809 | return sched_setaffinity(pid, new_mask); |
4810 | } | 4810 | } |
4811 | 4811 | ||
4812 | /* | 4812 | /* |
4813 | * Represents all cpu's present in the system | 4813 | * Represents all cpu's present in the system |
4814 | * In systems capable of hotplug, this map could dynamically grow | 4814 | * In systems capable of hotplug, this map could dynamically grow |
4815 | * as new cpu's are detected in the system via any platform specific | 4815 | * as new cpu's are detected in the system via any platform specific |
4816 | * method, such as ACPI for e.g. | 4816 | * method, such as ACPI for e.g. |
4817 | */ | 4817 | */ |
4818 | 4818 | ||
4819 | cpumask_t cpu_present_map __read_mostly; | 4819 | cpumask_t cpu_present_map __read_mostly; |
4820 | EXPORT_SYMBOL(cpu_present_map); | 4820 | EXPORT_SYMBOL(cpu_present_map); |
4821 | 4821 | ||
4822 | #ifndef CONFIG_SMP | 4822 | #ifndef CONFIG_SMP |
4823 | cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; | 4823 | cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; |
4824 | EXPORT_SYMBOL(cpu_online_map); | 4824 | EXPORT_SYMBOL(cpu_online_map); |
4825 | 4825 | ||
4826 | cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; | 4826 | cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; |
4827 | EXPORT_SYMBOL(cpu_possible_map); | 4827 | EXPORT_SYMBOL(cpu_possible_map); |
4828 | #endif | 4828 | #endif |
4829 | 4829 | ||
4830 | long sched_getaffinity(pid_t pid, cpumask_t *mask) | 4830 | long sched_getaffinity(pid_t pid, cpumask_t *mask) |
4831 | { | 4831 | { |
4832 | struct task_struct *p; | 4832 | struct task_struct *p; |
4833 | int retval; | 4833 | int retval; |
4834 | 4834 | ||
4835 | get_online_cpus(); | 4835 | get_online_cpus(); |
4836 | read_lock(&tasklist_lock); | 4836 | read_lock(&tasklist_lock); |
4837 | 4837 | ||
4838 | retval = -ESRCH; | 4838 | retval = -ESRCH; |
4839 | p = find_process_by_pid(pid); | 4839 | p = find_process_by_pid(pid); |
4840 | if (!p) | 4840 | if (!p) |
4841 | goto out_unlock; | 4841 | goto out_unlock; |
4842 | 4842 | ||
4843 | retval = security_task_getscheduler(p); | 4843 | retval = security_task_getscheduler(p); |
4844 | if (retval) | 4844 | if (retval) |
4845 | goto out_unlock; | 4845 | goto out_unlock; |
4846 | 4846 | ||
4847 | cpus_and(*mask, p->cpus_allowed, cpu_online_map); | 4847 | cpus_and(*mask, p->cpus_allowed, cpu_online_map); |
4848 | 4848 | ||
4849 | out_unlock: | 4849 | out_unlock: |
4850 | read_unlock(&tasklist_lock); | 4850 | read_unlock(&tasklist_lock); |
4851 | put_online_cpus(); | 4851 | put_online_cpus(); |
4852 | 4852 | ||
4853 | return retval; | 4853 | return retval; |
4854 | } | 4854 | } |
4855 | 4855 | ||
4856 | /** | 4856 | /** |
4857 | * sys_sched_getaffinity - get the cpu affinity of a process | 4857 | * sys_sched_getaffinity - get the cpu affinity of a process |
4858 | * @pid: pid of the process | 4858 | * @pid: pid of the process |
4859 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr | 4859 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr |
4860 | * @user_mask_ptr: user-space pointer to hold the current cpu mask | 4860 | * @user_mask_ptr: user-space pointer to hold the current cpu mask |
4861 | */ | 4861 | */ |
4862 | asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, | 4862 | asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, |
4863 | unsigned long __user *user_mask_ptr) | 4863 | unsigned long __user *user_mask_ptr) |
4864 | { | 4864 | { |
4865 | int ret; | 4865 | int ret; |
4866 | cpumask_t mask; | 4866 | cpumask_t mask; |
4867 | 4867 | ||
4868 | if (len < sizeof(cpumask_t)) | 4868 | if (len < sizeof(cpumask_t)) |
4869 | return -EINVAL; | 4869 | return -EINVAL; |
4870 | 4870 | ||
4871 | ret = sched_getaffinity(pid, &mask); | 4871 | ret = sched_getaffinity(pid, &mask); |
4872 | if (ret < 0) | 4872 | if (ret < 0) |
4873 | return ret; | 4873 | return ret; |
4874 | 4874 | ||
4875 | if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t))) | 4875 | if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t))) |
4876 | return -EFAULT; | 4876 | return -EFAULT; |
4877 | 4877 | ||
4878 | return sizeof(cpumask_t); | 4878 | return sizeof(cpumask_t); |
4879 | } | 4879 | } |
4880 | 4880 | ||
4881 | /** | 4881 | /** |
4882 | * sys_sched_yield - yield the current processor to other threads. | 4882 | * sys_sched_yield - yield the current processor to other threads. |
4883 | * | 4883 | * |
4884 | * This function yields the current CPU to other tasks. If there are no | 4884 | * This function yields the current CPU to other tasks. If there are no |
4885 | * other threads running on this CPU then this function will return. | 4885 | * other threads running on this CPU then this function will return. |
4886 | */ | 4886 | */ |
4887 | asmlinkage long sys_sched_yield(void) | 4887 | asmlinkage long sys_sched_yield(void) |
4888 | { | 4888 | { |
4889 | struct rq *rq = this_rq_lock(); | 4889 | struct rq *rq = this_rq_lock(); |
4890 | 4890 | ||
4891 | schedstat_inc(rq, yld_count); | 4891 | schedstat_inc(rq, yld_count); |
4892 | current->sched_class->yield_task(rq); | 4892 | current->sched_class->yield_task(rq); |
4893 | 4893 | ||
4894 | /* | 4894 | /* |
4895 | * Since we are going to call schedule() anyway, there's | 4895 | * Since we are going to call schedule() anyway, there's |
4896 | * no need to preempt or enable interrupts: | 4896 | * no need to preempt or enable interrupts: |
4897 | */ | 4897 | */ |
4898 | __release(rq->lock); | 4898 | __release(rq->lock); |
4899 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); | 4899 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); |
4900 | _raw_spin_unlock(&rq->lock); | 4900 | _raw_spin_unlock(&rq->lock); |
4901 | preempt_enable_no_resched(); | 4901 | preempt_enable_no_resched(); |
4902 | 4902 | ||
4903 | schedule(); | 4903 | schedule(); |
4904 | 4904 | ||
4905 | return 0; | 4905 | return 0; |
4906 | } | 4906 | } |
4907 | 4907 | ||
4908 | static void __cond_resched(void) | 4908 | static void __cond_resched(void) |
4909 | { | 4909 | { |
4910 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 4910 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP |
4911 | __might_sleep(__FILE__, __LINE__); | 4911 | __might_sleep(__FILE__, __LINE__); |
4912 | #endif | 4912 | #endif |
4913 | /* | 4913 | /* |
4914 | * The BKS might be reacquired before we have dropped | 4914 | * The BKS might be reacquired before we have dropped |
4915 | * PREEMPT_ACTIVE, which could trigger a second | 4915 | * PREEMPT_ACTIVE, which could trigger a second |
4916 | * cond_resched() call. | 4916 | * cond_resched() call. |
4917 | */ | 4917 | */ |
4918 | do { | 4918 | do { |
4919 | add_preempt_count(PREEMPT_ACTIVE); | 4919 | add_preempt_count(PREEMPT_ACTIVE); |
4920 | schedule(); | 4920 | schedule(); |
4921 | sub_preempt_count(PREEMPT_ACTIVE); | 4921 | sub_preempt_count(PREEMPT_ACTIVE); |
4922 | } while (need_resched()); | 4922 | } while (need_resched()); |
4923 | } | 4923 | } |
4924 | 4924 | ||
4925 | #if !defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY) | 4925 | #if !defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY) |
4926 | int __sched _cond_resched(void) | 4926 | int __sched _cond_resched(void) |
4927 | { | 4927 | { |
4928 | if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && | 4928 | if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && |
4929 | system_state == SYSTEM_RUNNING) { | 4929 | system_state == SYSTEM_RUNNING) { |
4930 | __cond_resched(); | 4930 | __cond_resched(); |
4931 | return 1; | 4931 | return 1; |
4932 | } | 4932 | } |
4933 | return 0; | 4933 | return 0; |
4934 | } | 4934 | } |
4935 | EXPORT_SYMBOL(_cond_resched); | 4935 | EXPORT_SYMBOL(_cond_resched); |
4936 | #endif | 4936 | #endif |
4937 | 4937 | ||
4938 | /* | 4938 | /* |
4939 | * cond_resched_lock() - if a reschedule is pending, drop the given lock, | 4939 | * cond_resched_lock() - if a reschedule is pending, drop the given lock, |
4940 | * call schedule, and on return reacquire the lock. | 4940 | * call schedule, and on return reacquire the lock. |
4941 | * | 4941 | * |
4942 | * This works OK both with and without CONFIG_PREEMPT. We do strange low-level | 4942 | * This works OK both with and without CONFIG_PREEMPT. We do strange low-level |
4943 | * operations here to prevent schedule() from being called twice (once via | 4943 | * operations here to prevent schedule() from being called twice (once via |
4944 | * spin_unlock(), once by hand). | 4944 | * spin_unlock(), once by hand). |
4945 | */ | 4945 | */ |
4946 | int cond_resched_lock(spinlock_t *lock) | 4946 | int cond_resched_lock(spinlock_t *lock) |
4947 | { | 4947 | { |
4948 | int resched = need_resched() && system_state == SYSTEM_RUNNING; | ||
4948 | int ret = 0; | 4949 | int ret = 0; |
4949 | 4950 | ||
4950 | if (need_lockbreak(lock)) { | 4951 | if (spin_needbreak(lock) || resched) { |
4951 | spin_unlock(lock); | 4952 | spin_unlock(lock); |
4952 | cpu_relax(); | 4953 | if (resched && need_resched()) |
4953 | ret = 1; | 4954 | __cond_resched(); |
4954 | spin_lock(lock); | 4955 | else |
4955 | } | 4956 | cpu_relax(); |
4956 | if (need_resched() && system_state == SYSTEM_RUNNING) { | ||
4957 | spin_release(&lock->dep_map, 1, _THIS_IP_); | ||
4958 | _raw_spin_unlock(lock); | ||
4959 | preempt_enable_no_resched(); | ||
4960 | __cond_resched(); | ||
4961 | ret = 1; | 4957 | ret = 1; |
4962 | spin_lock(lock); | 4958 | spin_lock(lock); |
4963 | } | 4959 | } |
4964 | return ret; | 4960 | return ret; |
4965 | } | 4961 | } |
4966 | EXPORT_SYMBOL(cond_resched_lock); | 4962 | EXPORT_SYMBOL(cond_resched_lock); |
4967 | 4963 | ||
4968 | int __sched cond_resched_softirq(void) | 4964 | int __sched cond_resched_softirq(void) |
4969 | { | 4965 | { |
4970 | BUG_ON(!in_softirq()); | 4966 | BUG_ON(!in_softirq()); |
4971 | 4967 | ||
4972 | if (need_resched() && system_state == SYSTEM_RUNNING) { | 4968 | if (need_resched() && system_state == SYSTEM_RUNNING) { |
4973 | local_bh_enable(); | 4969 | local_bh_enable(); |
4974 | __cond_resched(); | 4970 | __cond_resched(); |
4975 | local_bh_disable(); | 4971 | local_bh_disable(); |
4976 | return 1; | 4972 | return 1; |
4977 | } | 4973 | } |
4978 | return 0; | 4974 | return 0; |
4979 | } | 4975 | } |
4980 | EXPORT_SYMBOL(cond_resched_softirq); | 4976 | EXPORT_SYMBOL(cond_resched_softirq); |
4981 | 4977 | ||
4982 | /** | 4978 | /** |
4983 | * yield - yield the current processor to other threads. | 4979 | * yield - yield the current processor to other threads. |
4984 | * | 4980 | * |
4985 | * This is a shortcut for kernel-space yielding - it marks the | 4981 | * This is a shortcut for kernel-space yielding - it marks the |
4986 | * thread runnable and calls sys_sched_yield(). | 4982 | * thread runnable and calls sys_sched_yield(). |
4987 | */ | 4983 | */ |
4988 | void __sched yield(void) | 4984 | void __sched yield(void) |
4989 | { | 4985 | { |
4990 | set_current_state(TASK_RUNNING); | 4986 | set_current_state(TASK_RUNNING); |
4991 | sys_sched_yield(); | 4987 | sys_sched_yield(); |
4992 | } | 4988 | } |
4993 | EXPORT_SYMBOL(yield); | 4989 | EXPORT_SYMBOL(yield); |
4994 | 4990 | ||
4995 | /* | 4991 | /* |
4996 | * This task is about to go to sleep on IO. Increment rq->nr_iowait so | 4992 | * This task is about to go to sleep on IO. Increment rq->nr_iowait so |
4997 | * that process accounting knows that this is a task in IO wait state. | 4993 | * that process accounting knows that this is a task in IO wait state. |
4998 | * | 4994 | * |
4999 | * But don't do that if it is a deliberate, throttling IO wait (this task | 4995 | * But don't do that if it is a deliberate, throttling IO wait (this task |
5000 | * has set its backing_dev_info: the queue against which it should throttle) | 4996 | * has set its backing_dev_info: the queue against which it should throttle) |
5001 | */ | 4997 | */ |
5002 | void __sched io_schedule(void) | 4998 | void __sched io_schedule(void) |
5003 | { | 4999 | { |
5004 | struct rq *rq = &__raw_get_cpu_var(runqueues); | 5000 | struct rq *rq = &__raw_get_cpu_var(runqueues); |
5005 | 5001 | ||
5006 | delayacct_blkio_start(); | 5002 | delayacct_blkio_start(); |
5007 | atomic_inc(&rq->nr_iowait); | 5003 | atomic_inc(&rq->nr_iowait); |
5008 | schedule(); | 5004 | schedule(); |
5009 | atomic_dec(&rq->nr_iowait); | 5005 | atomic_dec(&rq->nr_iowait); |
5010 | delayacct_blkio_end(); | 5006 | delayacct_blkio_end(); |
5011 | } | 5007 | } |
5012 | EXPORT_SYMBOL(io_schedule); | 5008 | EXPORT_SYMBOL(io_schedule); |
5013 | 5009 | ||
5014 | long __sched io_schedule_timeout(long timeout) | 5010 | long __sched io_schedule_timeout(long timeout) |
5015 | { | 5011 | { |
5016 | struct rq *rq = &__raw_get_cpu_var(runqueues); | 5012 | struct rq *rq = &__raw_get_cpu_var(runqueues); |
5017 | long ret; | 5013 | long ret; |
5018 | 5014 | ||
5019 | delayacct_blkio_start(); | 5015 | delayacct_blkio_start(); |
5020 | atomic_inc(&rq->nr_iowait); | 5016 | atomic_inc(&rq->nr_iowait); |
5021 | ret = schedule_timeout(timeout); | 5017 | ret = schedule_timeout(timeout); |
5022 | atomic_dec(&rq->nr_iowait); | 5018 | atomic_dec(&rq->nr_iowait); |
5023 | delayacct_blkio_end(); | 5019 | delayacct_blkio_end(); |
5024 | return ret; | 5020 | return ret; |
5025 | } | 5021 | } |
5026 | 5022 | ||
5027 | /** | 5023 | /** |
5028 | * sys_sched_get_priority_max - return maximum RT priority. | 5024 | * sys_sched_get_priority_max - return maximum RT priority. |
5029 | * @policy: scheduling class. | 5025 | * @policy: scheduling class. |
5030 | * | 5026 | * |
5031 | * this syscall returns the maximum rt_priority that can be used | 5027 | * this syscall returns the maximum rt_priority that can be used |
5032 | * by a given scheduling class. | 5028 | * by a given scheduling class. |
5033 | */ | 5029 | */ |
5034 | asmlinkage long sys_sched_get_priority_max(int policy) | 5030 | asmlinkage long sys_sched_get_priority_max(int policy) |
5035 | { | 5031 | { |
5036 | int ret = -EINVAL; | 5032 | int ret = -EINVAL; |
5037 | 5033 | ||
5038 | switch (policy) { | 5034 | switch (policy) { |
5039 | case SCHED_FIFO: | 5035 | case SCHED_FIFO: |
5040 | case SCHED_RR: | 5036 | case SCHED_RR: |
5041 | ret = MAX_USER_RT_PRIO-1; | 5037 | ret = MAX_USER_RT_PRIO-1; |
5042 | break; | 5038 | break; |
5043 | case SCHED_NORMAL: | 5039 | case SCHED_NORMAL: |
5044 | case SCHED_BATCH: | 5040 | case SCHED_BATCH: |
5045 | case SCHED_IDLE: | 5041 | case SCHED_IDLE: |
5046 | ret = 0; | 5042 | ret = 0; |
5047 | break; | 5043 | break; |
5048 | } | 5044 | } |
5049 | return ret; | 5045 | return ret; |
5050 | } | 5046 | } |
5051 | 5047 | ||
5052 | /** | 5048 | /** |
5053 | * sys_sched_get_priority_min - return minimum RT priority. | 5049 | * sys_sched_get_priority_min - return minimum RT priority. |
5054 | * @policy: scheduling class. | 5050 | * @policy: scheduling class. |
5055 | * | 5051 | * |
5056 | * this syscall returns the minimum rt_priority that can be used | 5052 | * this syscall returns the minimum rt_priority that can be used |
5057 | * by a given scheduling class. | 5053 | * by a given scheduling class. |
5058 | */ | 5054 | */ |
5059 | asmlinkage long sys_sched_get_priority_min(int policy) | 5055 | asmlinkage long sys_sched_get_priority_min(int policy) |
5060 | { | 5056 | { |
5061 | int ret = -EINVAL; | 5057 | int ret = -EINVAL; |
5062 | 5058 | ||
5063 | switch (policy) { | 5059 | switch (policy) { |
5064 | case SCHED_FIFO: | 5060 | case SCHED_FIFO: |
5065 | case SCHED_RR: | 5061 | case SCHED_RR: |
5066 | ret = 1; | 5062 | ret = 1; |
5067 | break; | 5063 | break; |
5068 | case SCHED_NORMAL: | 5064 | case SCHED_NORMAL: |
5069 | case SCHED_BATCH: | 5065 | case SCHED_BATCH: |
5070 | case SCHED_IDLE: | 5066 | case SCHED_IDLE: |
5071 | ret = 0; | 5067 | ret = 0; |
5072 | } | 5068 | } |
5073 | return ret; | 5069 | return ret; |
5074 | } | 5070 | } |
5075 | 5071 | ||
5076 | /** | 5072 | /** |
5077 | * sys_sched_rr_get_interval - return the default timeslice of a process. | 5073 | * sys_sched_rr_get_interval - return the default timeslice of a process. |
5078 | * @pid: pid of the process. | 5074 | * @pid: pid of the process. |
5079 | * @interval: userspace pointer to the timeslice value. | 5075 | * @interval: userspace pointer to the timeslice value. |
5080 | * | 5076 | * |
5081 | * this syscall writes the default timeslice value of a given process | 5077 | * this syscall writes the default timeslice value of a given process |
5082 | * into the user-space timespec buffer. A value of '0' means infinity. | 5078 | * into the user-space timespec buffer. A value of '0' means infinity. |
5083 | */ | 5079 | */ |
5084 | asmlinkage | 5080 | asmlinkage |
5085 | long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) | 5081 | long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) |
5086 | { | 5082 | { |
5087 | struct task_struct *p; | 5083 | struct task_struct *p; |
5088 | unsigned int time_slice; | 5084 | unsigned int time_slice; |
5089 | int retval; | 5085 | int retval; |
5090 | struct timespec t; | 5086 | struct timespec t; |
5091 | 5087 | ||
5092 | if (pid < 0) | 5088 | if (pid < 0) |
5093 | return -EINVAL; | 5089 | return -EINVAL; |
5094 | 5090 | ||
5095 | retval = -ESRCH; | 5091 | retval = -ESRCH; |
5096 | read_lock(&tasklist_lock); | 5092 | read_lock(&tasklist_lock); |
5097 | p = find_process_by_pid(pid); | 5093 | p = find_process_by_pid(pid); |
5098 | if (!p) | 5094 | if (!p) |
5099 | goto out_unlock; | 5095 | goto out_unlock; |
5100 | 5096 | ||
5101 | retval = security_task_getscheduler(p); | 5097 | retval = security_task_getscheduler(p); |
5102 | if (retval) | 5098 | if (retval) |
5103 | goto out_unlock; | 5099 | goto out_unlock; |
5104 | 5100 | ||
5105 | /* | 5101 | /* |
5106 | * Time slice is 0 for SCHED_FIFO tasks and for SCHED_OTHER | 5102 | * Time slice is 0 for SCHED_FIFO tasks and for SCHED_OTHER |
5107 | * tasks that are on an otherwise idle runqueue: | 5103 | * tasks that are on an otherwise idle runqueue: |
5108 | */ | 5104 | */ |
5109 | time_slice = 0; | 5105 | time_slice = 0; |
5110 | if (p->policy == SCHED_RR) { | 5106 | if (p->policy == SCHED_RR) { |
5111 | time_slice = DEF_TIMESLICE; | 5107 | time_slice = DEF_TIMESLICE; |
5112 | } else { | 5108 | } else { |
5113 | struct sched_entity *se = &p->se; | 5109 | struct sched_entity *se = &p->se; |
5114 | unsigned long flags; | 5110 | unsigned long flags; |
5115 | struct rq *rq; | 5111 | struct rq *rq; |
5116 | 5112 | ||
5117 | rq = task_rq_lock(p, &flags); | 5113 | rq = task_rq_lock(p, &flags); |
5118 | if (rq->cfs.load.weight) | 5114 | if (rq->cfs.load.weight) |
5119 | time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); | 5115 | time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); |
5120 | task_rq_unlock(rq, &flags); | 5116 | task_rq_unlock(rq, &flags); |
5121 | } | 5117 | } |
5122 | read_unlock(&tasklist_lock); | 5118 | read_unlock(&tasklist_lock); |
5123 | jiffies_to_timespec(time_slice, &t); | 5119 | jiffies_to_timespec(time_slice, &t); |
5124 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; | 5120 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; |
5125 | return retval; | 5121 | return retval; |
5126 | 5122 | ||
5127 | out_unlock: | 5123 | out_unlock: |
5128 | read_unlock(&tasklist_lock); | 5124 | read_unlock(&tasklist_lock); |
5129 | return retval; | 5125 | return retval; |
5130 | } | 5126 | } |
5131 | 5127 | ||
5132 | static const char stat_nam[] = "RSDTtZX"; | 5128 | static const char stat_nam[] = "RSDTtZX"; |
5133 | 5129 | ||
5134 | void sched_show_task(struct task_struct *p) | 5130 | void sched_show_task(struct task_struct *p) |
5135 | { | 5131 | { |
5136 | unsigned long free = 0; | 5132 | unsigned long free = 0; |
5137 | unsigned state; | 5133 | unsigned state; |
5138 | 5134 | ||
5139 | state = p->state ? __ffs(p->state) + 1 : 0; | 5135 | state = p->state ? __ffs(p->state) + 1 : 0; |
5140 | printk(KERN_INFO "%-13.13s %c", p->comm, | 5136 | printk(KERN_INFO "%-13.13s %c", p->comm, |
5141 | state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); | 5137 | state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); |
5142 | #if BITS_PER_LONG == 32 | 5138 | #if BITS_PER_LONG == 32 |
5143 | if (state == TASK_RUNNING) | 5139 | if (state == TASK_RUNNING) |
5144 | printk(KERN_CONT " running "); | 5140 | printk(KERN_CONT " running "); |
5145 | else | 5141 | else |
5146 | printk(KERN_CONT " %08lx ", thread_saved_pc(p)); | 5142 | printk(KERN_CONT " %08lx ", thread_saved_pc(p)); |
5147 | #else | 5143 | #else |
5148 | if (state == TASK_RUNNING) | 5144 | if (state == TASK_RUNNING) |
5149 | printk(KERN_CONT " running task "); | 5145 | printk(KERN_CONT " running task "); |
5150 | else | 5146 | else |
5151 | printk(KERN_CONT " %016lx ", thread_saved_pc(p)); | 5147 | printk(KERN_CONT " %016lx ", thread_saved_pc(p)); |
5152 | #endif | 5148 | #endif |
5153 | #ifdef CONFIG_DEBUG_STACK_USAGE | 5149 | #ifdef CONFIG_DEBUG_STACK_USAGE |
5154 | { | 5150 | { |
5155 | unsigned long *n = end_of_stack(p); | 5151 | unsigned long *n = end_of_stack(p); |
5156 | while (!*n) | 5152 | while (!*n) |
5157 | n++; | 5153 | n++; |
5158 | free = (unsigned long)n - (unsigned long)end_of_stack(p); | 5154 | free = (unsigned long)n - (unsigned long)end_of_stack(p); |
5159 | } | 5155 | } |
5160 | #endif | 5156 | #endif |
5161 | printk(KERN_CONT "%5lu %5d %6d\n", free, | 5157 | printk(KERN_CONT "%5lu %5d %6d\n", free, |
5162 | task_pid_nr(p), task_pid_nr(p->real_parent)); | 5158 | task_pid_nr(p), task_pid_nr(p->real_parent)); |
5163 | 5159 | ||
5164 | show_stack(p, NULL); | 5160 | show_stack(p, NULL); |
5165 | } | 5161 | } |
5166 | 5162 | ||
5167 | void show_state_filter(unsigned long state_filter) | 5163 | void show_state_filter(unsigned long state_filter) |
5168 | { | 5164 | { |
5169 | struct task_struct *g, *p; | 5165 | struct task_struct *g, *p; |
5170 | 5166 | ||
5171 | #if BITS_PER_LONG == 32 | 5167 | #if BITS_PER_LONG == 32 |
5172 | printk(KERN_INFO | 5168 | printk(KERN_INFO |
5173 | " task PC stack pid father\n"); | 5169 | " task PC stack pid father\n"); |
5174 | #else | 5170 | #else |
5175 | printk(KERN_INFO | 5171 | printk(KERN_INFO |
5176 | " task PC stack pid father\n"); | 5172 | " task PC stack pid father\n"); |
5177 | #endif | 5173 | #endif |
5178 | read_lock(&tasklist_lock); | 5174 | read_lock(&tasklist_lock); |
5179 | do_each_thread(g, p) { | 5175 | do_each_thread(g, p) { |
5180 | /* | 5176 | /* |
5181 | * reset the NMI-timeout, listing all files on a slow | 5177 | * reset the NMI-timeout, listing all files on a slow |
5182 | * console might take alot of time: | 5178 | * console might take alot of time: |
5183 | */ | 5179 | */ |
5184 | touch_nmi_watchdog(); | 5180 | touch_nmi_watchdog(); |
5185 | if (!state_filter || (p->state & state_filter)) | 5181 | if (!state_filter || (p->state & state_filter)) |
5186 | sched_show_task(p); | 5182 | sched_show_task(p); |
5187 | } while_each_thread(g, p); | 5183 | } while_each_thread(g, p); |
5188 | 5184 | ||
5189 | touch_all_softlockup_watchdogs(); | 5185 | touch_all_softlockup_watchdogs(); |
5190 | 5186 | ||
5191 | #ifdef CONFIG_SCHED_DEBUG | 5187 | #ifdef CONFIG_SCHED_DEBUG |
5192 | sysrq_sched_debug_show(); | 5188 | sysrq_sched_debug_show(); |
5193 | #endif | 5189 | #endif |
5194 | read_unlock(&tasklist_lock); | 5190 | read_unlock(&tasklist_lock); |
5195 | /* | 5191 | /* |
5196 | * Only show locks if all tasks are dumped: | 5192 | * Only show locks if all tasks are dumped: |
5197 | */ | 5193 | */ |
5198 | if (state_filter == -1) | 5194 | if (state_filter == -1) |
5199 | debug_show_all_locks(); | 5195 | debug_show_all_locks(); |
5200 | } | 5196 | } |
5201 | 5197 | ||
5202 | void __cpuinit init_idle_bootup_task(struct task_struct *idle) | 5198 | void __cpuinit init_idle_bootup_task(struct task_struct *idle) |
5203 | { | 5199 | { |
5204 | idle->sched_class = &idle_sched_class; | 5200 | idle->sched_class = &idle_sched_class; |
5205 | } | 5201 | } |
5206 | 5202 | ||
5207 | /** | 5203 | /** |
5208 | * init_idle - set up an idle thread for a given CPU | 5204 | * init_idle - set up an idle thread for a given CPU |
5209 | * @idle: task in question | 5205 | * @idle: task in question |
5210 | * @cpu: cpu the idle task belongs to | 5206 | * @cpu: cpu the idle task belongs to |
5211 | * | 5207 | * |
5212 | * NOTE: this function does not set the idle thread's NEED_RESCHED | 5208 | * NOTE: this function does not set the idle thread's NEED_RESCHED |
5213 | * flag, to make booting more robust. | 5209 | * flag, to make booting more robust. |
5214 | */ | 5210 | */ |
5215 | void __cpuinit init_idle(struct task_struct *idle, int cpu) | 5211 | void __cpuinit init_idle(struct task_struct *idle, int cpu) |
5216 | { | 5212 | { |
5217 | struct rq *rq = cpu_rq(cpu); | 5213 | struct rq *rq = cpu_rq(cpu); |
5218 | unsigned long flags; | 5214 | unsigned long flags; |
5219 | 5215 | ||
5220 | __sched_fork(idle); | 5216 | __sched_fork(idle); |
5221 | idle->se.exec_start = sched_clock(); | 5217 | idle->se.exec_start = sched_clock(); |
5222 | 5218 | ||
5223 | idle->prio = idle->normal_prio = MAX_PRIO; | 5219 | idle->prio = idle->normal_prio = MAX_PRIO; |
5224 | idle->cpus_allowed = cpumask_of_cpu(cpu); | 5220 | idle->cpus_allowed = cpumask_of_cpu(cpu); |
5225 | __set_task_cpu(idle, cpu); | 5221 | __set_task_cpu(idle, cpu); |
5226 | 5222 | ||
5227 | spin_lock_irqsave(&rq->lock, flags); | 5223 | spin_lock_irqsave(&rq->lock, flags); |
5228 | rq->curr = rq->idle = idle; | 5224 | rq->curr = rq->idle = idle; |
5229 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 5225 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) |
5230 | idle->oncpu = 1; | 5226 | idle->oncpu = 1; |
5231 | #endif | 5227 | #endif |
5232 | spin_unlock_irqrestore(&rq->lock, flags); | 5228 | spin_unlock_irqrestore(&rq->lock, flags); |
5233 | 5229 | ||
5234 | /* Set the preempt count _outside_ the spinlocks! */ | 5230 | /* Set the preempt count _outside_ the spinlocks! */ |
5235 | task_thread_info(idle)->preempt_count = 0; | 5231 | task_thread_info(idle)->preempt_count = 0; |
5236 | 5232 | ||
5237 | /* | 5233 | /* |
5238 | * The idle tasks have their own, simple scheduling class: | 5234 | * The idle tasks have their own, simple scheduling class: |
5239 | */ | 5235 | */ |
5240 | idle->sched_class = &idle_sched_class; | 5236 | idle->sched_class = &idle_sched_class; |
5241 | } | 5237 | } |
5242 | 5238 | ||
5243 | /* | 5239 | /* |
5244 | * In a system that switches off the HZ timer nohz_cpu_mask | 5240 | * In a system that switches off the HZ timer nohz_cpu_mask |
5245 | * indicates which cpus entered this state. This is used | 5241 | * indicates which cpus entered this state. This is used |
5246 | * in the rcu update to wait only for active cpus. For system | 5242 | * in the rcu update to wait only for active cpus. For system |
5247 | * which do not switch off the HZ timer nohz_cpu_mask should | 5243 | * which do not switch off the HZ timer nohz_cpu_mask should |
5248 | * always be CPU_MASK_NONE. | 5244 | * always be CPU_MASK_NONE. |
5249 | */ | 5245 | */ |
5250 | cpumask_t nohz_cpu_mask = CPU_MASK_NONE; | 5246 | cpumask_t nohz_cpu_mask = CPU_MASK_NONE; |
5251 | 5247 | ||
5252 | /* | 5248 | /* |
5253 | * Increase the granularity value when there are more CPUs, | 5249 | * Increase the granularity value when there are more CPUs, |
5254 | * because with more CPUs the 'effective latency' as visible | 5250 | * because with more CPUs the 'effective latency' as visible |
5255 | * to users decreases. But the relationship is not linear, | 5251 | * to users decreases. But the relationship is not linear, |
5256 | * so pick a second-best guess by going with the log2 of the | 5252 | * so pick a second-best guess by going with the log2 of the |
5257 | * number of CPUs. | 5253 | * number of CPUs. |
5258 | * | 5254 | * |
5259 | * This idea comes from the SD scheduler of Con Kolivas: | 5255 | * This idea comes from the SD scheduler of Con Kolivas: |
5260 | */ | 5256 | */ |
5261 | static inline void sched_init_granularity(void) | 5257 | static inline void sched_init_granularity(void) |
5262 | { | 5258 | { |
5263 | unsigned int factor = 1 + ilog2(num_online_cpus()); | 5259 | unsigned int factor = 1 + ilog2(num_online_cpus()); |
5264 | const unsigned long limit = 200000000; | 5260 | const unsigned long limit = 200000000; |
5265 | 5261 | ||
5266 | sysctl_sched_min_granularity *= factor; | 5262 | sysctl_sched_min_granularity *= factor; |
5267 | if (sysctl_sched_min_granularity > limit) | 5263 | if (sysctl_sched_min_granularity > limit) |
5268 | sysctl_sched_min_granularity = limit; | 5264 | sysctl_sched_min_granularity = limit; |
5269 | 5265 | ||
5270 | sysctl_sched_latency *= factor; | 5266 | sysctl_sched_latency *= factor; |
5271 | if (sysctl_sched_latency > limit) | 5267 | if (sysctl_sched_latency > limit) |
5272 | sysctl_sched_latency = limit; | 5268 | sysctl_sched_latency = limit; |
5273 | 5269 | ||
5274 | sysctl_sched_wakeup_granularity *= factor; | 5270 | sysctl_sched_wakeup_granularity *= factor; |
5275 | sysctl_sched_batch_wakeup_granularity *= factor; | 5271 | sysctl_sched_batch_wakeup_granularity *= factor; |
5276 | } | 5272 | } |
5277 | 5273 | ||
5278 | #ifdef CONFIG_SMP | 5274 | #ifdef CONFIG_SMP |
5279 | /* | 5275 | /* |
5280 | * This is how migration works: | 5276 | * This is how migration works: |
5281 | * | 5277 | * |
5282 | * 1) we queue a struct migration_req structure in the source CPU's | 5278 | * 1) we queue a struct migration_req structure in the source CPU's |
5283 | * runqueue and wake up that CPU's migration thread. | 5279 | * runqueue and wake up that CPU's migration thread. |
5284 | * 2) we down() the locked semaphore => thread blocks. | 5280 | * 2) we down() the locked semaphore => thread blocks. |
5285 | * 3) migration thread wakes up (implicitly it forces the migrated | 5281 | * 3) migration thread wakes up (implicitly it forces the migrated |
5286 | * thread off the CPU) | 5282 | * thread off the CPU) |
5287 | * 4) it gets the migration request and checks whether the migrated | 5283 | * 4) it gets the migration request and checks whether the migrated |
5288 | * task is still in the wrong runqueue. | 5284 | * task is still in the wrong runqueue. |
5289 | * 5) if it's in the wrong runqueue then the migration thread removes | 5285 | * 5) if it's in the wrong runqueue then the migration thread removes |
5290 | * it and puts it into the right queue. | 5286 | * it and puts it into the right queue. |
5291 | * 6) migration thread up()s the semaphore. | 5287 | * 6) migration thread up()s the semaphore. |
5292 | * 7) we wake up and the migration is done. | 5288 | * 7) we wake up and the migration is done. |
5293 | */ | 5289 | */ |
5294 | 5290 | ||
5295 | /* | 5291 | /* |
5296 | * Change a given task's CPU affinity. Migrate the thread to a | 5292 | * Change a given task's CPU affinity. Migrate the thread to a |
5297 | * proper CPU and schedule it away if the CPU it's executing on | 5293 | * proper CPU and schedule it away if the CPU it's executing on |
5298 | * is removed from the allowed bitmask. | 5294 | * is removed from the allowed bitmask. |
5299 | * | 5295 | * |
5300 | * NOTE: the caller must have a valid reference to the task, the | 5296 | * NOTE: the caller must have a valid reference to the task, the |
5301 | * task must not exit() & deallocate itself prematurely. The | 5297 | * task must not exit() & deallocate itself prematurely. The |
5302 | * call is not atomic; no spinlocks may be held. | 5298 | * call is not atomic; no spinlocks may be held. |
5303 | */ | 5299 | */ |
5304 | int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) | 5300 | int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) |
5305 | { | 5301 | { |
5306 | struct migration_req req; | 5302 | struct migration_req req; |
5307 | unsigned long flags; | 5303 | unsigned long flags; |
5308 | struct rq *rq; | 5304 | struct rq *rq; |
5309 | int ret = 0; | 5305 | int ret = 0; |
5310 | 5306 | ||
5311 | rq = task_rq_lock(p, &flags); | 5307 | rq = task_rq_lock(p, &flags); |
5312 | if (!cpus_intersects(new_mask, cpu_online_map)) { | 5308 | if (!cpus_intersects(new_mask, cpu_online_map)) { |
5313 | ret = -EINVAL; | 5309 | ret = -EINVAL; |
5314 | goto out; | 5310 | goto out; |
5315 | } | 5311 | } |
5316 | 5312 | ||
5317 | if (p->sched_class->set_cpus_allowed) | 5313 | if (p->sched_class->set_cpus_allowed) |
5318 | p->sched_class->set_cpus_allowed(p, &new_mask); | 5314 | p->sched_class->set_cpus_allowed(p, &new_mask); |
5319 | else { | 5315 | else { |
5320 | p->cpus_allowed = new_mask; | 5316 | p->cpus_allowed = new_mask; |
5321 | p->rt.nr_cpus_allowed = cpus_weight(new_mask); | 5317 | p->rt.nr_cpus_allowed = cpus_weight(new_mask); |
5322 | } | 5318 | } |
5323 | 5319 | ||
5324 | /* Can the task run on the task's current CPU? If so, we're done */ | 5320 | /* Can the task run on the task's current CPU? If so, we're done */ |
5325 | if (cpu_isset(task_cpu(p), new_mask)) | 5321 | if (cpu_isset(task_cpu(p), new_mask)) |
5326 | goto out; | 5322 | goto out; |
5327 | 5323 | ||
5328 | if (migrate_task(p, any_online_cpu(new_mask), &req)) { | 5324 | if (migrate_task(p, any_online_cpu(new_mask), &req)) { |
5329 | /* Need help from migration thread: drop lock and wait. */ | 5325 | /* Need help from migration thread: drop lock and wait. */ |
5330 | task_rq_unlock(rq, &flags); | 5326 | task_rq_unlock(rq, &flags); |
5331 | wake_up_process(rq->migration_thread); | 5327 | wake_up_process(rq->migration_thread); |
5332 | wait_for_completion(&req.done); | 5328 | wait_for_completion(&req.done); |
5333 | tlb_migrate_finish(p->mm); | 5329 | tlb_migrate_finish(p->mm); |
5334 | return 0; | 5330 | return 0; |
5335 | } | 5331 | } |
5336 | out: | 5332 | out: |
5337 | task_rq_unlock(rq, &flags); | 5333 | task_rq_unlock(rq, &flags); |
5338 | 5334 | ||
5339 | return ret; | 5335 | return ret; |
5340 | } | 5336 | } |
5341 | EXPORT_SYMBOL_GPL(set_cpus_allowed); | 5337 | EXPORT_SYMBOL_GPL(set_cpus_allowed); |
5342 | 5338 | ||
5343 | /* | 5339 | /* |
5344 | * Move (not current) task off this cpu, onto dest cpu. We're doing | 5340 | * Move (not current) task off this cpu, onto dest cpu. We're doing |
5345 | * this because either it can't run here any more (set_cpus_allowed() | 5341 | * this because either it can't run here any more (set_cpus_allowed() |
5346 | * away from this CPU, or CPU going down), or because we're | 5342 | * away from this CPU, or CPU going down), or because we're |
5347 | * attempting to rebalance this task on exec (sched_exec). | 5343 | * attempting to rebalance this task on exec (sched_exec). |
5348 | * | 5344 | * |
5349 | * So we race with normal scheduler movements, but that's OK, as long | 5345 | * So we race with normal scheduler movements, but that's OK, as long |
5350 | * as the task is no longer on this CPU. | 5346 | * as the task is no longer on this CPU. |
5351 | * | 5347 | * |
5352 | * Returns non-zero if task was successfully migrated. | 5348 | * Returns non-zero if task was successfully migrated. |
5353 | */ | 5349 | */ |
5354 | static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | 5350 | static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) |
5355 | { | 5351 | { |
5356 | struct rq *rq_dest, *rq_src; | 5352 | struct rq *rq_dest, *rq_src; |
5357 | int ret = 0, on_rq; | 5353 | int ret = 0, on_rq; |
5358 | 5354 | ||
5359 | if (unlikely(cpu_is_offline(dest_cpu))) | 5355 | if (unlikely(cpu_is_offline(dest_cpu))) |
5360 | return ret; | 5356 | return ret; |
5361 | 5357 | ||
5362 | rq_src = cpu_rq(src_cpu); | 5358 | rq_src = cpu_rq(src_cpu); |
5363 | rq_dest = cpu_rq(dest_cpu); | 5359 | rq_dest = cpu_rq(dest_cpu); |
5364 | 5360 | ||
5365 | double_rq_lock(rq_src, rq_dest); | 5361 | double_rq_lock(rq_src, rq_dest); |
5366 | /* Already moved. */ | 5362 | /* Already moved. */ |
5367 | if (task_cpu(p) != src_cpu) | 5363 | if (task_cpu(p) != src_cpu) |
5368 | goto out; | 5364 | goto out; |
5369 | /* Affinity changed (again). */ | 5365 | /* Affinity changed (again). */ |
5370 | if (!cpu_isset(dest_cpu, p->cpus_allowed)) | 5366 | if (!cpu_isset(dest_cpu, p->cpus_allowed)) |
5371 | goto out; | 5367 | goto out; |
5372 | 5368 | ||
5373 | on_rq = p->se.on_rq; | 5369 | on_rq = p->se.on_rq; |
5374 | if (on_rq) | 5370 | if (on_rq) |
5375 | deactivate_task(rq_src, p, 0); | 5371 | deactivate_task(rq_src, p, 0); |
5376 | 5372 | ||
5377 | set_task_cpu(p, dest_cpu); | 5373 | set_task_cpu(p, dest_cpu); |
5378 | if (on_rq) { | 5374 | if (on_rq) { |
5379 | activate_task(rq_dest, p, 0); | 5375 | activate_task(rq_dest, p, 0); |
5380 | check_preempt_curr(rq_dest, p); | 5376 | check_preempt_curr(rq_dest, p); |
5381 | } | 5377 | } |
5382 | ret = 1; | 5378 | ret = 1; |
5383 | out: | 5379 | out: |
5384 | double_rq_unlock(rq_src, rq_dest); | 5380 | double_rq_unlock(rq_src, rq_dest); |
5385 | return ret; | 5381 | return ret; |
5386 | } | 5382 | } |
5387 | 5383 | ||
5388 | /* | 5384 | /* |
5389 | * migration_thread - this is a highprio system thread that performs | 5385 | * migration_thread - this is a highprio system thread that performs |
5390 | * thread migration by bumping thread off CPU then 'pushing' onto | 5386 | * thread migration by bumping thread off CPU then 'pushing' onto |
5391 | * another runqueue. | 5387 | * another runqueue. |
5392 | */ | 5388 | */ |
5393 | static int migration_thread(void *data) | 5389 | static int migration_thread(void *data) |
5394 | { | 5390 | { |
5395 | int cpu = (long)data; | 5391 | int cpu = (long)data; |
5396 | struct rq *rq; | 5392 | struct rq *rq; |
5397 | 5393 | ||
5398 | rq = cpu_rq(cpu); | 5394 | rq = cpu_rq(cpu); |
5399 | BUG_ON(rq->migration_thread != current); | 5395 | BUG_ON(rq->migration_thread != current); |
5400 | 5396 | ||
5401 | set_current_state(TASK_INTERRUPTIBLE); | 5397 | set_current_state(TASK_INTERRUPTIBLE); |
5402 | while (!kthread_should_stop()) { | 5398 | while (!kthread_should_stop()) { |
5403 | struct migration_req *req; | 5399 | struct migration_req *req; |
5404 | struct list_head *head; | 5400 | struct list_head *head; |
5405 | 5401 | ||
5406 | spin_lock_irq(&rq->lock); | 5402 | spin_lock_irq(&rq->lock); |
5407 | 5403 | ||
5408 | if (cpu_is_offline(cpu)) { | 5404 | if (cpu_is_offline(cpu)) { |
5409 | spin_unlock_irq(&rq->lock); | 5405 | spin_unlock_irq(&rq->lock); |
5410 | goto wait_to_die; | 5406 | goto wait_to_die; |
5411 | } | 5407 | } |
5412 | 5408 | ||
5413 | if (rq->active_balance) { | 5409 | if (rq->active_balance) { |
5414 | active_load_balance(rq, cpu); | 5410 | active_load_balance(rq, cpu); |
5415 | rq->active_balance = 0; | 5411 | rq->active_balance = 0; |
5416 | } | 5412 | } |
5417 | 5413 | ||
5418 | head = &rq->migration_queue; | 5414 | head = &rq->migration_queue; |
5419 | 5415 | ||
5420 | if (list_empty(head)) { | 5416 | if (list_empty(head)) { |
5421 | spin_unlock_irq(&rq->lock); | 5417 | spin_unlock_irq(&rq->lock); |
5422 | schedule(); | 5418 | schedule(); |
5423 | set_current_state(TASK_INTERRUPTIBLE); | 5419 | set_current_state(TASK_INTERRUPTIBLE); |
5424 | continue; | 5420 | continue; |
5425 | } | 5421 | } |
5426 | req = list_entry(head->next, struct migration_req, list); | 5422 | req = list_entry(head->next, struct migration_req, list); |
5427 | list_del_init(head->next); | 5423 | list_del_init(head->next); |
5428 | 5424 | ||
5429 | spin_unlock(&rq->lock); | 5425 | spin_unlock(&rq->lock); |
5430 | __migrate_task(req->task, cpu, req->dest_cpu); | 5426 | __migrate_task(req->task, cpu, req->dest_cpu); |
5431 | local_irq_enable(); | 5427 | local_irq_enable(); |
5432 | 5428 | ||
5433 | complete(&req->done); | 5429 | complete(&req->done); |
5434 | } | 5430 | } |
5435 | __set_current_state(TASK_RUNNING); | 5431 | __set_current_state(TASK_RUNNING); |
5436 | return 0; | 5432 | return 0; |
5437 | 5433 | ||
5438 | wait_to_die: | 5434 | wait_to_die: |
5439 | /* Wait for kthread_stop */ | 5435 | /* Wait for kthread_stop */ |
5440 | set_current_state(TASK_INTERRUPTIBLE); | 5436 | set_current_state(TASK_INTERRUPTIBLE); |
5441 | while (!kthread_should_stop()) { | 5437 | while (!kthread_should_stop()) { |
5442 | schedule(); | 5438 | schedule(); |
5443 | set_current_state(TASK_INTERRUPTIBLE); | 5439 | set_current_state(TASK_INTERRUPTIBLE); |
5444 | } | 5440 | } |
5445 | __set_current_state(TASK_RUNNING); | 5441 | __set_current_state(TASK_RUNNING); |
5446 | return 0; | 5442 | return 0; |
5447 | } | 5443 | } |
5448 | 5444 | ||
5449 | #ifdef CONFIG_HOTPLUG_CPU | 5445 | #ifdef CONFIG_HOTPLUG_CPU |
5450 | 5446 | ||
5451 | static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu) | 5447 | static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu) |
5452 | { | 5448 | { |
5453 | int ret; | 5449 | int ret; |
5454 | 5450 | ||
5455 | local_irq_disable(); | 5451 | local_irq_disable(); |
5456 | ret = __migrate_task(p, src_cpu, dest_cpu); | 5452 | ret = __migrate_task(p, src_cpu, dest_cpu); |
5457 | local_irq_enable(); | 5453 | local_irq_enable(); |
5458 | return ret; | 5454 | return ret; |
5459 | } | 5455 | } |
5460 | 5456 | ||
5461 | /* | 5457 | /* |
5462 | * Figure out where task on dead CPU should go, use force if necessary. | 5458 | * Figure out where task on dead CPU should go, use force if necessary. |
5463 | * NOTE: interrupts should be disabled by the caller | 5459 | * NOTE: interrupts should be disabled by the caller |
5464 | */ | 5460 | */ |
5465 | static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) | 5461 | static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) |
5466 | { | 5462 | { |
5467 | unsigned long flags; | 5463 | unsigned long flags; |
5468 | cpumask_t mask; | 5464 | cpumask_t mask; |
5469 | struct rq *rq; | 5465 | struct rq *rq; |
5470 | int dest_cpu; | 5466 | int dest_cpu; |
5471 | 5467 | ||
5472 | do { | 5468 | do { |
5473 | /* On same node? */ | 5469 | /* On same node? */ |
5474 | mask = node_to_cpumask(cpu_to_node(dead_cpu)); | 5470 | mask = node_to_cpumask(cpu_to_node(dead_cpu)); |
5475 | cpus_and(mask, mask, p->cpus_allowed); | 5471 | cpus_and(mask, mask, p->cpus_allowed); |
5476 | dest_cpu = any_online_cpu(mask); | 5472 | dest_cpu = any_online_cpu(mask); |
5477 | 5473 | ||
5478 | /* On any allowed CPU? */ | 5474 | /* On any allowed CPU? */ |
5479 | if (dest_cpu == NR_CPUS) | 5475 | if (dest_cpu == NR_CPUS) |
5480 | dest_cpu = any_online_cpu(p->cpus_allowed); | 5476 | dest_cpu = any_online_cpu(p->cpus_allowed); |
5481 | 5477 | ||
5482 | /* No more Mr. Nice Guy. */ | 5478 | /* No more Mr. Nice Guy. */ |
5483 | if (dest_cpu == NR_CPUS) { | 5479 | if (dest_cpu == NR_CPUS) { |
5484 | cpumask_t cpus_allowed = cpuset_cpus_allowed_locked(p); | 5480 | cpumask_t cpus_allowed = cpuset_cpus_allowed_locked(p); |
5485 | /* | 5481 | /* |
5486 | * Try to stay on the same cpuset, where the | 5482 | * Try to stay on the same cpuset, where the |
5487 | * current cpuset may be a subset of all cpus. | 5483 | * current cpuset may be a subset of all cpus. |
5488 | * The cpuset_cpus_allowed_locked() variant of | 5484 | * The cpuset_cpus_allowed_locked() variant of |
5489 | * cpuset_cpus_allowed() will not block. It must be | 5485 | * cpuset_cpus_allowed() will not block. It must be |
5490 | * called within calls to cpuset_lock/cpuset_unlock. | 5486 | * called within calls to cpuset_lock/cpuset_unlock. |
5491 | */ | 5487 | */ |
5492 | rq = task_rq_lock(p, &flags); | 5488 | rq = task_rq_lock(p, &flags); |
5493 | p->cpus_allowed = cpus_allowed; | 5489 | p->cpus_allowed = cpus_allowed; |
5494 | dest_cpu = any_online_cpu(p->cpus_allowed); | 5490 | dest_cpu = any_online_cpu(p->cpus_allowed); |
5495 | task_rq_unlock(rq, &flags); | 5491 | task_rq_unlock(rq, &flags); |
5496 | 5492 | ||
5497 | /* | 5493 | /* |
5498 | * Don't tell them about moving exiting tasks or | 5494 | * Don't tell them about moving exiting tasks or |
5499 | * kernel threads (both mm NULL), since they never | 5495 | * kernel threads (both mm NULL), since they never |
5500 | * leave kernel. | 5496 | * leave kernel. |
5501 | */ | 5497 | */ |
5502 | if (p->mm && printk_ratelimit()) { | 5498 | if (p->mm && printk_ratelimit()) { |
5503 | printk(KERN_INFO "process %d (%s) no " | 5499 | printk(KERN_INFO "process %d (%s) no " |
5504 | "longer affine to cpu%d\n", | 5500 | "longer affine to cpu%d\n", |
5505 | task_pid_nr(p), p->comm, dead_cpu); | 5501 | task_pid_nr(p), p->comm, dead_cpu); |
5506 | } | 5502 | } |
5507 | } | 5503 | } |
5508 | } while (!__migrate_task_irq(p, dead_cpu, dest_cpu)); | 5504 | } while (!__migrate_task_irq(p, dead_cpu, dest_cpu)); |
5509 | } | 5505 | } |
5510 | 5506 | ||
5511 | /* | 5507 | /* |
5512 | * While a dead CPU has no uninterruptible tasks queued at this point, | 5508 | * While a dead CPU has no uninterruptible tasks queued at this point, |
5513 | * it might still have a nonzero ->nr_uninterruptible counter, because | 5509 | * it might still have a nonzero ->nr_uninterruptible counter, because |
5514 | * for performance reasons the counter is not stricly tracking tasks to | 5510 | * for performance reasons the counter is not stricly tracking tasks to |
5515 | * their home CPUs. So we just add the counter to another CPU's counter, | 5511 | * their home CPUs. So we just add the counter to another CPU's counter, |
5516 | * to keep the global sum constant after CPU-down: | 5512 | * to keep the global sum constant after CPU-down: |
5517 | */ | 5513 | */ |
5518 | static void migrate_nr_uninterruptible(struct rq *rq_src) | 5514 | static void migrate_nr_uninterruptible(struct rq *rq_src) |
5519 | { | 5515 | { |
5520 | struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL)); | 5516 | struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL)); |
5521 | unsigned long flags; | 5517 | unsigned long flags; |
5522 | 5518 | ||
5523 | local_irq_save(flags); | 5519 | local_irq_save(flags); |
5524 | double_rq_lock(rq_src, rq_dest); | 5520 | double_rq_lock(rq_src, rq_dest); |
5525 | rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; | 5521 | rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; |
5526 | rq_src->nr_uninterruptible = 0; | 5522 | rq_src->nr_uninterruptible = 0; |
5527 | double_rq_unlock(rq_src, rq_dest); | 5523 | double_rq_unlock(rq_src, rq_dest); |
5528 | local_irq_restore(flags); | 5524 | local_irq_restore(flags); |
5529 | } | 5525 | } |
5530 | 5526 | ||
5531 | /* Run through task list and migrate tasks from the dead cpu. */ | 5527 | /* Run through task list and migrate tasks from the dead cpu. */ |
5532 | static void migrate_live_tasks(int src_cpu) | 5528 | static void migrate_live_tasks(int src_cpu) |
5533 | { | 5529 | { |
5534 | struct task_struct *p, *t; | 5530 | struct task_struct *p, *t; |
5535 | 5531 | ||
5536 | read_lock(&tasklist_lock); | 5532 | read_lock(&tasklist_lock); |
5537 | 5533 | ||
5538 | do_each_thread(t, p) { | 5534 | do_each_thread(t, p) { |
5539 | if (p == current) | 5535 | if (p == current) |
5540 | continue; | 5536 | continue; |
5541 | 5537 | ||
5542 | if (task_cpu(p) == src_cpu) | 5538 | if (task_cpu(p) == src_cpu) |
5543 | move_task_off_dead_cpu(src_cpu, p); | 5539 | move_task_off_dead_cpu(src_cpu, p); |
5544 | } while_each_thread(t, p); | 5540 | } while_each_thread(t, p); |
5545 | 5541 | ||
5546 | read_unlock(&tasklist_lock); | 5542 | read_unlock(&tasklist_lock); |
5547 | } | 5543 | } |
5548 | 5544 | ||
5549 | /* | 5545 | /* |
5550 | * Schedules idle task to be the next runnable task on current CPU. | 5546 | * Schedules idle task to be the next runnable task on current CPU. |
5551 | * It does so by boosting its priority to highest possible. | 5547 | * It does so by boosting its priority to highest possible. |
5552 | * Used by CPU offline code. | 5548 | * Used by CPU offline code. |
5553 | */ | 5549 | */ |
5554 | void sched_idle_next(void) | 5550 | void sched_idle_next(void) |
5555 | { | 5551 | { |
5556 | int this_cpu = smp_processor_id(); | 5552 | int this_cpu = smp_processor_id(); |
5557 | struct rq *rq = cpu_rq(this_cpu); | 5553 | struct rq *rq = cpu_rq(this_cpu); |
5558 | struct task_struct *p = rq->idle; | 5554 | struct task_struct *p = rq->idle; |
5559 | unsigned long flags; | 5555 | unsigned long flags; |
5560 | 5556 | ||
5561 | /* cpu has to be offline */ | 5557 | /* cpu has to be offline */ |
5562 | BUG_ON(cpu_online(this_cpu)); | 5558 | BUG_ON(cpu_online(this_cpu)); |
5563 | 5559 | ||
5564 | /* | 5560 | /* |
5565 | * Strictly not necessary since rest of the CPUs are stopped by now | 5561 | * Strictly not necessary since rest of the CPUs are stopped by now |
5566 | * and interrupts disabled on the current cpu. | 5562 | * and interrupts disabled on the current cpu. |
5567 | */ | 5563 | */ |
5568 | spin_lock_irqsave(&rq->lock, flags); | 5564 | spin_lock_irqsave(&rq->lock, flags); |
5569 | 5565 | ||
5570 | __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); | 5566 | __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); |
5571 | 5567 | ||
5572 | update_rq_clock(rq); | 5568 | update_rq_clock(rq); |
5573 | activate_task(rq, p, 0); | 5569 | activate_task(rq, p, 0); |
5574 | 5570 | ||
5575 | spin_unlock_irqrestore(&rq->lock, flags); | 5571 | spin_unlock_irqrestore(&rq->lock, flags); |
5576 | } | 5572 | } |
5577 | 5573 | ||
5578 | /* | 5574 | /* |
5579 | * Ensures that the idle task is using init_mm right before its cpu goes | 5575 | * Ensures that the idle task is using init_mm right before its cpu goes |
5580 | * offline. | 5576 | * offline. |
5581 | */ | 5577 | */ |
5582 | void idle_task_exit(void) | 5578 | void idle_task_exit(void) |
5583 | { | 5579 | { |
5584 | struct mm_struct *mm = current->active_mm; | 5580 | struct mm_struct *mm = current->active_mm; |
5585 | 5581 | ||
5586 | BUG_ON(cpu_online(smp_processor_id())); | 5582 | BUG_ON(cpu_online(smp_processor_id())); |
5587 | 5583 | ||
5588 | if (mm != &init_mm) | 5584 | if (mm != &init_mm) |
5589 | switch_mm(mm, &init_mm, current); | 5585 | switch_mm(mm, &init_mm, current); |
5590 | mmdrop(mm); | 5586 | mmdrop(mm); |
5591 | } | 5587 | } |
5592 | 5588 | ||
5593 | /* called under rq->lock with disabled interrupts */ | 5589 | /* called under rq->lock with disabled interrupts */ |
5594 | static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) | 5590 | static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) |
5595 | { | 5591 | { |
5596 | struct rq *rq = cpu_rq(dead_cpu); | 5592 | struct rq *rq = cpu_rq(dead_cpu); |
5597 | 5593 | ||
5598 | /* Must be exiting, otherwise would be on tasklist. */ | 5594 | /* Must be exiting, otherwise would be on tasklist. */ |
5599 | BUG_ON(!p->exit_state); | 5595 | BUG_ON(!p->exit_state); |
5600 | 5596 | ||
5601 | /* Cannot have done final schedule yet: would have vanished. */ | 5597 | /* Cannot have done final schedule yet: would have vanished. */ |
5602 | BUG_ON(p->state == TASK_DEAD); | 5598 | BUG_ON(p->state == TASK_DEAD); |
5603 | 5599 | ||
5604 | get_task_struct(p); | 5600 | get_task_struct(p); |
5605 | 5601 | ||
5606 | /* | 5602 | /* |
5607 | * Drop lock around migration; if someone else moves it, | 5603 | * Drop lock around migration; if someone else moves it, |
5608 | * that's OK. No task can be added to this CPU, so iteration is | 5604 | * that's OK. No task can be added to this CPU, so iteration is |
5609 | * fine. | 5605 | * fine. |
5610 | */ | 5606 | */ |
5611 | spin_unlock_irq(&rq->lock); | 5607 | spin_unlock_irq(&rq->lock); |
5612 | move_task_off_dead_cpu(dead_cpu, p); | 5608 | move_task_off_dead_cpu(dead_cpu, p); |
5613 | spin_lock_irq(&rq->lock); | 5609 | spin_lock_irq(&rq->lock); |
5614 | 5610 | ||
5615 | put_task_struct(p); | 5611 | put_task_struct(p); |
5616 | } | 5612 | } |
5617 | 5613 | ||
5618 | /* release_task() removes task from tasklist, so we won't find dead tasks. */ | 5614 | /* release_task() removes task from tasklist, so we won't find dead tasks. */ |
5619 | static void migrate_dead_tasks(unsigned int dead_cpu) | 5615 | static void migrate_dead_tasks(unsigned int dead_cpu) |
5620 | { | 5616 | { |
5621 | struct rq *rq = cpu_rq(dead_cpu); | 5617 | struct rq *rq = cpu_rq(dead_cpu); |
5622 | struct task_struct *next; | 5618 | struct task_struct *next; |
5623 | 5619 | ||
5624 | for ( ; ; ) { | 5620 | for ( ; ; ) { |
5625 | if (!rq->nr_running) | 5621 | if (!rq->nr_running) |
5626 | break; | 5622 | break; |
5627 | update_rq_clock(rq); | 5623 | update_rq_clock(rq); |
5628 | next = pick_next_task(rq, rq->curr); | 5624 | next = pick_next_task(rq, rq->curr); |
5629 | if (!next) | 5625 | if (!next) |
5630 | break; | 5626 | break; |
5631 | migrate_dead(dead_cpu, next); | 5627 | migrate_dead(dead_cpu, next); |
5632 | 5628 | ||
5633 | } | 5629 | } |
5634 | } | 5630 | } |
5635 | #endif /* CONFIG_HOTPLUG_CPU */ | 5631 | #endif /* CONFIG_HOTPLUG_CPU */ |
5636 | 5632 | ||
5637 | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) | 5633 | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) |
5638 | 5634 | ||
5639 | static struct ctl_table sd_ctl_dir[] = { | 5635 | static struct ctl_table sd_ctl_dir[] = { |
5640 | { | 5636 | { |
5641 | .procname = "sched_domain", | 5637 | .procname = "sched_domain", |
5642 | .mode = 0555, | 5638 | .mode = 0555, |
5643 | }, | 5639 | }, |
5644 | {0, }, | 5640 | {0, }, |
5645 | }; | 5641 | }; |
5646 | 5642 | ||
5647 | static struct ctl_table sd_ctl_root[] = { | 5643 | static struct ctl_table sd_ctl_root[] = { |
5648 | { | 5644 | { |
5649 | .ctl_name = CTL_KERN, | 5645 | .ctl_name = CTL_KERN, |
5650 | .procname = "kernel", | 5646 | .procname = "kernel", |
5651 | .mode = 0555, | 5647 | .mode = 0555, |
5652 | .child = sd_ctl_dir, | 5648 | .child = sd_ctl_dir, |
5653 | }, | 5649 | }, |
5654 | {0, }, | 5650 | {0, }, |
5655 | }; | 5651 | }; |
5656 | 5652 | ||
5657 | static struct ctl_table *sd_alloc_ctl_entry(int n) | 5653 | static struct ctl_table *sd_alloc_ctl_entry(int n) |
5658 | { | 5654 | { |
5659 | struct ctl_table *entry = | 5655 | struct ctl_table *entry = |
5660 | kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); | 5656 | kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); |
5661 | 5657 | ||
5662 | return entry; | 5658 | return entry; |
5663 | } | 5659 | } |
5664 | 5660 | ||
5665 | static void sd_free_ctl_entry(struct ctl_table **tablep) | 5661 | static void sd_free_ctl_entry(struct ctl_table **tablep) |
5666 | { | 5662 | { |
5667 | struct ctl_table *entry; | 5663 | struct ctl_table *entry; |
5668 | 5664 | ||
5669 | /* | 5665 | /* |
5670 | * In the intermediate directories, both the child directory and | 5666 | * In the intermediate directories, both the child directory and |
5671 | * procname are dynamically allocated and could fail but the mode | 5667 | * procname are dynamically allocated and could fail but the mode |
5672 | * will always be set. In the lowest directory the names are | 5668 | * will always be set. In the lowest directory the names are |
5673 | * static strings and all have proc handlers. | 5669 | * static strings and all have proc handlers. |
5674 | */ | 5670 | */ |
5675 | for (entry = *tablep; entry->mode; entry++) { | 5671 | for (entry = *tablep; entry->mode; entry++) { |
5676 | if (entry->child) | 5672 | if (entry->child) |
5677 | sd_free_ctl_entry(&entry->child); | 5673 | sd_free_ctl_entry(&entry->child); |
5678 | if (entry->proc_handler == NULL) | 5674 | if (entry->proc_handler == NULL) |
5679 | kfree(entry->procname); | 5675 | kfree(entry->procname); |
5680 | } | 5676 | } |
5681 | 5677 | ||
5682 | kfree(*tablep); | 5678 | kfree(*tablep); |
5683 | *tablep = NULL; | 5679 | *tablep = NULL; |
5684 | } | 5680 | } |
5685 | 5681 | ||
5686 | static void | 5682 | static void |
5687 | set_table_entry(struct ctl_table *entry, | 5683 | set_table_entry(struct ctl_table *entry, |
5688 | const char *procname, void *data, int maxlen, | 5684 | const char *procname, void *data, int maxlen, |
5689 | mode_t mode, proc_handler *proc_handler) | 5685 | mode_t mode, proc_handler *proc_handler) |
5690 | { | 5686 | { |
5691 | entry->procname = procname; | 5687 | entry->procname = procname; |
5692 | entry->data = data; | 5688 | entry->data = data; |
5693 | entry->maxlen = maxlen; | 5689 | entry->maxlen = maxlen; |
5694 | entry->mode = mode; | 5690 | entry->mode = mode; |
5695 | entry->proc_handler = proc_handler; | 5691 | entry->proc_handler = proc_handler; |
5696 | } | 5692 | } |
5697 | 5693 | ||
5698 | static struct ctl_table * | 5694 | static struct ctl_table * |
5699 | sd_alloc_ctl_domain_table(struct sched_domain *sd) | 5695 | sd_alloc_ctl_domain_table(struct sched_domain *sd) |
5700 | { | 5696 | { |
5701 | struct ctl_table *table = sd_alloc_ctl_entry(12); | 5697 | struct ctl_table *table = sd_alloc_ctl_entry(12); |
5702 | 5698 | ||
5703 | if (table == NULL) | 5699 | if (table == NULL) |
5704 | return NULL; | 5700 | return NULL; |
5705 | 5701 | ||
5706 | set_table_entry(&table[0], "min_interval", &sd->min_interval, | 5702 | set_table_entry(&table[0], "min_interval", &sd->min_interval, |
5707 | sizeof(long), 0644, proc_doulongvec_minmax); | 5703 | sizeof(long), 0644, proc_doulongvec_minmax); |
5708 | set_table_entry(&table[1], "max_interval", &sd->max_interval, | 5704 | set_table_entry(&table[1], "max_interval", &sd->max_interval, |
5709 | sizeof(long), 0644, proc_doulongvec_minmax); | 5705 | sizeof(long), 0644, proc_doulongvec_minmax); |
5710 | set_table_entry(&table[2], "busy_idx", &sd->busy_idx, | 5706 | set_table_entry(&table[2], "busy_idx", &sd->busy_idx, |
5711 | sizeof(int), 0644, proc_dointvec_minmax); | 5707 | sizeof(int), 0644, proc_dointvec_minmax); |
5712 | set_table_entry(&table[3], "idle_idx", &sd->idle_idx, | 5708 | set_table_entry(&table[3], "idle_idx", &sd->idle_idx, |
5713 | sizeof(int), 0644, proc_dointvec_minmax); | 5709 | sizeof(int), 0644, proc_dointvec_minmax); |
5714 | set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, | 5710 | set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, |
5715 | sizeof(int), 0644, proc_dointvec_minmax); | 5711 | sizeof(int), 0644, proc_dointvec_minmax); |
5716 | set_table_entry(&table[5], "wake_idx", &sd->wake_idx, | 5712 | set_table_entry(&table[5], "wake_idx", &sd->wake_idx, |
5717 | sizeof(int), 0644, proc_dointvec_minmax); | 5713 | sizeof(int), 0644, proc_dointvec_minmax); |
5718 | set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, | 5714 | set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, |
5719 | sizeof(int), 0644, proc_dointvec_minmax); | 5715 | sizeof(int), 0644, proc_dointvec_minmax); |
5720 | set_table_entry(&table[7], "busy_factor", &sd->busy_factor, | 5716 | set_table_entry(&table[7], "busy_factor", &sd->busy_factor, |
5721 | sizeof(int), 0644, proc_dointvec_minmax); | 5717 | sizeof(int), 0644, proc_dointvec_minmax); |
5722 | set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, | 5718 | set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, |
5723 | sizeof(int), 0644, proc_dointvec_minmax); | 5719 | sizeof(int), 0644, proc_dointvec_minmax); |
5724 | set_table_entry(&table[9], "cache_nice_tries", | 5720 | set_table_entry(&table[9], "cache_nice_tries", |
5725 | &sd->cache_nice_tries, | 5721 | &sd->cache_nice_tries, |
5726 | sizeof(int), 0644, proc_dointvec_minmax); | 5722 | sizeof(int), 0644, proc_dointvec_minmax); |
5727 | set_table_entry(&table[10], "flags", &sd->flags, | 5723 | set_table_entry(&table[10], "flags", &sd->flags, |
5728 | sizeof(int), 0644, proc_dointvec_minmax); | 5724 | sizeof(int), 0644, proc_dointvec_minmax); |
5729 | /* &table[11] is terminator */ | 5725 | /* &table[11] is terminator */ |
5730 | 5726 | ||
5731 | return table; | 5727 | return table; |
5732 | } | 5728 | } |
5733 | 5729 | ||
5734 | static ctl_table *sd_alloc_ctl_cpu_table(int cpu) | 5730 | static ctl_table *sd_alloc_ctl_cpu_table(int cpu) |
5735 | { | 5731 | { |
5736 | struct ctl_table *entry, *table; | 5732 | struct ctl_table *entry, *table; |
5737 | struct sched_domain *sd; | 5733 | struct sched_domain *sd; |
5738 | int domain_num = 0, i; | 5734 | int domain_num = 0, i; |
5739 | char buf[32]; | 5735 | char buf[32]; |
5740 | 5736 | ||
5741 | for_each_domain(cpu, sd) | 5737 | for_each_domain(cpu, sd) |
5742 | domain_num++; | 5738 | domain_num++; |
5743 | entry = table = sd_alloc_ctl_entry(domain_num + 1); | 5739 | entry = table = sd_alloc_ctl_entry(domain_num + 1); |
5744 | if (table == NULL) | 5740 | if (table == NULL) |
5745 | return NULL; | 5741 | return NULL; |
5746 | 5742 | ||
5747 | i = 0; | 5743 | i = 0; |
5748 | for_each_domain(cpu, sd) { | 5744 | for_each_domain(cpu, sd) { |
5749 | snprintf(buf, 32, "domain%d", i); | 5745 | snprintf(buf, 32, "domain%d", i); |
5750 | entry->procname = kstrdup(buf, GFP_KERNEL); | 5746 | entry->procname = kstrdup(buf, GFP_KERNEL); |
5751 | entry->mode = 0555; | 5747 | entry->mode = 0555; |
5752 | entry->child = sd_alloc_ctl_domain_table(sd); | 5748 | entry->child = sd_alloc_ctl_domain_table(sd); |
5753 | entry++; | 5749 | entry++; |
5754 | i++; | 5750 | i++; |
5755 | } | 5751 | } |
5756 | return table; | 5752 | return table; |
5757 | } | 5753 | } |
5758 | 5754 | ||
5759 | static struct ctl_table_header *sd_sysctl_header; | 5755 | static struct ctl_table_header *sd_sysctl_header; |
5760 | static void register_sched_domain_sysctl(void) | 5756 | static void register_sched_domain_sysctl(void) |
5761 | { | 5757 | { |
5762 | int i, cpu_num = num_online_cpus(); | 5758 | int i, cpu_num = num_online_cpus(); |
5763 | struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); | 5759 | struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); |
5764 | char buf[32]; | 5760 | char buf[32]; |
5765 | 5761 | ||
5766 | WARN_ON(sd_ctl_dir[0].child); | 5762 | WARN_ON(sd_ctl_dir[0].child); |
5767 | sd_ctl_dir[0].child = entry; | 5763 | sd_ctl_dir[0].child = entry; |
5768 | 5764 | ||
5769 | if (entry == NULL) | 5765 | if (entry == NULL) |
5770 | return; | 5766 | return; |
5771 | 5767 | ||
5772 | for_each_online_cpu(i) { | 5768 | for_each_online_cpu(i) { |
5773 | snprintf(buf, 32, "cpu%d", i); | 5769 | snprintf(buf, 32, "cpu%d", i); |
5774 | entry->procname = kstrdup(buf, GFP_KERNEL); | 5770 | entry->procname = kstrdup(buf, GFP_KERNEL); |
5775 | entry->mode = 0555; | 5771 | entry->mode = 0555; |
5776 | entry->child = sd_alloc_ctl_cpu_table(i); | 5772 | entry->child = sd_alloc_ctl_cpu_table(i); |
5777 | entry++; | 5773 | entry++; |
5778 | } | 5774 | } |
5779 | 5775 | ||
5780 | WARN_ON(sd_sysctl_header); | 5776 | WARN_ON(sd_sysctl_header); |
5781 | sd_sysctl_header = register_sysctl_table(sd_ctl_root); | 5777 | sd_sysctl_header = register_sysctl_table(sd_ctl_root); |
5782 | } | 5778 | } |
5783 | 5779 | ||
5784 | /* may be called multiple times per register */ | 5780 | /* may be called multiple times per register */ |
5785 | static void unregister_sched_domain_sysctl(void) | 5781 | static void unregister_sched_domain_sysctl(void) |
5786 | { | 5782 | { |
5787 | if (sd_sysctl_header) | 5783 | if (sd_sysctl_header) |
5788 | unregister_sysctl_table(sd_sysctl_header); | 5784 | unregister_sysctl_table(sd_sysctl_header); |
5789 | sd_sysctl_header = NULL; | 5785 | sd_sysctl_header = NULL; |
5790 | if (sd_ctl_dir[0].child) | 5786 | if (sd_ctl_dir[0].child) |
5791 | sd_free_ctl_entry(&sd_ctl_dir[0].child); | 5787 | sd_free_ctl_entry(&sd_ctl_dir[0].child); |
5792 | } | 5788 | } |
5793 | #else | 5789 | #else |
5794 | static void register_sched_domain_sysctl(void) | 5790 | static void register_sched_domain_sysctl(void) |
5795 | { | 5791 | { |
5796 | } | 5792 | } |
5797 | static void unregister_sched_domain_sysctl(void) | 5793 | static void unregister_sched_domain_sysctl(void) |
5798 | { | 5794 | { |
5799 | } | 5795 | } |
5800 | #endif | 5796 | #endif |
5801 | 5797 | ||
5802 | /* | 5798 | /* |
5803 | * migration_call - callback that gets triggered when a CPU is added. | 5799 | * migration_call - callback that gets triggered when a CPU is added. |
5804 | * Here we can start up the necessary migration thread for the new CPU. | 5800 | * Here we can start up the necessary migration thread for the new CPU. |
5805 | */ | 5801 | */ |
5806 | static int __cpuinit | 5802 | static int __cpuinit |
5807 | migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | 5803 | migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) |
5808 | { | 5804 | { |
5809 | struct task_struct *p; | 5805 | struct task_struct *p; |
5810 | int cpu = (long)hcpu; | 5806 | int cpu = (long)hcpu; |
5811 | unsigned long flags; | 5807 | unsigned long flags; |
5812 | struct rq *rq; | 5808 | struct rq *rq; |
5813 | 5809 | ||
5814 | switch (action) { | 5810 | switch (action) { |
5815 | 5811 | ||
5816 | case CPU_UP_PREPARE: | 5812 | case CPU_UP_PREPARE: |
5817 | case CPU_UP_PREPARE_FROZEN: | 5813 | case CPU_UP_PREPARE_FROZEN: |
5818 | p = kthread_create(migration_thread, hcpu, "migration/%d", cpu); | 5814 | p = kthread_create(migration_thread, hcpu, "migration/%d", cpu); |
5819 | if (IS_ERR(p)) | 5815 | if (IS_ERR(p)) |
5820 | return NOTIFY_BAD; | 5816 | return NOTIFY_BAD; |
5821 | kthread_bind(p, cpu); | 5817 | kthread_bind(p, cpu); |
5822 | /* Must be high prio: stop_machine expects to yield to it. */ | 5818 | /* Must be high prio: stop_machine expects to yield to it. */ |
5823 | rq = task_rq_lock(p, &flags); | 5819 | rq = task_rq_lock(p, &flags); |
5824 | __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); | 5820 | __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); |
5825 | task_rq_unlock(rq, &flags); | 5821 | task_rq_unlock(rq, &flags); |
5826 | cpu_rq(cpu)->migration_thread = p; | 5822 | cpu_rq(cpu)->migration_thread = p; |
5827 | break; | 5823 | break; |
5828 | 5824 | ||
5829 | case CPU_ONLINE: | 5825 | case CPU_ONLINE: |
5830 | case CPU_ONLINE_FROZEN: | 5826 | case CPU_ONLINE_FROZEN: |
5831 | /* Strictly unnecessary, as first user will wake it. */ | 5827 | /* Strictly unnecessary, as first user will wake it. */ |
5832 | wake_up_process(cpu_rq(cpu)->migration_thread); | 5828 | wake_up_process(cpu_rq(cpu)->migration_thread); |
5833 | 5829 | ||
5834 | /* Update our root-domain */ | 5830 | /* Update our root-domain */ |
5835 | rq = cpu_rq(cpu); | 5831 | rq = cpu_rq(cpu); |
5836 | spin_lock_irqsave(&rq->lock, flags); | 5832 | spin_lock_irqsave(&rq->lock, flags); |
5837 | if (rq->rd) { | 5833 | if (rq->rd) { |
5838 | BUG_ON(!cpu_isset(cpu, rq->rd->span)); | 5834 | BUG_ON(!cpu_isset(cpu, rq->rd->span)); |
5839 | cpu_set(cpu, rq->rd->online); | 5835 | cpu_set(cpu, rq->rd->online); |
5840 | } | 5836 | } |
5841 | spin_unlock_irqrestore(&rq->lock, flags); | 5837 | spin_unlock_irqrestore(&rq->lock, flags); |
5842 | break; | 5838 | break; |
5843 | 5839 | ||
5844 | #ifdef CONFIG_HOTPLUG_CPU | 5840 | #ifdef CONFIG_HOTPLUG_CPU |
5845 | case CPU_UP_CANCELED: | 5841 | case CPU_UP_CANCELED: |
5846 | case CPU_UP_CANCELED_FROZEN: | 5842 | case CPU_UP_CANCELED_FROZEN: |
5847 | if (!cpu_rq(cpu)->migration_thread) | 5843 | if (!cpu_rq(cpu)->migration_thread) |
5848 | break; | 5844 | break; |
5849 | /* Unbind it from offline cpu so it can run. Fall thru. */ | 5845 | /* Unbind it from offline cpu so it can run. Fall thru. */ |
5850 | kthread_bind(cpu_rq(cpu)->migration_thread, | 5846 | kthread_bind(cpu_rq(cpu)->migration_thread, |
5851 | any_online_cpu(cpu_online_map)); | 5847 | any_online_cpu(cpu_online_map)); |
5852 | kthread_stop(cpu_rq(cpu)->migration_thread); | 5848 | kthread_stop(cpu_rq(cpu)->migration_thread); |
5853 | cpu_rq(cpu)->migration_thread = NULL; | 5849 | cpu_rq(cpu)->migration_thread = NULL; |
5854 | break; | 5850 | break; |
5855 | 5851 | ||
5856 | case CPU_DEAD: | 5852 | case CPU_DEAD: |
5857 | case CPU_DEAD_FROZEN: | 5853 | case CPU_DEAD_FROZEN: |
5858 | cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */ | 5854 | cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */ |
5859 | migrate_live_tasks(cpu); | 5855 | migrate_live_tasks(cpu); |
5860 | rq = cpu_rq(cpu); | 5856 | rq = cpu_rq(cpu); |
5861 | kthread_stop(rq->migration_thread); | 5857 | kthread_stop(rq->migration_thread); |
5862 | rq->migration_thread = NULL; | 5858 | rq->migration_thread = NULL; |
5863 | /* Idle task back to normal (off runqueue, low prio) */ | 5859 | /* Idle task back to normal (off runqueue, low prio) */ |
5864 | spin_lock_irq(&rq->lock); | 5860 | spin_lock_irq(&rq->lock); |
5865 | update_rq_clock(rq); | 5861 | update_rq_clock(rq); |
5866 | deactivate_task(rq, rq->idle, 0); | 5862 | deactivate_task(rq, rq->idle, 0); |
5867 | rq->idle->static_prio = MAX_PRIO; | 5863 | rq->idle->static_prio = MAX_PRIO; |
5868 | __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); | 5864 | __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); |
5869 | rq->idle->sched_class = &idle_sched_class; | 5865 | rq->idle->sched_class = &idle_sched_class; |
5870 | migrate_dead_tasks(cpu); | 5866 | migrate_dead_tasks(cpu); |
5871 | spin_unlock_irq(&rq->lock); | 5867 | spin_unlock_irq(&rq->lock); |
5872 | cpuset_unlock(); | 5868 | cpuset_unlock(); |
5873 | migrate_nr_uninterruptible(rq); | 5869 | migrate_nr_uninterruptible(rq); |
5874 | BUG_ON(rq->nr_running != 0); | 5870 | BUG_ON(rq->nr_running != 0); |
5875 | 5871 | ||
5876 | /* | 5872 | /* |
5877 | * No need to migrate the tasks: it was best-effort if | 5873 | * No need to migrate the tasks: it was best-effort if |
5878 | * they didn't take sched_hotcpu_mutex. Just wake up | 5874 | * they didn't take sched_hotcpu_mutex. Just wake up |
5879 | * the requestors. | 5875 | * the requestors. |
5880 | */ | 5876 | */ |
5881 | spin_lock_irq(&rq->lock); | 5877 | spin_lock_irq(&rq->lock); |
5882 | while (!list_empty(&rq->migration_queue)) { | 5878 | while (!list_empty(&rq->migration_queue)) { |
5883 | struct migration_req *req; | 5879 | struct migration_req *req; |
5884 | 5880 | ||
5885 | req = list_entry(rq->migration_queue.next, | 5881 | req = list_entry(rq->migration_queue.next, |
5886 | struct migration_req, list); | 5882 | struct migration_req, list); |
5887 | list_del_init(&req->list); | 5883 | list_del_init(&req->list); |
5888 | complete(&req->done); | 5884 | complete(&req->done); |
5889 | } | 5885 | } |
5890 | spin_unlock_irq(&rq->lock); | 5886 | spin_unlock_irq(&rq->lock); |
5891 | break; | 5887 | break; |
5892 | 5888 | ||
5893 | case CPU_DOWN_PREPARE: | 5889 | case CPU_DOWN_PREPARE: |
5894 | /* Update our root-domain */ | 5890 | /* Update our root-domain */ |
5895 | rq = cpu_rq(cpu); | 5891 | rq = cpu_rq(cpu); |
5896 | spin_lock_irqsave(&rq->lock, flags); | 5892 | spin_lock_irqsave(&rq->lock, flags); |
5897 | if (rq->rd) { | 5893 | if (rq->rd) { |
5898 | BUG_ON(!cpu_isset(cpu, rq->rd->span)); | 5894 | BUG_ON(!cpu_isset(cpu, rq->rd->span)); |
5899 | cpu_clear(cpu, rq->rd->online); | 5895 | cpu_clear(cpu, rq->rd->online); |
5900 | } | 5896 | } |
5901 | spin_unlock_irqrestore(&rq->lock, flags); | 5897 | spin_unlock_irqrestore(&rq->lock, flags); |
5902 | break; | 5898 | break; |
5903 | #endif | 5899 | #endif |
5904 | } | 5900 | } |
5905 | return NOTIFY_OK; | 5901 | return NOTIFY_OK; |
5906 | } | 5902 | } |
5907 | 5903 | ||
5908 | /* Register at highest priority so that task migration (migrate_all_tasks) | 5904 | /* Register at highest priority so that task migration (migrate_all_tasks) |
5909 | * happens before everything else. | 5905 | * happens before everything else. |
5910 | */ | 5906 | */ |
5911 | static struct notifier_block __cpuinitdata migration_notifier = { | 5907 | static struct notifier_block __cpuinitdata migration_notifier = { |
5912 | .notifier_call = migration_call, | 5908 | .notifier_call = migration_call, |
5913 | .priority = 10 | 5909 | .priority = 10 |
5914 | }; | 5910 | }; |
5915 | 5911 | ||
5916 | void __init migration_init(void) | 5912 | void __init migration_init(void) |
5917 | { | 5913 | { |
5918 | void *cpu = (void *)(long)smp_processor_id(); | 5914 | void *cpu = (void *)(long)smp_processor_id(); |
5919 | int err; | 5915 | int err; |
5920 | 5916 | ||
5921 | /* Start one for the boot CPU: */ | 5917 | /* Start one for the boot CPU: */ |
5922 | err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); | 5918 | err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); |
5923 | BUG_ON(err == NOTIFY_BAD); | 5919 | BUG_ON(err == NOTIFY_BAD); |
5924 | migration_call(&migration_notifier, CPU_ONLINE, cpu); | 5920 | migration_call(&migration_notifier, CPU_ONLINE, cpu); |
5925 | register_cpu_notifier(&migration_notifier); | 5921 | register_cpu_notifier(&migration_notifier); |
5926 | } | 5922 | } |
5927 | #endif | 5923 | #endif |
5928 | 5924 | ||
5929 | #ifdef CONFIG_SMP | 5925 | #ifdef CONFIG_SMP |
5930 | 5926 | ||
5931 | /* Number of possible processor ids */ | 5927 | /* Number of possible processor ids */ |
5932 | int nr_cpu_ids __read_mostly = NR_CPUS; | 5928 | int nr_cpu_ids __read_mostly = NR_CPUS; |
5933 | EXPORT_SYMBOL(nr_cpu_ids); | 5929 | EXPORT_SYMBOL(nr_cpu_ids); |
5934 | 5930 | ||
5935 | #ifdef CONFIG_SCHED_DEBUG | 5931 | #ifdef CONFIG_SCHED_DEBUG |
5936 | 5932 | ||
5937 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level) | 5933 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level) |
5938 | { | 5934 | { |
5939 | struct sched_group *group = sd->groups; | 5935 | struct sched_group *group = sd->groups; |
5940 | cpumask_t groupmask; | 5936 | cpumask_t groupmask; |
5941 | char str[NR_CPUS]; | 5937 | char str[NR_CPUS]; |
5942 | 5938 | ||
5943 | cpumask_scnprintf(str, NR_CPUS, sd->span); | 5939 | cpumask_scnprintf(str, NR_CPUS, sd->span); |
5944 | cpus_clear(groupmask); | 5940 | cpus_clear(groupmask); |
5945 | 5941 | ||
5946 | printk(KERN_DEBUG "%*s domain %d: ", level, "", level); | 5942 | printk(KERN_DEBUG "%*s domain %d: ", level, "", level); |
5947 | 5943 | ||
5948 | if (!(sd->flags & SD_LOAD_BALANCE)) { | 5944 | if (!(sd->flags & SD_LOAD_BALANCE)) { |
5949 | printk("does not load-balance\n"); | 5945 | printk("does not load-balance\n"); |
5950 | if (sd->parent) | 5946 | if (sd->parent) |
5951 | printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" | 5947 | printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" |
5952 | " has parent"); | 5948 | " has parent"); |
5953 | return -1; | 5949 | return -1; |
5954 | } | 5950 | } |
5955 | 5951 | ||
5956 | printk(KERN_CONT "span %s\n", str); | 5952 | printk(KERN_CONT "span %s\n", str); |
5957 | 5953 | ||
5958 | if (!cpu_isset(cpu, sd->span)) { | 5954 | if (!cpu_isset(cpu, sd->span)) { |
5959 | printk(KERN_ERR "ERROR: domain->span does not contain " | 5955 | printk(KERN_ERR "ERROR: domain->span does not contain " |
5960 | "CPU%d\n", cpu); | 5956 | "CPU%d\n", cpu); |
5961 | } | 5957 | } |
5962 | if (!cpu_isset(cpu, group->cpumask)) { | 5958 | if (!cpu_isset(cpu, group->cpumask)) { |
5963 | printk(KERN_ERR "ERROR: domain->groups does not contain" | 5959 | printk(KERN_ERR "ERROR: domain->groups does not contain" |
5964 | " CPU%d\n", cpu); | 5960 | " CPU%d\n", cpu); |
5965 | } | 5961 | } |
5966 | 5962 | ||
5967 | printk(KERN_DEBUG "%*s groups:", level + 1, ""); | 5963 | printk(KERN_DEBUG "%*s groups:", level + 1, ""); |
5968 | do { | 5964 | do { |
5969 | if (!group) { | 5965 | if (!group) { |
5970 | printk("\n"); | 5966 | printk("\n"); |
5971 | printk(KERN_ERR "ERROR: group is NULL\n"); | 5967 | printk(KERN_ERR "ERROR: group is NULL\n"); |
5972 | break; | 5968 | break; |
5973 | } | 5969 | } |
5974 | 5970 | ||
5975 | if (!group->__cpu_power) { | 5971 | if (!group->__cpu_power) { |
5976 | printk(KERN_CONT "\n"); | 5972 | printk(KERN_CONT "\n"); |
5977 | printk(KERN_ERR "ERROR: domain->cpu_power not " | 5973 | printk(KERN_ERR "ERROR: domain->cpu_power not " |
5978 | "set\n"); | 5974 | "set\n"); |
5979 | break; | 5975 | break; |
5980 | } | 5976 | } |
5981 | 5977 | ||
5982 | if (!cpus_weight(group->cpumask)) { | 5978 | if (!cpus_weight(group->cpumask)) { |
5983 | printk(KERN_CONT "\n"); | 5979 | printk(KERN_CONT "\n"); |
5984 | printk(KERN_ERR "ERROR: empty group\n"); | 5980 | printk(KERN_ERR "ERROR: empty group\n"); |
5985 | break; | 5981 | break; |
5986 | } | 5982 | } |
5987 | 5983 | ||
5988 | if (cpus_intersects(groupmask, group->cpumask)) { | 5984 | if (cpus_intersects(groupmask, group->cpumask)) { |
5989 | printk(KERN_CONT "\n"); | 5985 | printk(KERN_CONT "\n"); |
5990 | printk(KERN_ERR "ERROR: repeated CPUs\n"); | 5986 | printk(KERN_ERR "ERROR: repeated CPUs\n"); |
5991 | break; | 5987 | break; |
5992 | } | 5988 | } |
5993 | 5989 | ||
5994 | cpus_or(groupmask, groupmask, group->cpumask); | 5990 | cpus_or(groupmask, groupmask, group->cpumask); |
5995 | 5991 | ||
5996 | cpumask_scnprintf(str, NR_CPUS, group->cpumask); | 5992 | cpumask_scnprintf(str, NR_CPUS, group->cpumask); |
5997 | printk(KERN_CONT " %s", str); | 5993 | printk(KERN_CONT " %s", str); |
5998 | 5994 | ||
5999 | group = group->next; | 5995 | group = group->next; |
6000 | } while (group != sd->groups); | 5996 | } while (group != sd->groups); |
6001 | printk(KERN_CONT "\n"); | 5997 | printk(KERN_CONT "\n"); |
6002 | 5998 | ||
6003 | if (!cpus_equal(sd->span, groupmask)) | 5999 | if (!cpus_equal(sd->span, groupmask)) |
6004 | printk(KERN_ERR "ERROR: groups don't span domain->span\n"); | 6000 | printk(KERN_ERR "ERROR: groups don't span domain->span\n"); |
6005 | 6001 | ||
6006 | if (sd->parent && !cpus_subset(groupmask, sd->parent->span)) | 6002 | if (sd->parent && !cpus_subset(groupmask, sd->parent->span)) |
6007 | printk(KERN_ERR "ERROR: parent span is not a superset " | 6003 | printk(KERN_ERR "ERROR: parent span is not a superset " |
6008 | "of domain->span\n"); | 6004 | "of domain->span\n"); |
6009 | return 0; | 6005 | return 0; |
6010 | } | 6006 | } |
6011 | 6007 | ||
6012 | static void sched_domain_debug(struct sched_domain *sd, int cpu) | 6008 | static void sched_domain_debug(struct sched_domain *sd, int cpu) |
6013 | { | 6009 | { |
6014 | int level = 0; | 6010 | int level = 0; |
6015 | 6011 | ||
6016 | if (!sd) { | 6012 | if (!sd) { |
6017 | printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); | 6013 | printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); |
6018 | return; | 6014 | return; |
6019 | } | 6015 | } |
6020 | 6016 | ||
6021 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); | 6017 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); |
6022 | 6018 | ||
6023 | for (;;) { | 6019 | for (;;) { |
6024 | if (sched_domain_debug_one(sd, cpu, level)) | 6020 | if (sched_domain_debug_one(sd, cpu, level)) |
6025 | break; | 6021 | break; |
6026 | level++; | 6022 | level++; |
6027 | sd = sd->parent; | 6023 | sd = sd->parent; |
6028 | if (!sd) | 6024 | if (!sd) |
6029 | break; | 6025 | break; |
6030 | } | 6026 | } |
6031 | } | 6027 | } |
6032 | #else | 6028 | #else |
6033 | # define sched_domain_debug(sd, cpu) do { } while (0) | 6029 | # define sched_domain_debug(sd, cpu) do { } while (0) |
6034 | #endif | 6030 | #endif |
6035 | 6031 | ||
6036 | static int sd_degenerate(struct sched_domain *sd) | 6032 | static int sd_degenerate(struct sched_domain *sd) |
6037 | { | 6033 | { |
6038 | if (cpus_weight(sd->span) == 1) | 6034 | if (cpus_weight(sd->span) == 1) |
6039 | return 1; | 6035 | return 1; |
6040 | 6036 | ||
6041 | /* Following flags need at least 2 groups */ | 6037 | /* Following flags need at least 2 groups */ |
6042 | if (sd->flags & (SD_LOAD_BALANCE | | 6038 | if (sd->flags & (SD_LOAD_BALANCE | |
6043 | SD_BALANCE_NEWIDLE | | 6039 | SD_BALANCE_NEWIDLE | |
6044 | SD_BALANCE_FORK | | 6040 | SD_BALANCE_FORK | |
6045 | SD_BALANCE_EXEC | | 6041 | SD_BALANCE_EXEC | |
6046 | SD_SHARE_CPUPOWER | | 6042 | SD_SHARE_CPUPOWER | |
6047 | SD_SHARE_PKG_RESOURCES)) { | 6043 | SD_SHARE_PKG_RESOURCES)) { |
6048 | if (sd->groups != sd->groups->next) | 6044 | if (sd->groups != sd->groups->next) |
6049 | return 0; | 6045 | return 0; |
6050 | } | 6046 | } |
6051 | 6047 | ||
6052 | /* Following flags don't use groups */ | 6048 | /* Following flags don't use groups */ |
6053 | if (sd->flags & (SD_WAKE_IDLE | | 6049 | if (sd->flags & (SD_WAKE_IDLE | |
6054 | SD_WAKE_AFFINE | | 6050 | SD_WAKE_AFFINE | |
6055 | SD_WAKE_BALANCE)) | 6051 | SD_WAKE_BALANCE)) |
6056 | return 0; | 6052 | return 0; |
6057 | 6053 | ||
6058 | return 1; | 6054 | return 1; |
6059 | } | 6055 | } |
6060 | 6056 | ||
6061 | static int | 6057 | static int |
6062 | sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | 6058 | sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) |
6063 | { | 6059 | { |
6064 | unsigned long cflags = sd->flags, pflags = parent->flags; | 6060 | unsigned long cflags = sd->flags, pflags = parent->flags; |
6065 | 6061 | ||
6066 | if (sd_degenerate(parent)) | 6062 | if (sd_degenerate(parent)) |
6067 | return 1; | 6063 | return 1; |
6068 | 6064 | ||
6069 | if (!cpus_equal(sd->span, parent->span)) | 6065 | if (!cpus_equal(sd->span, parent->span)) |
6070 | return 0; | 6066 | return 0; |
6071 | 6067 | ||
6072 | /* Does parent contain flags not in child? */ | 6068 | /* Does parent contain flags not in child? */ |
6073 | /* WAKE_BALANCE is a subset of WAKE_AFFINE */ | 6069 | /* WAKE_BALANCE is a subset of WAKE_AFFINE */ |
6074 | if (cflags & SD_WAKE_AFFINE) | 6070 | if (cflags & SD_WAKE_AFFINE) |
6075 | pflags &= ~SD_WAKE_BALANCE; | 6071 | pflags &= ~SD_WAKE_BALANCE; |
6076 | /* Flags needing groups don't count if only 1 group in parent */ | 6072 | /* Flags needing groups don't count if only 1 group in parent */ |
6077 | if (parent->groups == parent->groups->next) { | 6073 | if (parent->groups == parent->groups->next) { |
6078 | pflags &= ~(SD_LOAD_BALANCE | | 6074 | pflags &= ~(SD_LOAD_BALANCE | |
6079 | SD_BALANCE_NEWIDLE | | 6075 | SD_BALANCE_NEWIDLE | |
6080 | SD_BALANCE_FORK | | 6076 | SD_BALANCE_FORK | |
6081 | SD_BALANCE_EXEC | | 6077 | SD_BALANCE_EXEC | |
6082 | SD_SHARE_CPUPOWER | | 6078 | SD_SHARE_CPUPOWER | |
6083 | SD_SHARE_PKG_RESOURCES); | 6079 | SD_SHARE_PKG_RESOURCES); |
6084 | } | 6080 | } |
6085 | if (~cflags & pflags) | 6081 | if (~cflags & pflags) |
6086 | return 0; | 6082 | return 0; |
6087 | 6083 | ||
6088 | return 1; | 6084 | return 1; |
6089 | } | 6085 | } |
6090 | 6086 | ||
6091 | static void rq_attach_root(struct rq *rq, struct root_domain *rd) | 6087 | static void rq_attach_root(struct rq *rq, struct root_domain *rd) |
6092 | { | 6088 | { |
6093 | unsigned long flags; | 6089 | unsigned long flags; |
6094 | const struct sched_class *class; | 6090 | const struct sched_class *class; |
6095 | 6091 | ||
6096 | spin_lock_irqsave(&rq->lock, flags); | 6092 | spin_lock_irqsave(&rq->lock, flags); |
6097 | 6093 | ||
6098 | if (rq->rd) { | 6094 | if (rq->rd) { |
6099 | struct root_domain *old_rd = rq->rd; | 6095 | struct root_domain *old_rd = rq->rd; |
6100 | 6096 | ||
6101 | for (class = sched_class_highest; class; class = class->next) { | 6097 | for (class = sched_class_highest; class; class = class->next) { |
6102 | if (class->leave_domain) | 6098 | if (class->leave_domain) |
6103 | class->leave_domain(rq); | 6099 | class->leave_domain(rq); |
6104 | } | 6100 | } |
6105 | 6101 | ||
6106 | cpu_clear(rq->cpu, old_rd->span); | 6102 | cpu_clear(rq->cpu, old_rd->span); |
6107 | cpu_clear(rq->cpu, old_rd->online); | 6103 | cpu_clear(rq->cpu, old_rd->online); |
6108 | 6104 | ||
6109 | if (atomic_dec_and_test(&old_rd->refcount)) | 6105 | if (atomic_dec_and_test(&old_rd->refcount)) |
6110 | kfree(old_rd); | 6106 | kfree(old_rd); |
6111 | } | 6107 | } |
6112 | 6108 | ||
6113 | atomic_inc(&rd->refcount); | 6109 | atomic_inc(&rd->refcount); |
6114 | rq->rd = rd; | 6110 | rq->rd = rd; |
6115 | 6111 | ||
6116 | cpu_set(rq->cpu, rd->span); | 6112 | cpu_set(rq->cpu, rd->span); |
6117 | if (cpu_isset(rq->cpu, cpu_online_map)) | 6113 | if (cpu_isset(rq->cpu, cpu_online_map)) |
6118 | cpu_set(rq->cpu, rd->online); | 6114 | cpu_set(rq->cpu, rd->online); |
6119 | 6115 | ||
6120 | for (class = sched_class_highest; class; class = class->next) { | 6116 | for (class = sched_class_highest; class; class = class->next) { |
6121 | if (class->join_domain) | 6117 | if (class->join_domain) |
6122 | class->join_domain(rq); | 6118 | class->join_domain(rq); |
6123 | } | 6119 | } |
6124 | 6120 | ||
6125 | spin_unlock_irqrestore(&rq->lock, flags); | 6121 | spin_unlock_irqrestore(&rq->lock, flags); |
6126 | } | 6122 | } |
6127 | 6123 | ||
6128 | static void init_rootdomain(struct root_domain *rd) | 6124 | static void init_rootdomain(struct root_domain *rd) |
6129 | { | 6125 | { |
6130 | memset(rd, 0, sizeof(*rd)); | 6126 | memset(rd, 0, sizeof(*rd)); |
6131 | 6127 | ||
6132 | cpus_clear(rd->span); | 6128 | cpus_clear(rd->span); |
6133 | cpus_clear(rd->online); | 6129 | cpus_clear(rd->online); |
6134 | } | 6130 | } |
6135 | 6131 | ||
6136 | static void init_defrootdomain(void) | 6132 | static void init_defrootdomain(void) |
6137 | { | 6133 | { |
6138 | init_rootdomain(&def_root_domain); | 6134 | init_rootdomain(&def_root_domain); |
6139 | atomic_set(&def_root_domain.refcount, 1); | 6135 | atomic_set(&def_root_domain.refcount, 1); |
6140 | } | 6136 | } |
6141 | 6137 | ||
6142 | static struct root_domain *alloc_rootdomain(void) | 6138 | static struct root_domain *alloc_rootdomain(void) |
6143 | { | 6139 | { |
6144 | struct root_domain *rd; | 6140 | struct root_domain *rd; |
6145 | 6141 | ||
6146 | rd = kmalloc(sizeof(*rd), GFP_KERNEL); | 6142 | rd = kmalloc(sizeof(*rd), GFP_KERNEL); |
6147 | if (!rd) | 6143 | if (!rd) |
6148 | return NULL; | 6144 | return NULL; |
6149 | 6145 | ||
6150 | init_rootdomain(rd); | 6146 | init_rootdomain(rd); |
6151 | 6147 | ||
6152 | return rd; | 6148 | return rd; |
6153 | } | 6149 | } |
6154 | 6150 | ||
6155 | /* | 6151 | /* |
6156 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must | 6152 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must |
6157 | * hold the hotplug lock. | 6153 | * hold the hotplug lock. |
6158 | */ | 6154 | */ |
6159 | static void | 6155 | static void |
6160 | cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | 6156 | cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) |
6161 | { | 6157 | { |
6162 | struct rq *rq = cpu_rq(cpu); | 6158 | struct rq *rq = cpu_rq(cpu); |
6163 | struct sched_domain *tmp; | 6159 | struct sched_domain *tmp; |
6164 | 6160 | ||
6165 | /* Remove the sched domains which do not contribute to scheduling. */ | 6161 | /* Remove the sched domains which do not contribute to scheduling. */ |
6166 | for (tmp = sd; tmp; tmp = tmp->parent) { | 6162 | for (tmp = sd; tmp; tmp = tmp->parent) { |
6167 | struct sched_domain *parent = tmp->parent; | 6163 | struct sched_domain *parent = tmp->parent; |
6168 | if (!parent) | 6164 | if (!parent) |
6169 | break; | 6165 | break; |
6170 | if (sd_parent_degenerate(tmp, parent)) { | 6166 | if (sd_parent_degenerate(tmp, parent)) { |
6171 | tmp->parent = parent->parent; | 6167 | tmp->parent = parent->parent; |
6172 | if (parent->parent) | 6168 | if (parent->parent) |
6173 | parent->parent->child = tmp; | 6169 | parent->parent->child = tmp; |
6174 | } | 6170 | } |
6175 | } | 6171 | } |
6176 | 6172 | ||
6177 | if (sd && sd_degenerate(sd)) { | 6173 | if (sd && sd_degenerate(sd)) { |
6178 | sd = sd->parent; | 6174 | sd = sd->parent; |
6179 | if (sd) | 6175 | if (sd) |
6180 | sd->child = NULL; | 6176 | sd->child = NULL; |
6181 | } | 6177 | } |
6182 | 6178 | ||
6183 | sched_domain_debug(sd, cpu); | 6179 | sched_domain_debug(sd, cpu); |
6184 | 6180 | ||
6185 | rq_attach_root(rq, rd); | 6181 | rq_attach_root(rq, rd); |
6186 | rcu_assign_pointer(rq->sd, sd); | 6182 | rcu_assign_pointer(rq->sd, sd); |
6187 | } | 6183 | } |
6188 | 6184 | ||
6189 | /* cpus with isolated domains */ | 6185 | /* cpus with isolated domains */ |
6190 | static cpumask_t cpu_isolated_map = CPU_MASK_NONE; | 6186 | static cpumask_t cpu_isolated_map = CPU_MASK_NONE; |
6191 | 6187 | ||
6192 | /* Setup the mask of cpus configured for isolated domains */ | 6188 | /* Setup the mask of cpus configured for isolated domains */ |
6193 | static int __init isolated_cpu_setup(char *str) | 6189 | static int __init isolated_cpu_setup(char *str) |
6194 | { | 6190 | { |
6195 | int ints[NR_CPUS], i; | 6191 | int ints[NR_CPUS], i; |
6196 | 6192 | ||
6197 | str = get_options(str, ARRAY_SIZE(ints), ints); | 6193 | str = get_options(str, ARRAY_SIZE(ints), ints); |
6198 | cpus_clear(cpu_isolated_map); | 6194 | cpus_clear(cpu_isolated_map); |
6199 | for (i = 1; i <= ints[0]; i++) | 6195 | for (i = 1; i <= ints[0]; i++) |
6200 | if (ints[i] < NR_CPUS) | 6196 | if (ints[i] < NR_CPUS) |
6201 | cpu_set(ints[i], cpu_isolated_map); | 6197 | cpu_set(ints[i], cpu_isolated_map); |
6202 | return 1; | 6198 | return 1; |
6203 | } | 6199 | } |
6204 | 6200 | ||
6205 | __setup("isolcpus=", isolated_cpu_setup); | 6201 | __setup("isolcpus=", isolated_cpu_setup); |
6206 | 6202 | ||
6207 | /* | 6203 | /* |
6208 | * init_sched_build_groups takes the cpumask we wish to span, and a pointer | 6204 | * init_sched_build_groups takes the cpumask we wish to span, and a pointer |
6209 | * to a function which identifies what group(along with sched group) a CPU | 6205 | * to a function which identifies what group(along with sched group) a CPU |
6210 | * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS | 6206 | * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS |
6211 | * (due to the fact that we keep track of groups covered with a cpumask_t). | 6207 | * (due to the fact that we keep track of groups covered with a cpumask_t). |
6212 | * | 6208 | * |
6213 | * init_sched_build_groups will build a circular linked list of the groups | 6209 | * init_sched_build_groups will build a circular linked list of the groups |
6214 | * covered by the given span, and will set each group's ->cpumask correctly, | 6210 | * covered by the given span, and will set each group's ->cpumask correctly, |
6215 | * and ->cpu_power to 0. | 6211 | * and ->cpu_power to 0. |
6216 | */ | 6212 | */ |
6217 | static void | 6213 | static void |
6218 | init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, | 6214 | init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, |
6219 | int (*group_fn)(int cpu, const cpumask_t *cpu_map, | 6215 | int (*group_fn)(int cpu, const cpumask_t *cpu_map, |
6220 | struct sched_group **sg)) | 6216 | struct sched_group **sg)) |
6221 | { | 6217 | { |
6222 | struct sched_group *first = NULL, *last = NULL; | 6218 | struct sched_group *first = NULL, *last = NULL; |
6223 | cpumask_t covered = CPU_MASK_NONE; | 6219 | cpumask_t covered = CPU_MASK_NONE; |
6224 | int i; | 6220 | int i; |
6225 | 6221 | ||
6226 | for_each_cpu_mask(i, span) { | 6222 | for_each_cpu_mask(i, span) { |
6227 | struct sched_group *sg; | 6223 | struct sched_group *sg; |
6228 | int group = group_fn(i, cpu_map, &sg); | 6224 | int group = group_fn(i, cpu_map, &sg); |
6229 | int j; | 6225 | int j; |
6230 | 6226 | ||
6231 | if (cpu_isset(i, covered)) | 6227 | if (cpu_isset(i, covered)) |
6232 | continue; | 6228 | continue; |
6233 | 6229 | ||
6234 | sg->cpumask = CPU_MASK_NONE; | 6230 | sg->cpumask = CPU_MASK_NONE; |
6235 | sg->__cpu_power = 0; | 6231 | sg->__cpu_power = 0; |
6236 | 6232 | ||
6237 | for_each_cpu_mask(j, span) { | 6233 | for_each_cpu_mask(j, span) { |
6238 | if (group_fn(j, cpu_map, NULL) != group) | 6234 | if (group_fn(j, cpu_map, NULL) != group) |
6239 | continue; | 6235 | continue; |
6240 | 6236 | ||
6241 | cpu_set(j, covered); | 6237 | cpu_set(j, covered); |
6242 | cpu_set(j, sg->cpumask); | 6238 | cpu_set(j, sg->cpumask); |
6243 | } | 6239 | } |
6244 | if (!first) | 6240 | if (!first) |
6245 | first = sg; | 6241 | first = sg; |
6246 | if (last) | 6242 | if (last) |
6247 | last->next = sg; | 6243 | last->next = sg; |
6248 | last = sg; | 6244 | last = sg; |
6249 | } | 6245 | } |
6250 | last->next = first; | 6246 | last->next = first; |
6251 | } | 6247 | } |
6252 | 6248 | ||
6253 | #define SD_NODES_PER_DOMAIN 16 | 6249 | #define SD_NODES_PER_DOMAIN 16 |
6254 | 6250 | ||
6255 | #ifdef CONFIG_NUMA | 6251 | #ifdef CONFIG_NUMA |
6256 | 6252 | ||
6257 | /** | 6253 | /** |
6258 | * find_next_best_node - find the next node to include in a sched_domain | 6254 | * find_next_best_node - find the next node to include in a sched_domain |
6259 | * @node: node whose sched_domain we're building | 6255 | * @node: node whose sched_domain we're building |
6260 | * @used_nodes: nodes already in the sched_domain | 6256 | * @used_nodes: nodes already in the sched_domain |
6261 | * | 6257 | * |
6262 | * Find the next node to include in a given scheduling domain. Simply | 6258 | * Find the next node to include in a given scheduling domain. Simply |
6263 | * finds the closest node not already in the @used_nodes map. | 6259 | * finds the closest node not already in the @used_nodes map. |
6264 | * | 6260 | * |
6265 | * Should use nodemask_t. | 6261 | * Should use nodemask_t. |
6266 | */ | 6262 | */ |
6267 | static int find_next_best_node(int node, unsigned long *used_nodes) | 6263 | static int find_next_best_node(int node, unsigned long *used_nodes) |
6268 | { | 6264 | { |
6269 | int i, n, val, min_val, best_node = 0; | 6265 | int i, n, val, min_val, best_node = 0; |
6270 | 6266 | ||
6271 | min_val = INT_MAX; | 6267 | min_val = INT_MAX; |
6272 | 6268 | ||
6273 | for (i = 0; i < MAX_NUMNODES; i++) { | 6269 | for (i = 0; i < MAX_NUMNODES; i++) { |
6274 | /* Start at @node */ | 6270 | /* Start at @node */ |
6275 | n = (node + i) % MAX_NUMNODES; | 6271 | n = (node + i) % MAX_NUMNODES; |
6276 | 6272 | ||
6277 | if (!nr_cpus_node(n)) | 6273 | if (!nr_cpus_node(n)) |
6278 | continue; | 6274 | continue; |
6279 | 6275 | ||
6280 | /* Skip already used nodes */ | 6276 | /* Skip already used nodes */ |
6281 | if (test_bit(n, used_nodes)) | 6277 | if (test_bit(n, used_nodes)) |
6282 | continue; | 6278 | continue; |
6283 | 6279 | ||
6284 | /* Simple min distance search */ | 6280 | /* Simple min distance search */ |
6285 | val = node_distance(node, n); | 6281 | val = node_distance(node, n); |
6286 | 6282 | ||
6287 | if (val < min_val) { | 6283 | if (val < min_val) { |
6288 | min_val = val; | 6284 | min_val = val; |
6289 | best_node = n; | 6285 | best_node = n; |
6290 | } | 6286 | } |
6291 | } | 6287 | } |
6292 | 6288 | ||
6293 | set_bit(best_node, used_nodes); | 6289 | set_bit(best_node, used_nodes); |
6294 | return best_node; | 6290 | return best_node; |
6295 | } | 6291 | } |
6296 | 6292 | ||
6297 | /** | 6293 | /** |
6298 | * sched_domain_node_span - get a cpumask for a node's sched_domain | 6294 | * sched_domain_node_span - get a cpumask for a node's sched_domain |
6299 | * @node: node whose cpumask we're constructing | 6295 | * @node: node whose cpumask we're constructing |
6300 | * @size: number of nodes to include in this span | 6296 | * @size: number of nodes to include in this span |
6301 | * | 6297 | * |
6302 | * Given a node, construct a good cpumask for its sched_domain to span. It | 6298 | * Given a node, construct a good cpumask for its sched_domain to span. It |
6303 | * should be one that prevents unnecessary balancing, but also spreads tasks | 6299 | * should be one that prevents unnecessary balancing, but also spreads tasks |
6304 | * out optimally. | 6300 | * out optimally. |
6305 | */ | 6301 | */ |
6306 | static cpumask_t sched_domain_node_span(int node) | 6302 | static cpumask_t sched_domain_node_span(int node) |
6307 | { | 6303 | { |
6308 | DECLARE_BITMAP(used_nodes, MAX_NUMNODES); | 6304 | DECLARE_BITMAP(used_nodes, MAX_NUMNODES); |
6309 | cpumask_t span, nodemask; | 6305 | cpumask_t span, nodemask; |
6310 | int i; | 6306 | int i; |
6311 | 6307 | ||
6312 | cpus_clear(span); | 6308 | cpus_clear(span); |
6313 | bitmap_zero(used_nodes, MAX_NUMNODES); | 6309 | bitmap_zero(used_nodes, MAX_NUMNODES); |
6314 | 6310 | ||
6315 | nodemask = node_to_cpumask(node); | 6311 | nodemask = node_to_cpumask(node); |
6316 | cpus_or(span, span, nodemask); | 6312 | cpus_or(span, span, nodemask); |
6317 | set_bit(node, used_nodes); | 6313 | set_bit(node, used_nodes); |
6318 | 6314 | ||
6319 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { | 6315 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { |
6320 | int next_node = find_next_best_node(node, used_nodes); | 6316 | int next_node = find_next_best_node(node, used_nodes); |
6321 | 6317 | ||
6322 | nodemask = node_to_cpumask(next_node); | 6318 | nodemask = node_to_cpumask(next_node); |
6323 | cpus_or(span, span, nodemask); | 6319 | cpus_or(span, span, nodemask); |
6324 | } | 6320 | } |
6325 | 6321 | ||
6326 | return span; | 6322 | return span; |
6327 | } | 6323 | } |
6328 | #endif | 6324 | #endif |
6329 | 6325 | ||
6330 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; | 6326 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; |
6331 | 6327 | ||
6332 | /* | 6328 | /* |
6333 | * SMT sched-domains: | 6329 | * SMT sched-domains: |
6334 | */ | 6330 | */ |
6335 | #ifdef CONFIG_SCHED_SMT | 6331 | #ifdef CONFIG_SCHED_SMT |
6336 | static DEFINE_PER_CPU(struct sched_domain, cpu_domains); | 6332 | static DEFINE_PER_CPU(struct sched_domain, cpu_domains); |
6337 | static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); | 6333 | static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); |
6338 | 6334 | ||
6339 | static int | 6335 | static int |
6340 | cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) | 6336 | cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) |
6341 | { | 6337 | { |
6342 | if (sg) | 6338 | if (sg) |
6343 | *sg = &per_cpu(sched_group_cpus, cpu); | 6339 | *sg = &per_cpu(sched_group_cpus, cpu); |
6344 | return cpu; | 6340 | return cpu; |
6345 | } | 6341 | } |
6346 | #endif | 6342 | #endif |
6347 | 6343 | ||
6348 | /* | 6344 | /* |
6349 | * multi-core sched-domains: | 6345 | * multi-core sched-domains: |
6350 | */ | 6346 | */ |
6351 | #ifdef CONFIG_SCHED_MC | 6347 | #ifdef CONFIG_SCHED_MC |
6352 | static DEFINE_PER_CPU(struct sched_domain, core_domains); | 6348 | static DEFINE_PER_CPU(struct sched_domain, core_domains); |
6353 | static DEFINE_PER_CPU(struct sched_group, sched_group_core); | 6349 | static DEFINE_PER_CPU(struct sched_group, sched_group_core); |
6354 | #endif | 6350 | #endif |
6355 | 6351 | ||
6356 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) | 6352 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) |
6357 | static int | 6353 | static int |
6358 | cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) | 6354 | cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) |
6359 | { | 6355 | { |
6360 | int group; | 6356 | int group; |
6361 | cpumask_t mask = per_cpu(cpu_sibling_map, cpu); | 6357 | cpumask_t mask = per_cpu(cpu_sibling_map, cpu); |
6362 | cpus_and(mask, mask, *cpu_map); | 6358 | cpus_and(mask, mask, *cpu_map); |
6363 | group = first_cpu(mask); | 6359 | group = first_cpu(mask); |
6364 | if (sg) | 6360 | if (sg) |
6365 | *sg = &per_cpu(sched_group_core, group); | 6361 | *sg = &per_cpu(sched_group_core, group); |
6366 | return group; | 6362 | return group; |
6367 | } | 6363 | } |
6368 | #elif defined(CONFIG_SCHED_MC) | 6364 | #elif defined(CONFIG_SCHED_MC) |
6369 | static int | 6365 | static int |
6370 | cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) | 6366 | cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) |
6371 | { | 6367 | { |
6372 | if (sg) | 6368 | if (sg) |
6373 | *sg = &per_cpu(sched_group_core, cpu); | 6369 | *sg = &per_cpu(sched_group_core, cpu); |
6374 | return cpu; | 6370 | return cpu; |
6375 | } | 6371 | } |
6376 | #endif | 6372 | #endif |
6377 | 6373 | ||
6378 | static DEFINE_PER_CPU(struct sched_domain, phys_domains); | 6374 | static DEFINE_PER_CPU(struct sched_domain, phys_domains); |
6379 | static DEFINE_PER_CPU(struct sched_group, sched_group_phys); | 6375 | static DEFINE_PER_CPU(struct sched_group, sched_group_phys); |
6380 | 6376 | ||
6381 | static int | 6377 | static int |
6382 | cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) | 6378 | cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) |
6383 | { | 6379 | { |
6384 | int group; | 6380 | int group; |
6385 | #ifdef CONFIG_SCHED_MC | 6381 | #ifdef CONFIG_SCHED_MC |
6386 | cpumask_t mask = cpu_coregroup_map(cpu); | 6382 | cpumask_t mask = cpu_coregroup_map(cpu); |
6387 | cpus_and(mask, mask, *cpu_map); | 6383 | cpus_and(mask, mask, *cpu_map); |
6388 | group = first_cpu(mask); | 6384 | group = first_cpu(mask); |
6389 | #elif defined(CONFIG_SCHED_SMT) | 6385 | #elif defined(CONFIG_SCHED_SMT) |
6390 | cpumask_t mask = per_cpu(cpu_sibling_map, cpu); | 6386 | cpumask_t mask = per_cpu(cpu_sibling_map, cpu); |
6391 | cpus_and(mask, mask, *cpu_map); | 6387 | cpus_and(mask, mask, *cpu_map); |
6392 | group = first_cpu(mask); | 6388 | group = first_cpu(mask); |
6393 | #else | 6389 | #else |
6394 | group = cpu; | 6390 | group = cpu; |
6395 | #endif | 6391 | #endif |
6396 | if (sg) | 6392 | if (sg) |
6397 | *sg = &per_cpu(sched_group_phys, group); | 6393 | *sg = &per_cpu(sched_group_phys, group); |
6398 | return group; | 6394 | return group; |
6399 | } | 6395 | } |
6400 | 6396 | ||
6401 | #ifdef CONFIG_NUMA | 6397 | #ifdef CONFIG_NUMA |
6402 | /* | 6398 | /* |
6403 | * The init_sched_build_groups can't handle what we want to do with node | 6399 | * The init_sched_build_groups can't handle what we want to do with node |
6404 | * groups, so roll our own. Now each node has its own list of groups which | 6400 | * groups, so roll our own. Now each node has its own list of groups which |
6405 | * gets dynamically allocated. | 6401 | * gets dynamically allocated. |
6406 | */ | 6402 | */ |
6407 | static DEFINE_PER_CPU(struct sched_domain, node_domains); | 6403 | static DEFINE_PER_CPU(struct sched_domain, node_domains); |
6408 | static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; | 6404 | static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; |
6409 | 6405 | ||
6410 | static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); | 6406 | static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); |
6411 | static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); | 6407 | static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); |
6412 | 6408 | ||
6413 | static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, | 6409 | static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, |
6414 | struct sched_group **sg) | 6410 | struct sched_group **sg) |
6415 | { | 6411 | { |
6416 | cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu)); | 6412 | cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu)); |
6417 | int group; | 6413 | int group; |
6418 | 6414 | ||
6419 | cpus_and(nodemask, nodemask, *cpu_map); | 6415 | cpus_and(nodemask, nodemask, *cpu_map); |
6420 | group = first_cpu(nodemask); | 6416 | group = first_cpu(nodemask); |
6421 | 6417 | ||
6422 | if (sg) | 6418 | if (sg) |
6423 | *sg = &per_cpu(sched_group_allnodes, group); | 6419 | *sg = &per_cpu(sched_group_allnodes, group); |
6424 | return group; | 6420 | return group; |
6425 | } | 6421 | } |
6426 | 6422 | ||
6427 | static void init_numa_sched_groups_power(struct sched_group *group_head) | 6423 | static void init_numa_sched_groups_power(struct sched_group *group_head) |
6428 | { | 6424 | { |
6429 | struct sched_group *sg = group_head; | 6425 | struct sched_group *sg = group_head; |
6430 | int j; | 6426 | int j; |
6431 | 6427 | ||
6432 | if (!sg) | 6428 | if (!sg) |
6433 | return; | 6429 | return; |
6434 | do { | 6430 | do { |
6435 | for_each_cpu_mask(j, sg->cpumask) { | 6431 | for_each_cpu_mask(j, sg->cpumask) { |
6436 | struct sched_domain *sd; | 6432 | struct sched_domain *sd; |
6437 | 6433 | ||
6438 | sd = &per_cpu(phys_domains, j); | 6434 | sd = &per_cpu(phys_domains, j); |
6439 | if (j != first_cpu(sd->groups->cpumask)) { | 6435 | if (j != first_cpu(sd->groups->cpumask)) { |
6440 | /* | 6436 | /* |
6441 | * Only add "power" once for each | 6437 | * Only add "power" once for each |
6442 | * physical package. | 6438 | * physical package. |
6443 | */ | 6439 | */ |
6444 | continue; | 6440 | continue; |
6445 | } | 6441 | } |
6446 | 6442 | ||
6447 | sg_inc_cpu_power(sg, sd->groups->__cpu_power); | 6443 | sg_inc_cpu_power(sg, sd->groups->__cpu_power); |
6448 | } | 6444 | } |
6449 | sg = sg->next; | 6445 | sg = sg->next; |
6450 | } while (sg != group_head); | 6446 | } while (sg != group_head); |
6451 | } | 6447 | } |
6452 | #endif | 6448 | #endif |
6453 | 6449 | ||
6454 | #ifdef CONFIG_NUMA | 6450 | #ifdef CONFIG_NUMA |
6455 | /* Free memory allocated for various sched_group structures */ | 6451 | /* Free memory allocated for various sched_group structures */ |
6456 | static void free_sched_groups(const cpumask_t *cpu_map) | 6452 | static void free_sched_groups(const cpumask_t *cpu_map) |
6457 | { | 6453 | { |
6458 | int cpu, i; | 6454 | int cpu, i; |
6459 | 6455 | ||
6460 | for_each_cpu_mask(cpu, *cpu_map) { | 6456 | for_each_cpu_mask(cpu, *cpu_map) { |
6461 | struct sched_group **sched_group_nodes | 6457 | struct sched_group **sched_group_nodes |
6462 | = sched_group_nodes_bycpu[cpu]; | 6458 | = sched_group_nodes_bycpu[cpu]; |
6463 | 6459 | ||
6464 | if (!sched_group_nodes) | 6460 | if (!sched_group_nodes) |
6465 | continue; | 6461 | continue; |
6466 | 6462 | ||
6467 | for (i = 0; i < MAX_NUMNODES; i++) { | 6463 | for (i = 0; i < MAX_NUMNODES; i++) { |
6468 | cpumask_t nodemask = node_to_cpumask(i); | 6464 | cpumask_t nodemask = node_to_cpumask(i); |
6469 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; | 6465 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; |
6470 | 6466 | ||
6471 | cpus_and(nodemask, nodemask, *cpu_map); | 6467 | cpus_and(nodemask, nodemask, *cpu_map); |
6472 | if (cpus_empty(nodemask)) | 6468 | if (cpus_empty(nodemask)) |
6473 | continue; | 6469 | continue; |
6474 | 6470 | ||
6475 | if (sg == NULL) | 6471 | if (sg == NULL) |
6476 | continue; | 6472 | continue; |
6477 | sg = sg->next; | 6473 | sg = sg->next; |
6478 | next_sg: | 6474 | next_sg: |
6479 | oldsg = sg; | 6475 | oldsg = sg; |
6480 | sg = sg->next; | 6476 | sg = sg->next; |
6481 | kfree(oldsg); | 6477 | kfree(oldsg); |
6482 | if (oldsg != sched_group_nodes[i]) | 6478 | if (oldsg != sched_group_nodes[i]) |
6483 | goto next_sg; | 6479 | goto next_sg; |
6484 | } | 6480 | } |
6485 | kfree(sched_group_nodes); | 6481 | kfree(sched_group_nodes); |
6486 | sched_group_nodes_bycpu[cpu] = NULL; | 6482 | sched_group_nodes_bycpu[cpu] = NULL; |
6487 | } | 6483 | } |
6488 | } | 6484 | } |
6489 | #else | 6485 | #else |
6490 | static void free_sched_groups(const cpumask_t *cpu_map) | 6486 | static void free_sched_groups(const cpumask_t *cpu_map) |
6491 | { | 6487 | { |
6492 | } | 6488 | } |
6493 | #endif | 6489 | #endif |
6494 | 6490 | ||
6495 | /* | 6491 | /* |
6496 | * Initialize sched groups cpu_power. | 6492 | * Initialize sched groups cpu_power. |
6497 | * | 6493 | * |
6498 | * cpu_power indicates the capacity of sched group, which is used while | 6494 | * cpu_power indicates the capacity of sched group, which is used while |
6499 | * distributing the load between different sched groups in a sched domain. | 6495 | * distributing the load between different sched groups in a sched domain. |
6500 | * Typically cpu_power for all the groups in a sched domain will be same unless | 6496 | * Typically cpu_power for all the groups in a sched domain will be same unless |
6501 | * there are asymmetries in the topology. If there are asymmetries, group | 6497 | * there are asymmetries in the topology. If there are asymmetries, group |
6502 | * having more cpu_power will pickup more load compared to the group having | 6498 | * having more cpu_power will pickup more load compared to the group having |
6503 | * less cpu_power. | 6499 | * less cpu_power. |
6504 | * | 6500 | * |
6505 | * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents | 6501 | * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents |
6506 | * the maximum number of tasks a group can handle in the presence of other idle | 6502 | * the maximum number of tasks a group can handle in the presence of other idle |
6507 | * or lightly loaded groups in the same sched domain. | 6503 | * or lightly loaded groups in the same sched domain. |
6508 | */ | 6504 | */ |
6509 | static void init_sched_groups_power(int cpu, struct sched_domain *sd) | 6505 | static void init_sched_groups_power(int cpu, struct sched_domain *sd) |
6510 | { | 6506 | { |
6511 | struct sched_domain *child; | 6507 | struct sched_domain *child; |
6512 | struct sched_group *group; | 6508 | struct sched_group *group; |
6513 | 6509 | ||
6514 | WARN_ON(!sd || !sd->groups); | 6510 | WARN_ON(!sd || !sd->groups); |
6515 | 6511 | ||
6516 | if (cpu != first_cpu(sd->groups->cpumask)) | 6512 | if (cpu != first_cpu(sd->groups->cpumask)) |
6517 | return; | 6513 | return; |
6518 | 6514 | ||
6519 | child = sd->child; | 6515 | child = sd->child; |
6520 | 6516 | ||
6521 | sd->groups->__cpu_power = 0; | 6517 | sd->groups->__cpu_power = 0; |
6522 | 6518 | ||
6523 | /* | 6519 | /* |
6524 | * For perf policy, if the groups in child domain share resources | 6520 | * For perf policy, if the groups in child domain share resources |
6525 | * (for example cores sharing some portions of the cache hierarchy | 6521 | * (for example cores sharing some portions of the cache hierarchy |
6526 | * or SMT), then set this domain groups cpu_power such that each group | 6522 | * or SMT), then set this domain groups cpu_power such that each group |
6527 | * can handle only one task, when there are other idle groups in the | 6523 | * can handle only one task, when there are other idle groups in the |
6528 | * same sched domain. | 6524 | * same sched domain. |
6529 | */ | 6525 | */ |
6530 | if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && | 6526 | if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && |
6531 | (child->flags & | 6527 | (child->flags & |
6532 | (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { | 6528 | (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { |
6533 | sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE); | 6529 | sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE); |
6534 | return; | 6530 | return; |
6535 | } | 6531 | } |
6536 | 6532 | ||
6537 | /* | 6533 | /* |
6538 | * add cpu_power of each child group to this groups cpu_power | 6534 | * add cpu_power of each child group to this groups cpu_power |
6539 | */ | 6535 | */ |
6540 | group = child->groups; | 6536 | group = child->groups; |
6541 | do { | 6537 | do { |
6542 | sg_inc_cpu_power(sd->groups, group->__cpu_power); | 6538 | sg_inc_cpu_power(sd->groups, group->__cpu_power); |
6543 | group = group->next; | 6539 | group = group->next; |
6544 | } while (group != child->groups); | 6540 | } while (group != child->groups); |
6545 | } | 6541 | } |
6546 | 6542 | ||
6547 | /* | 6543 | /* |
6548 | * Build sched domains for a given set of cpus and attach the sched domains | 6544 | * Build sched domains for a given set of cpus and attach the sched domains |
6549 | * to the individual cpus | 6545 | * to the individual cpus |
6550 | */ | 6546 | */ |
6551 | static int build_sched_domains(const cpumask_t *cpu_map) | 6547 | static int build_sched_domains(const cpumask_t *cpu_map) |
6552 | { | 6548 | { |
6553 | int i; | 6549 | int i; |
6554 | struct root_domain *rd; | 6550 | struct root_domain *rd; |
6555 | #ifdef CONFIG_NUMA | 6551 | #ifdef CONFIG_NUMA |
6556 | struct sched_group **sched_group_nodes = NULL; | 6552 | struct sched_group **sched_group_nodes = NULL; |
6557 | int sd_allnodes = 0; | 6553 | int sd_allnodes = 0; |
6558 | 6554 | ||
6559 | /* | 6555 | /* |
6560 | * Allocate the per-node list of sched groups | 6556 | * Allocate the per-node list of sched groups |
6561 | */ | 6557 | */ |
6562 | sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *), | 6558 | sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *), |
6563 | GFP_KERNEL); | 6559 | GFP_KERNEL); |
6564 | if (!sched_group_nodes) { | 6560 | if (!sched_group_nodes) { |
6565 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | 6561 | printk(KERN_WARNING "Can not alloc sched group node list\n"); |
6566 | return -ENOMEM; | 6562 | return -ENOMEM; |
6567 | } | 6563 | } |
6568 | sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; | 6564 | sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; |
6569 | #endif | 6565 | #endif |
6570 | 6566 | ||
6571 | rd = alloc_rootdomain(); | 6567 | rd = alloc_rootdomain(); |
6572 | if (!rd) { | 6568 | if (!rd) { |
6573 | printk(KERN_WARNING "Cannot alloc root domain\n"); | 6569 | printk(KERN_WARNING "Cannot alloc root domain\n"); |
6574 | return -ENOMEM; | 6570 | return -ENOMEM; |
6575 | } | 6571 | } |
6576 | 6572 | ||
6577 | /* | 6573 | /* |
6578 | * Set up domains for cpus specified by the cpu_map. | 6574 | * Set up domains for cpus specified by the cpu_map. |
6579 | */ | 6575 | */ |
6580 | for_each_cpu_mask(i, *cpu_map) { | 6576 | for_each_cpu_mask(i, *cpu_map) { |
6581 | struct sched_domain *sd = NULL, *p; | 6577 | struct sched_domain *sd = NULL, *p; |
6582 | cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); | 6578 | cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); |
6583 | 6579 | ||
6584 | cpus_and(nodemask, nodemask, *cpu_map); | 6580 | cpus_and(nodemask, nodemask, *cpu_map); |
6585 | 6581 | ||
6586 | #ifdef CONFIG_NUMA | 6582 | #ifdef CONFIG_NUMA |
6587 | if (cpus_weight(*cpu_map) > | 6583 | if (cpus_weight(*cpu_map) > |
6588 | SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { | 6584 | SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { |
6589 | sd = &per_cpu(allnodes_domains, i); | 6585 | sd = &per_cpu(allnodes_domains, i); |
6590 | *sd = SD_ALLNODES_INIT; | 6586 | *sd = SD_ALLNODES_INIT; |
6591 | sd->span = *cpu_map; | 6587 | sd->span = *cpu_map; |
6592 | cpu_to_allnodes_group(i, cpu_map, &sd->groups); | 6588 | cpu_to_allnodes_group(i, cpu_map, &sd->groups); |
6593 | p = sd; | 6589 | p = sd; |
6594 | sd_allnodes = 1; | 6590 | sd_allnodes = 1; |
6595 | } else | 6591 | } else |
6596 | p = NULL; | 6592 | p = NULL; |
6597 | 6593 | ||
6598 | sd = &per_cpu(node_domains, i); | 6594 | sd = &per_cpu(node_domains, i); |
6599 | *sd = SD_NODE_INIT; | 6595 | *sd = SD_NODE_INIT; |
6600 | sd->span = sched_domain_node_span(cpu_to_node(i)); | 6596 | sd->span = sched_domain_node_span(cpu_to_node(i)); |
6601 | sd->parent = p; | 6597 | sd->parent = p; |
6602 | if (p) | 6598 | if (p) |
6603 | p->child = sd; | 6599 | p->child = sd; |
6604 | cpus_and(sd->span, sd->span, *cpu_map); | 6600 | cpus_and(sd->span, sd->span, *cpu_map); |
6605 | #endif | 6601 | #endif |
6606 | 6602 | ||
6607 | p = sd; | 6603 | p = sd; |
6608 | sd = &per_cpu(phys_domains, i); | 6604 | sd = &per_cpu(phys_domains, i); |
6609 | *sd = SD_CPU_INIT; | 6605 | *sd = SD_CPU_INIT; |
6610 | sd->span = nodemask; | 6606 | sd->span = nodemask; |
6611 | sd->parent = p; | 6607 | sd->parent = p; |
6612 | if (p) | 6608 | if (p) |
6613 | p->child = sd; | 6609 | p->child = sd; |
6614 | cpu_to_phys_group(i, cpu_map, &sd->groups); | 6610 | cpu_to_phys_group(i, cpu_map, &sd->groups); |
6615 | 6611 | ||
6616 | #ifdef CONFIG_SCHED_MC | 6612 | #ifdef CONFIG_SCHED_MC |
6617 | p = sd; | 6613 | p = sd; |
6618 | sd = &per_cpu(core_domains, i); | 6614 | sd = &per_cpu(core_domains, i); |
6619 | *sd = SD_MC_INIT; | 6615 | *sd = SD_MC_INIT; |
6620 | sd->span = cpu_coregroup_map(i); | 6616 | sd->span = cpu_coregroup_map(i); |
6621 | cpus_and(sd->span, sd->span, *cpu_map); | 6617 | cpus_and(sd->span, sd->span, *cpu_map); |
6622 | sd->parent = p; | 6618 | sd->parent = p; |
6623 | p->child = sd; | 6619 | p->child = sd; |
6624 | cpu_to_core_group(i, cpu_map, &sd->groups); | 6620 | cpu_to_core_group(i, cpu_map, &sd->groups); |
6625 | #endif | 6621 | #endif |
6626 | 6622 | ||
6627 | #ifdef CONFIG_SCHED_SMT | 6623 | #ifdef CONFIG_SCHED_SMT |
6628 | p = sd; | 6624 | p = sd; |
6629 | sd = &per_cpu(cpu_domains, i); | 6625 | sd = &per_cpu(cpu_domains, i); |
6630 | *sd = SD_SIBLING_INIT; | 6626 | *sd = SD_SIBLING_INIT; |
6631 | sd->span = per_cpu(cpu_sibling_map, i); | 6627 | sd->span = per_cpu(cpu_sibling_map, i); |
6632 | cpus_and(sd->span, sd->span, *cpu_map); | 6628 | cpus_and(sd->span, sd->span, *cpu_map); |
6633 | sd->parent = p; | 6629 | sd->parent = p; |
6634 | p->child = sd; | 6630 | p->child = sd; |
6635 | cpu_to_cpu_group(i, cpu_map, &sd->groups); | 6631 | cpu_to_cpu_group(i, cpu_map, &sd->groups); |
6636 | #endif | 6632 | #endif |
6637 | } | 6633 | } |
6638 | 6634 | ||
6639 | #ifdef CONFIG_SCHED_SMT | 6635 | #ifdef CONFIG_SCHED_SMT |
6640 | /* Set up CPU (sibling) groups */ | 6636 | /* Set up CPU (sibling) groups */ |
6641 | for_each_cpu_mask(i, *cpu_map) { | 6637 | for_each_cpu_mask(i, *cpu_map) { |
6642 | cpumask_t this_sibling_map = per_cpu(cpu_sibling_map, i); | 6638 | cpumask_t this_sibling_map = per_cpu(cpu_sibling_map, i); |
6643 | cpus_and(this_sibling_map, this_sibling_map, *cpu_map); | 6639 | cpus_and(this_sibling_map, this_sibling_map, *cpu_map); |
6644 | if (i != first_cpu(this_sibling_map)) | 6640 | if (i != first_cpu(this_sibling_map)) |
6645 | continue; | 6641 | continue; |
6646 | 6642 | ||
6647 | init_sched_build_groups(this_sibling_map, cpu_map, | 6643 | init_sched_build_groups(this_sibling_map, cpu_map, |
6648 | &cpu_to_cpu_group); | 6644 | &cpu_to_cpu_group); |
6649 | } | 6645 | } |
6650 | #endif | 6646 | #endif |
6651 | 6647 | ||
6652 | #ifdef CONFIG_SCHED_MC | 6648 | #ifdef CONFIG_SCHED_MC |
6653 | /* Set up multi-core groups */ | 6649 | /* Set up multi-core groups */ |
6654 | for_each_cpu_mask(i, *cpu_map) { | 6650 | for_each_cpu_mask(i, *cpu_map) { |
6655 | cpumask_t this_core_map = cpu_coregroup_map(i); | 6651 | cpumask_t this_core_map = cpu_coregroup_map(i); |
6656 | cpus_and(this_core_map, this_core_map, *cpu_map); | 6652 | cpus_and(this_core_map, this_core_map, *cpu_map); |
6657 | if (i != first_cpu(this_core_map)) | 6653 | if (i != first_cpu(this_core_map)) |
6658 | continue; | 6654 | continue; |
6659 | init_sched_build_groups(this_core_map, cpu_map, | 6655 | init_sched_build_groups(this_core_map, cpu_map, |
6660 | &cpu_to_core_group); | 6656 | &cpu_to_core_group); |
6661 | } | 6657 | } |
6662 | #endif | 6658 | #endif |
6663 | 6659 | ||
6664 | /* Set up physical groups */ | 6660 | /* Set up physical groups */ |
6665 | for (i = 0; i < MAX_NUMNODES; i++) { | 6661 | for (i = 0; i < MAX_NUMNODES; i++) { |
6666 | cpumask_t nodemask = node_to_cpumask(i); | 6662 | cpumask_t nodemask = node_to_cpumask(i); |
6667 | 6663 | ||
6668 | cpus_and(nodemask, nodemask, *cpu_map); | 6664 | cpus_and(nodemask, nodemask, *cpu_map); |
6669 | if (cpus_empty(nodemask)) | 6665 | if (cpus_empty(nodemask)) |
6670 | continue; | 6666 | continue; |
6671 | 6667 | ||
6672 | init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group); | 6668 | init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group); |
6673 | } | 6669 | } |
6674 | 6670 | ||
6675 | #ifdef CONFIG_NUMA | 6671 | #ifdef CONFIG_NUMA |
6676 | /* Set up node groups */ | 6672 | /* Set up node groups */ |
6677 | if (sd_allnodes) | 6673 | if (sd_allnodes) |
6678 | init_sched_build_groups(*cpu_map, cpu_map, | 6674 | init_sched_build_groups(*cpu_map, cpu_map, |
6679 | &cpu_to_allnodes_group); | 6675 | &cpu_to_allnodes_group); |
6680 | 6676 | ||
6681 | for (i = 0; i < MAX_NUMNODES; i++) { | 6677 | for (i = 0; i < MAX_NUMNODES; i++) { |
6682 | /* Set up node groups */ | 6678 | /* Set up node groups */ |
6683 | struct sched_group *sg, *prev; | 6679 | struct sched_group *sg, *prev; |
6684 | cpumask_t nodemask = node_to_cpumask(i); | 6680 | cpumask_t nodemask = node_to_cpumask(i); |
6685 | cpumask_t domainspan; | 6681 | cpumask_t domainspan; |
6686 | cpumask_t covered = CPU_MASK_NONE; | 6682 | cpumask_t covered = CPU_MASK_NONE; |
6687 | int j; | 6683 | int j; |
6688 | 6684 | ||
6689 | cpus_and(nodemask, nodemask, *cpu_map); | 6685 | cpus_and(nodemask, nodemask, *cpu_map); |
6690 | if (cpus_empty(nodemask)) { | 6686 | if (cpus_empty(nodemask)) { |
6691 | sched_group_nodes[i] = NULL; | 6687 | sched_group_nodes[i] = NULL; |
6692 | continue; | 6688 | continue; |
6693 | } | 6689 | } |
6694 | 6690 | ||
6695 | domainspan = sched_domain_node_span(i); | 6691 | domainspan = sched_domain_node_span(i); |
6696 | cpus_and(domainspan, domainspan, *cpu_map); | 6692 | cpus_and(domainspan, domainspan, *cpu_map); |
6697 | 6693 | ||
6698 | sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); | 6694 | sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); |
6699 | if (!sg) { | 6695 | if (!sg) { |
6700 | printk(KERN_WARNING "Can not alloc domain group for " | 6696 | printk(KERN_WARNING "Can not alloc domain group for " |
6701 | "node %d\n", i); | 6697 | "node %d\n", i); |
6702 | goto error; | 6698 | goto error; |
6703 | } | 6699 | } |
6704 | sched_group_nodes[i] = sg; | 6700 | sched_group_nodes[i] = sg; |
6705 | for_each_cpu_mask(j, nodemask) { | 6701 | for_each_cpu_mask(j, nodemask) { |
6706 | struct sched_domain *sd; | 6702 | struct sched_domain *sd; |
6707 | 6703 | ||
6708 | sd = &per_cpu(node_domains, j); | 6704 | sd = &per_cpu(node_domains, j); |
6709 | sd->groups = sg; | 6705 | sd->groups = sg; |
6710 | } | 6706 | } |
6711 | sg->__cpu_power = 0; | 6707 | sg->__cpu_power = 0; |
6712 | sg->cpumask = nodemask; | 6708 | sg->cpumask = nodemask; |
6713 | sg->next = sg; | 6709 | sg->next = sg; |
6714 | cpus_or(covered, covered, nodemask); | 6710 | cpus_or(covered, covered, nodemask); |
6715 | prev = sg; | 6711 | prev = sg; |
6716 | 6712 | ||
6717 | for (j = 0; j < MAX_NUMNODES; j++) { | 6713 | for (j = 0; j < MAX_NUMNODES; j++) { |
6718 | cpumask_t tmp, notcovered; | 6714 | cpumask_t tmp, notcovered; |
6719 | int n = (i + j) % MAX_NUMNODES; | 6715 | int n = (i + j) % MAX_NUMNODES; |
6720 | 6716 | ||
6721 | cpus_complement(notcovered, covered); | 6717 | cpus_complement(notcovered, covered); |
6722 | cpus_and(tmp, notcovered, *cpu_map); | 6718 | cpus_and(tmp, notcovered, *cpu_map); |
6723 | cpus_and(tmp, tmp, domainspan); | 6719 | cpus_and(tmp, tmp, domainspan); |
6724 | if (cpus_empty(tmp)) | 6720 | if (cpus_empty(tmp)) |
6725 | break; | 6721 | break; |
6726 | 6722 | ||
6727 | nodemask = node_to_cpumask(n); | 6723 | nodemask = node_to_cpumask(n); |
6728 | cpus_and(tmp, tmp, nodemask); | 6724 | cpus_and(tmp, tmp, nodemask); |
6729 | if (cpus_empty(tmp)) | 6725 | if (cpus_empty(tmp)) |
6730 | continue; | 6726 | continue; |
6731 | 6727 | ||
6732 | sg = kmalloc_node(sizeof(struct sched_group), | 6728 | sg = kmalloc_node(sizeof(struct sched_group), |
6733 | GFP_KERNEL, i); | 6729 | GFP_KERNEL, i); |
6734 | if (!sg) { | 6730 | if (!sg) { |
6735 | printk(KERN_WARNING | 6731 | printk(KERN_WARNING |
6736 | "Can not alloc domain group for node %d\n", j); | 6732 | "Can not alloc domain group for node %d\n", j); |
6737 | goto error; | 6733 | goto error; |
6738 | } | 6734 | } |
6739 | sg->__cpu_power = 0; | 6735 | sg->__cpu_power = 0; |
6740 | sg->cpumask = tmp; | 6736 | sg->cpumask = tmp; |
6741 | sg->next = prev->next; | 6737 | sg->next = prev->next; |
6742 | cpus_or(covered, covered, tmp); | 6738 | cpus_or(covered, covered, tmp); |
6743 | prev->next = sg; | 6739 | prev->next = sg; |
6744 | prev = sg; | 6740 | prev = sg; |
6745 | } | 6741 | } |
6746 | } | 6742 | } |
6747 | #endif | 6743 | #endif |
6748 | 6744 | ||
6749 | /* Calculate CPU power for physical packages and nodes */ | 6745 | /* Calculate CPU power for physical packages and nodes */ |
6750 | #ifdef CONFIG_SCHED_SMT | 6746 | #ifdef CONFIG_SCHED_SMT |
6751 | for_each_cpu_mask(i, *cpu_map) { | 6747 | for_each_cpu_mask(i, *cpu_map) { |
6752 | struct sched_domain *sd = &per_cpu(cpu_domains, i); | 6748 | struct sched_domain *sd = &per_cpu(cpu_domains, i); |
6753 | 6749 | ||
6754 | init_sched_groups_power(i, sd); | 6750 | init_sched_groups_power(i, sd); |
6755 | } | 6751 | } |
6756 | #endif | 6752 | #endif |
6757 | #ifdef CONFIG_SCHED_MC | 6753 | #ifdef CONFIG_SCHED_MC |
6758 | for_each_cpu_mask(i, *cpu_map) { | 6754 | for_each_cpu_mask(i, *cpu_map) { |
6759 | struct sched_domain *sd = &per_cpu(core_domains, i); | 6755 | struct sched_domain *sd = &per_cpu(core_domains, i); |
6760 | 6756 | ||
6761 | init_sched_groups_power(i, sd); | 6757 | init_sched_groups_power(i, sd); |
6762 | } | 6758 | } |
6763 | #endif | 6759 | #endif |
6764 | 6760 | ||
6765 | for_each_cpu_mask(i, *cpu_map) { | 6761 | for_each_cpu_mask(i, *cpu_map) { |
6766 | struct sched_domain *sd = &per_cpu(phys_domains, i); | 6762 | struct sched_domain *sd = &per_cpu(phys_domains, i); |
6767 | 6763 | ||
6768 | init_sched_groups_power(i, sd); | 6764 | init_sched_groups_power(i, sd); |
6769 | } | 6765 | } |
6770 | 6766 | ||
6771 | #ifdef CONFIG_NUMA | 6767 | #ifdef CONFIG_NUMA |
6772 | for (i = 0; i < MAX_NUMNODES; i++) | 6768 | for (i = 0; i < MAX_NUMNODES; i++) |
6773 | init_numa_sched_groups_power(sched_group_nodes[i]); | 6769 | init_numa_sched_groups_power(sched_group_nodes[i]); |
6774 | 6770 | ||
6775 | if (sd_allnodes) { | 6771 | if (sd_allnodes) { |
6776 | struct sched_group *sg; | 6772 | struct sched_group *sg; |
6777 | 6773 | ||
6778 | cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg); | 6774 | cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg); |
6779 | init_numa_sched_groups_power(sg); | 6775 | init_numa_sched_groups_power(sg); |
6780 | } | 6776 | } |
6781 | #endif | 6777 | #endif |
6782 | 6778 | ||
6783 | /* Attach the domains */ | 6779 | /* Attach the domains */ |
6784 | for_each_cpu_mask(i, *cpu_map) { | 6780 | for_each_cpu_mask(i, *cpu_map) { |
6785 | struct sched_domain *sd; | 6781 | struct sched_domain *sd; |
6786 | #ifdef CONFIG_SCHED_SMT | 6782 | #ifdef CONFIG_SCHED_SMT |
6787 | sd = &per_cpu(cpu_domains, i); | 6783 | sd = &per_cpu(cpu_domains, i); |
6788 | #elif defined(CONFIG_SCHED_MC) | 6784 | #elif defined(CONFIG_SCHED_MC) |
6789 | sd = &per_cpu(core_domains, i); | 6785 | sd = &per_cpu(core_domains, i); |
6790 | #else | 6786 | #else |
6791 | sd = &per_cpu(phys_domains, i); | 6787 | sd = &per_cpu(phys_domains, i); |
6792 | #endif | 6788 | #endif |
6793 | cpu_attach_domain(sd, rd, i); | 6789 | cpu_attach_domain(sd, rd, i); |
6794 | } | 6790 | } |
6795 | 6791 | ||
6796 | return 0; | 6792 | return 0; |
6797 | 6793 | ||
6798 | #ifdef CONFIG_NUMA | 6794 | #ifdef CONFIG_NUMA |
6799 | error: | 6795 | error: |
6800 | free_sched_groups(cpu_map); | 6796 | free_sched_groups(cpu_map); |
6801 | return -ENOMEM; | 6797 | return -ENOMEM; |
6802 | #endif | 6798 | #endif |
6803 | } | 6799 | } |
6804 | 6800 | ||
6805 | static cpumask_t *doms_cur; /* current sched domains */ | 6801 | static cpumask_t *doms_cur; /* current sched domains */ |
6806 | static int ndoms_cur; /* number of sched domains in 'doms_cur' */ | 6802 | static int ndoms_cur; /* number of sched domains in 'doms_cur' */ |
6807 | 6803 | ||
6808 | /* | 6804 | /* |
6809 | * Special case: If a kmalloc of a doms_cur partition (array of | 6805 | * Special case: If a kmalloc of a doms_cur partition (array of |
6810 | * cpumask_t) fails, then fallback to a single sched domain, | 6806 | * cpumask_t) fails, then fallback to a single sched domain, |
6811 | * as determined by the single cpumask_t fallback_doms. | 6807 | * as determined by the single cpumask_t fallback_doms. |
6812 | */ | 6808 | */ |
6813 | static cpumask_t fallback_doms; | 6809 | static cpumask_t fallback_doms; |
6814 | 6810 | ||
6815 | /* | 6811 | /* |
6816 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | 6812 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. |
6817 | * For now this just excludes isolated cpus, but could be used to | 6813 | * For now this just excludes isolated cpus, but could be used to |
6818 | * exclude other special cases in the future. | 6814 | * exclude other special cases in the future. |
6819 | */ | 6815 | */ |
6820 | static int arch_init_sched_domains(const cpumask_t *cpu_map) | 6816 | static int arch_init_sched_domains(const cpumask_t *cpu_map) |
6821 | { | 6817 | { |
6822 | int err; | 6818 | int err; |
6823 | 6819 | ||
6824 | ndoms_cur = 1; | 6820 | ndoms_cur = 1; |
6825 | doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL); | 6821 | doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL); |
6826 | if (!doms_cur) | 6822 | if (!doms_cur) |
6827 | doms_cur = &fallback_doms; | 6823 | doms_cur = &fallback_doms; |
6828 | cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map); | 6824 | cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map); |
6829 | err = build_sched_domains(doms_cur); | 6825 | err = build_sched_domains(doms_cur); |
6830 | register_sched_domain_sysctl(); | 6826 | register_sched_domain_sysctl(); |
6831 | 6827 | ||
6832 | return err; | 6828 | return err; |
6833 | } | 6829 | } |
6834 | 6830 | ||
6835 | static void arch_destroy_sched_domains(const cpumask_t *cpu_map) | 6831 | static void arch_destroy_sched_domains(const cpumask_t *cpu_map) |
6836 | { | 6832 | { |
6837 | free_sched_groups(cpu_map); | 6833 | free_sched_groups(cpu_map); |
6838 | } | 6834 | } |
6839 | 6835 | ||
6840 | /* | 6836 | /* |
6841 | * Detach sched domains from a group of cpus specified in cpu_map | 6837 | * Detach sched domains from a group of cpus specified in cpu_map |
6842 | * These cpus will now be attached to the NULL domain | 6838 | * These cpus will now be attached to the NULL domain |
6843 | */ | 6839 | */ |
6844 | static void detach_destroy_domains(const cpumask_t *cpu_map) | 6840 | static void detach_destroy_domains(const cpumask_t *cpu_map) |
6845 | { | 6841 | { |
6846 | int i; | 6842 | int i; |
6847 | 6843 | ||
6848 | unregister_sched_domain_sysctl(); | 6844 | unregister_sched_domain_sysctl(); |
6849 | 6845 | ||
6850 | for_each_cpu_mask(i, *cpu_map) | 6846 | for_each_cpu_mask(i, *cpu_map) |
6851 | cpu_attach_domain(NULL, &def_root_domain, i); | 6847 | cpu_attach_domain(NULL, &def_root_domain, i); |
6852 | synchronize_sched(); | 6848 | synchronize_sched(); |
6853 | arch_destroy_sched_domains(cpu_map); | 6849 | arch_destroy_sched_domains(cpu_map); |
6854 | } | 6850 | } |
6855 | 6851 | ||
6856 | /* | 6852 | /* |
6857 | * Partition sched domains as specified by the 'ndoms_new' | 6853 | * Partition sched domains as specified by the 'ndoms_new' |
6858 | * cpumasks in the array doms_new[] of cpumasks. This compares | 6854 | * cpumasks in the array doms_new[] of cpumasks. This compares |
6859 | * doms_new[] to the current sched domain partitioning, doms_cur[]. | 6855 | * doms_new[] to the current sched domain partitioning, doms_cur[]. |
6860 | * It destroys each deleted domain and builds each new domain. | 6856 | * It destroys each deleted domain and builds each new domain. |
6861 | * | 6857 | * |
6862 | * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'. | 6858 | * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'. |
6863 | * The masks don't intersect (don't overlap.) We should setup one | 6859 | * The masks don't intersect (don't overlap.) We should setup one |
6864 | * sched domain for each mask. CPUs not in any of the cpumasks will | 6860 | * sched domain for each mask. CPUs not in any of the cpumasks will |
6865 | * not be load balanced. If the same cpumask appears both in the | 6861 | * not be load balanced. If the same cpumask appears both in the |
6866 | * current 'doms_cur' domains and in the new 'doms_new', we can leave | 6862 | * current 'doms_cur' domains and in the new 'doms_new', we can leave |
6867 | * it as it is. | 6863 | * it as it is. |
6868 | * | 6864 | * |
6869 | * The passed in 'doms_new' should be kmalloc'd. This routine takes | 6865 | * The passed in 'doms_new' should be kmalloc'd. This routine takes |
6870 | * ownership of it and will kfree it when done with it. If the caller | 6866 | * ownership of it and will kfree it when done with it. If the caller |
6871 | * failed the kmalloc call, then it can pass in doms_new == NULL, | 6867 | * failed the kmalloc call, then it can pass in doms_new == NULL, |
6872 | * and partition_sched_domains() will fallback to the single partition | 6868 | * and partition_sched_domains() will fallback to the single partition |
6873 | * 'fallback_doms'. | 6869 | * 'fallback_doms'. |
6874 | * | 6870 | * |
6875 | * Call with hotplug lock held | 6871 | * Call with hotplug lock held |
6876 | */ | 6872 | */ |
6877 | void partition_sched_domains(int ndoms_new, cpumask_t *doms_new) | 6873 | void partition_sched_domains(int ndoms_new, cpumask_t *doms_new) |
6878 | { | 6874 | { |
6879 | int i, j; | 6875 | int i, j; |
6880 | 6876 | ||
6881 | lock_doms_cur(); | 6877 | lock_doms_cur(); |
6882 | 6878 | ||
6883 | /* always unregister in case we don't destroy any domains */ | 6879 | /* always unregister in case we don't destroy any domains */ |
6884 | unregister_sched_domain_sysctl(); | 6880 | unregister_sched_domain_sysctl(); |
6885 | 6881 | ||
6886 | if (doms_new == NULL) { | 6882 | if (doms_new == NULL) { |
6887 | ndoms_new = 1; | 6883 | ndoms_new = 1; |
6888 | doms_new = &fallback_doms; | 6884 | doms_new = &fallback_doms; |
6889 | cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); | 6885 | cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); |
6890 | } | 6886 | } |
6891 | 6887 | ||
6892 | /* Destroy deleted domains */ | 6888 | /* Destroy deleted domains */ |
6893 | for (i = 0; i < ndoms_cur; i++) { | 6889 | for (i = 0; i < ndoms_cur; i++) { |
6894 | for (j = 0; j < ndoms_new; j++) { | 6890 | for (j = 0; j < ndoms_new; j++) { |
6895 | if (cpus_equal(doms_cur[i], doms_new[j])) | 6891 | if (cpus_equal(doms_cur[i], doms_new[j])) |
6896 | goto match1; | 6892 | goto match1; |
6897 | } | 6893 | } |
6898 | /* no match - a current sched domain not in new doms_new[] */ | 6894 | /* no match - a current sched domain not in new doms_new[] */ |
6899 | detach_destroy_domains(doms_cur + i); | 6895 | detach_destroy_domains(doms_cur + i); |
6900 | match1: | 6896 | match1: |
6901 | ; | 6897 | ; |
6902 | } | 6898 | } |
6903 | 6899 | ||
6904 | /* Build new domains */ | 6900 | /* Build new domains */ |
6905 | for (i = 0; i < ndoms_new; i++) { | 6901 | for (i = 0; i < ndoms_new; i++) { |
6906 | for (j = 0; j < ndoms_cur; j++) { | 6902 | for (j = 0; j < ndoms_cur; j++) { |
6907 | if (cpus_equal(doms_new[i], doms_cur[j])) | 6903 | if (cpus_equal(doms_new[i], doms_cur[j])) |
6908 | goto match2; | 6904 | goto match2; |
6909 | } | 6905 | } |
6910 | /* no match - add a new doms_new */ | 6906 | /* no match - add a new doms_new */ |
6911 | build_sched_domains(doms_new + i); | 6907 | build_sched_domains(doms_new + i); |
6912 | match2: | 6908 | match2: |
6913 | ; | 6909 | ; |
6914 | } | 6910 | } |
6915 | 6911 | ||
6916 | /* Remember the new sched domains */ | 6912 | /* Remember the new sched domains */ |
6917 | if (doms_cur != &fallback_doms) | 6913 | if (doms_cur != &fallback_doms) |
6918 | kfree(doms_cur); | 6914 | kfree(doms_cur); |
6919 | doms_cur = doms_new; | 6915 | doms_cur = doms_new; |
6920 | ndoms_cur = ndoms_new; | 6916 | ndoms_cur = ndoms_new; |
6921 | 6917 | ||
6922 | register_sched_domain_sysctl(); | 6918 | register_sched_domain_sysctl(); |
6923 | 6919 | ||
6924 | unlock_doms_cur(); | 6920 | unlock_doms_cur(); |
6925 | } | 6921 | } |
6926 | 6922 | ||
6927 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 6923 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
6928 | static int arch_reinit_sched_domains(void) | 6924 | static int arch_reinit_sched_domains(void) |
6929 | { | 6925 | { |
6930 | int err; | 6926 | int err; |
6931 | 6927 | ||
6932 | get_online_cpus(); | 6928 | get_online_cpus(); |
6933 | detach_destroy_domains(&cpu_online_map); | 6929 | detach_destroy_domains(&cpu_online_map); |
6934 | err = arch_init_sched_domains(&cpu_online_map); | 6930 | err = arch_init_sched_domains(&cpu_online_map); |
6935 | put_online_cpus(); | 6931 | put_online_cpus(); |
6936 | 6932 | ||
6937 | return err; | 6933 | return err; |
6938 | } | 6934 | } |
6939 | 6935 | ||
6940 | static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) | 6936 | static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) |
6941 | { | 6937 | { |
6942 | int ret; | 6938 | int ret; |
6943 | 6939 | ||
6944 | if (buf[0] != '0' && buf[0] != '1') | 6940 | if (buf[0] != '0' && buf[0] != '1') |
6945 | return -EINVAL; | 6941 | return -EINVAL; |
6946 | 6942 | ||
6947 | if (smt) | 6943 | if (smt) |
6948 | sched_smt_power_savings = (buf[0] == '1'); | 6944 | sched_smt_power_savings = (buf[0] == '1'); |
6949 | else | 6945 | else |
6950 | sched_mc_power_savings = (buf[0] == '1'); | 6946 | sched_mc_power_savings = (buf[0] == '1'); |
6951 | 6947 | ||
6952 | ret = arch_reinit_sched_domains(); | 6948 | ret = arch_reinit_sched_domains(); |
6953 | 6949 | ||
6954 | return ret ? ret : count; | 6950 | return ret ? ret : count; |
6955 | } | 6951 | } |
6956 | 6952 | ||
6957 | #ifdef CONFIG_SCHED_MC | 6953 | #ifdef CONFIG_SCHED_MC |
6958 | static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page) | 6954 | static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page) |
6959 | { | 6955 | { |
6960 | return sprintf(page, "%u\n", sched_mc_power_savings); | 6956 | return sprintf(page, "%u\n", sched_mc_power_savings); |
6961 | } | 6957 | } |
6962 | static ssize_t sched_mc_power_savings_store(struct sys_device *dev, | 6958 | static ssize_t sched_mc_power_savings_store(struct sys_device *dev, |
6963 | const char *buf, size_t count) | 6959 | const char *buf, size_t count) |
6964 | { | 6960 | { |
6965 | return sched_power_savings_store(buf, count, 0); | 6961 | return sched_power_savings_store(buf, count, 0); |
6966 | } | 6962 | } |
6967 | static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show, | 6963 | static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show, |
6968 | sched_mc_power_savings_store); | 6964 | sched_mc_power_savings_store); |
6969 | #endif | 6965 | #endif |
6970 | 6966 | ||
6971 | #ifdef CONFIG_SCHED_SMT | 6967 | #ifdef CONFIG_SCHED_SMT |
6972 | static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page) | 6968 | static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page) |
6973 | { | 6969 | { |
6974 | return sprintf(page, "%u\n", sched_smt_power_savings); | 6970 | return sprintf(page, "%u\n", sched_smt_power_savings); |
6975 | } | 6971 | } |
6976 | static ssize_t sched_smt_power_savings_store(struct sys_device *dev, | 6972 | static ssize_t sched_smt_power_savings_store(struct sys_device *dev, |
6977 | const char *buf, size_t count) | 6973 | const char *buf, size_t count) |
6978 | { | 6974 | { |
6979 | return sched_power_savings_store(buf, count, 1); | 6975 | return sched_power_savings_store(buf, count, 1); |
6980 | } | 6976 | } |
6981 | static SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, | 6977 | static SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, |
6982 | sched_smt_power_savings_store); | 6978 | sched_smt_power_savings_store); |
6983 | #endif | 6979 | #endif |
6984 | 6980 | ||
6985 | int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) | 6981 | int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) |
6986 | { | 6982 | { |
6987 | int err = 0; | 6983 | int err = 0; |
6988 | 6984 | ||
6989 | #ifdef CONFIG_SCHED_SMT | 6985 | #ifdef CONFIG_SCHED_SMT |
6990 | if (smt_capable()) | 6986 | if (smt_capable()) |
6991 | err = sysfs_create_file(&cls->kset.kobj, | 6987 | err = sysfs_create_file(&cls->kset.kobj, |
6992 | &attr_sched_smt_power_savings.attr); | 6988 | &attr_sched_smt_power_savings.attr); |
6993 | #endif | 6989 | #endif |
6994 | #ifdef CONFIG_SCHED_MC | 6990 | #ifdef CONFIG_SCHED_MC |
6995 | if (!err && mc_capable()) | 6991 | if (!err && mc_capable()) |
6996 | err = sysfs_create_file(&cls->kset.kobj, | 6992 | err = sysfs_create_file(&cls->kset.kobj, |
6997 | &attr_sched_mc_power_savings.attr); | 6993 | &attr_sched_mc_power_savings.attr); |
6998 | #endif | 6994 | #endif |
6999 | return err; | 6995 | return err; |
7000 | } | 6996 | } |
7001 | #endif | 6997 | #endif |
7002 | 6998 | ||
7003 | /* | 6999 | /* |
7004 | * Force a reinitialization of the sched domains hierarchy. The domains | 7000 | * Force a reinitialization of the sched domains hierarchy. The domains |
7005 | * and groups cannot be updated in place without racing with the balancing | 7001 | * and groups cannot be updated in place without racing with the balancing |
7006 | * code, so we temporarily attach all running cpus to the NULL domain | 7002 | * code, so we temporarily attach all running cpus to the NULL domain |
7007 | * which will prevent rebalancing while the sched domains are recalculated. | 7003 | * which will prevent rebalancing while the sched domains are recalculated. |
7008 | */ | 7004 | */ |
7009 | static int update_sched_domains(struct notifier_block *nfb, | 7005 | static int update_sched_domains(struct notifier_block *nfb, |
7010 | unsigned long action, void *hcpu) | 7006 | unsigned long action, void *hcpu) |
7011 | { | 7007 | { |
7012 | switch (action) { | 7008 | switch (action) { |
7013 | case CPU_UP_PREPARE: | 7009 | case CPU_UP_PREPARE: |
7014 | case CPU_UP_PREPARE_FROZEN: | 7010 | case CPU_UP_PREPARE_FROZEN: |
7015 | case CPU_DOWN_PREPARE: | 7011 | case CPU_DOWN_PREPARE: |
7016 | case CPU_DOWN_PREPARE_FROZEN: | 7012 | case CPU_DOWN_PREPARE_FROZEN: |
7017 | detach_destroy_domains(&cpu_online_map); | 7013 | detach_destroy_domains(&cpu_online_map); |
7018 | return NOTIFY_OK; | 7014 | return NOTIFY_OK; |
7019 | 7015 | ||
7020 | case CPU_UP_CANCELED: | 7016 | case CPU_UP_CANCELED: |
7021 | case CPU_UP_CANCELED_FROZEN: | 7017 | case CPU_UP_CANCELED_FROZEN: |
7022 | case CPU_DOWN_FAILED: | 7018 | case CPU_DOWN_FAILED: |
7023 | case CPU_DOWN_FAILED_FROZEN: | 7019 | case CPU_DOWN_FAILED_FROZEN: |
7024 | case CPU_ONLINE: | 7020 | case CPU_ONLINE: |
7025 | case CPU_ONLINE_FROZEN: | 7021 | case CPU_ONLINE_FROZEN: |
7026 | case CPU_DEAD: | 7022 | case CPU_DEAD: |
7027 | case CPU_DEAD_FROZEN: | 7023 | case CPU_DEAD_FROZEN: |
7028 | /* | 7024 | /* |
7029 | * Fall through and re-initialise the domains. | 7025 | * Fall through and re-initialise the domains. |
7030 | */ | 7026 | */ |
7031 | break; | 7027 | break; |
7032 | default: | 7028 | default: |
7033 | return NOTIFY_DONE; | 7029 | return NOTIFY_DONE; |
7034 | } | 7030 | } |
7035 | 7031 | ||
7036 | /* The hotplug lock is already held by cpu_up/cpu_down */ | 7032 | /* The hotplug lock is already held by cpu_up/cpu_down */ |
7037 | arch_init_sched_domains(&cpu_online_map); | 7033 | arch_init_sched_domains(&cpu_online_map); |
7038 | 7034 | ||
7039 | return NOTIFY_OK; | 7035 | return NOTIFY_OK; |
7040 | } | 7036 | } |
7041 | 7037 | ||
7042 | void __init sched_init_smp(void) | 7038 | void __init sched_init_smp(void) |
7043 | { | 7039 | { |
7044 | cpumask_t non_isolated_cpus; | 7040 | cpumask_t non_isolated_cpus; |
7045 | 7041 | ||
7046 | get_online_cpus(); | 7042 | get_online_cpus(); |
7047 | arch_init_sched_domains(&cpu_online_map); | 7043 | arch_init_sched_domains(&cpu_online_map); |
7048 | cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); | 7044 | cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); |
7049 | if (cpus_empty(non_isolated_cpus)) | 7045 | if (cpus_empty(non_isolated_cpus)) |
7050 | cpu_set(smp_processor_id(), non_isolated_cpus); | 7046 | cpu_set(smp_processor_id(), non_isolated_cpus); |
7051 | put_online_cpus(); | 7047 | put_online_cpus(); |
7052 | /* XXX: Theoretical race here - CPU may be hotplugged now */ | 7048 | /* XXX: Theoretical race here - CPU may be hotplugged now */ |
7053 | hotcpu_notifier(update_sched_domains, 0); | 7049 | hotcpu_notifier(update_sched_domains, 0); |
7054 | 7050 | ||
7055 | /* Move init over to a non-isolated CPU */ | 7051 | /* Move init over to a non-isolated CPU */ |
7056 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) | 7052 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) |
7057 | BUG(); | 7053 | BUG(); |
7058 | sched_init_granularity(); | 7054 | sched_init_granularity(); |
7059 | 7055 | ||
7060 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7056 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7061 | if (nr_cpu_ids == 1) | 7057 | if (nr_cpu_ids == 1) |
7062 | return; | 7058 | return; |
7063 | 7059 | ||
7064 | lb_monitor_task = kthread_create(load_balance_monitor, NULL, | 7060 | lb_monitor_task = kthread_create(load_balance_monitor, NULL, |
7065 | "group_balance"); | 7061 | "group_balance"); |
7066 | if (!IS_ERR(lb_monitor_task)) { | 7062 | if (!IS_ERR(lb_monitor_task)) { |
7067 | lb_monitor_task->flags |= PF_NOFREEZE; | 7063 | lb_monitor_task->flags |= PF_NOFREEZE; |
7068 | wake_up_process(lb_monitor_task); | 7064 | wake_up_process(lb_monitor_task); |
7069 | } else { | 7065 | } else { |
7070 | printk(KERN_ERR "Could not create load balance monitor thread" | 7066 | printk(KERN_ERR "Could not create load balance monitor thread" |
7071 | "(error = %ld) \n", PTR_ERR(lb_monitor_task)); | 7067 | "(error = %ld) \n", PTR_ERR(lb_monitor_task)); |
7072 | } | 7068 | } |
7073 | #endif | 7069 | #endif |
7074 | } | 7070 | } |
7075 | #else | 7071 | #else |
7076 | void __init sched_init_smp(void) | 7072 | void __init sched_init_smp(void) |
7077 | { | 7073 | { |
7078 | sched_init_granularity(); | 7074 | sched_init_granularity(); |
7079 | } | 7075 | } |
7080 | #endif /* CONFIG_SMP */ | 7076 | #endif /* CONFIG_SMP */ |
7081 | 7077 | ||
7082 | int in_sched_functions(unsigned long addr) | 7078 | int in_sched_functions(unsigned long addr) |
7083 | { | 7079 | { |
7084 | return in_lock_functions(addr) || | 7080 | return in_lock_functions(addr) || |
7085 | (addr >= (unsigned long)__sched_text_start | 7081 | (addr >= (unsigned long)__sched_text_start |
7086 | && addr < (unsigned long)__sched_text_end); | 7082 | && addr < (unsigned long)__sched_text_end); |
7087 | } | 7083 | } |
7088 | 7084 | ||
7089 | static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) | 7085 | static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) |
7090 | { | 7086 | { |
7091 | cfs_rq->tasks_timeline = RB_ROOT; | 7087 | cfs_rq->tasks_timeline = RB_ROOT; |
7092 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7088 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7093 | cfs_rq->rq = rq; | 7089 | cfs_rq->rq = rq; |
7094 | #endif | 7090 | #endif |
7095 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); | 7091 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); |
7096 | } | 7092 | } |
7097 | 7093 | ||
7098 | static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | 7094 | static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) |
7099 | { | 7095 | { |
7100 | struct rt_prio_array *array; | 7096 | struct rt_prio_array *array; |
7101 | int i; | 7097 | int i; |
7102 | 7098 | ||
7103 | array = &rt_rq->active; | 7099 | array = &rt_rq->active; |
7104 | for (i = 0; i < MAX_RT_PRIO; i++) { | 7100 | for (i = 0; i < MAX_RT_PRIO; i++) { |
7105 | INIT_LIST_HEAD(array->queue + i); | 7101 | INIT_LIST_HEAD(array->queue + i); |
7106 | __clear_bit(i, array->bitmap); | 7102 | __clear_bit(i, array->bitmap); |
7107 | } | 7103 | } |
7108 | /* delimiter for bitsearch: */ | 7104 | /* delimiter for bitsearch: */ |
7109 | __set_bit(MAX_RT_PRIO, array->bitmap); | 7105 | __set_bit(MAX_RT_PRIO, array->bitmap); |
7110 | 7106 | ||
7111 | #if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED | 7107 | #if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED |
7112 | rt_rq->highest_prio = MAX_RT_PRIO; | 7108 | rt_rq->highest_prio = MAX_RT_PRIO; |
7113 | #endif | 7109 | #endif |
7114 | #ifdef CONFIG_SMP | 7110 | #ifdef CONFIG_SMP |
7115 | rt_rq->rt_nr_migratory = 0; | 7111 | rt_rq->rt_nr_migratory = 0; |
7116 | rt_rq->overloaded = 0; | 7112 | rt_rq->overloaded = 0; |
7117 | #endif | 7113 | #endif |
7118 | 7114 | ||
7119 | rt_rq->rt_time = 0; | 7115 | rt_rq->rt_time = 0; |
7120 | rt_rq->rt_throttled = 0; | 7116 | rt_rq->rt_throttled = 0; |
7121 | 7117 | ||
7122 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7118 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7123 | rt_rq->rq = rq; | 7119 | rt_rq->rq = rq; |
7124 | #endif | 7120 | #endif |
7125 | } | 7121 | } |
7126 | 7122 | ||
7127 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7123 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7128 | static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg, | 7124 | static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg, |
7129 | struct cfs_rq *cfs_rq, struct sched_entity *se, | 7125 | struct cfs_rq *cfs_rq, struct sched_entity *se, |
7130 | int cpu, int add) | 7126 | int cpu, int add) |
7131 | { | 7127 | { |
7132 | tg->cfs_rq[cpu] = cfs_rq; | 7128 | tg->cfs_rq[cpu] = cfs_rq; |
7133 | init_cfs_rq(cfs_rq, rq); | 7129 | init_cfs_rq(cfs_rq, rq); |
7134 | cfs_rq->tg = tg; | 7130 | cfs_rq->tg = tg; |
7135 | if (add) | 7131 | if (add) |
7136 | list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | 7132 | list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); |
7137 | 7133 | ||
7138 | tg->se[cpu] = se; | 7134 | tg->se[cpu] = se; |
7139 | se->cfs_rq = &rq->cfs; | 7135 | se->cfs_rq = &rq->cfs; |
7140 | se->my_q = cfs_rq; | 7136 | se->my_q = cfs_rq; |
7141 | se->load.weight = tg->shares; | 7137 | se->load.weight = tg->shares; |
7142 | se->load.inv_weight = div64_64(1ULL<<32, se->load.weight); | 7138 | se->load.inv_weight = div64_64(1ULL<<32, se->load.weight); |
7143 | se->parent = NULL; | 7139 | se->parent = NULL; |
7144 | } | 7140 | } |
7145 | 7141 | ||
7146 | static void init_tg_rt_entry(struct rq *rq, struct task_group *tg, | 7142 | static void init_tg_rt_entry(struct rq *rq, struct task_group *tg, |
7147 | struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, | 7143 | struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, |
7148 | int cpu, int add) | 7144 | int cpu, int add) |
7149 | { | 7145 | { |
7150 | tg->rt_rq[cpu] = rt_rq; | 7146 | tg->rt_rq[cpu] = rt_rq; |
7151 | init_rt_rq(rt_rq, rq); | 7147 | init_rt_rq(rt_rq, rq); |
7152 | rt_rq->tg = tg; | 7148 | rt_rq->tg = tg; |
7153 | rt_rq->rt_se = rt_se; | 7149 | rt_rq->rt_se = rt_se; |
7154 | if (add) | 7150 | if (add) |
7155 | list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); | 7151 | list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); |
7156 | 7152 | ||
7157 | tg->rt_se[cpu] = rt_se; | 7153 | tg->rt_se[cpu] = rt_se; |
7158 | rt_se->rt_rq = &rq->rt; | 7154 | rt_se->rt_rq = &rq->rt; |
7159 | rt_se->my_q = rt_rq; | 7155 | rt_se->my_q = rt_rq; |
7160 | rt_se->parent = NULL; | 7156 | rt_se->parent = NULL; |
7161 | INIT_LIST_HEAD(&rt_se->run_list); | 7157 | INIT_LIST_HEAD(&rt_se->run_list); |
7162 | } | 7158 | } |
7163 | #endif | 7159 | #endif |
7164 | 7160 | ||
7165 | void __init sched_init(void) | 7161 | void __init sched_init(void) |
7166 | { | 7162 | { |
7167 | int highest_cpu = 0; | 7163 | int highest_cpu = 0; |
7168 | int i, j; | 7164 | int i, j; |
7169 | 7165 | ||
7170 | #ifdef CONFIG_SMP | 7166 | #ifdef CONFIG_SMP |
7171 | init_defrootdomain(); | 7167 | init_defrootdomain(); |
7172 | #endif | 7168 | #endif |
7173 | 7169 | ||
7174 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7170 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7175 | list_add(&init_task_group.list, &task_groups); | 7171 | list_add(&init_task_group.list, &task_groups); |
7176 | #endif | 7172 | #endif |
7177 | 7173 | ||
7178 | for_each_possible_cpu(i) { | 7174 | for_each_possible_cpu(i) { |
7179 | struct rq *rq; | 7175 | struct rq *rq; |
7180 | 7176 | ||
7181 | rq = cpu_rq(i); | 7177 | rq = cpu_rq(i); |
7182 | spin_lock_init(&rq->lock); | 7178 | spin_lock_init(&rq->lock); |
7183 | lockdep_set_class(&rq->lock, &rq->rq_lock_key); | 7179 | lockdep_set_class(&rq->lock, &rq->rq_lock_key); |
7184 | rq->nr_running = 0; | 7180 | rq->nr_running = 0; |
7185 | rq->clock = 1; | 7181 | rq->clock = 1; |
7186 | init_cfs_rq(&rq->cfs, rq); | 7182 | init_cfs_rq(&rq->cfs, rq); |
7187 | init_rt_rq(&rq->rt, rq); | 7183 | init_rt_rq(&rq->rt, rq); |
7188 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7184 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7189 | init_task_group.shares = init_task_group_load; | 7185 | init_task_group.shares = init_task_group_load; |
7190 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | 7186 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
7191 | init_tg_cfs_entry(rq, &init_task_group, | 7187 | init_tg_cfs_entry(rq, &init_task_group, |
7192 | &per_cpu(init_cfs_rq, i), | 7188 | &per_cpu(init_cfs_rq, i), |
7193 | &per_cpu(init_sched_entity, i), i, 1); | 7189 | &per_cpu(init_sched_entity, i), i, 1); |
7194 | 7190 | ||
7195 | init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */ | 7191 | init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */ |
7196 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); | 7192 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); |
7197 | init_tg_rt_entry(rq, &init_task_group, | 7193 | init_tg_rt_entry(rq, &init_task_group, |
7198 | &per_cpu(init_rt_rq, i), | 7194 | &per_cpu(init_rt_rq, i), |
7199 | &per_cpu(init_sched_rt_entity, i), i, 1); | 7195 | &per_cpu(init_sched_rt_entity, i), i, 1); |
7200 | #endif | 7196 | #endif |
7201 | rq->rt_period_expire = 0; | 7197 | rq->rt_period_expire = 0; |
7202 | rq->rt_throttled = 0; | 7198 | rq->rt_throttled = 0; |
7203 | 7199 | ||
7204 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) | 7200 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) |
7205 | rq->cpu_load[j] = 0; | 7201 | rq->cpu_load[j] = 0; |
7206 | #ifdef CONFIG_SMP | 7202 | #ifdef CONFIG_SMP |
7207 | rq->sd = NULL; | 7203 | rq->sd = NULL; |
7208 | rq->rd = NULL; | 7204 | rq->rd = NULL; |
7209 | rq->active_balance = 0; | 7205 | rq->active_balance = 0; |
7210 | rq->next_balance = jiffies; | 7206 | rq->next_balance = jiffies; |
7211 | rq->push_cpu = 0; | 7207 | rq->push_cpu = 0; |
7212 | rq->cpu = i; | 7208 | rq->cpu = i; |
7213 | rq->migration_thread = NULL; | 7209 | rq->migration_thread = NULL; |
7214 | INIT_LIST_HEAD(&rq->migration_queue); | 7210 | INIT_LIST_HEAD(&rq->migration_queue); |
7215 | rq_attach_root(rq, &def_root_domain); | 7211 | rq_attach_root(rq, &def_root_domain); |
7216 | #endif | 7212 | #endif |
7217 | init_rq_hrtick(rq); | 7213 | init_rq_hrtick(rq); |
7218 | atomic_set(&rq->nr_iowait, 0); | 7214 | atomic_set(&rq->nr_iowait, 0); |
7219 | highest_cpu = i; | 7215 | highest_cpu = i; |
7220 | } | 7216 | } |
7221 | 7217 | ||
7222 | set_load_weight(&init_task); | 7218 | set_load_weight(&init_task); |
7223 | 7219 | ||
7224 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 7220 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
7225 | INIT_HLIST_HEAD(&init_task.preempt_notifiers); | 7221 | INIT_HLIST_HEAD(&init_task.preempt_notifiers); |
7226 | #endif | 7222 | #endif |
7227 | 7223 | ||
7228 | #ifdef CONFIG_SMP | 7224 | #ifdef CONFIG_SMP |
7229 | nr_cpu_ids = highest_cpu + 1; | 7225 | nr_cpu_ids = highest_cpu + 1; |
7230 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); | 7226 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); |
7231 | #endif | 7227 | #endif |
7232 | 7228 | ||
7233 | #ifdef CONFIG_RT_MUTEXES | 7229 | #ifdef CONFIG_RT_MUTEXES |
7234 | plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); | 7230 | plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); |
7235 | #endif | 7231 | #endif |
7236 | 7232 | ||
7237 | /* | 7233 | /* |
7238 | * The boot idle thread does lazy MMU switching as well: | 7234 | * The boot idle thread does lazy MMU switching as well: |
7239 | */ | 7235 | */ |
7240 | atomic_inc(&init_mm.mm_count); | 7236 | atomic_inc(&init_mm.mm_count); |
7241 | enter_lazy_tlb(&init_mm, current); | 7237 | enter_lazy_tlb(&init_mm, current); |
7242 | 7238 | ||
7243 | /* | 7239 | /* |
7244 | * Make us the idle thread. Technically, schedule() should not be | 7240 | * Make us the idle thread. Technically, schedule() should not be |
7245 | * called from this thread, however somewhere below it might be, | 7241 | * called from this thread, however somewhere below it might be, |
7246 | * but because we are the idle thread, we just pick up running again | 7242 | * but because we are the idle thread, we just pick up running again |
7247 | * when this runqueue becomes "idle". | 7243 | * when this runqueue becomes "idle". |
7248 | */ | 7244 | */ |
7249 | init_idle(current, smp_processor_id()); | 7245 | init_idle(current, smp_processor_id()); |
7250 | /* | 7246 | /* |
7251 | * During early bootup we pretend to be a normal task: | 7247 | * During early bootup we pretend to be a normal task: |
7252 | */ | 7248 | */ |
7253 | current->sched_class = &fair_sched_class; | 7249 | current->sched_class = &fair_sched_class; |
7254 | } | 7250 | } |
7255 | 7251 | ||
7256 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 7252 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP |
7257 | void __might_sleep(char *file, int line) | 7253 | void __might_sleep(char *file, int line) |
7258 | { | 7254 | { |
7259 | #ifdef in_atomic | 7255 | #ifdef in_atomic |
7260 | static unsigned long prev_jiffy; /* ratelimiting */ | 7256 | static unsigned long prev_jiffy; /* ratelimiting */ |
7261 | 7257 | ||
7262 | if ((in_atomic() || irqs_disabled()) && | 7258 | if ((in_atomic() || irqs_disabled()) && |
7263 | system_state == SYSTEM_RUNNING && !oops_in_progress) { | 7259 | system_state == SYSTEM_RUNNING && !oops_in_progress) { |
7264 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) | 7260 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) |
7265 | return; | 7261 | return; |
7266 | prev_jiffy = jiffies; | 7262 | prev_jiffy = jiffies; |
7267 | printk(KERN_ERR "BUG: sleeping function called from invalid" | 7263 | printk(KERN_ERR "BUG: sleeping function called from invalid" |
7268 | " context at %s:%d\n", file, line); | 7264 | " context at %s:%d\n", file, line); |
7269 | printk("in_atomic():%d, irqs_disabled():%d\n", | 7265 | printk("in_atomic():%d, irqs_disabled():%d\n", |
7270 | in_atomic(), irqs_disabled()); | 7266 | in_atomic(), irqs_disabled()); |
7271 | debug_show_held_locks(current); | 7267 | debug_show_held_locks(current); |
7272 | if (irqs_disabled()) | 7268 | if (irqs_disabled()) |
7273 | print_irqtrace_events(current); | 7269 | print_irqtrace_events(current); |
7274 | dump_stack(); | 7270 | dump_stack(); |
7275 | } | 7271 | } |
7276 | #endif | 7272 | #endif |
7277 | } | 7273 | } |
7278 | EXPORT_SYMBOL(__might_sleep); | 7274 | EXPORT_SYMBOL(__might_sleep); |
7279 | #endif | 7275 | #endif |
7280 | 7276 | ||
7281 | #ifdef CONFIG_MAGIC_SYSRQ | 7277 | #ifdef CONFIG_MAGIC_SYSRQ |
7282 | static void normalize_task(struct rq *rq, struct task_struct *p) | 7278 | static void normalize_task(struct rq *rq, struct task_struct *p) |
7283 | { | 7279 | { |
7284 | int on_rq; | 7280 | int on_rq; |
7285 | update_rq_clock(rq); | 7281 | update_rq_clock(rq); |
7286 | on_rq = p->se.on_rq; | 7282 | on_rq = p->se.on_rq; |
7287 | if (on_rq) | 7283 | if (on_rq) |
7288 | deactivate_task(rq, p, 0); | 7284 | deactivate_task(rq, p, 0); |
7289 | __setscheduler(rq, p, SCHED_NORMAL, 0); | 7285 | __setscheduler(rq, p, SCHED_NORMAL, 0); |
7290 | if (on_rq) { | 7286 | if (on_rq) { |
7291 | activate_task(rq, p, 0); | 7287 | activate_task(rq, p, 0); |
7292 | resched_task(rq->curr); | 7288 | resched_task(rq->curr); |
7293 | } | 7289 | } |
7294 | } | 7290 | } |
7295 | 7291 | ||
7296 | void normalize_rt_tasks(void) | 7292 | void normalize_rt_tasks(void) |
7297 | { | 7293 | { |
7298 | struct task_struct *g, *p; | 7294 | struct task_struct *g, *p; |
7299 | unsigned long flags; | 7295 | unsigned long flags; |
7300 | struct rq *rq; | 7296 | struct rq *rq; |
7301 | 7297 | ||
7302 | read_lock_irq(&tasklist_lock); | 7298 | read_lock_irq(&tasklist_lock); |
7303 | do_each_thread(g, p) { | 7299 | do_each_thread(g, p) { |
7304 | /* | 7300 | /* |
7305 | * Only normalize user tasks: | 7301 | * Only normalize user tasks: |
7306 | */ | 7302 | */ |
7307 | if (!p->mm) | 7303 | if (!p->mm) |
7308 | continue; | 7304 | continue; |
7309 | 7305 | ||
7310 | p->se.exec_start = 0; | 7306 | p->se.exec_start = 0; |
7311 | #ifdef CONFIG_SCHEDSTATS | 7307 | #ifdef CONFIG_SCHEDSTATS |
7312 | p->se.wait_start = 0; | 7308 | p->se.wait_start = 0; |
7313 | p->se.sleep_start = 0; | 7309 | p->se.sleep_start = 0; |
7314 | p->se.block_start = 0; | 7310 | p->se.block_start = 0; |
7315 | #endif | 7311 | #endif |
7316 | task_rq(p)->clock = 0; | 7312 | task_rq(p)->clock = 0; |
7317 | 7313 | ||
7318 | if (!rt_task(p)) { | 7314 | if (!rt_task(p)) { |
7319 | /* | 7315 | /* |
7320 | * Renice negative nice level userspace | 7316 | * Renice negative nice level userspace |
7321 | * tasks back to 0: | 7317 | * tasks back to 0: |
7322 | */ | 7318 | */ |
7323 | if (TASK_NICE(p) < 0 && p->mm) | 7319 | if (TASK_NICE(p) < 0 && p->mm) |
7324 | set_user_nice(p, 0); | 7320 | set_user_nice(p, 0); |
7325 | continue; | 7321 | continue; |
7326 | } | 7322 | } |
7327 | 7323 | ||
7328 | spin_lock_irqsave(&p->pi_lock, flags); | 7324 | spin_lock_irqsave(&p->pi_lock, flags); |
7329 | rq = __task_rq_lock(p); | 7325 | rq = __task_rq_lock(p); |
7330 | 7326 | ||
7331 | normalize_task(rq, p); | 7327 | normalize_task(rq, p); |
7332 | 7328 | ||
7333 | __task_rq_unlock(rq); | 7329 | __task_rq_unlock(rq); |
7334 | spin_unlock_irqrestore(&p->pi_lock, flags); | 7330 | spin_unlock_irqrestore(&p->pi_lock, flags); |
7335 | } while_each_thread(g, p); | 7331 | } while_each_thread(g, p); |
7336 | 7332 | ||
7337 | read_unlock_irq(&tasklist_lock); | 7333 | read_unlock_irq(&tasklist_lock); |
7338 | } | 7334 | } |
7339 | 7335 | ||
7340 | #endif /* CONFIG_MAGIC_SYSRQ */ | 7336 | #endif /* CONFIG_MAGIC_SYSRQ */ |
7341 | 7337 | ||
7342 | #ifdef CONFIG_IA64 | 7338 | #ifdef CONFIG_IA64 |
7343 | /* | 7339 | /* |
7344 | * These functions are only useful for the IA64 MCA handling. | 7340 | * These functions are only useful for the IA64 MCA handling. |
7345 | * | 7341 | * |
7346 | * They can only be called when the whole system has been | 7342 | * They can only be called when the whole system has been |
7347 | * stopped - every CPU needs to be quiescent, and no scheduling | 7343 | * stopped - every CPU needs to be quiescent, and no scheduling |
7348 | * activity can take place. Using them for anything else would | 7344 | * activity can take place. Using them for anything else would |
7349 | * be a serious bug, and as a result, they aren't even visible | 7345 | * be a serious bug, and as a result, they aren't even visible |
7350 | * under any other configuration. | 7346 | * under any other configuration. |
7351 | */ | 7347 | */ |
7352 | 7348 | ||
7353 | /** | 7349 | /** |
7354 | * curr_task - return the current task for a given cpu. | 7350 | * curr_task - return the current task for a given cpu. |
7355 | * @cpu: the processor in question. | 7351 | * @cpu: the processor in question. |
7356 | * | 7352 | * |
7357 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! | 7353 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! |
7358 | */ | 7354 | */ |
7359 | struct task_struct *curr_task(int cpu) | 7355 | struct task_struct *curr_task(int cpu) |
7360 | { | 7356 | { |
7361 | return cpu_curr(cpu); | 7357 | return cpu_curr(cpu); |
7362 | } | 7358 | } |
7363 | 7359 | ||
7364 | /** | 7360 | /** |
7365 | * set_curr_task - set the current task for a given cpu. | 7361 | * set_curr_task - set the current task for a given cpu. |
7366 | * @cpu: the processor in question. | 7362 | * @cpu: the processor in question. |
7367 | * @p: the task pointer to set. | 7363 | * @p: the task pointer to set. |
7368 | * | 7364 | * |
7369 | * Description: This function must only be used when non-maskable interrupts | 7365 | * Description: This function must only be used when non-maskable interrupts |
7370 | * are serviced on a separate stack. It allows the architecture to switch the | 7366 | * are serviced on a separate stack. It allows the architecture to switch the |
7371 | * notion of the current task on a cpu in a non-blocking manner. This function | 7367 | * notion of the current task on a cpu in a non-blocking manner. This function |
7372 | * must be called with all CPU's synchronized, and interrupts disabled, the | 7368 | * must be called with all CPU's synchronized, and interrupts disabled, the |
7373 | * and caller must save the original value of the current task (see | 7369 | * and caller must save the original value of the current task (see |
7374 | * curr_task() above) and restore that value before reenabling interrupts and | 7370 | * curr_task() above) and restore that value before reenabling interrupts and |
7375 | * re-starting the system. | 7371 | * re-starting the system. |
7376 | * | 7372 | * |
7377 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! | 7373 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! |
7378 | */ | 7374 | */ |
7379 | void set_curr_task(int cpu, struct task_struct *p) | 7375 | void set_curr_task(int cpu, struct task_struct *p) |
7380 | { | 7376 | { |
7381 | cpu_curr(cpu) = p; | 7377 | cpu_curr(cpu) = p; |
7382 | } | 7378 | } |
7383 | 7379 | ||
7384 | #endif | 7380 | #endif |
7385 | 7381 | ||
7386 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7382 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7387 | 7383 | ||
7388 | #ifdef CONFIG_SMP | 7384 | #ifdef CONFIG_SMP |
7389 | /* | 7385 | /* |
7390 | * distribute shares of all task groups among their schedulable entities, | 7386 | * distribute shares of all task groups among their schedulable entities, |
7391 | * to reflect load distribution across cpus. | 7387 | * to reflect load distribution across cpus. |
7392 | */ | 7388 | */ |
7393 | static int rebalance_shares(struct sched_domain *sd, int this_cpu) | 7389 | static int rebalance_shares(struct sched_domain *sd, int this_cpu) |
7394 | { | 7390 | { |
7395 | struct cfs_rq *cfs_rq; | 7391 | struct cfs_rq *cfs_rq; |
7396 | struct rq *rq = cpu_rq(this_cpu); | 7392 | struct rq *rq = cpu_rq(this_cpu); |
7397 | cpumask_t sdspan = sd->span; | 7393 | cpumask_t sdspan = sd->span; |
7398 | int balanced = 1; | 7394 | int balanced = 1; |
7399 | 7395 | ||
7400 | /* Walk thr' all the task groups that we have */ | 7396 | /* Walk thr' all the task groups that we have */ |
7401 | for_each_leaf_cfs_rq(rq, cfs_rq) { | 7397 | for_each_leaf_cfs_rq(rq, cfs_rq) { |
7402 | int i; | 7398 | int i; |
7403 | unsigned long total_load = 0, total_shares; | 7399 | unsigned long total_load = 0, total_shares; |
7404 | struct task_group *tg = cfs_rq->tg; | 7400 | struct task_group *tg = cfs_rq->tg; |
7405 | 7401 | ||
7406 | /* Gather total task load of this group across cpus */ | 7402 | /* Gather total task load of this group across cpus */ |
7407 | for_each_cpu_mask(i, sdspan) | 7403 | for_each_cpu_mask(i, sdspan) |
7408 | total_load += tg->cfs_rq[i]->load.weight; | 7404 | total_load += tg->cfs_rq[i]->load.weight; |
7409 | 7405 | ||
7410 | /* Nothing to do if this group has no load */ | 7406 | /* Nothing to do if this group has no load */ |
7411 | if (!total_load) | 7407 | if (!total_load) |
7412 | continue; | 7408 | continue; |
7413 | 7409 | ||
7414 | /* | 7410 | /* |
7415 | * tg->shares represents the number of cpu shares the task group | 7411 | * tg->shares represents the number of cpu shares the task group |
7416 | * is eligible to hold on a single cpu. On N cpus, it is | 7412 | * is eligible to hold on a single cpu. On N cpus, it is |
7417 | * eligible to hold (N * tg->shares) number of cpu shares. | 7413 | * eligible to hold (N * tg->shares) number of cpu shares. |
7418 | */ | 7414 | */ |
7419 | total_shares = tg->shares * cpus_weight(sdspan); | 7415 | total_shares = tg->shares * cpus_weight(sdspan); |
7420 | 7416 | ||
7421 | /* | 7417 | /* |
7422 | * redistribute total_shares across cpus as per the task load | 7418 | * redistribute total_shares across cpus as per the task load |
7423 | * distribution. | 7419 | * distribution. |
7424 | */ | 7420 | */ |
7425 | for_each_cpu_mask(i, sdspan) { | 7421 | for_each_cpu_mask(i, sdspan) { |
7426 | unsigned long local_load, local_shares; | 7422 | unsigned long local_load, local_shares; |
7427 | 7423 | ||
7428 | local_load = tg->cfs_rq[i]->load.weight; | 7424 | local_load = tg->cfs_rq[i]->load.weight; |
7429 | local_shares = (local_load * total_shares) / total_load; | 7425 | local_shares = (local_load * total_shares) / total_load; |
7430 | if (!local_shares) | 7426 | if (!local_shares) |
7431 | local_shares = MIN_GROUP_SHARES; | 7427 | local_shares = MIN_GROUP_SHARES; |
7432 | if (local_shares == tg->se[i]->load.weight) | 7428 | if (local_shares == tg->se[i]->load.weight) |
7433 | continue; | 7429 | continue; |
7434 | 7430 | ||
7435 | spin_lock_irq(&cpu_rq(i)->lock); | 7431 | spin_lock_irq(&cpu_rq(i)->lock); |
7436 | set_se_shares(tg->se[i], local_shares); | 7432 | set_se_shares(tg->se[i], local_shares); |
7437 | spin_unlock_irq(&cpu_rq(i)->lock); | 7433 | spin_unlock_irq(&cpu_rq(i)->lock); |
7438 | balanced = 0; | 7434 | balanced = 0; |
7439 | } | 7435 | } |
7440 | } | 7436 | } |
7441 | 7437 | ||
7442 | return balanced; | 7438 | return balanced; |
7443 | } | 7439 | } |
7444 | 7440 | ||
7445 | /* | 7441 | /* |
7446 | * How frequently should we rebalance_shares() across cpus? | 7442 | * How frequently should we rebalance_shares() across cpus? |
7447 | * | 7443 | * |
7448 | * The more frequently we rebalance shares, the more accurate is the fairness | 7444 | * The more frequently we rebalance shares, the more accurate is the fairness |
7449 | * of cpu bandwidth distribution between task groups. However higher frequency | 7445 | * of cpu bandwidth distribution between task groups. However higher frequency |
7450 | * also implies increased scheduling overhead. | 7446 | * also implies increased scheduling overhead. |
7451 | * | 7447 | * |
7452 | * sysctl_sched_min_bal_int_shares represents the minimum interval between | 7448 | * sysctl_sched_min_bal_int_shares represents the minimum interval between |
7453 | * consecutive calls to rebalance_shares() in the same sched domain. | 7449 | * consecutive calls to rebalance_shares() in the same sched domain. |
7454 | * | 7450 | * |
7455 | * sysctl_sched_max_bal_int_shares represents the maximum interval between | 7451 | * sysctl_sched_max_bal_int_shares represents the maximum interval between |
7456 | * consecutive calls to rebalance_shares() in the same sched domain. | 7452 | * consecutive calls to rebalance_shares() in the same sched domain. |
7457 | * | 7453 | * |
7458 | * These settings allows for the appropriate trade-off between accuracy of | 7454 | * These settings allows for the appropriate trade-off between accuracy of |
7459 | * fairness and the associated overhead. | 7455 | * fairness and the associated overhead. |
7460 | * | 7456 | * |
7461 | */ | 7457 | */ |
7462 | 7458 | ||
7463 | /* default: 8ms, units: milliseconds */ | 7459 | /* default: 8ms, units: milliseconds */ |
7464 | const_debug unsigned int sysctl_sched_min_bal_int_shares = 8; | 7460 | const_debug unsigned int sysctl_sched_min_bal_int_shares = 8; |
7465 | 7461 | ||
7466 | /* default: 128ms, units: milliseconds */ | 7462 | /* default: 128ms, units: milliseconds */ |
7467 | const_debug unsigned int sysctl_sched_max_bal_int_shares = 128; | 7463 | const_debug unsigned int sysctl_sched_max_bal_int_shares = 128; |
7468 | 7464 | ||
7469 | /* kernel thread that runs rebalance_shares() periodically */ | 7465 | /* kernel thread that runs rebalance_shares() periodically */ |
7470 | static int load_balance_monitor(void *unused) | 7466 | static int load_balance_monitor(void *unused) |
7471 | { | 7467 | { |
7472 | unsigned int timeout = sysctl_sched_min_bal_int_shares; | 7468 | unsigned int timeout = sysctl_sched_min_bal_int_shares; |
7473 | struct sched_param schedparm; | 7469 | struct sched_param schedparm; |
7474 | int ret; | 7470 | int ret; |
7475 | 7471 | ||
7476 | /* | 7472 | /* |
7477 | * We don't want this thread's execution to be limited by the shares | 7473 | * We don't want this thread's execution to be limited by the shares |
7478 | * assigned to default group (init_task_group). Hence make it run | 7474 | * assigned to default group (init_task_group). Hence make it run |
7479 | * as a SCHED_RR RT task at the lowest priority. | 7475 | * as a SCHED_RR RT task at the lowest priority. |
7480 | */ | 7476 | */ |
7481 | schedparm.sched_priority = 1; | 7477 | schedparm.sched_priority = 1; |
7482 | ret = sched_setscheduler(current, SCHED_RR, &schedparm); | 7478 | ret = sched_setscheduler(current, SCHED_RR, &schedparm); |
7483 | if (ret) | 7479 | if (ret) |
7484 | printk(KERN_ERR "Couldn't set SCHED_RR policy for load balance" | 7480 | printk(KERN_ERR "Couldn't set SCHED_RR policy for load balance" |
7485 | " monitor thread (error = %d) \n", ret); | 7481 | " monitor thread (error = %d) \n", ret); |
7486 | 7482 | ||
7487 | while (!kthread_should_stop()) { | 7483 | while (!kthread_should_stop()) { |
7488 | int i, cpu, balanced = 1; | 7484 | int i, cpu, balanced = 1; |
7489 | 7485 | ||
7490 | /* Prevent cpus going down or coming up */ | 7486 | /* Prevent cpus going down or coming up */ |
7491 | get_online_cpus(); | 7487 | get_online_cpus(); |
7492 | /* lockout changes to doms_cur[] array */ | 7488 | /* lockout changes to doms_cur[] array */ |
7493 | lock_doms_cur(); | 7489 | lock_doms_cur(); |
7494 | /* | 7490 | /* |
7495 | * Enter a rcu read-side critical section to safely walk rq->sd | 7491 | * Enter a rcu read-side critical section to safely walk rq->sd |
7496 | * chain on various cpus and to walk task group list | 7492 | * chain on various cpus and to walk task group list |
7497 | * (rq->leaf_cfs_rq_list) in rebalance_shares(). | 7493 | * (rq->leaf_cfs_rq_list) in rebalance_shares(). |
7498 | */ | 7494 | */ |
7499 | rcu_read_lock(); | 7495 | rcu_read_lock(); |
7500 | 7496 | ||
7501 | for (i = 0; i < ndoms_cur; i++) { | 7497 | for (i = 0; i < ndoms_cur; i++) { |
7502 | cpumask_t cpumap = doms_cur[i]; | 7498 | cpumask_t cpumap = doms_cur[i]; |
7503 | struct sched_domain *sd = NULL, *sd_prev = NULL; | 7499 | struct sched_domain *sd = NULL, *sd_prev = NULL; |
7504 | 7500 | ||
7505 | cpu = first_cpu(cpumap); | 7501 | cpu = first_cpu(cpumap); |
7506 | 7502 | ||
7507 | /* Find the highest domain at which to balance shares */ | 7503 | /* Find the highest domain at which to balance shares */ |
7508 | for_each_domain(cpu, sd) { | 7504 | for_each_domain(cpu, sd) { |
7509 | if (!(sd->flags & SD_LOAD_BALANCE)) | 7505 | if (!(sd->flags & SD_LOAD_BALANCE)) |
7510 | continue; | 7506 | continue; |
7511 | sd_prev = sd; | 7507 | sd_prev = sd; |
7512 | } | 7508 | } |
7513 | 7509 | ||
7514 | sd = sd_prev; | 7510 | sd = sd_prev; |
7515 | /* sd == NULL? No load balance reqd in this domain */ | 7511 | /* sd == NULL? No load balance reqd in this domain */ |
7516 | if (!sd) | 7512 | if (!sd) |
7517 | continue; | 7513 | continue; |
7518 | 7514 | ||
7519 | balanced &= rebalance_shares(sd, cpu); | 7515 | balanced &= rebalance_shares(sd, cpu); |
7520 | } | 7516 | } |
7521 | 7517 | ||
7522 | rcu_read_unlock(); | 7518 | rcu_read_unlock(); |
7523 | 7519 | ||
7524 | unlock_doms_cur(); | 7520 | unlock_doms_cur(); |
7525 | put_online_cpus(); | 7521 | put_online_cpus(); |
7526 | 7522 | ||
7527 | if (!balanced) | 7523 | if (!balanced) |
7528 | timeout = sysctl_sched_min_bal_int_shares; | 7524 | timeout = sysctl_sched_min_bal_int_shares; |
7529 | else if (timeout < sysctl_sched_max_bal_int_shares) | 7525 | else if (timeout < sysctl_sched_max_bal_int_shares) |
7530 | timeout *= 2; | 7526 | timeout *= 2; |
7531 | 7527 | ||
7532 | msleep_interruptible(timeout); | 7528 | msleep_interruptible(timeout); |
7533 | } | 7529 | } |
7534 | 7530 | ||
7535 | return 0; | 7531 | return 0; |
7536 | } | 7532 | } |
7537 | #endif /* CONFIG_SMP */ | 7533 | #endif /* CONFIG_SMP */ |
7538 | 7534 | ||
7539 | static void free_sched_group(struct task_group *tg) | 7535 | static void free_sched_group(struct task_group *tg) |
7540 | { | 7536 | { |
7541 | int i; | 7537 | int i; |
7542 | 7538 | ||
7543 | for_each_possible_cpu(i) { | 7539 | for_each_possible_cpu(i) { |
7544 | if (tg->cfs_rq) | 7540 | if (tg->cfs_rq) |
7545 | kfree(tg->cfs_rq[i]); | 7541 | kfree(tg->cfs_rq[i]); |
7546 | if (tg->se) | 7542 | if (tg->se) |
7547 | kfree(tg->se[i]); | 7543 | kfree(tg->se[i]); |
7548 | if (tg->rt_rq) | 7544 | if (tg->rt_rq) |
7549 | kfree(tg->rt_rq[i]); | 7545 | kfree(tg->rt_rq[i]); |
7550 | if (tg->rt_se) | 7546 | if (tg->rt_se) |
7551 | kfree(tg->rt_se[i]); | 7547 | kfree(tg->rt_se[i]); |
7552 | } | 7548 | } |
7553 | 7549 | ||
7554 | kfree(tg->cfs_rq); | 7550 | kfree(tg->cfs_rq); |
7555 | kfree(tg->se); | 7551 | kfree(tg->se); |
7556 | kfree(tg->rt_rq); | 7552 | kfree(tg->rt_rq); |
7557 | kfree(tg->rt_se); | 7553 | kfree(tg->rt_se); |
7558 | kfree(tg); | 7554 | kfree(tg); |
7559 | } | 7555 | } |
7560 | 7556 | ||
7561 | /* allocate runqueue etc for a new task group */ | 7557 | /* allocate runqueue etc for a new task group */ |
7562 | struct task_group *sched_create_group(void) | 7558 | struct task_group *sched_create_group(void) |
7563 | { | 7559 | { |
7564 | struct task_group *tg; | 7560 | struct task_group *tg; |
7565 | struct cfs_rq *cfs_rq; | 7561 | struct cfs_rq *cfs_rq; |
7566 | struct sched_entity *se; | 7562 | struct sched_entity *se; |
7567 | struct rt_rq *rt_rq; | 7563 | struct rt_rq *rt_rq; |
7568 | struct sched_rt_entity *rt_se; | 7564 | struct sched_rt_entity *rt_se; |
7569 | struct rq *rq; | 7565 | struct rq *rq; |
7570 | int i; | 7566 | int i; |
7571 | 7567 | ||
7572 | tg = kzalloc(sizeof(*tg), GFP_KERNEL); | 7568 | tg = kzalloc(sizeof(*tg), GFP_KERNEL); |
7573 | if (!tg) | 7569 | if (!tg) |
7574 | return ERR_PTR(-ENOMEM); | 7570 | return ERR_PTR(-ENOMEM); |
7575 | 7571 | ||
7576 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL); | 7572 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL); |
7577 | if (!tg->cfs_rq) | 7573 | if (!tg->cfs_rq) |
7578 | goto err; | 7574 | goto err; |
7579 | tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); | 7575 | tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); |
7580 | if (!tg->se) | 7576 | if (!tg->se) |
7581 | goto err; | 7577 | goto err; |
7582 | tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL); | 7578 | tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL); |
7583 | if (!tg->rt_rq) | 7579 | if (!tg->rt_rq) |
7584 | goto err; | 7580 | goto err; |
7585 | tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL); | 7581 | tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL); |
7586 | if (!tg->rt_se) | 7582 | if (!tg->rt_se) |
7587 | goto err; | 7583 | goto err; |
7588 | 7584 | ||
7589 | tg->shares = NICE_0_LOAD; | 7585 | tg->shares = NICE_0_LOAD; |
7590 | tg->rt_ratio = 0; /* XXX */ | 7586 | tg->rt_ratio = 0; /* XXX */ |
7591 | 7587 | ||
7592 | for_each_possible_cpu(i) { | 7588 | for_each_possible_cpu(i) { |
7593 | rq = cpu_rq(i); | 7589 | rq = cpu_rq(i); |
7594 | 7590 | ||
7595 | cfs_rq = kmalloc_node(sizeof(struct cfs_rq), | 7591 | cfs_rq = kmalloc_node(sizeof(struct cfs_rq), |
7596 | GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); | 7592 | GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); |
7597 | if (!cfs_rq) | 7593 | if (!cfs_rq) |
7598 | goto err; | 7594 | goto err; |
7599 | 7595 | ||
7600 | se = kmalloc_node(sizeof(struct sched_entity), | 7596 | se = kmalloc_node(sizeof(struct sched_entity), |
7601 | GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); | 7597 | GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); |
7602 | if (!se) | 7598 | if (!se) |
7603 | goto err; | 7599 | goto err; |
7604 | 7600 | ||
7605 | rt_rq = kmalloc_node(sizeof(struct rt_rq), | 7601 | rt_rq = kmalloc_node(sizeof(struct rt_rq), |
7606 | GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); | 7602 | GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); |
7607 | if (!rt_rq) | 7603 | if (!rt_rq) |
7608 | goto err; | 7604 | goto err; |
7609 | 7605 | ||
7610 | rt_se = kmalloc_node(sizeof(struct sched_rt_entity), | 7606 | rt_se = kmalloc_node(sizeof(struct sched_rt_entity), |
7611 | GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); | 7607 | GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); |
7612 | if (!rt_se) | 7608 | if (!rt_se) |
7613 | goto err; | 7609 | goto err; |
7614 | 7610 | ||
7615 | init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0); | 7611 | init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0); |
7616 | init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0); | 7612 | init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0); |
7617 | } | 7613 | } |
7618 | 7614 | ||
7619 | lock_task_group_list(); | 7615 | lock_task_group_list(); |
7620 | for_each_possible_cpu(i) { | 7616 | for_each_possible_cpu(i) { |
7621 | rq = cpu_rq(i); | 7617 | rq = cpu_rq(i); |
7622 | cfs_rq = tg->cfs_rq[i]; | 7618 | cfs_rq = tg->cfs_rq[i]; |
7623 | list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | 7619 | list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); |
7624 | rt_rq = tg->rt_rq[i]; | 7620 | rt_rq = tg->rt_rq[i]; |
7625 | list_add_rcu(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); | 7621 | list_add_rcu(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); |
7626 | } | 7622 | } |
7627 | list_add_rcu(&tg->list, &task_groups); | 7623 | list_add_rcu(&tg->list, &task_groups); |
7628 | unlock_task_group_list(); | 7624 | unlock_task_group_list(); |
7629 | 7625 | ||
7630 | return tg; | 7626 | return tg; |
7631 | 7627 | ||
7632 | err: | 7628 | err: |
7633 | free_sched_group(tg); | 7629 | free_sched_group(tg); |
7634 | return ERR_PTR(-ENOMEM); | 7630 | return ERR_PTR(-ENOMEM); |
7635 | } | 7631 | } |
7636 | 7632 | ||
7637 | /* rcu callback to free various structures associated with a task group */ | 7633 | /* rcu callback to free various structures associated with a task group */ |
7638 | static void free_sched_group_rcu(struct rcu_head *rhp) | 7634 | static void free_sched_group_rcu(struct rcu_head *rhp) |
7639 | { | 7635 | { |
7640 | /* now it should be safe to free those cfs_rqs */ | 7636 | /* now it should be safe to free those cfs_rqs */ |
7641 | free_sched_group(container_of(rhp, struct task_group, rcu)); | 7637 | free_sched_group(container_of(rhp, struct task_group, rcu)); |
7642 | } | 7638 | } |
7643 | 7639 | ||
7644 | /* Destroy runqueue etc associated with a task group */ | 7640 | /* Destroy runqueue etc associated with a task group */ |
7645 | void sched_destroy_group(struct task_group *tg) | 7641 | void sched_destroy_group(struct task_group *tg) |
7646 | { | 7642 | { |
7647 | struct cfs_rq *cfs_rq = NULL; | 7643 | struct cfs_rq *cfs_rq = NULL; |
7648 | struct rt_rq *rt_rq = NULL; | 7644 | struct rt_rq *rt_rq = NULL; |
7649 | int i; | 7645 | int i; |
7650 | 7646 | ||
7651 | lock_task_group_list(); | 7647 | lock_task_group_list(); |
7652 | for_each_possible_cpu(i) { | 7648 | for_each_possible_cpu(i) { |
7653 | cfs_rq = tg->cfs_rq[i]; | 7649 | cfs_rq = tg->cfs_rq[i]; |
7654 | list_del_rcu(&cfs_rq->leaf_cfs_rq_list); | 7650 | list_del_rcu(&cfs_rq->leaf_cfs_rq_list); |
7655 | rt_rq = tg->rt_rq[i]; | 7651 | rt_rq = tg->rt_rq[i]; |
7656 | list_del_rcu(&rt_rq->leaf_rt_rq_list); | 7652 | list_del_rcu(&rt_rq->leaf_rt_rq_list); |
7657 | } | 7653 | } |
7658 | list_del_rcu(&tg->list); | 7654 | list_del_rcu(&tg->list); |
7659 | unlock_task_group_list(); | 7655 | unlock_task_group_list(); |
7660 | 7656 | ||
7661 | BUG_ON(!cfs_rq); | 7657 | BUG_ON(!cfs_rq); |
7662 | 7658 | ||
7663 | /* wait for possible concurrent references to cfs_rqs complete */ | 7659 | /* wait for possible concurrent references to cfs_rqs complete */ |
7664 | call_rcu(&tg->rcu, free_sched_group_rcu); | 7660 | call_rcu(&tg->rcu, free_sched_group_rcu); |
7665 | } | 7661 | } |
7666 | 7662 | ||
7667 | /* change task's runqueue when it moves between groups. | 7663 | /* change task's runqueue when it moves between groups. |
7668 | * The caller of this function should have put the task in its new group | 7664 | * The caller of this function should have put the task in its new group |
7669 | * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to | 7665 | * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to |
7670 | * reflect its new group. | 7666 | * reflect its new group. |
7671 | */ | 7667 | */ |
7672 | void sched_move_task(struct task_struct *tsk) | 7668 | void sched_move_task(struct task_struct *tsk) |
7673 | { | 7669 | { |
7674 | int on_rq, running; | 7670 | int on_rq, running; |
7675 | unsigned long flags; | 7671 | unsigned long flags; |
7676 | struct rq *rq; | 7672 | struct rq *rq; |
7677 | 7673 | ||
7678 | rq = task_rq_lock(tsk, &flags); | 7674 | rq = task_rq_lock(tsk, &flags); |
7679 | 7675 | ||
7680 | update_rq_clock(rq); | 7676 | update_rq_clock(rq); |
7681 | 7677 | ||
7682 | running = task_current(rq, tsk); | 7678 | running = task_current(rq, tsk); |
7683 | on_rq = tsk->se.on_rq; | 7679 | on_rq = tsk->se.on_rq; |
7684 | 7680 | ||
7685 | if (on_rq) { | 7681 | if (on_rq) { |
7686 | dequeue_task(rq, tsk, 0); | 7682 | dequeue_task(rq, tsk, 0); |
7687 | if (unlikely(running)) | 7683 | if (unlikely(running)) |
7688 | tsk->sched_class->put_prev_task(rq, tsk); | 7684 | tsk->sched_class->put_prev_task(rq, tsk); |
7689 | } | 7685 | } |
7690 | 7686 | ||
7691 | set_task_rq(tsk, task_cpu(tsk)); | 7687 | set_task_rq(tsk, task_cpu(tsk)); |
7692 | 7688 | ||
7693 | if (on_rq) { | 7689 | if (on_rq) { |
7694 | if (unlikely(running)) | 7690 | if (unlikely(running)) |
7695 | tsk->sched_class->set_curr_task(rq); | 7691 | tsk->sched_class->set_curr_task(rq); |
7696 | enqueue_task(rq, tsk, 0); | 7692 | enqueue_task(rq, tsk, 0); |
7697 | } | 7693 | } |
7698 | 7694 | ||
7699 | task_rq_unlock(rq, &flags); | 7695 | task_rq_unlock(rq, &flags); |
7700 | } | 7696 | } |
7701 | 7697 | ||
7702 | /* rq->lock to be locked by caller */ | 7698 | /* rq->lock to be locked by caller */ |
7703 | static void set_se_shares(struct sched_entity *se, unsigned long shares) | 7699 | static void set_se_shares(struct sched_entity *se, unsigned long shares) |
7704 | { | 7700 | { |
7705 | struct cfs_rq *cfs_rq = se->cfs_rq; | 7701 | struct cfs_rq *cfs_rq = se->cfs_rq; |
7706 | struct rq *rq = cfs_rq->rq; | 7702 | struct rq *rq = cfs_rq->rq; |
7707 | int on_rq; | 7703 | int on_rq; |
7708 | 7704 | ||
7709 | if (!shares) | 7705 | if (!shares) |
7710 | shares = MIN_GROUP_SHARES; | 7706 | shares = MIN_GROUP_SHARES; |
7711 | 7707 | ||
7712 | on_rq = se->on_rq; | 7708 | on_rq = se->on_rq; |
7713 | if (on_rq) { | 7709 | if (on_rq) { |
7714 | dequeue_entity(cfs_rq, se, 0); | 7710 | dequeue_entity(cfs_rq, se, 0); |
7715 | dec_cpu_load(rq, se->load.weight); | 7711 | dec_cpu_load(rq, se->load.weight); |
7716 | } | 7712 | } |
7717 | 7713 | ||
7718 | se->load.weight = shares; | 7714 | se->load.weight = shares; |
7719 | se->load.inv_weight = div64_64((1ULL<<32), shares); | 7715 | se->load.inv_weight = div64_64((1ULL<<32), shares); |
7720 | 7716 | ||
7721 | if (on_rq) { | 7717 | if (on_rq) { |
7722 | enqueue_entity(cfs_rq, se, 0); | 7718 | enqueue_entity(cfs_rq, se, 0); |
7723 | inc_cpu_load(rq, se->load.weight); | 7719 | inc_cpu_load(rq, se->load.weight); |
7724 | } | 7720 | } |
7725 | } | 7721 | } |
7726 | 7722 | ||
7727 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) | 7723 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) |
7728 | { | 7724 | { |
7729 | int i; | 7725 | int i; |
7730 | struct cfs_rq *cfs_rq; | 7726 | struct cfs_rq *cfs_rq; |
7731 | struct rq *rq; | 7727 | struct rq *rq; |
7732 | 7728 | ||
7733 | lock_task_group_list(); | 7729 | lock_task_group_list(); |
7734 | if (tg->shares == shares) | 7730 | if (tg->shares == shares) |
7735 | goto done; | 7731 | goto done; |
7736 | 7732 | ||
7737 | if (shares < MIN_GROUP_SHARES) | 7733 | if (shares < MIN_GROUP_SHARES) |
7738 | shares = MIN_GROUP_SHARES; | 7734 | shares = MIN_GROUP_SHARES; |
7739 | 7735 | ||
7740 | /* | 7736 | /* |
7741 | * Prevent any load balance activity (rebalance_shares, | 7737 | * Prevent any load balance activity (rebalance_shares, |
7742 | * load_balance_fair) from referring to this group first, | 7738 | * load_balance_fair) from referring to this group first, |
7743 | * by taking it off the rq->leaf_cfs_rq_list on each cpu. | 7739 | * by taking it off the rq->leaf_cfs_rq_list on each cpu. |
7744 | */ | 7740 | */ |
7745 | for_each_possible_cpu(i) { | 7741 | for_each_possible_cpu(i) { |
7746 | cfs_rq = tg->cfs_rq[i]; | 7742 | cfs_rq = tg->cfs_rq[i]; |
7747 | list_del_rcu(&cfs_rq->leaf_cfs_rq_list); | 7743 | list_del_rcu(&cfs_rq->leaf_cfs_rq_list); |
7748 | } | 7744 | } |
7749 | 7745 | ||
7750 | /* wait for any ongoing reference to this group to finish */ | 7746 | /* wait for any ongoing reference to this group to finish */ |
7751 | synchronize_sched(); | 7747 | synchronize_sched(); |
7752 | 7748 | ||
7753 | /* | 7749 | /* |
7754 | * Now we are free to modify the group's share on each cpu | 7750 | * Now we are free to modify the group's share on each cpu |
7755 | * w/o tripping rebalance_share or load_balance_fair. | 7751 | * w/o tripping rebalance_share or load_balance_fair. |
7756 | */ | 7752 | */ |
7757 | tg->shares = shares; | 7753 | tg->shares = shares; |
7758 | for_each_possible_cpu(i) { | 7754 | for_each_possible_cpu(i) { |
7759 | spin_lock_irq(&cpu_rq(i)->lock); | 7755 | spin_lock_irq(&cpu_rq(i)->lock); |
7760 | set_se_shares(tg->se[i], shares); | 7756 | set_se_shares(tg->se[i], shares); |
7761 | spin_unlock_irq(&cpu_rq(i)->lock); | 7757 | spin_unlock_irq(&cpu_rq(i)->lock); |
7762 | } | 7758 | } |
7763 | 7759 | ||
7764 | /* | 7760 | /* |
7765 | * Enable load balance activity on this group, by inserting it back on | 7761 | * Enable load balance activity on this group, by inserting it back on |
7766 | * each cpu's rq->leaf_cfs_rq_list. | 7762 | * each cpu's rq->leaf_cfs_rq_list. |
7767 | */ | 7763 | */ |
7768 | for_each_possible_cpu(i) { | 7764 | for_each_possible_cpu(i) { |
7769 | rq = cpu_rq(i); | 7765 | rq = cpu_rq(i); |
7770 | cfs_rq = tg->cfs_rq[i]; | 7766 | cfs_rq = tg->cfs_rq[i]; |
7771 | list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | 7767 | list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); |
7772 | } | 7768 | } |
7773 | done: | 7769 | done: |
7774 | unlock_task_group_list(); | 7770 | unlock_task_group_list(); |
7775 | return 0; | 7771 | return 0; |
7776 | } | 7772 | } |
7777 | 7773 | ||
7778 | unsigned long sched_group_shares(struct task_group *tg) | 7774 | unsigned long sched_group_shares(struct task_group *tg) |
7779 | { | 7775 | { |
7780 | return tg->shares; | 7776 | return tg->shares; |
7781 | } | 7777 | } |
7782 | 7778 | ||
7783 | /* | 7779 | /* |
7784 | * Ensure the total rt_ratio <= sysctl_sched_rt_ratio | 7780 | * Ensure the total rt_ratio <= sysctl_sched_rt_ratio |
7785 | */ | 7781 | */ |
7786 | int sched_group_set_rt_ratio(struct task_group *tg, unsigned long rt_ratio) | 7782 | int sched_group_set_rt_ratio(struct task_group *tg, unsigned long rt_ratio) |
7787 | { | 7783 | { |
7788 | struct task_group *tgi; | 7784 | struct task_group *tgi; |
7789 | unsigned long total = 0; | 7785 | unsigned long total = 0; |
7790 | 7786 | ||
7791 | rcu_read_lock(); | 7787 | rcu_read_lock(); |
7792 | list_for_each_entry_rcu(tgi, &task_groups, list) | 7788 | list_for_each_entry_rcu(tgi, &task_groups, list) |
7793 | total += tgi->rt_ratio; | 7789 | total += tgi->rt_ratio; |
7794 | rcu_read_unlock(); | 7790 | rcu_read_unlock(); |
7795 | 7791 | ||
7796 | if (total + rt_ratio - tg->rt_ratio > sysctl_sched_rt_ratio) | 7792 | if (total + rt_ratio - tg->rt_ratio > sysctl_sched_rt_ratio) |
7797 | return -EINVAL; | 7793 | return -EINVAL; |
7798 | 7794 | ||
7799 | tg->rt_ratio = rt_ratio; | 7795 | tg->rt_ratio = rt_ratio; |
7800 | return 0; | 7796 | return 0; |
7801 | } | 7797 | } |
7802 | 7798 | ||
7803 | unsigned long sched_group_rt_ratio(struct task_group *tg) | 7799 | unsigned long sched_group_rt_ratio(struct task_group *tg) |
7804 | { | 7800 | { |
7805 | return tg->rt_ratio; | 7801 | return tg->rt_ratio; |
7806 | } | 7802 | } |
7807 | 7803 | ||
7808 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 7804 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
7809 | 7805 | ||
7810 | #ifdef CONFIG_FAIR_CGROUP_SCHED | 7806 | #ifdef CONFIG_FAIR_CGROUP_SCHED |
7811 | 7807 | ||
7812 | /* return corresponding task_group object of a cgroup */ | 7808 | /* return corresponding task_group object of a cgroup */ |
7813 | static inline struct task_group *cgroup_tg(struct cgroup *cgrp) | 7809 | static inline struct task_group *cgroup_tg(struct cgroup *cgrp) |
7814 | { | 7810 | { |
7815 | return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id), | 7811 | return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id), |
7816 | struct task_group, css); | 7812 | struct task_group, css); |
7817 | } | 7813 | } |
7818 | 7814 | ||
7819 | static struct cgroup_subsys_state * | 7815 | static struct cgroup_subsys_state * |
7820 | cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) | 7816 | cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) |
7821 | { | 7817 | { |
7822 | struct task_group *tg; | 7818 | struct task_group *tg; |
7823 | 7819 | ||
7824 | if (!cgrp->parent) { | 7820 | if (!cgrp->parent) { |
7825 | /* This is early initialization for the top cgroup */ | 7821 | /* This is early initialization for the top cgroup */ |
7826 | init_task_group.css.cgroup = cgrp; | 7822 | init_task_group.css.cgroup = cgrp; |
7827 | return &init_task_group.css; | 7823 | return &init_task_group.css; |
7828 | } | 7824 | } |
7829 | 7825 | ||
7830 | /* we support only 1-level deep hierarchical scheduler atm */ | 7826 | /* we support only 1-level deep hierarchical scheduler atm */ |
7831 | if (cgrp->parent->parent) | 7827 | if (cgrp->parent->parent) |
7832 | return ERR_PTR(-EINVAL); | 7828 | return ERR_PTR(-EINVAL); |
7833 | 7829 | ||
7834 | tg = sched_create_group(); | 7830 | tg = sched_create_group(); |
7835 | if (IS_ERR(tg)) | 7831 | if (IS_ERR(tg)) |
7836 | return ERR_PTR(-ENOMEM); | 7832 | return ERR_PTR(-ENOMEM); |
7837 | 7833 | ||
7838 | /* Bind the cgroup to task_group object we just created */ | 7834 | /* Bind the cgroup to task_group object we just created */ |
7839 | tg->css.cgroup = cgrp; | 7835 | tg->css.cgroup = cgrp; |
7840 | 7836 | ||
7841 | return &tg->css; | 7837 | return &tg->css; |
7842 | } | 7838 | } |
7843 | 7839 | ||
7844 | static void | 7840 | static void |
7845 | cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) | 7841 | cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) |
7846 | { | 7842 | { |
7847 | struct task_group *tg = cgroup_tg(cgrp); | 7843 | struct task_group *tg = cgroup_tg(cgrp); |
7848 | 7844 | ||
7849 | sched_destroy_group(tg); | 7845 | sched_destroy_group(tg); |
7850 | } | 7846 | } |
7851 | 7847 | ||
7852 | static int | 7848 | static int |
7853 | cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | 7849 | cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, |
7854 | struct task_struct *tsk) | 7850 | struct task_struct *tsk) |
7855 | { | 7851 | { |
7856 | /* We don't support RT-tasks being in separate groups */ | 7852 | /* We don't support RT-tasks being in separate groups */ |
7857 | if (tsk->sched_class != &fair_sched_class) | 7853 | if (tsk->sched_class != &fair_sched_class) |
7858 | return -EINVAL; | 7854 | return -EINVAL; |
7859 | 7855 | ||
7860 | return 0; | 7856 | return 0; |
7861 | } | 7857 | } |
7862 | 7858 | ||
7863 | static void | 7859 | static void |
7864 | cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | 7860 | cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, |
7865 | struct cgroup *old_cont, struct task_struct *tsk) | 7861 | struct cgroup *old_cont, struct task_struct *tsk) |
7866 | { | 7862 | { |
7867 | sched_move_task(tsk); | 7863 | sched_move_task(tsk); |
7868 | } | 7864 | } |
7869 | 7865 | ||
7870 | static int cpu_shares_write_uint(struct cgroup *cgrp, struct cftype *cftype, | 7866 | static int cpu_shares_write_uint(struct cgroup *cgrp, struct cftype *cftype, |
7871 | u64 shareval) | 7867 | u64 shareval) |
7872 | { | 7868 | { |
7873 | return sched_group_set_shares(cgroup_tg(cgrp), shareval); | 7869 | return sched_group_set_shares(cgroup_tg(cgrp), shareval); |
7874 | } | 7870 | } |
7875 | 7871 | ||
7876 | static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft) | 7872 | static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft) |
7877 | { | 7873 | { |
7878 | struct task_group *tg = cgroup_tg(cgrp); | 7874 | struct task_group *tg = cgroup_tg(cgrp); |
7879 | 7875 | ||
7880 | return (u64) tg->shares; | 7876 | return (u64) tg->shares; |
7881 | } | 7877 | } |
7882 | 7878 | ||
7883 | static int cpu_rt_ratio_write_uint(struct cgroup *cgrp, struct cftype *cftype, | 7879 | static int cpu_rt_ratio_write_uint(struct cgroup *cgrp, struct cftype *cftype, |
7884 | u64 rt_ratio_val) | 7880 | u64 rt_ratio_val) |
7885 | { | 7881 | { |
7886 | return sched_group_set_rt_ratio(cgroup_tg(cgrp), rt_ratio_val); | 7882 | return sched_group_set_rt_ratio(cgroup_tg(cgrp), rt_ratio_val); |
7887 | } | 7883 | } |
7888 | 7884 | ||
7889 | static u64 cpu_rt_ratio_read_uint(struct cgroup *cgrp, struct cftype *cft) | 7885 | static u64 cpu_rt_ratio_read_uint(struct cgroup *cgrp, struct cftype *cft) |
7890 | { | 7886 | { |
7891 | struct task_group *tg = cgroup_tg(cgrp); | 7887 | struct task_group *tg = cgroup_tg(cgrp); |
7892 | 7888 | ||
7893 | return (u64) tg->rt_ratio; | 7889 | return (u64) tg->rt_ratio; |
7894 | } | 7890 | } |
7895 | 7891 | ||
7896 | static struct cftype cpu_files[] = { | 7892 | static struct cftype cpu_files[] = { |
7897 | { | 7893 | { |
7898 | .name = "shares", | 7894 | .name = "shares", |
7899 | .read_uint = cpu_shares_read_uint, | 7895 | .read_uint = cpu_shares_read_uint, |
7900 | .write_uint = cpu_shares_write_uint, | 7896 | .write_uint = cpu_shares_write_uint, |
7901 | }, | 7897 | }, |
7902 | { | 7898 | { |
7903 | .name = "rt_ratio", | 7899 | .name = "rt_ratio", |
7904 | .read_uint = cpu_rt_ratio_read_uint, | 7900 | .read_uint = cpu_rt_ratio_read_uint, |
7905 | .write_uint = cpu_rt_ratio_write_uint, | 7901 | .write_uint = cpu_rt_ratio_write_uint, |
7906 | }, | 7902 | }, |
7907 | }; | 7903 | }; |
7908 | 7904 | ||
7909 | static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) | 7905 | static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) |
7910 | { | 7906 | { |
7911 | return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files)); | 7907 | return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files)); |
7912 | } | 7908 | } |
7913 | 7909 | ||
7914 | struct cgroup_subsys cpu_cgroup_subsys = { | 7910 | struct cgroup_subsys cpu_cgroup_subsys = { |
7915 | .name = "cpu", | 7911 | .name = "cpu", |
7916 | .create = cpu_cgroup_create, | 7912 | .create = cpu_cgroup_create, |
7917 | .destroy = cpu_cgroup_destroy, | 7913 | .destroy = cpu_cgroup_destroy, |
7918 | .can_attach = cpu_cgroup_can_attach, | 7914 | .can_attach = cpu_cgroup_can_attach, |
7919 | .attach = cpu_cgroup_attach, | 7915 | .attach = cpu_cgroup_attach, |
7920 | .populate = cpu_cgroup_populate, | 7916 | .populate = cpu_cgroup_populate, |
7921 | .subsys_id = cpu_cgroup_subsys_id, | 7917 | .subsys_id = cpu_cgroup_subsys_id, |
7922 | .early_init = 1, | 7918 | .early_init = 1, |
7923 | }; | 7919 | }; |
7924 | 7920 | ||
7925 | #endif /* CONFIG_FAIR_CGROUP_SCHED */ | 7921 | #endif /* CONFIG_FAIR_CGROUP_SCHED */ |
7926 | 7922 | ||
7927 | #ifdef CONFIG_CGROUP_CPUACCT | 7923 | #ifdef CONFIG_CGROUP_CPUACCT |
7928 | 7924 | ||
7929 | /* | 7925 | /* |
7930 | * CPU accounting code for task groups. | 7926 | * CPU accounting code for task groups. |
7931 | * | 7927 | * |
7932 | * Based on the work by Paul Menage (menage@google.com) and Balbir Singh | 7928 | * Based on the work by Paul Menage (menage@google.com) and Balbir Singh |
7933 | * (balbir@in.ibm.com). | 7929 | * (balbir@in.ibm.com). |
7934 | */ | 7930 | */ |
7935 | 7931 | ||
7936 | /* track cpu usage of a group of tasks */ | 7932 | /* track cpu usage of a group of tasks */ |
7937 | struct cpuacct { | 7933 | struct cpuacct { |
7938 | struct cgroup_subsys_state css; | 7934 | struct cgroup_subsys_state css; |
7939 | /* cpuusage holds pointer to a u64-type object on every cpu */ | 7935 | /* cpuusage holds pointer to a u64-type object on every cpu */ |
7940 | u64 *cpuusage; | 7936 | u64 *cpuusage; |
7941 | }; | 7937 | }; |
7942 | 7938 | ||
7943 | struct cgroup_subsys cpuacct_subsys; | 7939 | struct cgroup_subsys cpuacct_subsys; |
7944 | 7940 | ||
7945 | /* return cpu accounting group corresponding to this container */ | 7941 | /* return cpu accounting group corresponding to this container */ |
7946 | static inline struct cpuacct *cgroup_ca(struct cgroup *cont) | 7942 | static inline struct cpuacct *cgroup_ca(struct cgroup *cont) |
7947 | { | 7943 | { |
7948 | return container_of(cgroup_subsys_state(cont, cpuacct_subsys_id), | 7944 | return container_of(cgroup_subsys_state(cont, cpuacct_subsys_id), |
7949 | struct cpuacct, css); | 7945 | struct cpuacct, css); |
7950 | } | 7946 | } |
7951 | 7947 | ||
7952 | /* return cpu accounting group to which this task belongs */ | 7948 | /* return cpu accounting group to which this task belongs */ |
7953 | static inline struct cpuacct *task_ca(struct task_struct *tsk) | 7949 | static inline struct cpuacct *task_ca(struct task_struct *tsk) |
7954 | { | 7950 | { |
7955 | return container_of(task_subsys_state(tsk, cpuacct_subsys_id), | 7951 | return container_of(task_subsys_state(tsk, cpuacct_subsys_id), |
7956 | struct cpuacct, css); | 7952 | struct cpuacct, css); |
7957 | } | 7953 | } |
7958 | 7954 | ||
7959 | /* create a new cpu accounting group */ | 7955 | /* create a new cpu accounting group */ |
7960 | static struct cgroup_subsys_state *cpuacct_create( | 7956 | static struct cgroup_subsys_state *cpuacct_create( |
7961 | struct cgroup_subsys *ss, struct cgroup *cont) | 7957 | struct cgroup_subsys *ss, struct cgroup *cont) |
7962 | { | 7958 | { |
7963 | struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); | 7959 | struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); |
7964 | 7960 | ||
7965 | if (!ca) | 7961 | if (!ca) |
7966 | return ERR_PTR(-ENOMEM); | 7962 | return ERR_PTR(-ENOMEM); |
7967 | 7963 | ||
7968 | ca->cpuusage = alloc_percpu(u64); | 7964 | ca->cpuusage = alloc_percpu(u64); |
7969 | if (!ca->cpuusage) { | 7965 | if (!ca->cpuusage) { |
7970 | kfree(ca); | 7966 | kfree(ca); |
7971 | return ERR_PTR(-ENOMEM); | 7967 | return ERR_PTR(-ENOMEM); |
7972 | } | 7968 | } |
7973 | 7969 | ||
7974 | return &ca->css; | 7970 | return &ca->css; |
7975 | } | 7971 | } |
7976 | 7972 | ||
7977 | /* destroy an existing cpu accounting group */ | 7973 | /* destroy an existing cpu accounting group */ |
7978 | static void | 7974 | static void |
7979 | cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cont) | 7975 | cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cont) |
7980 | { | 7976 | { |
7981 | struct cpuacct *ca = cgroup_ca(cont); | 7977 | struct cpuacct *ca = cgroup_ca(cont); |
7982 | 7978 | ||
7983 | free_percpu(ca->cpuusage); | 7979 | free_percpu(ca->cpuusage); |
7984 | kfree(ca); | 7980 | kfree(ca); |
7985 | } | 7981 | } |
7986 | 7982 | ||
7987 | /* return total cpu usage (in nanoseconds) of a group */ | 7983 | /* return total cpu usage (in nanoseconds) of a group */ |
7988 | static u64 cpuusage_read(struct cgroup *cont, struct cftype *cft) | 7984 | static u64 cpuusage_read(struct cgroup *cont, struct cftype *cft) |
7989 | { | 7985 | { |
7990 | struct cpuacct *ca = cgroup_ca(cont); | 7986 | struct cpuacct *ca = cgroup_ca(cont); |
7991 | u64 totalcpuusage = 0; | 7987 | u64 totalcpuusage = 0; |
7992 | int i; | 7988 | int i; |
7993 | 7989 | ||
7994 | for_each_possible_cpu(i) { | 7990 | for_each_possible_cpu(i) { |
7995 | u64 *cpuusage = percpu_ptr(ca->cpuusage, i); | 7991 | u64 *cpuusage = percpu_ptr(ca->cpuusage, i); |
7996 | 7992 | ||
7997 | /* | 7993 | /* |
7998 | * Take rq->lock to make 64-bit addition safe on 32-bit | 7994 | * Take rq->lock to make 64-bit addition safe on 32-bit |
7999 | * platforms. | 7995 | * platforms. |
8000 | */ | 7996 | */ |
8001 | spin_lock_irq(&cpu_rq(i)->lock); | 7997 | spin_lock_irq(&cpu_rq(i)->lock); |
8002 | totalcpuusage += *cpuusage; | 7998 | totalcpuusage += *cpuusage; |
8003 | spin_unlock_irq(&cpu_rq(i)->lock); | 7999 | spin_unlock_irq(&cpu_rq(i)->lock); |
8004 | } | 8000 | } |
8005 | 8001 | ||
8006 | return totalcpuusage; | 8002 | return totalcpuusage; |
8007 | } | 8003 | } |
8008 | 8004 | ||
8009 | static struct cftype files[] = { | 8005 | static struct cftype files[] = { |
8010 | { | 8006 | { |
8011 | .name = "usage", | 8007 | .name = "usage", |
8012 | .read_uint = cpuusage_read, | 8008 | .read_uint = cpuusage_read, |
8013 | }, | 8009 | }, |
8014 | }; | 8010 | }; |
8015 | 8011 | ||
8016 | static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cont) | 8012 | static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cont) |
8017 | { | 8013 | { |
8018 | return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); | 8014 | return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); |
8019 | } | 8015 | } |
8020 | 8016 | ||
8021 | /* | 8017 | /* |
8022 | * charge this task's execution time to its accounting group. | 8018 | * charge this task's execution time to its accounting group. |
8023 | * | 8019 | * |
8024 | * called with rq->lock held. | 8020 | * called with rq->lock held. |
8025 | */ | 8021 | */ |
8026 | static void cpuacct_charge(struct task_struct *tsk, u64 cputime) | 8022 | static void cpuacct_charge(struct task_struct *tsk, u64 cputime) |
8027 | { | 8023 | { |
8028 | struct cpuacct *ca; | 8024 | struct cpuacct *ca; |
8029 | 8025 | ||
8030 | if (!cpuacct_subsys.active) | 8026 | if (!cpuacct_subsys.active) |
8031 | return; | 8027 | return; |
8032 | 8028 | ||
8033 | ca = task_ca(tsk); | 8029 | ca = task_ca(tsk); |
8034 | if (ca) { | 8030 | if (ca) { |
8035 | u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk)); | 8031 | u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk)); |
8036 | 8032 | ||
8037 | *cpuusage += cputime; | 8033 | *cpuusage += cputime; |
8038 | } | 8034 | } |
8039 | } | 8035 | } |
8040 | 8036 | ||
8041 | struct cgroup_subsys cpuacct_subsys = { | 8037 | struct cgroup_subsys cpuacct_subsys = { |
8042 | .name = "cpuacct", | 8038 | .name = "cpuacct", |
8043 | .create = cpuacct_create, | 8039 | .create = cpuacct_create, |
8044 | .destroy = cpuacct_destroy, | 8040 | .destroy = cpuacct_destroy, |
8045 | .populate = cpuacct_populate, | 8041 | .populate = cpuacct_populate, |
8046 | .subsys_id = cpuacct_subsys_id, | 8042 | .subsys_id = cpuacct_subsys_id, |
8047 | }; | 8043 | }; |
8048 | #endif /* CONFIG_CGROUP_CPUACCT */ | 8044 | #endif /* CONFIG_CGROUP_CPUACCT */ |
kernel/spinlock.c
1 | /* | 1 | /* |
2 | * Copyright (2004) Linus Torvalds | 2 | * Copyright (2004) Linus Torvalds |
3 | * | 3 | * |
4 | * Author: Zwane Mwaikambo <zwane@fsmlabs.com> | 4 | * Author: Zwane Mwaikambo <zwane@fsmlabs.com> |
5 | * | 5 | * |
6 | * Copyright (2004, 2005) Ingo Molnar | 6 | * Copyright (2004, 2005) Ingo Molnar |
7 | * | 7 | * |
8 | * This file contains the spinlock/rwlock implementations for the | 8 | * This file contains the spinlock/rwlock implementations for the |
9 | * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them) | 9 | * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them) |
10 | * | 10 | * |
11 | * Note that some architectures have special knowledge about the | 11 | * Note that some architectures have special knowledge about the |
12 | * stack frames of these functions in their profile_pc. If you | 12 | * stack frames of these functions in their profile_pc. If you |
13 | * change anything significant here that could change the stack | 13 | * change anything significant here that could change the stack |
14 | * frame contact the architecture maintainers. | 14 | * frame contact the architecture maintainers. |
15 | */ | 15 | */ |
16 | 16 | ||
17 | #include <linux/linkage.h> | 17 | #include <linux/linkage.h> |
18 | #include <linux/preempt.h> | 18 | #include <linux/preempt.h> |
19 | #include <linux/spinlock.h> | 19 | #include <linux/spinlock.h> |
20 | #include <linux/interrupt.h> | 20 | #include <linux/interrupt.h> |
21 | #include <linux/debug_locks.h> | 21 | #include <linux/debug_locks.h> |
22 | #include <linux/module.h> | 22 | #include <linux/module.h> |
23 | 23 | ||
24 | int __lockfunc _spin_trylock(spinlock_t *lock) | 24 | int __lockfunc _spin_trylock(spinlock_t *lock) |
25 | { | 25 | { |
26 | preempt_disable(); | 26 | preempt_disable(); |
27 | if (_raw_spin_trylock(lock)) { | 27 | if (_raw_spin_trylock(lock)) { |
28 | spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); | 28 | spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); |
29 | return 1; | 29 | return 1; |
30 | } | 30 | } |
31 | 31 | ||
32 | preempt_enable(); | 32 | preempt_enable(); |
33 | return 0; | 33 | return 0; |
34 | } | 34 | } |
35 | EXPORT_SYMBOL(_spin_trylock); | 35 | EXPORT_SYMBOL(_spin_trylock); |
36 | 36 | ||
37 | int __lockfunc _read_trylock(rwlock_t *lock) | 37 | int __lockfunc _read_trylock(rwlock_t *lock) |
38 | { | 38 | { |
39 | preempt_disable(); | 39 | preempt_disable(); |
40 | if (_raw_read_trylock(lock)) { | 40 | if (_raw_read_trylock(lock)) { |
41 | rwlock_acquire_read(&lock->dep_map, 0, 1, _RET_IP_); | 41 | rwlock_acquire_read(&lock->dep_map, 0, 1, _RET_IP_); |
42 | return 1; | 42 | return 1; |
43 | } | 43 | } |
44 | 44 | ||
45 | preempt_enable(); | 45 | preempt_enable(); |
46 | return 0; | 46 | return 0; |
47 | } | 47 | } |
48 | EXPORT_SYMBOL(_read_trylock); | 48 | EXPORT_SYMBOL(_read_trylock); |
49 | 49 | ||
50 | int __lockfunc _write_trylock(rwlock_t *lock) | 50 | int __lockfunc _write_trylock(rwlock_t *lock) |
51 | { | 51 | { |
52 | preempt_disable(); | 52 | preempt_disable(); |
53 | if (_raw_write_trylock(lock)) { | 53 | if (_raw_write_trylock(lock)) { |
54 | rwlock_acquire(&lock->dep_map, 0, 1, _RET_IP_); | 54 | rwlock_acquire(&lock->dep_map, 0, 1, _RET_IP_); |
55 | return 1; | 55 | return 1; |
56 | } | 56 | } |
57 | 57 | ||
58 | preempt_enable(); | 58 | preempt_enable(); |
59 | return 0; | 59 | return 0; |
60 | } | 60 | } |
61 | EXPORT_SYMBOL(_write_trylock); | 61 | EXPORT_SYMBOL(_write_trylock); |
62 | 62 | ||
63 | /* | 63 | /* |
64 | * If lockdep is enabled then we use the non-preemption spin-ops | 64 | * If lockdep is enabled then we use the non-preemption spin-ops |
65 | * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are | 65 | * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are |
66 | * not re-enabled during lock-acquire (which the preempt-spin-ops do): | 66 | * not re-enabled during lock-acquire (which the preempt-spin-ops do): |
67 | */ | 67 | */ |
68 | #if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP) || \ | 68 | #if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) |
69 | defined(CONFIG_DEBUG_LOCK_ALLOC) | ||
70 | 69 | ||
71 | void __lockfunc _read_lock(rwlock_t *lock) | 70 | void __lockfunc _read_lock(rwlock_t *lock) |
72 | { | 71 | { |
73 | preempt_disable(); | 72 | preempt_disable(); |
74 | rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); | 73 | rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); |
75 | LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); | 74 | LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); |
76 | } | 75 | } |
77 | EXPORT_SYMBOL(_read_lock); | 76 | EXPORT_SYMBOL(_read_lock); |
78 | 77 | ||
79 | unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) | 78 | unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) |
80 | { | 79 | { |
81 | unsigned long flags; | 80 | unsigned long flags; |
82 | 81 | ||
83 | local_irq_save(flags); | 82 | local_irq_save(flags); |
84 | preempt_disable(); | 83 | preempt_disable(); |
85 | spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); | 84 | spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); |
86 | /* | 85 | /* |
87 | * On lockdep we dont want the hand-coded irq-enable of | 86 | * On lockdep we dont want the hand-coded irq-enable of |
88 | * _raw_spin_lock_flags() code, because lockdep assumes | 87 | * _raw_spin_lock_flags() code, because lockdep assumes |
89 | * that interrupts are not re-enabled during lock-acquire: | 88 | * that interrupts are not re-enabled during lock-acquire: |
90 | */ | 89 | */ |
91 | #ifdef CONFIG_LOCKDEP | 90 | #ifdef CONFIG_LOCKDEP |
92 | LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); | 91 | LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); |
93 | #else | 92 | #else |
94 | _raw_spin_lock_flags(lock, &flags); | 93 | _raw_spin_lock_flags(lock, &flags); |
95 | #endif | 94 | #endif |
96 | return flags; | 95 | return flags; |
97 | } | 96 | } |
98 | EXPORT_SYMBOL(_spin_lock_irqsave); | 97 | EXPORT_SYMBOL(_spin_lock_irqsave); |
99 | 98 | ||
100 | void __lockfunc _spin_lock_irq(spinlock_t *lock) | 99 | void __lockfunc _spin_lock_irq(spinlock_t *lock) |
101 | { | 100 | { |
102 | local_irq_disable(); | 101 | local_irq_disable(); |
103 | preempt_disable(); | 102 | preempt_disable(); |
104 | spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); | 103 | spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); |
105 | LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); | 104 | LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); |
106 | } | 105 | } |
107 | EXPORT_SYMBOL(_spin_lock_irq); | 106 | EXPORT_SYMBOL(_spin_lock_irq); |
108 | 107 | ||
109 | void __lockfunc _spin_lock_bh(spinlock_t *lock) | 108 | void __lockfunc _spin_lock_bh(spinlock_t *lock) |
110 | { | 109 | { |
111 | local_bh_disable(); | 110 | local_bh_disable(); |
112 | preempt_disable(); | 111 | preempt_disable(); |
113 | spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); | 112 | spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); |
114 | LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); | 113 | LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); |
115 | } | 114 | } |
116 | EXPORT_SYMBOL(_spin_lock_bh); | 115 | EXPORT_SYMBOL(_spin_lock_bh); |
117 | 116 | ||
118 | unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock) | 117 | unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock) |
119 | { | 118 | { |
120 | unsigned long flags; | 119 | unsigned long flags; |
121 | 120 | ||
122 | local_irq_save(flags); | 121 | local_irq_save(flags); |
123 | preempt_disable(); | 122 | preempt_disable(); |
124 | rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); | 123 | rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); |
125 | LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); | 124 | LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); |
126 | return flags; | 125 | return flags; |
127 | } | 126 | } |
128 | EXPORT_SYMBOL(_read_lock_irqsave); | 127 | EXPORT_SYMBOL(_read_lock_irqsave); |
129 | 128 | ||
130 | void __lockfunc _read_lock_irq(rwlock_t *lock) | 129 | void __lockfunc _read_lock_irq(rwlock_t *lock) |
131 | { | 130 | { |
132 | local_irq_disable(); | 131 | local_irq_disable(); |
133 | preempt_disable(); | 132 | preempt_disable(); |
134 | rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); | 133 | rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); |
135 | LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); | 134 | LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); |
136 | } | 135 | } |
137 | EXPORT_SYMBOL(_read_lock_irq); | 136 | EXPORT_SYMBOL(_read_lock_irq); |
138 | 137 | ||
139 | void __lockfunc _read_lock_bh(rwlock_t *lock) | 138 | void __lockfunc _read_lock_bh(rwlock_t *lock) |
140 | { | 139 | { |
141 | local_bh_disable(); | 140 | local_bh_disable(); |
142 | preempt_disable(); | 141 | preempt_disable(); |
143 | rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); | 142 | rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); |
144 | LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); | 143 | LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); |
145 | } | 144 | } |
146 | EXPORT_SYMBOL(_read_lock_bh); | 145 | EXPORT_SYMBOL(_read_lock_bh); |
147 | 146 | ||
148 | unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock) | 147 | unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock) |
149 | { | 148 | { |
150 | unsigned long flags; | 149 | unsigned long flags; |
151 | 150 | ||
152 | local_irq_save(flags); | 151 | local_irq_save(flags); |
153 | preempt_disable(); | 152 | preempt_disable(); |
154 | rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); | 153 | rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); |
155 | LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); | 154 | LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); |
156 | return flags; | 155 | return flags; |
157 | } | 156 | } |
158 | EXPORT_SYMBOL(_write_lock_irqsave); | 157 | EXPORT_SYMBOL(_write_lock_irqsave); |
159 | 158 | ||
160 | void __lockfunc _write_lock_irq(rwlock_t *lock) | 159 | void __lockfunc _write_lock_irq(rwlock_t *lock) |
161 | { | 160 | { |
162 | local_irq_disable(); | 161 | local_irq_disable(); |
163 | preempt_disable(); | 162 | preempt_disable(); |
164 | rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); | 163 | rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); |
165 | LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); | 164 | LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); |
166 | } | 165 | } |
167 | EXPORT_SYMBOL(_write_lock_irq); | 166 | EXPORT_SYMBOL(_write_lock_irq); |
168 | 167 | ||
169 | void __lockfunc _write_lock_bh(rwlock_t *lock) | 168 | void __lockfunc _write_lock_bh(rwlock_t *lock) |
170 | { | 169 | { |
171 | local_bh_disable(); | 170 | local_bh_disable(); |
172 | preempt_disable(); | 171 | preempt_disable(); |
173 | rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); | 172 | rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); |
174 | LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); | 173 | LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); |
175 | } | 174 | } |
176 | EXPORT_SYMBOL(_write_lock_bh); | 175 | EXPORT_SYMBOL(_write_lock_bh); |
177 | 176 | ||
178 | void __lockfunc _spin_lock(spinlock_t *lock) | 177 | void __lockfunc _spin_lock(spinlock_t *lock) |
179 | { | 178 | { |
180 | preempt_disable(); | 179 | preempt_disable(); |
181 | spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); | 180 | spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); |
182 | LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); | 181 | LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); |
183 | } | 182 | } |
184 | 183 | ||
185 | EXPORT_SYMBOL(_spin_lock); | 184 | EXPORT_SYMBOL(_spin_lock); |
186 | 185 | ||
187 | void __lockfunc _write_lock(rwlock_t *lock) | 186 | void __lockfunc _write_lock(rwlock_t *lock) |
188 | { | 187 | { |
189 | preempt_disable(); | 188 | preempt_disable(); |
190 | rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); | 189 | rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); |
191 | LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); | 190 | LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); |
192 | } | 191 | } |
193 | 192 | ||
194 | EXPORT_SYMBOL(_write_lock); | 193 | EXPORT_SYMBOL(_write_lock); |
195 | 194 | ||
196 | #else /* CONFIG_PREEMPT: */ | 195 | #else /* CONFIG_PREEMPT: */ |
197 | 196 | ||
198 | /* | 197 | /* |
199 | * This could be a long-held lock. We both prepare to spin for a long | 198 | * This could be a long-held lock. We both prepare to spin for a long |
200 | * time (making _this_ CPU preemptable if possible), and we also signal | 199 | * time (making _this_ CPU preemptable if possible), and we also signal |
201 | * towards that other CPU that it should break the lock ASAP. | 200 | * towards that other CPU that it should break the lock ASAP. |
202 | * | 201 | * |
203 | * (We do this in a function because inlining it would be excessive.) | 202 | * (We do this in a function because inlining it would be excessive.) |
204 | */ | 203 | */ |
205 | 204 | ||
206 | #define BUILD_LOCK_OPS(op, locktype) \ | 205 | #define BUILD_LOCK_OPS(op, locktype) \ |
207 | void __lockfunc _##op##_lock(locktype##_t *lock) \ | 206 | void __lockfunc _##op##_lock(locktype##_t *lock) \ |
208 | { \ | 207 | { \ |
209 | for (;;) { \ | 208 | for (;;) { \ |
210 | preempt_disable(); \ | 209 | preempt_disable(); \ |
211 | if (likely(_raw_##op##_trylock(lock))) \ | 210 | if (likely(_raw_##op##_trylock(lock))) \ |
212 | break; \ | 211 | break; \ |
213 | preempt_enable(); \ | 212 | preempt_enable(); \ |
214 | \ | 213 | \ |
215 | if (!(lock)->break_lock) \ | 214 | if (!(lock)->break_lock) \ |
216 | (lock)->break_lock = 1; \ | 215 | (lock)->break_lock = 1; \ |
217 | while (!op##_can_lock(lock) && (lock)->break_lock) \ | 216 | while (!op##_can_lock(lock) && (lock)->break_lock) \ |
218 | _raw_##op##_relax(&lock->raw_lock); \ | 217 | _raw_##op##_relax(&lock->raw_lock); \ |
219 | } \ | 218 | } \ |
220 | (lock)->break_lock = 0; \ | 219 | (lock)->break_lock = 0; \ |
221 | } \ | 220 | } \ |
222 | \ | 221 | \ |
223 | EXPORT_SYMBOL(_##op##_lock); \ | 222 | EXPORT_SYMBOL(_##op##_lock); \ |
224 | \ | 223 | \ |
225 | unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock) \ | 224 | unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock) \ |
226 | { \ | 225 | { \ |
227 | unsigned long flags; \ | 226 | unsigned long flags; \ |
228 | \ | 227 | \ |
229 | for (;;) { \ | 228 | for (;;) { \ |
230 | preempt_disable(); \ | 229 | preempt_disable(); \ |
231 | local_irq_save(flags); \ | 230 | local_irq_save(flags); \ |
232 | if (likely(_raw_##op##_trylock(lock))) \ | 231 | if (likely(_raw_##op##_trylock(lock))) \ |
233 | break; \ | 232 | break; \ |
234 | local_irq_restore(flags); \ | 233 | local_irq_restore(flags); \ |
235 | preempt_enable(); \ | 234 | preempt_enable(); \ |
236 | \ | 235 | \ |
237 | if (!(lock)->break_lock) \ | 236 | if (!(lock)->break_lock) \ |
238 | (lock)->break_lock = 1; \ | 237 | (lock)->break_lock = 1; \ |
239 | while (!op##_can_lock(lock) && (lock)->break_lock) \ | 238 | while (!op##_can_lock(lock) && (lock)->break_lock) \ |
240 | _raw_##op##_relax(&lock->raw_lock); \ | 239 | _raw_##op##_relax(&lock->raw_lock); \ |
241 | } \ | 240 | } \ |
242 | (lock)->break_lock = 0; \ | 241 | (lock)->break_lock = 0; \ |
243 | return flags; \ | 242 | return flags; \ |
244 | } \ | 243 | } \ |
245 | \ | 244 | \ |
246 | EXPORT_SYMBOL(_##op##_lock_irqsave); \ | 245 | EXPORT_SYMBOL(_##op##_lock_irqsave); \ |
247 | \ | 246 | \ |
248 | void __lockfunc _##op##_lock_irq(locktype##_t *lock) \ | 247 | void __lockfunc _##op##_lock_irq(locktype##_t *lock) \ |
249 | { \ | 248 | { \ |
250 | _##op##_lock_irqsave(lock); \ | 249 | _##op##_lock_irqsave(lock); \ |
251 | } \ | 250 | } \ |
252 | \ | 251 | \ |
253 | EXPORT_SYMBOL(_##op##_lock_irq); \ | 252 | EXPORT_SYMBOL(_##op##_lock_irq); \ |
254 | \ | 253 | \ |
255 | void __lockfunc _##op##_lock_bh(locktype##_t *lock) \ | 254 | void __lockfunc _##op##_lock_bh(locktype##_t *lock) \ |
256 | { \ | 255 | { \ |
257 | unsigned long flags; \ | 256 | unsigned long flags; \ |
258 | \ | 257 | \ |
259 | /* */ \ | 258 | /* */ \ |
260 | /* Careful: we must exclude softirqs too, hence the */ \ | 259 | /* Careful: we must exclude softirqs too, hence the */ \ |
261 | /* irq-disabling. We use the generic preemption-aware */ \ | 260 | /* irq-disabling. We use the generic preemption-aware */ \ |
262 | /* function: */ \ | 261 | /* function: */ \ |
263 | /**/ \ | 262 | /**/ \ |
264 | flags = _##op##_lock_irqsave(lock); \ | 263 | flags = _##op##_lock_irqsave(lock); \ |
265 | local_bh_disable(); \ | 264 | local_bh_disable(); \ |
266 | local_irq_restore(flags); \ | 265 | local_irq_restore(flags); \ |
267 | } \ | 266 | } \ |
268 | \ | 267 | \ |
269 | EXPORT_SYMBOL(_##op##_lock_bh) | 268 | EXPORT_SYMBOL(_##op##_lock_bh) |
270 | 269 | ||
271 | /* | 270 | /* |
272 | * Build preemption-friendly versions of the following | 271 | * Build preemption-friendly versions of the following |
273 | * lock-spinning functions: | 272 | * lock-spinning functions: |
274 | * | 273 | * |
275 | * _[spin|read|write]_lock() | 274 | * _[spin|read|write]_lock() |
276 | * _[spin|read|write]_lock_irq() | 275 | * _[spin|read|write]_lock_irq() |
277 | * _[spin|read|write]_lock_irqsave() | 276 | * _[spin|read|write]_lock_irqsave() |
278 | * _[spin|read|write]_lock_bh() | 277 | * _[spin|read|write]_lock_bh() |
279 | */ | 278 | */ |
280 | BUILD_LOCK_OPS(spin, spinlock); | 279 | BUILD_LOCK_OPS(spin, spinlock); |
281 | BUILD_LOCK_OPS(read, rwlock); | 280 | BUILD_LOCK_OPS(read, rwlock); |
282 | BUILD_LOCK_OPS(write, rwlock); | 281 | BUILD_LOCK_OPS(write, rwlock); |
283 | 282 | ||
284 | #endif /* CONFIG_PREEMPT */ | 283 | #endif /* CONFIG_PREEMPT */ |
285 | 284 | ||
286 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 285 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
287 | 286 | ||
288 | void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass) | 287 | void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass) |
289 | { | 288 | { |
290 | preempt_disable(); | 289 | preempt_disable(); |
291 | spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); | 290 | spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); |
292 | LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); | 291 | LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); |
293 | } | 292 | } |
294 | 293 | ||
295 | EXPORT_SYMBOL(_spin_lock_nested); | 294 | EXPORT_SYMBOL(_spin_lock_nested); |
296 | unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass) | 295 | unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass) |
297 | { | 296 | { |
298 | unsigned long flags; | 297 | unsigned long flags; |
299 | 298 | ||
300 | local_irq_save(flags); | 299 | local_irq_save(flags); |
301 | preempt_disable(); | 300 | preempt_disable(); |
302 | spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); | 301 | spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); |
303 | /* | 302 | /* |
304 | * On lockdep we dont want the hand-coded irq-enable of | 303 | * On lockdep we dont want the hand-coded irq-enable of |
305 | * _raw_spin_lock_flags() code, because lockdep assumes | 304 | * _raw_spin_lock_flags() code, because lockdep assumes |
306 | * that interrupts are not re-enabled during lock-acquire: | 305 | * that interrupts are not re-enabled during lock-acquire: |
307 | */ | 306 | */ |
308 | #ifdef CONFIG_LOCKDEP | 307 | #ifdef CONFIG_LOCKDEP |
309 | LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); | 308 | LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); |
310 | #else | 309 | #else |
311 | _raw_spin_lock_flags(lock, &flags); | 310 | _raw_spin_lock_flags(lock, &flags); |
312 | #endif | 311 | #endif |
313 | return flags; | 312 | return flags; |
314 | } | 313 | } |
315 | 314 | ||
316 | EXPORT_SYMBOL(_spin_lock_irqsave_nested); | 315 | EXPORT_SYMBOL(_spin_lock_irqsave_nested); |
317 | 316 | ||
318 | #endif | 317 | #endif |
319 | 318 | ||
320 | void __lockfunc _spin_unlock(spinlock_t *lock) | 319 | void __lockfunc _spin_unlock(spinlock_t *lock) |
321 | { | 320 | { |
322 | spin_release(&lock->dep_map, 1, _RET_IP_); | 321 | spin_release(&lock->dep_map, 1, _RET_IP_); |
323 | _raw_spin_unlock(lock); | 322 | _raw_spin_unlock(lock); |
324 | preempt_enable(); | 323 | preempt_enable(); |
325 | } | 324 | } |
326 | EXPORT_SYMBOL(_spin_unlock); | 325 | EXPORT_SYMBOL(_spin_unlock); |
327 | 326 | ||
328 | void __lockfunc _write_unlock(rwlock_t *lock) | 327 | void __lockfunc _write_unlock(rwlock_t *lock) |
329 | { | 328 | { |
330 | rwlock_release(&lock->dep_map, 1, _RET_IP_); | 329 | rwlock_release(&lock->dep_map, 1, _RET_IP_); |
331 | _raw_write_unlock(lock); | 330 | _raw_write_unlock(lock); |
332 | preempt_enable(); | 331 | preempt_enable(); |
333 | } | 332 | } |
334 | EXPORT_SYMBOL(_write_unlock); | 333 | EXPORT_SYMBOL(_write_unlock); |
335 | 334 | ||
336 | void __lockfunc _read_unlock(rwlock_t *lock) | 335 | void __lockfunc _read_unlock(rwlock_t *lock) |
337 | { | 336 | { |
338 | rwlock_release(&lock->dep_map, 1, _RET_IP_); | 337 | rwlock_release(&lock->dep_map, 1, _RET_IP_); |
339 | _raw_read_unlock(lock); | 338 | _raw_read_unlock(lock); |
340 | preempt_enable(); | 339 | preempt_enable(); |
341 | } | 340 | } |
342 | EXPORT_SYMBOL(_read_unlock); | 341 | EXPORT_SYMBOL(_read_unlock); |
343 | 342 | ||
344 | void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) | 343 | void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) |
345 | { | 344 | { |
346 | spin_release(&lock->dep_map, 1, _RET_IP_); | 345 | spin_release(&lock->dep_map, 1, _RET_IP_); |
347 | _raw_spin_unlock(lock); | 346 | _raw_spin_unlock(lock); |
348 | local_irq_restore(flags); | 347 | local_irq_restore(flags); |
349 | preempt_enable(); | 348 | preempt_enable(); |
350 | } | 349 | } |
351 | EXPORT_SYMBOL(_spin_unlock_irqrestore); | 350 | EXPORT_SYMBOL(_spin_unlock_irqrestore); |
352 | 351 | ||
353 | void __lockfunc _spin_unlock_irq(spinlock_t *lock) | 352 | void __lockfunc _spin_unlock_irq(spinlock_t *lock) |
354 | { | 353 | { |
355 | spin_release(&lock->dep_map, 1, _RET_IP_); | 354 | spin_release(&lock->dep_map, 1, _RET_IP_); |
356 | _raw_spin_unlock(lock); | 355 | _raw_spin_unlock(lock); |
357 | local_irq_enable(); | 356 | local_irq_enable(); |
358 | preempt_enable(); | 357 | preempt_enable(); |
359 | } | 358 | } |
360 | EXPORT_SYMBOL(_spin_unlock_irq); | 359 | EXPORT_SYMBOL(_spin_unlock_irq); |
361 | 360 | ||
362 | void __lockfunc _spin_unlock_bh(spinlock_t *lock) | 361 | void __lockfunc _spin_unlock_bh(spinlock_t *lock) |
363 | { | 362 | { |
364 | spin_release(&lock->dep_map, 1, _RET_IP_); | 363 | spin_release(&lock->dep_map, 1, _RET_IP_); |
365 | _raw_spin_unlock(lock); | 364 | _raw_spin_unlock(lock); |
366 | preempt_enable_no_resched(); | 365 | preempt_enable_no_resched(); |
367 | local_bh_enable_ip((unsigned long)__builtin_return_address(0)); | 366 | local_bh_enable_ip((unsigned long)__builtin_return_address(0)); |
368 | } | 367 | } |
369 | EXPORT_SYMBOL(_spin_unlock_bh); | 368 | EXPORT_SYMBOL(_spin_unlock_bh); |
370 | 369 | ||
371 | void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) | 370 | void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) |
372 | { | 371 | { |
373 | rwlock_release(&lock->dep_map, 1, _RET_IP_); | 372 | rwlock_release(&lock->dep_map, 1, _RET_IP_); |
374 | _raw_read_unlock(lock); | 373 | _raw_read_unlock(lock); |
375 | local_irq_restore(flags); | 374 | local_irq_restore(flags); |
376 | preempt_enable(); | 375 | preempt_enable(); |
377 | } | 376 | } |
378 | EXPORT_SYMBOL(_read_unlock_irqrestore); | 377 | EXPORT_SYMBOL(_read_unlock_irqrestore); |
379 | 378 | ||
380 | void __lockfunc _read_unlock_irq(rwlock_t *lock) | 379 | void __lockfunc _read_unlock_irq(rwlock_t *lock) |
381 | { | 380 | { |
382 | rwlock_release(&lock->dep_map, 1, _RET_IP_); | 381 | rwlock_release(&lock->dep_map, 1, _RET_IP_); |
383 | _raw_read_unlock(lock); | 382 | _raw_read_unlock(lock); |
384 | local_irq_enable(); | 383 | local_irq_enable(); |
385 | preempt_enable(); | 384 | preempt_enable(); |
386 | } | 385 | } |
387 | EXPORT_SYMBOL(_read_unlock_irq); | 386 | EXPORT_SYMBOL(_read_unlock_irq); |
388 | 387 | ||
389 | void __lockfunc _read_unlock_bh(rwlock_t *lock) | 388 | void __lockfunc _read_unlock_bh(rwlock_t *lock) |
390 | { | 389 | { |
391 | rwlock_release(&lock->dep_map, 1, _RET_IP_); | 390 | rwlock_release(&lock->dep_map, 1, _RET_IP_); |
392 | _raw_read_unlock(lock); | 391 | _raw_read_unlock(lock); |
393 | preempt_enable_no_resched(); | 392 | preempt_enable_no_resched(); |
394 | local_bh_enable_ip((unsigned long)__builtin_return_address(0)); | 393 | local_bh_enable_ip((unsigned long)__builtin_return_address(0)); |
395 | } | 394 | } |
396 | EXPORT_SYMBOL(_read_unlock_bh); | 395 | EXPORT_SYMBOL(_read_unlock_bh); |
397 | 396 | ||
398 | void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) | 397 | void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) |
399 | { | 398 | { |
400 | rwlock_release(&lock->dep_map, 1, _RET_IP_); | 399 | rwlock_release(&lock->dep_map, 1, _RET_IP_); |
401 | _raw_write_unlock(lock); | 400 | _raw_write_unlock(lock); |
402 | local_irq_restore(flags); | 401 | local_irq_restore(flags); |
403 | preempt_enable(); | 402 | preempt_enable(); |
404 | } | 403 | } |
405 | EXPORT_SYMBOL(_write_unlock_irqrestore); | 404 | EXPORT_SYMBOL(_write_unlock_irqrestore); |
406 | 405 | ||
407 | void __lockfunc _write_unlock_irq(rwlock_t *lock) | 406 | void __lockfunc _write_unlock_irq(rwlock_t *lock) |
408 | { | 407 | { |
409 | rwlock_release(&lock->dep_map, 1, _RET_IP_); | 408 | rwlock_release(&lock->dep_map, 1, _RET_IP_); |
410 | _raw_write_unlock(lock); | 409 | _raw_write_unlock(lock); |
411 | local_irq_enable(); | 410 | local_irq_enable(); |
412 | preempt_enable(); | 411 | preempt_enable(); |
413 | } | 412 | } |
414 | EXPORT_SYMBOL(_write_unlock_irq); | 413 | EXPORT_SYMBOL(_write_unlock_irq); |
415 | 414 | ||
416 | void __lockfunc _write_unlock_bh(rwlock_t *lock) | 415 | void __lockfunc _write_unlock_bh(rwlock_t *lock) |
417 | { | 416 | { |
418 | rwlock_release(&lock->dep_map, 1, _RET_IP_); | 417 | rwlock_release(&lock->dep_map, 1, _RET_IP_); |
419 | _raw_write_unlock(lock); | 418 | _raw_write_unlock(lock); |
420 | preempt_enable_no_resched(); | 419 | preempt_enable_no_resched(); |
421 | local_bh_enable_ip((unsigned long)__builtin_return_address(0)); | 420 | local_bh_enable_ip((unsigned long)__builtin_return_address(0)); |
422 | } | 421 | } |
423 | EXPORT_SYMBOL(_write_unlock_bh); | 422 | EXPORT_SYMBOL(_write_unlock_bh); |
424 | 423 | ||
425 | int __lockfunc _spin_trylock_bh(spinlock_t *lock) | 424 | int __lockfunc _spin_trylock_bh(spinlock_t *lock) |
426 | { | 425 | { |
427 | local_bh_disable(); | 426 | local_bh_disable(); |
428 | preempt_disable(); | 427 | preempt_disable(); |
429 | if (_raw_spin_trylock(lock)) { | 428 | if (_raw_spin_trylock(lock)) { |
430 | spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); | 429 | spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); |
431 | return 1; | 430 | return 1; |
432 | } | 431 | } |
433 | 432 | ||
434 | preempt_enable_no_resched(); | 433 | preempt_enable_no_resched(); |
435 | local_bh_enable_ip((unsigned long)__builtin_return_address(0)); | 434 | local_bh_enable_ip((unsigned long)__builtin_return_address(0)); |
436 | return 0; | 435 | return 0; |
437 | } | 436 | } |
438 | EXPORT_SYMBOL(_spin_trylock_bh); | 437 | EXPORT_SYMBOL(_spin_trylock_bh); |
439 | 438 | ||
440 | int in_lock_functions(unsigned long addr) | 439 | int in_lock_functions(unsigned long addr) |
441 | { | 440 | { |
442 | /* Linker adds these: start and end of __lockfunc functions */ | 441 | /* Linker adds these: start and end of __lockfunc functions */ |
443 | extern char __lock_text_start[], __lock_text_end[]; | 442 | extern char __lock_text_start[], __lock_text_end[]; |
444 | 443 | ||
445 | return addr >= (unsigned long)__lock_text_start | 444 | return addr >= (unsigned long)__lock_text_start |
446 | && addr < (unsigned long)__lock_text_end; | 445 | && addr < (unsigned long)__lock_text_end; |
447 | } | 446 | } |
448 | EXPORT_SYMBOL(in_lock_functions); | 447 | EXPORT_SYMBOL(in_lock_functions); |
449 | 448 |
mm/memory.c
1 | /* | 1 | /* |
2 | * linux/mm/memory.c | 2 | * linux/mm/memory.c |
3 | * | 3 | * |
4 | * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds | 4 | * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds |
5 | */ | 5 | */ |
6 | 6 | ||
7 | /* | 7 | /* |
8 | * demand-loading started 01.12.91 - seems it is high on the list of | 8 | * demand-loading started 01.12.91 - seems it is high on the list of |
9 | * things wanted, and it should be easy to implement. - Linus | 9 | * things wanted, and it should be easy to implement. - Linus |
10 | */ | 10 | */ |
11 | 11 | ||
12 | /* | 12 | /* |
13 | * Ok, demand-loading was easy, shared pages a little bit tricker. Shared | 13 | * Ok, demand-loading was easy, shared pages a little bit tricker. Shared |
14 | * pages started 02.12.91, seems to work. - Linus. | 14 | * pages started 02.12.91, seems to work. - Linus. |
15 | * | 15 | * |
16 | * Tested sharing by executing about 30 /bin/sh: under the old kernel it | 16 | * Tested sharing by executing about 30 /bin/sh: under the old kernel it |
17 | * would have taken more than the 6M I have free, but it worked well as | 17 | * would have taken more than the 6M I have free, but it worked well as |
18 | * far as I could see. | 18 | * far as I could see. |
19 | * | 19 | * |
20 | * Also corrected some "invalidate()"s - I wasn't doing enough of them. | 20 | * Also corrected some "invalidate()"s - I wasn't doing enough of them. |
21 | */ | 21 | */ |
22 | 22 | ||
23 | /* | 23 | /* |
24 | * Real VM (paging to/from disk) started 18.12.91. Much more work and | 24 | * Real VM (paging to/from disk) started 18.12.91. Much more work and |
25 | * thought has to go into this. Oh, well.. | 25 | * thought has to go into this. Oh, well.. |
26 | * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why. | 26 | * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why. |
27 | * Found it. Everything seems to work now. | 27 | * Found it. Everything seems to work now. |
28 | * 20.12.91 - Ok, making the swap-device changeable like the root. | 28 | * 20.12.91 - Ok, making the swap-device changeable like the root. |
29 | */ | 29 | */ |
30 | 30 | ||
31 | /* | 31 | /* |
32 | * 05.04.94 - Multi-page memory management added for v1.1. | 32 | * 05.04.94 - Multi-page memory management added for v1.1. |
33 | * Idea by Alex Bligh (alex@cconcepts.co.uk) | 33 | * Idea by Alex Bligh (alex@cconcepts.co.uk) |
34 | * | 34 | * |
35 | * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG | 35 | * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG |
36 | * (Gerhard.Wichert@pdb.siemens.de) | 36 | * (Gerhard.Wichert@pdb.siemens.de) |
37 | * | 37 | * |
38 | * Aug/Sep 2004 Changed to four level page tables (Andi Kleen) | 38 | * Aug/Sep 2004 Changed to four level page tables (Andi Kleen) |
39 | */ | 39 | */ |
40 | 40 | ||
41 | #include <linux/kernel_stat.h> | 41 | #include <linux/kernel_stat.h> |
42 | #include <linux/mm.h> | 42 | #include <linux/mm.h> |
43 | #include <linux/hugetlb.h> | 43 | #include <linux/hugetlb.h> |
44 | #include <linux/mman.h> | 44 | #include <linux/mman.h> |
45 | #include <linux/swap.h> | 45 | #include <linux/swap.h> |
46 | #include <linux/highmem.h> | 46 | #include <linux/highmem.h> |
47 | #include <linux/pagemap.h> | 47 | #include <linux/pagemap.h> |
48 | #include <linux/rmap.h> | 48 | #include <linux/rmap.h> |
49 | #include <linux/module.h> | 49 | #include <linux/module.h> |
50 | #include <linux/delayacct.h> | 50 | #include <linux/delayacct.h> |
51 | #include <linux/init.h> | 51 | #include <linux/init.h> |
52 | #include <linux/writeback.h> | 52 | #include <linux/writeback.h> |
53 | 53 | ||
54 | #include <asm/pgalloc.h> | 54 | #include <asm/pgalloc.h> |
55 | #include <asm/uaccess.h> | 55 | #include <asm/uaccess.h> |
56 | #include <asm/tlb.h> | 56 | #include <asm/tlb.h> |
57 | #include <asm/tlbflush.h> | 57 | #include <asm/tlbflush.h> |
58 | #include <asm/pgtable.h> | 58 | #include <asm/pgtable.h> |
59 | 59 | ||
60 | #include <linux/swapops.h> | 60 | #include <linux/swapops.h> |
61 | #include <linux/elf.h> | 61 | #include <linux/elf.h> |
62 | 62 | ||
63 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 63 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
64 | /* use the per-pgdat data instead for discontigmem - mbligh */ | 64 | /* use the per-pgdat data instead for discontigmem - mbligh */ |
65 | unsigned long max_mapnr; | 65 | unsigned long max_mapnr; |
66 | struct page *mem_map; | 66 | struct page *mem_map; |
67 | 67 | ||
68 | EXPORT_SYMBOL(max_mapnr); | 68 | EXPORT_SYMBOL(max_mapnr); |
69 | EXPORT_SYMBOL(mem_map); | 69 | EXPORT_SYMBOL(mem_map); |
70 | #endif | 70 | #endif |
71 | 71 | ||
72 | unsigned long num_physpages; | 72 | unsigned long num_physpages; |
73 | /* | 73 | /* |
74 | * A number of key systems in x86 including ioremap() rely on the assumption | 74 | * A number of key systems in x86 including ioremap() rely on the assumption |
75 | * that high_memory defines the upper bound on direct map memory, then end | 75 | * that high_memory defines the upper bound on direct map memory, then end |
76 | * of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and | 76 | * of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and |
77 | * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL | 77 | * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL |
78 | * and ZONE_HIGHMEM. | 78 | * and ZONE_HIGHMEM. |
79 | */ | 79 | */ |
80 | void * high_memory; | 80 | void * high_memory; |
81 | 81 | ||
82 | EXPORT_SYMBOL(num_physpages); | 82 | EXPORT_SYMBOL(num_physpages); |
83 | EXPORT_SYMBOL(high_memory); | 83 | EXPORT_SYMBOL(high_memory); |
84 | 84 | ||
85 | int randomize_va_space __read_mostly = 1; | 85 | int randomize_va_space __read_mostly = 1; |
86 | 86 | ||
87 | static int __init disable_randmaps(char *s) | 87 | static int __init disable_randmaps(char *s) |
88 | { | 88 | { |
89 | randomize_va_space = 0; | 89 | randomize_va_space = 0; |
90 | return 1; | 90 | return 1; |
91 | } | 91 | } |
92 | __setup("norandmaps", disable_randmaps); | 92 | __setup("norandmaps", disable_randmaps); |
93 | 93 | ||
94 | 94 | ||
95 | /* | 95 | /* |
96 | * If a p?d_bad entry is found while walking page tables, report | 96 | * If a p?d_bad entry is found while walking page tables, report |
97 | * the error, before resetting entry to p?d_none. Usually (but | 97 | * the error, before resetting entry to p?d_none. Usually (but |
98 | * very seldom) called out from the p?d_none_or_clear_bad macros. | 98 | * very seldom) called out from the p?d_none_or_clear_bad macros. |
99 | */ | 99 | */ |
100 | 100 | ||
101 | void pgd_clear_bad(pgd_t *pgd) | 101 | void pgd_clear_bad(pgd_t *pgd) |
102 | { | 102 | { |
103 | pgd_ERROR(*pgd); | 103 | pgd_ERROR(*pgd); |
104 | pgd_clear(pgd); | 104 | pgd_clear(pgd); |
105 | } | 105 | } |
106 | 106 | ||
107 | void pud_clear_bad(pud_t *pud) | 107 | void pud_clear_bad(pud_t *pud) |
108 | { | 108 | { |
109 | pud_ERROR(*pud); | 109 | pud_ERROR(*pud); |
110 | pud_clear(pud); | 110 | pud_clear(pud); |
111 | } | 111 | } |
112 | 112 | ||
113 | void pmd_clear_bad(pmd_t *pmd) | 113 | void pmd_clear_bad(pmd_t *pmd) |
114 | { | 114 | { |
115 | pmd_ERROR(*pmd); | 115 | pmd_ERROR(*pmd); |
116 | pmd_clear(pmd); | 116 | pmd_clear(pmd); |
117 | } | 117 | } |
118 | 118 | ||
119 | /* | 119 | /* |
120 | * Note: this doesn't free the actual pages themselves. That | 120 | * Note: this doesn't free the actual pages themselves. That |
121 | * has been handled earlier when unmapping all the memory regions. | 121 | * has been handled earlier when unmapping all the memory regions. |
122 | */ | 122 | */ |
123 | static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd) | 123 | static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd) |
124 | { | 124 | { |
125 | struct page *page = pmd_page(*pmd); | 125 | struct page *page = pmd_page(*pmd); |
126 | pmd_clear(pmd); | 126 | pmd_clear(pmd); |
127 | pte_lock_deinit(page); | 127 | pte_lock_deinit(page); |
128 | pte_free_tlb(tlb, page); | 128 | pte_free_tlb(tlb, page); |
129 | dec_zone_page_state(page, NR_PAGETABLE); | 129 | dec_zone_page_state(page, NR_PAGETABLE); |
130 | tlb->mm->nr_ptes--; | 130 | tlb->mm->nr_ptes--; |
131 | } | 131 | } |
132 | 132 | ||
133 | static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, | 133 | static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, |
134 | unsigned long addr, unsigned long end, | 134 | unsigned long addr, unsigned long end, |
135 | unsigned long floor, unsigned long ceiling) | 135 | unsigned long floor, unsigned long ceiling) |
136 | { | 136 | { |
137 | pmd_t *pmd; | 137 | pmd_t *pmd; |
138 | unsigned long next; | 138 | unsigned long next; |
139 | unsigned long start; | 139 | unsigned long start; |
140 | 140 | ||
141 | start = addr; | 141 | start = addr; |
142 | pmd = pmd_offset(pud, addr); | 142 | pmd = pmd_offset(pud, addr); |
143 | do { | 143 | do { |
144 | next = pmd_addr_end(addr, end); | 144 | next = pmd_addr_end(addr, end); |
145 | if (pmd_none_or_clear_bad(pmd)) | 145 | if (pmd_none_or_clear_bad(pmd)) |
146 | continue; | 146 | continue; |
147 | free_pte_range(tlb, pmd); | 147 | free_pte_range(tlb, pmd); |
148 | } while (pmd++, addr = next, addr != end); | 148 | } while (pmd++, addr = next, addr != end); |
149 | 149 | ||
150 | start &= PUD_MASK; | 150 | start &= PUD_MASK; |
151 | if (start < floor) | 151 | if (start < floor) |
152 | return; | 152 | return; |
153 | if (ceiling) { | 153 | if (ceiling) { |
154 | ceiling &= PUD_MASK; | 154 | ceiling &= PUD_MASK; |
155 | if (!ceiling) | 155 | if (!ceiling) |
156 | return; | 156 | return; |
157 | } | 157 | } |
158 | if (end - 1 > ceiling - 1) | 158 | if (end - 1 > ceiling - 1) |
159 | return; | 159 | return; |
160 | 160 | ||
161 | pmd = pmd_offset(pud, start); | 161 | pmd = pmd_offset(pud, start); |
162 | pud_clear(pud); | 162 | pud_clear(pud); |
163 | pmd_free_tlb(tlb, pmd); | 163 | pmd_free_tlb(tlb, pmd); |
164 | } | 164 | } |
165 | 165 | ||
166 | static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, | 166 | static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, |
167 | unsigned long addr, unsigned long end, | 167 | unsigned long addr, unsigned long end, |
168 | unsigned long floor, unsigned long ceiling) | 168 | unsigned long floor, unsigned long ceiling) |
169 | { | 169 | { |
170 | pud_t *pud; | 170 | pud_t *pud; |
171 | unsigned long next; | 171 | unsigned long next; |
172 | unsigned long start; | 172 | unsigned long start; |
173 | 173 | ||
174 | start = addr; | 174 | start = addr; |
175 | pud = pud_offset(pgd, addr); | 175 | pud = pud_offset(pgd, addr); |
176 | do { | 176 | do { |
177 | next = pud_addr_end(addr, end); | 177 | next = pud_addr_end(addr, end); |
178 | if (pud_none_or_clear_bad(pud)) | 178 | if (pud_none_or_clear_bad(pud)) |
179 | continue; | 179 | continue; |
180 | free_pmd_range(tlb, pud, addr, next, floor, ceiling); | 180 | free_pmd_range(tlb, pud, addr, next, floor, ceiling); |
181 | } while (pud++, addr = next, addr != end); | 181 | } while (pud++, addr = next, addr != end); |
182 | 182 | ||
183 | start &= PGDIR_MASK; | 183 | start &= PGDIR_MASK; |
184 | if (start < floor) | 184 | if (start < floor) |
185 | return; | 185 | return; |
186 | if (ceiling) { | 186 | if (ceiling) { |
187 | ceiling &= PGDIR_MASK; | 187 | ceiling &= PGDIR_MASK; |
188 | if (!ceiling) | 188 | if (!ceiling) |
189 | return; | 189 | return; |
190 | } | 190 | } |
191 | if (end - 1 > ceiling - 1) | 191 | if (end - 1 > ceiling - 1) |
192 | return; | 192 | return; |
193 | 193 | ||
194 | pud = pud_offset(pgd, start); | 194 | pud = pud_offset(pgd, start); |
195 | pgd_clear(pgd); | 195 | pgd_clear(pgd); |
196 | pud_free_tlb(tlb, pud); | 196 | pud_free_tlb(tlb, pud); |
197 | } | 197 | } |
198 | 198 | ||
199 | /* | 199 | /* |
200 | * This function frees user-level page tables of a process. | 200 | * This function frees user-level page tables of a process. |
201 | * | 201 | * |
202 | * Must be called with pagetable lock held. | 202 | * Must be called with pagetable lock held. |
203 | */ | 203 | */ |
204 | void free_pgd_range(struct mmu_gather **tlb, | 204 | void free_pgd_range(struct mmu_gather **tlb, |
205 | unsigned long addr, unsigned long end, | 205 | unsigned long addr, unsigned long end, |
206 | unsigned long floor, unsigned long ceiling) | 206 | unsigned long floor, unsigned long ceiling) |
207 | { | 207 | { |
208 | pgd_t *pgd; | 208 | pgd_t *pgd; |
209 | unsigned long next; | 209 | unsigned long next; |
210 | unsigned long start; | 210 | unsigned long start; |
211 | 211 | ||
212 | /* | 212 | /* |
213 | * The next few lines have given us lots of grief... | 213 | * The next few lines have given us lots of grief... |
214 | * | 214 | * |
215 | * Why are we testing PMD* at this top level? Because often | 215 | * Why are we testing PMD* at this top level? Because often |
216 | * there will be no work to do at all, and we'd prefer not to | 216 | * there will be no work to do at all, and we'd prefer not to |
217 | * go all the way down to the bottom just to discover that. | 217 | * go all the way down to the bottom just to discover that. |
218 | * | 218 | * |
219 | * Why all these "- 1"s? Because 0 represents both the bottom | 219 | * Why all these "- 1"s? Because 0 represents both the bottom |
220 | * of the address space and the top of it (using -1 for the | 220 | * of the address space and the top of it (using -1 for the |
221 | * top wouldn't help much: the masks would do the wrong thing). | 221 | * top wouldn't help much: the masks would do the wrong thing). |
222 | * The rule is that addr 0 and floor 0 refer to the bottom of | 222 | * The rule is that addr 0 and floor 0 refer to the bottom of |
223 | * the address space, but end 0 and ceiling 0 refer to the top | 223 | * the address space, but end 0 and ceiling 0 refer to the top |
224 | * Comparisons need to use "end - 1" and "ceiling - 1" (though | 224 | * Comparisons need to use "end - 1" and "ceiling - 1" (though |
225 | * that end 0 case should be mythical). | 225 | * that end 0 case should be mythical). |
226 | * | 226 | * |
227 | * Wherever addr is brought up or ceiling brought down, we must | 227 | * Wherever addr is brought up or ceiling brought down, we must |
228 | * be careful to reject "the opposite 0" before it confuses the | 228 | * be careful to reject "the opposite 0" before it confuses the |
229 | * subsequent tests. But what about where end is brought down | 229 | * subsequent tests. But what about where end is brought down |
230 | * by PMD_SIZE below? no, end can't go down to 0 there. | 230 | * by PMD_SIZE below? no, end can't go down to 0 there. |
231 | * | 231 | * |
232 | * Whereas we round start (addr) and ceiling down, by different | 232 | * Whereas we round start (addr) and ceiling down, by different |
233 | * masks at different levels, in order to test whether a table | 233 | * masks at different levels, in order to test whether a table |
234 | * now has no other vmas using it, so can be freed, we don't | 234 | * now has no other vmas using it, so can be freed, we don't |
235 | * bother to round floor or end up - the tests don't need that. | 235 | * bother to round floor or end up - the tests don't need that. |
236 | */ | 236 | */ |
237 | 237 | ||
238 | addr &= PMD_MASK; | 238 | addr &= PMD_MASK; |
239 | if (addr < floor) { | 239 | if (addr < floor) { |
240 | addr += PMD_SIZE; | 240 | addr += PMD_SIZE; |
241 | if (!addr) | 241 | if (!addr) |
242 | return; | 242 | return; |
243 | } | 243 | } |
244 | if (ceiling) { | 244 | if (ceiling) { |
245 | ceiling &= PMD_MASK; | 245 | ceiling &= PMD_MASK; |
246 | if (!ceiling) | 246 | if (!ceiling) |
247 | return; | 247 | return; |
248 | } | 248 | } |
249 | if (end - 1 > ceiling - 1) | 249 | if (end - 1 > ceiling - 1) |
250 | end -= PMD_SIZE; | 250 | end -= PMD_SIZE; |
251 | if (addr > end - 1) | 251 | if (addr > end - 1) |
252 | return; | 252 | return; |
253 | 253 | ||
254 | start = addr; | 254 | start = addr; |
255 | pgd = pgd_offset((*tlb)->mm, addr); | 255 | pgd = pgd_offset((*tlb)->mm, addr); |
256 | do { | 256 | do { |
257 | next = pgd_addr_end(addr, end); | 257 | next = pgd_addr_end(addr, end); |
258 | if (pgd_none_or_clear_bad(pgd)) | 258 | if (pgd_none_or_clear_bad(pgd)) |
259 | continue; | 259 | continue; |
260 | free_pud_range(*tlb, pgd, addr, next, floor, ceiling); | 260 | free_pud_range(*tlb, pgd, addr, next, floor, ceiling); |
261 | } while (pgd++, addr = next, addr != end); | 261 | } while (pgd++, addr = next, addr != end); |
262 | } | 262 | } |
263 | 263 | ||
264 | void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, | 264 | void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, |
265 | unsigned long floor, unsigned long ceiling) | 265 | unsigned long floor, unsigned long ceiling) |
266 | { | 266 | { |
267 | while (vma) { | 267 | while (vma) { |
268 | struct vm_area_struct *next = vma->vm_next; | 268 | struct vm_area_struct *next = vma->vm_next; |
269 | unsigned long addr = vma->vm_start; | 269 | unsigned long addr = vma->vm_start; |
270 | 270 | ||
271 | /* | 271 | /* |
272 | * Hide vma from rmap and vmtruncate before freeing pgtables | 272 | * Hide vma from rmap and vmtruncate before freeing pgtables |
273 | */ | 273 | */ |
274 | anon_vma_unlink(vma); | 274 | anon_vma_unlink(vma); |
275 | unlink_file_vma(vma); | 275 | unlink_file_vma(vma); |
276 | 276 | ||
277 | if (is_vm_hugetlb_page(vma)) { | 277 | if (is_vm_hugetlb_page(vma)) { |
278 | hugetlb_free_pgd_range(tlb, addr, vma->vm_end, | 278 | hugetlb_free_pgd_range(tlb, addr, vma->vm_end, |
279 | floor, next? next->vm_start: ceiling); | 279 | floor, next? next->vm_start: ceiling); |
280 | } else { | 280 | } else { |
281 | /* | 281 | /* |
282 | * Optimization: gather nearby vmas into one call down | 282 | * Optimization: gather nearby vmas into one call down |
283 | */ | 283 | */ |
284 | while (next && next->vm_start <= vma->vm_end + PMD_SIZE | 284 | while (next && next->vm_start <= vma->vm_end + PMD_SIZE |
285 | && !is_vm_hugetlb_page(next)) { | 285 | && !is_vm_hugetlb_page(next)) { |
286 | vma = next; | 286 | vma = next; |
287 | next = vma->vm_next; | 287 | next = vma->vm_next; |
288 | anon_vma_unlink(vma); | 288 | anon_vma_unlink(vma); |
289 | unlink_file_vma(vma); | 289 | unlink_file_vma(vma); |
290 | } | 290 | } |
291 | free_pgd_range(tlb, addr, vma->vm_end, | 291 | free_pgd_range(tlb, addr, vma->vm_end, |
292 | floor, next? next->vm_start: ceiling); | 292 | floor, next? next->vm_start: ceiling); |
293 | } | 293 | } |
294 | vma = next; | 294 | vma = next; |
295 | } | 295 | } |
296 | } | 296 | } |
297 | 297 | ||
298 | int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) | 298 | int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) |
299 | { | 299 | { |
300 | struct page *new = pte_alloc_one(mm, address); | 300 | struct page *new = pte_alloc_one(mm, address); |
301 | if (!new) | 301 | if (!new) |
302 | return -ENOMEM; | 302 | return -ENOMEM; |
303 | 303 | ||
304 | pte_lock_init(new); | 304 | pte_lock_init(new); |
305 | spin_lock(&mm->page_table_lock); | 305 | spin_lock(&mm->page_table_lock); |
306 | if (pmd_present(*pmd)) { /* Another has populated it */ | 306 | if (pmd_present(*pmd)) { /* Another has populated it */ |
307 | pte_lock_deinit(new); | 307 | pte_lock_deinit(new); |
308 | pte_free(new); | 308 | pte_free(new); |
309 | } else { | 309 | } else { |
310 | mm->nr_ptes++; | 310 | mm->nr_ptes++; |
311 | inc_zone_page_state(new, NR_PAGETABLE); | 311 | inc_zone_page_state(new, NR_PAGETABLE); |
312 | pmd_populate(mm, pmd, new); | 312 | pmd_populate(mm, pmd, new); |
313 | } | 313 | } |
314 | spin_unlock(&mm->page_table_lock); | 314 | spin_unlock(&mm->page_table_lock); |
315 | return 0; | 315 | return 0; |
316 | } | 316 | } |
317 | 317 | ||
318 | int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) | 318 | int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) |
319 | { | 319 | { |
320 | pte_t *new = pte_alloc_one_kernel(&init_mm, address); | 320 | pte_t *new = pte_alloc_one_kernel(&init_mm, address); |
321 | if (!new) | 321 | if (!new) |
322 | return -ENOMEM; | 322 | return -ENOMEM; |
323 | 323 | ||
324 | spin_lock(&init_mm.page_table_lock); | 324 | spin_lock(&init_mm.page_table_lock); |
325 | if (pmd_present(*pmd)) /* Another has populated it */ | 325 | if (pmd_present(*pmd)) /* Another has populated it */ |
326 | pte_free_kernel(new); | 326 | pte_free_kernel(new); |
327 | else | 327 | else |
328 | pmd_populate_kernel(&init_mm, pmd, new); | 328 | pmd_populate_kernel(&init_mm, pmd, new); |
329 | spin_unlock(&init_mm.page_table_lock); | 329 | spin_unlock(&init_mm.page_table_lock); |
330 | return 0; | 330 | return 0; |
331 | } | 331 | } |
332 | 332 | ||
333 | static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) | 333 | static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) |
334 | { | 334 | { |
335 | if (file_rss) | 335 | if (file_rss) |
336 | add_mm_counter(mm, file_rss, file_rss); | 336 | add_mm_counter(mm, file_rss, file_rss); |
337 | if (anon_rss) | 337 | if (anon_rss) |
338 | add_mm_counter(mm, anon_rss, anon_rss); | 338 | add_mm_counter(mm, anon_rss, anon_rss); |
339 | } | 339 | } |
340 | 340 | ||
341 | /* | 341 | /* |
342 | * This function is called to print an error when a bad pte | 342 | * This function is called to print an error when a bad pte |
343 | * is found. For example, we might have a PFN-mapped pte in | 343 | * is found. For example, we might have a PFN-mapped pte in |
344 | * a region that doesn't allow it. | 344 | * a region that doesn't allow it. |
345 | * | 345 | * |
346 | * The calling function must still handle the error. | 346 | * The calling function must still handle the error. |
347 | */ | 347 | */ |
348 | void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr) | 348 | void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr) |
349 | { | 349 | { |
350 | printk(KERN_ERR "Bad pte = %08llx, process = %s, " | 350 | printk(KERN_ERR "Bad pte = %08llx, process = %s, " |
351 | "vm_flags = %lx, vaddr = %lx\n", | 351 | "vm_flags = %lx, vaddr = %lx\n", |
352 | (long long)pte_val(pte), | 352 | (long long)pte_val(pte), |
353 | (vma->vm_mm == current->mm ? current->comm : "???"), | 353 | (vma->vm_mm == current->mm ? current->comm : "???"), |
354 | vma->vm_flags, vaddr); | 354 | vma->vm_flags, vaddr); |
355 | dump_stack(); | 355 | dump_stack(); |
356 | } | 356 | } |
357 | 357 | ||
358 | static inline int is_cow_mapping(unsigned int flags) | 358 | static inline int is_cow_mapping(unsigned int flags) |
359 | { | 359 | { |
360 | return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; | 360 | return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; |
361 | } | 361 | } |
362 | 362 | ||
363 | /* | 363 | /* |
364 | * This function gets the "struct page" associated with a pte. | 364 | * This function gets the "struct page" associated with a pte. |
365 | * | 365 | * |
366 | * NOTE! Some mappings do not have "struct pages". A raw PFN mapping | 366 | * NOTE! Some mappings do not have "struct pages". A raw PFN mapping |
367 | * will have each page table entry just pointing to a raw page frame | 367 | * will have each page table entry just pointing to a raw page frame |
368 | * number, and as far as the VM layer is concerned, those do not have | 368 | * number, and as far as the VM layer is concerned, those do not have |
369 | * pages associated with them - even if the PFN might point to memory | 369 | * pages associated with them - even if the PFN might point to memory |
370 | * that otherwise is perfectly fine and has a "struct page". | 370 | * that otherwise is perfectly fine and has a "struct page". |
371 | * | 371 | * |
372 | * The way we recognize those mappings is through the rules set up | 372 | * The way we recognize those mappings is through the rules set up |
373 | * by "remap_pfn_range()": the vma will have the VM_PFNMAP bit set, | 373 | * by "remap_pfn_range()": the vma will have the VM_PFNMAP bit set, |
374 | * and the vm_pgoff will point to the first PFN mapped: thus every | 374 | * and the vm_pgoff will point to the first PFN mapped: thus every |
375 | * page that is a raw mapping will always honor the rule | 375 | * page that is a raw mapping will always honor the rule |
376 | * | 376 | * |
377 | * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT) | 377 | * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT) |
378 | * | 378 | * |
379 | * and if that isn't true, the page has been COW'ed (in which case it | 379 | * and if that isn't true, the page has been COW'ed (in which case it |
380 | * _does_ have a "struct page" associated with it even if it is in a | 380 | * _does_ have a "struct page" associated with it even if it is in a |
381 | * VM_PFNMAP range). | 381 | * VM_PFNMAP range). |
382 | */ | 382 | */ |
383 | struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte) | 383 | struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte) |
384 | { | 384 | { |
385 | unsigned long pfn = pte_pfn(pte); | 385 | unsigned long pfn = pte_pfn(pte); |
386 | 386 | ||
387 | if (unlikely(vma->vm_flags & VM_PFNMAP)) { | 387 | if (unlikely(vma->vm_flags & VM_PFNMAP)) { |
388 | unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT; | 388 | unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT; |
389 | if (pfn == vma->vm_pgoff + off) | 389 | if (pfn == vma->vm_pgoff + off) |
390 | return NULL; | 390 | return NULL; |
391 | if (!is_cow_mapping(vma->vm_flags)) | 391 | if (!is_cow_mapping(vma->vm_flags)) |
392 | return NULL; | 392 | return NULL; |
393 | } | 393 | } |
394 | 394 | ||
395 | #ifdef CONFIG_DEBUG_VM | 395 | #ifdef CONFIG_DEBUG_VM |
396 | /* | 396 | /* |
397 | * Add some anal sanity checks for now. Eventually, | 397 | * Add some anal sanity checks for now. Eventually, |
398 | * we should just do "return pfn_to_page(pfn)", but | 398 | * we should just do "return pfn_to_page(pfn)", but |
399 | * in the meantime we check that we get a valid pfn, | 399 | * in the meantime we check that we get a valid pfn, |
400 | * and that the resulting page looks ok. | 400 | * and that the resulting page looks ok. |
401 | */ | 401 | */ |
402 | if (unlikely(!pfn_valid(pfn))) { | 402 | if (unlikely(!pfn_valid(pfn))) { |
403 | print_bad_pte(vma, pte, addr); | 403 | print_bad_pte(vma, pte, addr); |
404 | return NULL; | 404 | return NULL; |
405 | } | 405 | } |
406 | #endif | 406 | #endif |
407 | 407 | ||
408 | /* | 408 | /* |
409 | * NOTE! We still have PageReserved() pages in the page | 409 | * NOTE! We still have PageReserved() pages in the page |
410 | * tables. | 410 | * tables. |
411 | * | 411 | * |
412 | * The PAGE_ZERO() pages and various VDSO mappings can | 412 | * The PAGE_ZERO() pages and various VDSO mappings can |
413 | * cause them to exist. | 413 | * cause them to exist. |
414 | */ | 414 | */ |
415 | return pfn_to_page(pfn); | 415 | return pfn_to_page(pfn); |
416 | } | 416 | } |
417 | 417 | ||
418 | /* | 418 | /* |
419 | * copy one vm_area from one task to the other. Assumes the page tables | 419 | * copy one vm_area from one task to the other. Assumes the page tables |
420 | * already present in the new task to be cleared in the whole range | 420 | * already present in the new task to be cleared in the whole range |
421 | * covered by this vma. | 421 | * covered by this vma. |
422 | */ | 422 | */ |
423 | 423 | ||
424 | static inline void | 424 | static inline void |
425 | copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | 425 | copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, |
426 | pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, | 426 | pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, |
427 | unsigned long addr, int *rss) | 427 | unsigned long addr, int *rss) |
428 | { | 428 | { |
429 | unsigned long vm_flags = vma->vm_flags; | 429 | unsigned long vm_flags = vma->vm_flags; |
430 | pte_t pte = *src_pte; | 430 | pte_t pte = *src_pte; |
431 | struct page *page; | 431 | struct page *page; |
432 | 432 | ||
433 | /* pte contains position in swap or file, so copy. */ | 433 | /* pte contains position in swap or file, so copy. */ |
434 | if (unlikely(!pte_present(pte))) { | 434 | if (unlikely(!pte_present(pte))) { |
435 | if (!pte_file(pte)) { | 435 | if (!pte_file(pte)) { |
436 | swp_entry_t entry = pte_to_swp_entry(pte); | 436 | swp_entry_t entry = pte_to_swp_entry(pte); |
437 | 437 | ||
438 | swap_duplicate(entry); | 438 | swap_duplicate(entry); |
439 | /* make sure dst_mm is on swapoff's mmlist. */ | 439 | /* make sure dst_mm is on swapoff's mmlist. */ |
440 | if (unlikely(list_empty(&dst_mm->mmlist))) { | 440 | if (unlikely(list_empty(&dst_mm->mmlist))) { |
441 | spin_lock(&mmlist_lock); | 441 | spin_lock(&mmlist_lock); |
442 | if (list_empty(&dst_mm->mmlist)) | 442 | if (list_empty(&dst_mm->mmlist)) |
443 | list_add(&dst_mm->mmlist, | 443 | list_add(&dst_mm->mmlist, |
444 | &src_mm->mmlist); | 444 | &src_mm->mmlist); |
445 | spin_unlock(&mmlist_lock); | 445 | spin_unlock(&mmlist_lock); |
446 | } | 446 | } |
447 | if (is_write_migration_entry(entry) && | 447 | if (is_write_migration_entry(entry) && |
448 | is_cow_mapping(vm_flags)) { | 448 | is_cow_mapping(vm_flags)) { |
449 | /* | 449 | /* |
450 | * COW mappings require pages in both parent | 450 | * COW mappings require pages in both parent |
451 | * and child to be set to read. | 451 | * and child to be set to read. |
452 | */ | 452 | */ |
453 | make_migration_entry_read(&entry); | 453 | make_migration_entry_read(&entry); |
454 | pte = swp_entry_to_pte(entry); | 454 | pte = swp_entry_to_pte(entry); |
455 | set_pte_at(src_mm, addr, src_pte, pte); | 455 | set_pte_at(src_mm, addr, src_pte, pte); |
456 | } | 456 | } |
457 | } | 457 | } |
458 | goto out_set_pte; | 458 | goto out_set_pte; |
459 | } | 459 | } |
460 | 460 | ||
461 | /* | 461 | /* |
462 | * If it's a COW mapping, write protect it both | 462 | * If it's a COW mapping, write protect it both |
463 | * in the parent and the child | 463 | * in the parent and the child |
464 | */ | 464 | */ |
465 | if (is_cow_mapping(vm_flags)) { | 465 | if (is_cow_mapping(vm_flags)) { |
466 | ptep_set_wrprotect(src_mm, addr, src_pte); | 466 | ptep_set_wrprotect(src_mm, addr, src_pte); |
467 | pte = pte_wrprotect(pte); | 467 | pte = pte_wrprotect(pte); |
468 | } | 468 | } |
469 | 469 | ||
470 | /* | 470 | /* |
471 | * If it's a shared mapping, mark it clean in | 471 | * If it's a shared mapping, mark it clean in |
472 | * the child | 472 | * the child |
473 | */ | 473 | */ |
474 | if (vm_flags & VM_SHARED) | 474 | if (vm_flags & VM_SHARED) |
475 | pte = pte_mkclean(pte); | 475 | pte = pte_mkclean(pte); |
476 | pte = pte_mkold(pte); | 476 | pte = pte_mkold(pte); |
477 | 477 | ||
478 | page = vm_normal_page(vma, addr, pte); | 478 | page = vm_normal_page(vma, addr, pte); |
479 | if (page) { | 479 | if (page) { |
480 | get_page(page); | 480 | get_page(page); |
481 | page_dup_rmap(page, vma, addr); | 481 | page_dup_rmap(page, vma, addr); |
482 | rss[!!PageAnon(page)]++; | 482 | rss[!!PageAnon(page)]++; |
483 | } | 483 | } |
484 | 484 | ||
485 | out_set_pte: | 485 | out_set_pte: |
486 | set_pte_at(dst_mm, addr, dst_pte, pte); | 486 | set_pte_at(dst_mm, addr, dst_pte, pte); |
487 | } | 487 | } |
488 | 488 | ||
489 | static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | 489 | static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, |
490 | pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, | 490 | pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, |
491 | unsigned long addr, unsigned long end) | 491 | unsigned long addr, unsigned long end) |
492 | { | 492 | { |
493 | pte_t *src_pte, *dst_pte; | 493 | pte_t *src_pte, *dst_pte; |
494 | spinlock_t *src_ptl, *dst_ptl; | 494 | spinlock_t *src_ptl, *dst_ptl; |
495 | int progress = 0; | 495 | int progress = 0; |
496 | int rss[2]; | 496 | int rss[2]; |
497 | 497 | ||
498 | again: | 498 | again: |
499 | rss[1] = rss[0] = 0; | 499 | rss[1] = rss[0] = 0; |
500 | dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); | 500 | dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); |
501 | if (!dst_pte) | 501 | if (!dst_pte) |
502 | return -ENOMEM; | 502 | return -ENOMEM; |
503 | src_pte = pte_offset_map_nested(src_pmd, addr); | 503 | src_pte = pte_offset_map_nested(src_pmd, addr); |
504 | src_ptl = pte_lockptr(src_mm, src_pmd); | 504 | src_ptl = pte_lockptr(src_mm, src_pmd); |
505 | spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); | 505 | spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); |
506 | arch_enter_lazy_mmu_mode(); | 506 | arch_enter_lazy_mmu_mode(); |
507 | 507 | ||
508 | do { | 508 | do { |
509 | /* | 509 | /* |
510 | * We are holding two locks at this point - either of them | 510 | * We are holding two locks at this point - either of them |
511 | * could generate latencies in another task on another CPU. | 511 | * could generate latencies in another task on another CPU. |
512 | */ | 512 | */ |
513 | if (progress >= 32) { | 513 | if (progress >= 32) { |
514 | progress = 0; | 514 | progress = 0; |
515 | if (need_resched() || | 515 | if (need_resched() || |
516 | need_lockbreak(src_ptl) || | 516 | spin_needbreak(src_ptl) || spin_needbreak(dst_ptl)) |
517 | need_lockbreak(dst_ptl)) | ||
518 | break; | 517 | break; |
519 | } | 518 | } |
520 | if (pte_none(*src_pte)) { | 519 | if (pte_none(*src_pte)) { |
521 | progress++; | 520 | progress++; |
522 | continue; | 521 | continue; |
523 | } | 522 | } |
524 | copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss); | 523 | copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss); |
525 | progress += 8; | 524 | progress += 8; |
526 | } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); | 525 | } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); |
527 | 526 | ||
528 | arch_leave_lazy_mmu_mode(); | 527 | arch_leave_lazy_mmu_mode(); |
529 | spin_unlock(src_ptl); | 528 | spin_unlock(src_ptl); |
530 | pte_unmap_nested(src_pte - 1); | 529 | pte_unmap_nested(src_pte - 1); |
531 | add_mm_rss(dst_mm, rss[0], rss[1]); | 530 | add_mm_rss(dst_mm, rss[0], rss[1]); |
532 | pte_unmap_unlock(dst_pte - 1, dst_ptl); | 531 | pte_unmap_unlock(dst_pte - 1, dst_ptl); |
533 | cond_resched(); | 532 | cond_resched(); |
534 | if (addr != end) | 533 | if (addr != end) |
535 | goto again; | 534 | goto again; |
536 | return 0; | 535 | return 0; |
537 | } | 536 | } |
538 | 537 | ||
539 | static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | 538 | static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, |
540 | pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma, | 539 | pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma, |
541 | unsigned long addr, unsigned long end) | 540 | unsigned long addr, unsigned long end) |
542 | { | 541 | { |
543 | pmd_t *src_pmd, *dst_pmd; | 542 | pmd_t *src_pmd, *dst_pmd; |
544 | unsigned long next; | 543 | unsigned long next; |
545 | 544 | ||
546 | dst_pmd = pmd_alloc(dst_mm, dst_pud, addr); | 545 | dst_pmd = pmd_alloc(dst_mm, dst_pud, addr); |
547 | if (!dst_pmd) | 546 | if (!dst_pmd) |
548 | return -ENOMEM; | 547 | return -ENOMEM; |
549 | src_pmd = pmd_offset(src_pud, addr); | 548 | src_pmd = pmd_offset(src_pud, addr); |
550 | do { | 549 | do { |
551 | next = pmd_addr_end(addr, end); | 550 | next = pmd_addr_end(addr, end); |
552 | if (pmd_none_or_clear_bad(src_pmd)) | 551 | if (pmd_none_or_clear_bad(src_pmd)) |
553 | continue; | 552 | continue; |
554 | if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, | 553 | if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, |
555 | vma, addr, next)) | 554 | vma, addr, next)) |
556 | return -ENOMEM; | 555 | return -ENOMEM; |
557 | } while (dst_pmd++, src_pmd++, addr = next, addr != end); | 556 | } while (dst_pmd++, src_pmd++, addr = next, addr != end); |
558 | return 0; | 557 | return 0; |
559 | } | 558 | } |
560 | 559 | ||
561 | static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | 560 | static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, |
562 | pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma, | 561 | pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma, |
563 | unsigned long addr, unsigned long end) | 562 | unsigned long addr, unsigned long end) |
564 | { | 563 | { |
565 | pud_t *src_pud, *dst_pud; | 564 | pud_t *src_pud, *dst_pud; |
566 | unsigned long next; | 565 | unsigned long next; |
567 | 566 | ||
568 | dst_pud = pud_alloc(dst_mm, dst_pgd, addr); | 567 | dst_pud = pud_alloc(dst_mm, dst_pgd, addr); |
569 | if (!dst_pud) | 568 | if (!dst_pud) |
570 | return -ENOMEM; | 569 | return -ENOMEM; |
571 | src_pud = pud_offset(src_pgd, addr); | 570 | src_pud = pud_offset(src_pgd, addr); |
572 | do { | 571 | do { |
573 | next = pud_addr_end(addr, end); | 572 | next = pud_addr_end(addr, end); |
574 | if (pud_none_or_clear_bad(src_pud)) | 573 | if (pud_none_or_clear_bad(src_pud)) |
575 | continue; | 574 | continue; |
576 | if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, | 575 | if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, |
577 | vma, addr, next)) | 576 | vma, addr, next)) |
578 | return -ENOMEM; | 577 | return -ENOMEM; |
579 | } while (dst_pud++, src_pud++, addr = next, addr != end); | 578 | } while (dst_pud++, src_pud++, addr = next, addr != end); |
580 | return 0; | 579 | return 0; |
581 | } | 580 | } |
582 | 581 | ||
583 | int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | 582 | int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, |
584 | struct vm_area_struct *vma) | 583 | struct vm_area_struct *vma) |
585 | { | 584 | { |
586 | pgd_t *src_pgd, *dst_pgd; | 585 | pgd_t *src_pgd, *dst_pgd; |
587 | unsigned long next; | 586 | unsigned long next; |
588 | unsigned long addr = vma->vm_start; | 587 | unsigned long addr = vma->vm_start; |
589 | unsigned long end = vma->vm_end; | 588 | unsigned long end = vma->vm_end; |
590 | 589 | ||
591 | /* | 590 | /* |
592 | * Don't copy ptes where a page fault will fill them correctly. | 591 | * Don't copy ptes where a page fault will fill them correctly. |
593 | * Fork becomes much lighter when there are big shared or private | 592 | * Fork becomes much lighter when there are big shared or private |
594 | * readonly mappings. The tradeoff is that copy_page_range is more | 593 | * readonly mappings. The tradeoff is that copy_page_range is more |
595 | * efficient than faulting. | 594 | * efficient than faulting. |
596 | */ | 595 | */ |
597 | if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) { | 596 | if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) { |
598 | if (!vma->anon_vma) | 597 | if (!vma->anon_vma) |
599 | return 0; | 598 | return 0; |
600 | } | 599 | } |
601 | 600 | ||
602 | if (is_vm_hugetlb_page(vma)) | 601 | if (is_vm_hugetlb_page(vma)) |
603 | return copy_hugetlb_page_range(dst_mm, src_mm, vma); | 602 | return copy_hugetlb_page_range(dst_mm, src_mm, vma); |
604 | 603 | ||
605 | dst_pgd = pgd_offset(dst_mm, addr); | 604 | dst_pgd = pgd_offset(dst_mm, addr); |
606 | src_pgd = pgd_offset(src_mm, addr); | 605 | src_pgd = pgd_offset(src_mm, addr); |
607 | do { | 606 | do { |
608 | next = pgd_addr_end(addr, end); | 607 | next = pgd_addr_end(addr, end); |
609 | if (pgd_none_or_clear_bad(src_pgd)) | 608 | if (pgd_none_or_clear_bad(src_pgd)) |
610 | continue; | 609 | continue; |
611 | if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, | 610 | if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, |
612 | vma, addr, next)) | 611 | vma, addr, next)) |
613 | return -ENOMEM; | 612 | return -ENOMEM; |
614 | } while (dst_pgd++, src_pgd++, addr = next, addr != end); | 613 | } while (dst_pgd++, src_pgd++, addr = next, addr != end); |
615 | return 0; | 614 | return 0; |
616 | } | 615 | } |
617 | 616 | ||
618 | static unsigned long zap_pte_range(struct mmu_gather *tlb, | 617 | static unsigned long zap_pte_range(struct mmu_gather *tlb, |
619 | struct vm_area_struct *vma, pmd_t *pmd, | 618 | struct vm_area_struct *vma, pmd_t *pmd, |
620 | unsigned long addr, unsigned long end, | 619 | unsigned long addr, unsigned long end, |
621 | long *zap_work, struct zap_details *details) | 620 | long *zap_work, struct zap_details *details) |
622 | { | 621 | { |
623 | struct mm_struct *mm = tlb->mm; | 622 | struct mm_struct *mm = tlb->mm; |
624 | pte_t *pte; | 623 | pte_t *pte; |
625 | spinlock_t *ptl; | 624 | spinlock_t *ptl; |
626 | int file_rss = 0; | 625 | int file_rss = 0; |
627 | int anon_rss = 0; | 626 | int anon_rss = 0; |
628 | 627 | ||
629 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); | 628 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); |
630 | arch_enter_lazy_mmu_mode(); | 629 | arch_enter_lazy_mmu_mode(); |
631 | do { | 630 | do { |
632 | pte_t ptent = *pte; | 631 | pte_t ptent = *pte; |
633 | if (pte_none(ptent)) { | 632 | if (pte_none(ptent)) { |
634 | (*zap_work)--; | 633 | (*zap_work)--; |
635 | continue; | 634 | continue; |
636 | } | 635 | } |
637 | 636 | ||
638 | (*zap_work) -= PAGE_SIZE; | 637 | (*zap_work) -= PAGE_SIZE; |
639 | 638 | ||
640 | if (pte_present(ptent)) { | 639 | if (pte_present(ptent)) { |
641 | struct page *page; | 640 | struct page *page; |
642 | 641 | ||
643 | page = vm_normal_page(vma, addr, ptent); | 642 | page = vm_normal_page(vma, addr, ptent); |
644 | if (unlikely(details) && page) { | 643 | if (unlikely(details) && page) { |
645 | /* | 644 | /* |
646 | * unmap_shared_mapping_pages() wants to | 645 | * unmap_shared_mapping_pages() wants to |
647 | * invalidate cache without truncating: | 646 | * invalidate cache without truncating: |
648 | * unmap shared but keep private pages. | 647 | * unmap shared but keep private pages. |
649 | */ | 648 | */ |
650 | if (details->check_mapping && | 649 | if (details->check_mapping && |
651 | details->check_mapping != page->mapping) | 650 | details->check_mapping != page->mapping) |
652 | continue; | 651 | continue; |
653 | /* | 652 | /* |
654 | * Each page->index must be checked when | 653 | * Each page->index must be checked when |
655 | * invalidating or truncating nonlinear. | 654 | * invalidating or truncating nonlinear. |
656 | */ | 655 | */ |
657 | if (details->nonlinear_vma && | 656 | if (details->nonlinear_vma && |
658 | (page->index < details->first_index || | 657 | (page->index < details->first_index || |
659 | page->index > details->last_index)) | 658 | page->index > details->last_index)) |
660 | continue; | 659 | continue; |
661 | } | 660 | } |
662 | ptent = ptep_get_and_clear_full(mm, addr, pte, | 661 | ptent = ptep_get_and_clear_full(mm, addr, pte, |
663 | tlb->fullmm); | 662 | tlb->fullmm); |
664 | tlb_remove_tlb_entry(tlb, pte, addr); | 663 | tlb_remove_tlb_entry(tlb, pte, addr); |
665 | if (unlikely(!page)) | 664 | if (unlikely(!page)) |
666 | continue; | 665 | continue; |
667 | if (unlikely(details) && details->nonlinear_vma | 666 | if (unlikely(details) && details->nonlinear_vma |
668 | && linear_page_index(details->nonlinear_vma, | 667 | && linear_page_index(details->nonlinear_vma, |
669 | addr) != page->index) | 668 | addr) != page->index) |
670 | set_pte_at(mm, addr, pte, | 669 | set_pte_at(mm, addr, pte, |
671 | pgoff_to_pte(page->index)); | 670 | pgoff_to_pte(page->index)); |
672 | if (PageAnon(page)) | 671 | if (PageAnon(page)) |
673 | anon_rss--; | 672 | anon_rss--; |
674 | else { | 673 | else { |
675 | if (pte_dirty(ptent)) | 674 | if (pte_dirty(ptent)) |
676 | set_page_dirty(page); | 675 | set_page_dirty(page); |
677 | if (pte_young(ptent)) | 676 | if (pte_young(ptent)) |
678 | SetPageReferenced(page); | 677 | SetPageReferenced(page); |
679 | file_rss--; | 678 | file_rss--; |
680 | } | 679 | } |
681 | page_remove_rmap(page, vma); | 680 | page_remove_rmap(page, vma); |
682 | tlb_remove_page(tlb, page); | 681 | tlb_remove_page(tlb, page); |
683 | continue; | 682 | continue; |
684 | } | 683 | } |
685 | /* | 684 | /* |
686 | * If details->check_mapping, we leave swap entries; | 685 | * If details->check_mapping, we leave swap entries; |
687 | * if details->nonlinear_vma, we leave file entries. | 686 | * if details->nonlinear_vma, we leave file entries. |
688 | */ | 687 | */ |
689 | if (unlikely(details)) | 688 | if (unlikely(details)) |
690 | continue; | 689 | continue; |
691 | if (!pte_file(ptent)) | 690 | if (!pte_file(ptent)) |
692 | free_swap_and_cache(pte_to_swp_entry(ptent)); | 691 | free_swap_and_cache(pte_to_swp_entry(ptent)); |
693 | pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); | 692 | pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); |
694 | } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); | 693 | } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); |
695 | 694 | ||
696 | add_mm_rss(mm, file_rss, anon_rss); | 695 | add_mm_rss(mm, file_rss, anon_rss); |
697 | arch_leave_lazy_mmu_mode(); | 696 | arch_leave_lazy_mmu_mode(); |
698 | pte_unmap_unlock(pte - 1, ptl); | 697 | pte_unmap_unlock(pte - 1, ptl); |
699 | 698 | ||
700 | return addr; | 699 | return addr; |
701 | } | 700 | } |
702 | 701 | ||
703 | static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, | 702 | static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, |
704 | struct vm_area_struct *vma, pud_t *pud, | 703 | struct vm_area_struct *vma, pud_t *pud, |
705 | unsigned long addr, unsigned long end, | 704 | unsigned long addr, unsigned long end, |
706 | long *zap_work, struct zap_details *details) | 705 | long *zap_work, struct zap_details *details) |
707 | { | 706 | { |
708 | pmd_t *pmd; | 707 | pmd_t *pmd; |
709 | unsigned long next; | 708 | unsigned long next; |
710 | 709 | ||
711 | pmd = pmd_offset(pud, addr); | 710 | pmd = pmd_offset(pud, addr); |
712 | do { | 711 | do { |
713 | next = pmd_addr_end(addr, end); | 712 | next = pmd_addr_end(addr, end); |
714 | if (pmd_none_or_clear_bad(pmd)) { | 713 | if (pmd_none_or_clear_bad(pmd)) { |
715 | (*zap_work)--; | 714 | (*zap_work)--; |
716 | continue; | 715 | continue; |
717 | } | 716 | } |
718 | next = zap_pte_range(tlb, vma, pmd, addr, next, | 717 | next = zap_pte_range(tlb, vma, pmd, addr, next, |
719 | zap_work, details); | 718 | zap_work, details); |
720 | } while (pmd++, addr = next, (addr != end && *zap_work > 0)); | 719 | } while (pmd++, addr = next, (addr != end && *zap_work > 0)); |
721 | 720 | ||
722 | return addr; | 721 | return addr; |
723 | } | 722 | } |
724 | 723 | ||
725 | static inline unsigned long zap_pud_range(struct mmu_gather *tlb, | 724 | static inline unsigned long zap_pud_range(struct mmu_gather *tlb, |
726 | struct vm_area_struct *vma, pgd_t *pgd, | 725 | struct vm_area_struct *vma, pgd_t *pgd, |
727 | unsigned long addr, unsigned long end, | 726 | unsigned long addr, unsigned long end, |
728 | long *zap_work, struct zap_details *details) | 727 | long *zap_work, struct zap_details *details) |
729 | { | 728 | { |
730 | pud_t *pud; | 729 | pud_t *pud; |
731 | unsigned long next; | 730 | unsigned long next; |
732 | 731 | ||
733 | pud = pud_offset(pgd, addr); | 732 | pud = pud_offset(pgd, addr); |
734 | do { | 733 | do { |
735 | next = pud_addr_end(addr, end); | 734 | next = pud_addr_end(addr, end); |
736 | if (pud_none_or_clear_bad(pud)) { | 735 | if (pud_none_or_clear_bad(pud)) { |
737 | (*zap_work)--; | 736 | (*zap_work)--; |
738 | continue; | 737 | continue; |
739 | } | 738 | } |
740 | next = zap_pmd_range(tlb, vma, pud, addr, next, | 739 | next = zap_pmd_range(tlb, vma, pud, addr, next, |
741 | zap_work, details); | 740 | zap_work, details); |
742 | } while (pud++, addr = next, (addr != end && *zap_work > 0)); | 741 | } while (pud++, addr = next, (addr != end && *zap_work > 0)); |
743 | 742 | ||
744 | return addr; | 743 | return addr; |
745 | } | 744 | } |
746 | 745 | ||
747 | static unsigned long unmap_page_range(struct mmu_gather *tlb, | 746 | static unsigned long unmap_page_range(struct mmu_gather *tlb, |
748 | struct vm_area_struct *vma, | 747 | struct vm_area_struct *vma, |
749 | unsigned long addr, unsigned long end, | 748 | unsigned long addr, unsigned long end, |
750 | long *zap_work, struct zap_details *details) | 749 | long *zap_work, struct zap_details *details) |
751 | { | 750 | { |
752 | pgd_t *pgd; | 751 | pgd_t *pgd; |
753 | unsigned long next; | 752 | unsigned long next; |
754 | 753 | ||
755 | if (details && !details->check_mapping && !details->nonlinear_vma) | 754 | if (details && !details->check_mapping && !details->nonlinear_vma) |
756 | details = NULL; | 755 | details = NULL; |
757 | 756 | ||
758 | BUG_ON(addr >= end); | 757 | BUG_ON(addr >= end); |
759 | tlb_start_vma(tlb, vma); | 758 | tlb_start_vma(tlb, vma); |
760 | pgd = pgd_offset(vma->vm_mm, addr); | 759 | pgd = pgd_offset(vma->vm_mm, addr); |
761 | do { | 760 | do { |
762 | next = pgd_addr_end(addr, end); | 761 | next = pgd_addr_end(addr, end); |
763 | if (pgd_none_or_clear_bad(pgd)) { | 762 | if (pgd_none_or_clear_bad(pgd)) { |
764 | (*zap_work)--; | 763 | (*zap_work)--; |
765 | continue; | 764 | continue; |
766 | } | 765 | } |
767 | next = zap_pud_range(tlb, vma, pgd, addr, next, | 766 | next = zap_pud_range(tlb, vma, pgd, addr, next, |
768 | zap_work, details); | 767 | zap_work, details); |
769 | } while (pgd++, addr = next, (addr != end && *zap_work > 0)); | 768 | } while (pgd++, addr = next, (addr != end && *zap_work > 0)); |
770 | tlb_end_vma(tlb, vma); | 769 | tlb_end_vma(tlb, vma); |
771 | 770 | ||
772 | return addr; | 771 | return addr; |
773 | } | 772 | } |
774 | 773 | ||
775 | #ifdef CONFIG_PREEMPT | 774 | #ifdef CONFIG_PREEMPT |
776 | # define ZAP_BLOCK_SIZE (8 * PAGE_SIZE) | 775 | # define ZAP_BLOCK_SIZE (8 * PAGE_SIZE) |
777 | #else | 776 | #else |
778 | /* No preempt: go for improved straight-line efficiency */ | 777 | /* No preempt: go for improved straight-line efficiency */ |
779 | # define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE) | 778 | # define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE) |
780 | #endif | 779 | #endif |
781 | 780 | ||
782 | /** | 781 | /** |
783 | * unmap_vmas - unmap a range of memory covered by a list of vma's | 782 | * unmap_vmas - unmap a range of memory covered by a list of vma's |
784 | * @tlbp: address of the caller's struct mmu_gather | 783 | * @tlbp: address of the caller's struct mmu_gather |
785 | * @vma: the starting vma | 784 | * @vma: the starting vma |
786 | * @start_addr: virtual address at which to start unmapping | 785 | * @start_addr: virtual address at which to start unmapping |
787 | * @end_addr: virtual address at which to end unmapping | 786 | * @end_addr: virtual address at which to end unmapping |
788 | * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here | 787 | * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here |
789 | * @details: details of nonlinear truncation or shared cache invalidation | 788 | * @details: details of nonlinear truncation or shared cache invalidation |
790 | * | 789 | * |
791 | * Returns the end address of the unmapping (restart addr if interrupted). | 790 | * Returns the end address of the unmapping (restart addr if interrupted). |
792 | * | 791 | * |
793 | * Unmap all pages in the vma list. | 792 | * Unmap all pages in the vma list. |
794 | * | 793 | * |
795 | * We aim to not hold locks for too long (for scheduling latency reasons). | 794 | * We aim to not hold locks for too long (for scheduling latency reasons). |
796 | * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to | 795 | * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to |
797 | * return the ending mmu_gather to the caller. | 796 | * return the ending mmu_gather to the caller. |
798 | * | 797 | * |
799 | * Only addresses between `start' and `end' will be unmapped. | 798 | * Only addresses between `start' and `end' will be unmapped. |
800 | * | 799 | * |
801 | * The VMA list must be sorted in ascending virtual address order. | 800 | * The VMA list must be sorted in ascending virtual address order. |
802 | * | 801 | * |
803 | * unmap_vmas() assumes that the caller will flush the whole unmapped address | 802 | * unmap_vmas() assumes that the caller will flush the whole unmapped address |
804 | * range after unmap_vmas() returns. So the only responsibility here is to | 803 | * range after unmap_vmas() returns. So the only responsibility here is to |
805 | * ensure that any thus-far unmapped pages are flushed before unmap_vmas() | 804 | * ensure that any thus-far unmapped pages are flushed before unmap_vmas() |
806 | * drops the lock and schedules. | 805 | * drops the lock and schedules. |
807 | */ | 806 | */ |
808 | unsigned long unmap_vmas(struct mmu_gather **tlbp, | 807 | unsigned long unmap_vmas(struct mmu_gather **tlbp, |
809 | struct vm_area_struct *vma, unsigned long start_addr, | 808 | struct vm_area_struct *vma, unsigned long start_addr, |
810 | unsigned long end_addr, unsigned long *nr_accounted, | 809 | unsigned long end_addr, unsigned long *nr_accounted, |
811 | struct zap_details *details) | 810 | struct zap_details *details) |
812 | { | 811 | { |
813 | long zap_work = ZAP_BLOCK_SIZE; | 812 | long zap_work = ZAP_BLOCK_SIZE; |
814 | unsigned long tlb_start = 0; /* For tlb_finish_mmu */ | 813 | unsigned long tlb_start = 0; /* For tlb_finish_mmu */ |
815 | int tlb_start_valid = 0; | 814 | int tlb_start_valid = 0; |
816 | unsigned long start = start_addr; | 815 | unsigned long start = start_addr; |
817 | spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; | 816 | spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; |
818 | int fullmm = (*tlbp)->fullmm; | 817 | int fullmm = (*tlbp)->fullmm; |
819 | 818 | ||
820 | for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { | 819 | for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { |
821 | unsigned long end; | 820 | unsigned long end; |
822 | 821 | ||
823 | start = max(vma->vm_start, start_addr); | 822 | start = max(vma->vm_start, start_addr); |
824 | if (start >= vma->vm_end) | 823 | if (start >= vma->vm_end) |
825 | continue; | 824 | continue; |
826 | end = min(vma->vm_end, end_addr); | 825 | end = min(vma->vm_end, end_addr); |
827 | if (end <= vma->vm_start) | 826 | if (end <= vma->vm_start) |
828 | continue; | 827 | continue; |
829 | 828 | ||
830 | if (vma->vm_flags & VM_ACCOUNT) | 829 | if (vma->vm_flags & VM_ACCOUNT) |
831 | *nr_accounted += (end - start) >> PAGE_SHIFT; | 830 | *nr_accounted += (end - start) >> PAGE_SHIFT; |
832 | 831 | ||
833 | while (start != end) { | 832 | while (start != end) { |
834 | if (!tlb_start_valid) { | 833 | if (!tlb_start_valid) { |
835 | tlb_start = start; | 834 | tlb_start = start; |
836 | tlb_start_valid = 1; | 835 | tlb_start_valid = 1; |
837 | } | 836 | } |
838 | 837 | ||
839 | if (unlikely(is_vm_hugetlb_page(vma))) { | 838 | if (unlikely(is_vm_hugetlb_page(vma))) { |
840 | unmap_hugepage_range(vma, start, end); | 839 | unmap_hugepage_range(vma, start, end); |
841 | zap_work -= (end - start) / | 840 | zap_work -= (end - start) / |
842 | (HPAGE_SIZE / PAGE_SIZE); | 841 | (HPAGE_SIZE / PAGE_SIZE); |
843 | start = end; | 842 | start = end; |
844 | } else | 843 | } else |
845 | start = unmap_page_range(*tlbp, vma, | 844 | start = unmap_page_range(*tlbp, vma, |
846 | start, end, &zap_work, details); | 845 | start, end, &zap_work, details); |
847 | 846 | ||
848 | if (zap_work > 0) { | 847 | if (zap_work > 0) { |
849 | BUG_ON(start != end); | 848 | BUG_ON(start != end); |
850 | break; | 849 | break; |
851 | } | 850 | } |
852 | 851 | ||
853 | tlb_finish_mmu(*tlbp, tlb_start, start); | 852 | tlb_finish_mmu(*tlbp, tlb_start, start); |
854 | 853 | ||
855 | if (need_resched() || | 854 | if (need_resched() || |
856 | (i_mmap_lock && need_lockbreak(i_mmap_lock))) { | 855 | (i_mmap_lock && spin_needbreak(i_mmap_lock))) { |
857 | if (i_mmap_lock) { | 856 | if (i_mmap_lock) { |
858 | *tlbp = NULL; | 857 | *tlbp = NULL; |
859 | goto out; | 858 | goto out; |
860 | } | 859 | } |
861 | cond_resched(); | 860 | cond_resched(); |
862 | } | 861 | } |
863 | 862 | ||
864 | *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm); | 863 | *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm); |
865 | tlb_start_valid = 0; | 864 | tlb_start_valid = 0; |
866 | zap_work = ZAP_BLOCK_SIZE; | 865 | zap_work = ZAP_BLOCK_SIZE; |
867 | } | 866 | } |
868 | } | 867 | } |
869 | out: | 868 | out: |
870 | return start; /* which is now the end (or restart) address */ | 869 | return start; /* which is now the end (or restart) address */ |
871 | } | 870 | } |
872 | 871 | ||
873 | /** | 872 | /** |
874 | * zap_page_range - remove user pages in a given range | 873 | * zap_page_range - remove user pages in a given range |
875 | * @vma: vm_area_struct holding the applicable pages | 874 | * @vma: vm_area_struct holding the applicable pages |
876 | * @address: starting address of pages to zap | 875 | * @address: starting address of pages to zap |
877 | * @size: number of bytes to zap | 876 | * @size: number of bytes to zap |
878 | * @details: details of nonlinear truncation or shared cache invalidation | 877 | * @details: details of nonlinear truncation or shared cache invalidation |
879 | */ | 878 | */ |
880 | unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, | 879 | unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, |
881 | unsigned long size, struct zap_details *details) | 880 | unsigned long size, struct zap_details *details) |
882 | { | 881 | { |
883 | struct mm_struct *mm = vma->vm_mm; | 882 | struct mm_struct *mm = vma->vm_mm; |
884 | struct mmu_gather *tlb; | 883 | struct mmu_gather *tlb; |
885 | unsigned long end = address + size; | 884 | unsigned long end = address + size; |
886 | unsigned long nr_accounted = 0; | 885 | unsigned long nr_accounted = 0; |
887 | 886 | ||
888 | lru_add_drain(); | 887 | lru_add_drain(); |
889 | tlb = tlb_gather_mmu(mm, 0); | 888 | tlb = tlb_gather_mmu(mm, 0); |
890 | update_hiwater_rss(mm); | 889 | update_hiwater_rss(mm); |
891 | end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); | 890 | end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); |
892 | if (tlb) | 891 | if (tlb) |
893 | tlb_finish_mmu(tlb, address, end); | 892 | tlb_finish_mmu(tlb, address, end); |
894 | return end; | 893 | return end; |
895 | } | 894 | } |
896 | 895 | ||
897 | /* | 896 | /* |
898 | * Do a quick page-table lookup for a single page. | 897 | * Do a quick page-table lookup for a single page. |
899 | */ | 898 | */ |
900 | struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | 899 | struct page *follow_page(struct vm_area_struct *vma, unsigned long address, |
901 | unsigned int flags) | 900 | unsigned int flags) |
902 | { | 901 | { |
903 | pgd_t *pgd; | 902 | pgd_t *pgd; |
904 | pud_t *pud; | 903 | pud_t *pud; |
905 | pmd_t *pmd; | 904 | pmd_t *pmd; |
906 | pte_t *ptep, pte; | 905 | pte_t *ptep, pte; |
907 | spinlock_t *ptl; | 906 | spinlock_t *ptl; |
908 | struct page *page; | 907 | struct page *page; |
909 | struct mm_struct *mm = vma->vm_mm; | 908 | struct mm_struct *mm = vma->vm_mm; |
910 | 909 | ||
911 | page = follow_huge_addr(mm, address, flags & FOLL_WRITE); | 910 | page = follow_huge_addr(mm, address, flags & FOLL_WRITE); |
912 | if (!IS_ERR(page)) { | 911 | if (!IS_ERR(page)) { |
913 | BUG_ON(flags & FOLL_GET); | 912 | BUG_ON(flags & FOLL_GET); |
914 | goto out; | 913 | goto out; |
915 | } | 914 | } |
916 | 915 | ||
917 | page = NULL; | 916 | page = NULL; |
918 | pgd = pgd_offset(mm, address); | 917 | pgd = pgd_offset(mm, address); |
919 | if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) | 918 | if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) |
920 | goto no_page_table; | 919 | goto no_page_table; |
921 | 920 | ||
922 | pud = pud_offset(pgd, address); | 921 | pud = pud_offset(pgd, address); |
923 | if (pud_none(*pud) || unlikely(pud_bad(*pud))) | 922 | if (pud_none(*pud) || unlikely(pud_bad(*pud))) |
924 | goto no_page_table; | 923 | goto no_page_table; |
925 | 924 | ||
926 | pmd = pmd_offset(pud, address); | 925 | pmd = pmd_offset(pud, address); |
927 | if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) | 926 | if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) |
928 | goto no_page_table; | 927 | goto no_page_table; |
929 | 928 | ||
930 | if (pmd_huge(*pmd)) { | 929 | if (pmd_huge(*pmd)) { |
931 | BUG_ON(flags & FOLL_GET); | 930 | BUG_ON(flags & FOLL_GET); |
932 | page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); | 931 | page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); |
933 | goto out; | 932 | goto out; |
934 | } | 933 | } |
935 | 934 | ||
936 | ptep = pte_offset_map_lock(mm, pmd, address, &ptl); | 935 | ptep = pte_offset_map_lock(mm, pmd, address, &ptl); |
937 | if (!ptep) | 936 | if (!ptep) |
938 | goto out; | 937 | goto out; |
939 | 938 | ||
940 | pte = *ptep; | 939 | pte = *ptep; |
941 | if (!pte_present(pte)) | 940 | if (!pte_present(pte)) |
942 | goto unlock; | 941 | goto unlock; |
943 | if ((flags & FOLL_WRITE) && !pte_write(pte)) | 942 | if ((flags & FOLL_WRITE) && !pte_write(pte)) |
944 | goto unlock; | 943 | goto unlock; |
945 | page = vm_normal_page(vma, address, pte); | 944 | page = vm_normal_page(vma, address, pte); |
946 | if (unlikely(!page)) | 945 | if (unlikely(!page)) |
947 | goto unlock; | 946 | goto unlock; |
948 | 947 | ||
949 | if (flags & FOLL_GET) | 948 | if (flags & FOLL_GET) |
950 | get_page(page); | 949 | get_page(page); |
951 | if (flags & FOLL_TOUCH) { | 950 | if (flags & FOLL_TOUCH) { |
952 | if ((flags & FOLL_WRITE) && | 951 | if ((flags & FOLL_WRITE) && |
953 | !pte_dirty(pte) && !PageDirty(page)) | 952 | !pte_dirty(pte) && !PageDirty(page)) |
954 | set_page_dirty(page); | 953 | set_page_dirty(page); |
955 | mark_page_accessed(page); | 954 | mark_page_accessed(page); |
956 | } | 955 | } |
957 | unlock: | 956 | unlock: |
958 | pte_unmap_unlock(ptep, ptl); | 957 | pte_unmap_unlock(ptep, ptl); |
959 | out: | 958 | out: |
960 | return page; | 959 | return page; |
961 | 960 | ||
962 | no_page_table: | 961 | no_page_table: |
963 | /* | 962 | /* |
964 | * When core dumping an enormous anonymous area that nobody | 963 | * When core dumping an enormous anonymous area that nobody |
965 | * has touched so far, we don't want to allocate page tables. | 964 | * has touched so far, we don't want to allocate page tables. |
966 | */ | 965 | */ |
967 | if (flags & FOLL_ANON) { | 966 | if (flags & FOLL_ANON) { |
968 | page = ZERO_PAGE(0); | 967 | page = ZERO_PAGE(0); |
969 | if (flags & FOLL_GET) | 968 | if (flags & FOLL_GET) |
970 | get_page(page); | 969 | get_page(page); |
971 | BUG_ON(flags & FOLL_WRITE); | 970 | BUG_ON(flags & FOLL_WRITE); |
972 | } | 971 | } |
973 | return page; | 972 | return page; |
974 | } | 973 | } |
975 | 974 | ||
976 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 975 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
977 | unsigned long start, int len, int write, int force, | 976 | unsigned long start, int len, int write, int force, |
978 | struct page **pages, struct vm_area_struct **vmas) | 977 | struct page **pages, struct vm_area_struct **vmas) |
979 | { | 978 | { |
980 | int i; | 979 | int i; |
981 | unsigned int vm_flags; | 980 | unsigned int vm_flags; |
982 | 981 | ||
983 | /* | 982 | /* |
984 | * Require read or write permissions. | 983 | * Require read or write permissions. |
985 | * If 'force' is set, we only require the "MAY" flags. | 984 | * If 'force' is set, we only require the "MAY" flags. |
986 | */ | 985 | */ |
987 | vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); | 986 | vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); |
988 | vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); | 987 | vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); |
989 | i = 0; | 988 | i = 0; |
990 | 989 | ||
991 | do { | 990 | do { |
992 | struct vm_area_struct *vma; | 991 | struct vm_area_struct *vma; |
993 | unsigned int foll_flags; | 992 | unsigned int foll_flags; |
994 | 993 | ||
995 | vma = find_extend_vma(mm, start); | 994 | vma = find_extend_vma(mm, start); |
996 | if (!vma && in_gate_area(tsk, start)) { | 995 | if (!vma && in_gate_area(tsk, start)) { |
997 | unsigned long pg = start & PAGE_MASK; | 996 | unsigned long pg = start & PAGE_MASK; |
998 | struct vm_area_struct *gate_vma = get_gate_vma(tsk); | 997 | struct vm_area_struct *gate_vma = get_gate_vma(tsk); |
999 | pgd_t *pgd; | 998 | pgd_t *pgd; |
1000 | pud_t *pud; | 999 | pud_t *pud; |
1001 | pmd_t *pmd; | 1000 | pmd_t *pmd; |
1002 | pte_t *pte; | 1001 | pte_t *pte; |
1003 | if (write) /* user gate pages are read-only */ | 1002 | if (write) /* user gate pages are read-only */ |
1004 | return i ? : -EFAULT; | 1003 | return i ? : -EFAULT; |
1005 | if (pg > TASK_SIZE) | 1004 | if (pg > TASK_SIZE) |
1006 | pgd = pgd_offset_k(pg); | 1005 | pgd = pgd_offset_k(pg); |
1007 | else | 1006 | else |
1008 | pgd = pgd_offset_gate(mm, pg); | 1007 | pgd = pgd_offset_gate(mm, pg); |
1009 | BUG_ON(pgd_none(*pgd)); | 1008 | BUG_ON(pgd_none(*pgd)); |
1010 | pud = pud_offset(pgd, pg); | 1009 | pud = pud_offset(pgd, pg); |
1011 | BUG_ON(pud_none(*pud)); | 1010 | BUG_ON(pud_none(*pud)); |
1012 | pmd = pmd_offset(pud, pg); | 1011 | pmd = pmd_offset(pud, pg); |
1013 | if (pmd_none(*pmd)) | 1012 | if (pmd_none(*pmd)) |
1014 | return i ? : -EFAULT; | 1013 | return i ? : -EFAULT; |
1015 | pte = pte_offset_map(pmd, pg); | 1014 | pte = pte_offset_map(pmd, pg); |
1016 | if (pte_none(*pte)) { | 1015 | if (pte_none(*pte)) { |
1017 | pte_unmap(pte); | 1016 | pte_unmap(pte); |
1018 | return i ? : -EFAULT; | 1017 | return i ? : -EFAULT; |
1019 | } | 1018 | } |
1020 | if (pages) { | 1019 | if (pages) { |
1021 | struct page *page = vm_normal_page(gate_vma, start, *pte); | 1020 | struct page *page = vm_normal_page(gate_vma, start, *pte); |
1022 | pages[i] = page; | 1021 | pages[i] = page; |
1023 | if (page) | 1022 | if (page) |
1024 | get_page(page); | 1023 | get_page(page); |
1025 | } | 1024 | } |
1026 | pte_unmap(pte); | 1025 | pte_unmap(pte); |
1027 | if (vmas) | 1026 | if (vmas) |
1028 | vmas[i] = gate_vma; | 1027 | vmas[i] = gate_vma; |
1029 | i++; | 1028 | i++; |
1030 | start += PAGE_SIZE; | 1029 | start += PAGE_SIZE; |
1031 | len--; | 1030 | len--; |
1032 | continue; | 1031 | continue; |
1033 | } | 1032 | } |
1034 | 1033 | ||
1035 | if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP)) | 1034 | if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP)) |
1036 | || !(vm_flags & vma->vm_flags)) | 1035 | || !(vm_flags & vma->vm_flags)) |
1037 | return i ? : -EFAULT; | 1036 | return i ? : -EFAULT; |
1038 | 1037 | ||
1039 | if (is_vm_hugetlb_page(vma)) { | 1038 | if (is_vm_hugetlb_page(vma)) { |
1040 | i = follow_hugetlb_page(mm, vma, pages, vmas, | 1039 | i = follow_hugetlb_page(mm, vma, pages, vmas, |
1041 | &start, &len, i, write); | 1040 | &start, &len, i, write); |
1042 | continue; | 1041 | continue; |
1043 | } | 1042 | } |
1044 | 1043 | ||
1045 | foll_flags = FOLL_TOUCH; | 1044 | foll_flags = FOLL_TOUCH; |
1046 | if (pages) | 1045 | if (pages) |
1047 | foll_flags |= FOLL_GET; | 1046 | foll_flags |= FOLL_GET; |
1048 | if (!write && !(vma->vm_flags & VM_LOCKED) && | 1047 | if (!write && !(vma->vm_flags & VM_LOCKED) && |
1049 | (!vma->vm_ops || (!vma->vm_ops->nopage && | 1048 | (!vma->vm_ops || (!vma->vm_ops->nopage && |
1050 | !vma->vm_ops->fault))) | 1049 | !vma->vm_ops->fault))) |
1051 | foll_flags |= FOLL_ANON; | 1050 | foll_flags |= FOLL_ANON; |
1052 | 1051 | ||
1053 | do { | 1052 | do { |
1054 | struct page *page; | 1053 | struct page *page; |
1055 | 1054 | ||
1056 | /* | 1055 | /* |
1057 | * If tsk is ooming, cut off its access to large memory | 1056 | * If tsk is ooming, cut off its access to large memory |
1058 | * allocations. It has a pending SIGKILL, but it can't | 1057 | * allocations. It has a pending SIGKILL, but it can't |
1059 | * be processed until returning to user space. | 1058 | * be processed until returning to user space. |
1060 | */ | 1059 | */ |
1061 | if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE))) | 1060 | if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE))) |
1062 | return -ENOMEM; | 1061 | return -ENOMEM; |
1063 | 1062 | ||
1064 | if (write) | 1063 | if (write) |
1065 | foll_flags |= FOLL_WRITE; | 1064 | foll_flags |= FOLL_WRITE; |
1066 | 1065 | ||
1067 | cond_resched(); | 1066 | cond_resched(); |
1068 | while (!(page = follow_page(vma, start, foll_flags))) { | 1067 | while (!(page = follow_page(vma, start, foll_flags))) { |
1069 | int ret; | 1068 | int ret; |
1070 | ret = handle_mm_fault(mm, vma, start, | 1069 | ret = handle_mm_fault(mm, vma, start, |
1071 | foll_flags & FOLL_WRITE); | 1070 | foll_flags & FOLL_WRITE); |
1072 | if (ret & VM_FAULT_ERROR) { | 1071 | if (ret & VM_FAULT_ERROR) { |
1073 | if (ret & VM_FAULT_OOM) | 1072 | if (ret & VM_FAULT_OOM) |
1074 | return i ? i : -ENOMEM; | 1073 | return i ? i : -ENOMEM; |
1075 | else if (ret & VM_FAULT_SIGBUS) | 1074 | else if (ret & VM_FAULT_SIGBUS) |
1076 | return i ? i : -EFAULT; | 1075 | return i ? i : -EFAULT; |
1077 | BUG(); | 1076 | BUG(); |
1078 | } | 1077 | } |
1079 | if (ret & VM_FAULT_MAJOR) | 1078 | if (ret & VM_FAULT_MAJOR) |
1080 | tsk->maj_flt++; | 1079 | tsk->maj_flt++; |
1081 | else | 1080 | else |
1082 | tsk->min_flt++; | 1081 | tsk->min_flt++; |
1083 | 1082 | ||
1084 | /* | 1083 | /* |
1085 | * The VM_FAULT_WRITE bit tells us that | 1084 | * The VM_FAULT_WRITE bit tells us that |
1086 | * do_wp_page has broken COW when necessary, | 1085 | * do_wp_page has broken COW when necessary, |
1087 | * even if maybe_mkwrite decided not to set | 1086 | * even if maybe_mkwrite decided not to set |
1088 | * pte_write. We can thus safely do subsequent | 1087 | * pte_write. We can thus safely do subsequent |
1089 | * page lookups as if they were reads. | 1088 | * page lookups as if they were reads. |
1090 | */ | 1089 | */ |
1091 | if (ret & VM_FAULT_WRITE) | 1090 | if (ret & VM_FAULT_WRITE) |
1092 | foll_flags &= ~FOLL_WRITE; | 1091 | foll_flags &= ~FOLL_WRITE; |
1093 | 1092 | ||
1094 | cond_resched(); | 1093 | cond_resched(); |
1095 | } | 1094 | } |
1096 | if (pages) { | 1095 | if (pages) { |
1097 | pages[i] = page; | 1096 | pages[i] = page; |
1098 | 1097 | ||
1099 | flush_anon_page(vma, page, start); | 1098 | flush_anon_page(vma, page, start); |
1100 | flush_dcache_page(page); | 1099 | flush_dcache_page(page); |
1101 | } | 1100 | } |
1102 | if (vmas) | 1101 | if (vmas) |
1103 | vmas[i] = vma; | 1102 | vmas[i] = vma; |
1104 | i++; | 1103 | i++; |
1105 | start += PAGE_SIZE; | 1104 | start += PAGE_SIZE; |
1106 | len--; | 1105 | len--; |
1107 | } while (len && start < vma->vm_end); | 1106 | } while (len && start < vma->vm_end); |
1108 | } while (len); | 1107 | } while (len); |
1109 | return i; | 1108 | return i; |
1110 | } | 1109 | } |
1111 | EXPORT_SYMBOL(get_user_pages); | 1110 | EXPORT_SYMBOL(get_user_pages); |
1112 | 1111 | ||
1113 | pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) | 1112 | pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) |
1114 | { | 1113 | { |
1115 | pgd_t * pgd = pgd_offset(mm, addr); | 1114 | pgd_t * pgd = pgd_offset(mm, addr); |
1116 | pud_t * pud = pud_alloc(mm, pgd, addr); | 1115 | pud_t * pud = pud_alloc(mm, pgd, addr); |
1117 | if (pud) { | 1116 | if (pud) { |
1118 | pmd_t * pmd = pmd_alloc(mm, pud, addr); | 1117 | pmd_t * pmd = pmd_alloc(mm, pud, addr); |
1119 | if (pmd) | 1118 | if (pmd) |
1120 | return pte_alloc_map_lock(mm, pmd, addr, ptl); | 1119 | return pte_alloc_map_lock(mm, pmd, addr, ptl); |
1121 | } | 1120 | } |
1122 | return NULL; | 1121 | return NULL; |
1123 | } | 1122 | } |
1124 | 1123 | ||
1125 | /* | 1124 | /* |
1126 | * This is the old fallback for page remapping. | 1125 | * This is the old fallback for page remapping. |
1127 | * | 1126 | * |
1128 | * For historical reasons, it only allows reserved pages. Only | 1127 | * For historical reasons, it only allows reserved pages. Only |
1129 | * old drivers should use this, and they needed to mark their | 1128 | * old drivers should use this, and they needed to mark their |
1130 | * pages reserved for the old functions anyway. | 1129 | * pages reserved for the old functions anyway. |
1131 | */ | 1130 | */ |
1132 | static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *page, pgprot_t prot) | 1131 | static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *page, pgprot_t prot) |
1133 | { | 1132 | { |
1134 | int retval; | 1133 | int retval; |
1135 | pte_t *pte; | 1134 | pte_t *pte; |
1136 | spinlock_t *ptl; | 1135 | spinlock_t *ptl; |
1137 | 1136 | ||
1138 | retval = -EINVAL; | 1137 | retval = -EINVAL; |
1139 | if (PageAnon(page)) | 1138 | if (PageAnon(page)) |
1140 | goto out; | 1139 | goto out; |
1141 | retval = -ENOMEM; | 1140 | retval = -ENOMEM; |
1142 | flush_dcache_page(page); | 1141 | flush_dcache_page(page); |
1143 | pte = get_locked_pte(mm, addr, &ptl); | 1142 | pte = get_locked_pte(mm, addr, &ptl); |
1144 | if (!pte) | 1143 | if (!pte) |
1145 | goto out; | 1144 | goto out; |
1146 | retval = -EBUSY; | 1145 | retval = -EBUSY; |
1147 | if (!pte_none(*pte)) | 1146 | if (!pte_none(*pte)) |
1148 | goto out_unlock; | 1147 | goto out_unlock; |
1149 | 1148 | ||
1150 | /* Ok, finally just insert the thing.. */ | 1149 | /* Ok, finally just insert the thing.. */ |
1151 | get_page(page); | 1150 | get_page(page); |
1152 | inc_mm_counter(mm, file_rss); | 1151 | inc_mm_counter(mm, file_rss); |
1153 | page_add_file_rmap(page); | 1152 | page_add_file_rmap(page); |
1154 | set_pte_at(mm, addr, pte, mk_pte(page, prot)); | 1153 | set_pte_at(mm, addr, pte, mk_pte(page, prot)); |
1155 | 1154 | ||
1156 | retval = 0; | 1155 | retval = 0; |
1157 | out_unlock: | 1156 | out_unlock: |
1158 | pte_unmap_unlock(pte, ptl); | 1157 | pte_unmap_unlock(pte, ptl); |
1159 | out: | 1158 | out: |
1160 | return retval; | 1159 | return retval; |
1161 | } | 1160 | } |
1162 | 1161 | ||
1163 | /** | 1162 | /** |
1164 | * vm_insert_page - insert single page into user vma | 1163 | * vm_insert_page - insert single page into user vma |
1165 | * @vma: user vma to map to | 1164 | * @vma: user vma to map to |
1166 | * @addr: target user address of this page | 1165 | * @addr: target user address of this page |
1167 | * @page: source kernel page | 1166 | * @page: source kernel page |
1168 | * | 1167 | * |
1169 | * This allows drivers to insert individual pages they've allocated | 1168 | * This allows drivers to insert individual pages they've allocated |
1170 | * into a user vma. | 1169 | * into a user vma. |
1171 | * | 1170 | * |
1172 | * The page has to be a nice clean _individual_ kernel allocation. | 1171 | * The page has to be a nice clean _individual_ kernel allocation. |
1173 | * If you allocate a compound page, you need to have marked it as | 1172 | * If you allocate a compound page, you need to have marked it as |
1174 | * such (__GFP_COMP), or manually just split the page up yourself | 1173 | * such (__GFP_COMP), or manually just split the page up yourself |
1175 | * (see split_page()). | 1174 | * (see split_page()). |
1176 | * | 1175 | * |
1177 | * NOTE! Traditionally this was done with "remap_pfn_range()" which | 1176 | * NOTE! Traditionally this was done with "remap_pfn_range()" which |
1178 | * took an arbitrary page protection parameter. This doesn't allow | 1177 | * took an arbitrary page protection parameter. This doesn't allow |
1179 | * that. Your vma protection will have to be set up correctly, which | 1178 | * that. Your vma protection will have to be set up correctly, which |
1180 | * means that if you want a shared writable mapping, you'd better | 1179 | * means that if you want a shared writable mapping, you'd better |
1181 | * ask for a shared writable mapping! | 1180 | * ask for a shared writable mapping! |
1182 | * | 1181 | * |
1183 | * The page does not need to be reserved. | 1182 | * The page does not need to be reserved. |
1184 | */ | 1183 | */ |
1185 | int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page) | 1184 | int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page) |
1186 | { | 1185 | { |
1187 | if (addr < vma->vm_start || addr >= vma->vm_end) | 1186 | if (addr < vma->vm_start || addr >= vma->vm_end) |
1188 | return -EFAULT; | 1187 | return -EFAULT; |
1189 | if (!page_count(page)) | 1188 | if (!page_count(page)) |
1190 | return -EINVAL; | 1189 | return -EINVAL; |
1191 | vma->vm_flags |= VM_INSERTPAGE; | 1190 | vma->vm_flags |= VM_INSERTPAGE; |
1192 | return insert_page(vma->vm_mm, addr, page, vma->vm_page_prot); | 1191 | return insert_page(vma->vm_mm, addr, page, vma->vm_page_prot); |
1193 | } | 1192 | } |
1194 | EXPORT_SYMBOL(vm_insert_page); | 1193 | EXPORT_SYMBOL(vm_insert_page); |
1195 | 1194 | ||
1196 | /** | 1195 | /** |
1197 | * vm_insert_pfn - insert single pfn into user vma | 1196 | * vm_insert_pfn - insert single pfn into user vma |
1198 | * @vma: user vma to map to | 1197 | * @vma: user vma to map to |
1199 | * @addr: target user address of this page | 1198 | * @addr: target user address of this page |
1200 | * @pfn: source kernel pfn | 1199 | * @pfn: source kernel pfn |
1201 | * | 1200 | * |
1202 | * Similar to vm_inert_page, this allows drivers to insert individual pages | 1201 | * Similar to vm_inert_page, this allows drivers to insert individual pages |
1203 | * they've allocated into a user vma. Same comments apply. | 1202 | * they've allocated into a user vma. Same comments apply. |
1204 | * | 1203 | * |
1205 | * This function should only be called from a vm_ops->fault handler, and | 1204 | * This function should only be called from a vm_ops->fault handler, and |
1206 | * in that case the handler should return NULL. | 1205 | * in that case the handler should return NULL. |
1207 | */ | 1206 | */ |
1208 | int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, | 1207 | int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, |
1209 | unsigned long pfn) | 1208 | unsigned long pfn) |
1210 | { | 1209 | { |
1211 | struct mm_struct *mm = vma->vm_mm; | 1210 | struct mm_struct *mm = vma->vm_mm; |
1212 | int retval; | 1211 | int retval; |
1213 | pte_t *pte, entry; | 1212 | pte_t *pte, entry; |
1214 | spinlock_t *ptl; | 1213 | spinlock_t *ptl; |
1215 | 1214 | ||
1216 | BUG_ON(!(vma->vm_flags & VM_PFNMAP)); | 1215 | BUG_ON(!(vma->vm_flags & VM_PFNMAP)); |
1217 | BUG_ON(is_cow_mapping(vma->vm_flags)); | 1216 | BUG_ON(is_cow_mapping(vma->vm_flags)); |
1218 | 1217 | ||
1219 | retval = -ENOMEM; | 1218 | retval = -ENOMEM; |
1220 | pte = get_locked_pte(mm, addr, &ptl); | 1219 | pte = get_locked_pte(mm, addr, &ptl); |
1221 | if (!pte) | 1220 | if (!pte) |
1222 | goto out; | 1221 | goto out; |
1223 | retval = -EBUSY; | 1222 | retval = -EBUSY; |
1224 | if (!pte_none(*pte)) | 1223 | if (!pte_none(*pte)) |
1225 | goto out_unlock; | 1224 | goto out_unlock; |
1226 | 1225 | ||
1227 | /* Ok, finally just insert the thing.. */ | 1226 | /* Ok, finally just insert the thing.. */ |
1228 | entry = pfn_pte(pfn, vma->vm_page_prot); | 1227 | entry = pfn_pte(pfn, vma->vm_page_prot); |
1229 | set_pte_at(mm, addr, pte, entry); | 1228 | set_pte_at(mm, addr, pte, entry); |
1230 | update_mmu_cache(vma, addr, entry); | 1229 | update_mmu_cache(vma, addr, entry); |
1231 | 1230 | ||
1232 | retval = 0; | 1231 | retval = 0; |
1233 | out_unlock: | 1232 | out_unlock: |
1234 | pte_unmap_unlock(pte, ptl); | 1233 | pte_unmap_unlock(pte, ptl); |
1235 | 1234 | ||
1236 | out: | 1235 | out: |
1237 | return retval; | 1236 | return retval; |
1238 | } | 1237 | } |
1239 | EXPORT_SYMBOL(vm_insert_pfn); | 1238 | EXPORT_SYMBOL(vm_insert_pfn); |
1240 | 1239 | ||
1241 | /* | 1240 | /* |
1242 | * maps a range of physical memory into the requested pages. the old | 1241 | * maps a range of physical memory into the requested pages. the old |
1243 | * mappings are removed. any references to nonexistent pages results | 1242 | * mappings are removed. any references to nonexistent pages results |
1244 | * in null mappings (currently treated as "copy-on-access") | 1243 | * in null mappings (currently treated as "copy-on-access") |
1245 | */ | 1244 | */ |
1246 | static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, | 1245 | static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, |
1247 | unsigned long addr, unsigned long end, | 1246 | unsigned long addr, unsigned long end, |
1248 | unsigned long pfn, pgprot_t prot) | 1247 | unsigned long pfn, pgprot_t prot) |
1249 | { | 1248 | { |
1250 | pte_t *pte; | 1249 | pte_t *pte; |
1251 | spinlock_t *ptl; | 1250 | spinlock_t *ptl; |
1252 | 1251 | ||
1253 | pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); | 1252 | pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); |
1254 | if (!pte) | 1253 | if (!pte) |
1255 | return -ENOMEM; | 1254 | return -ENOMEM; |
1256 | arch_enter_lazy_mmu_mode(); | 1255 | arch_enter_lazy_mmu_mode(); |
1257 | do { | 1256 | do { |
1258 | BUG_ON(!pte_none(*pte)); | 1257 | BUG_ON(!pte_none(*pte)); |
1259 | set_pte_at(mm, addr, pte, pfn_pte(pfn, prot)); | 1258 | set_pte_at(mm, addr, pte, pfn_pte(pfn, prot)); |
1260 | pfn++; | 1259 | pfn++; |
1261 | } while (pte++, addr += PAGE_SIZE, addr != end); | 1260 | } while (pte++, addr += PAGE_SIZE, addr != end); |
1262 | arch_leave_lazy_mmu_mode(); | 1261 | arch_leave_lazy_mmu_mode(); |
1263 | pte_unmap_unlock(pte - 1, ptl); | 1262 | pte_unmap_unlock(pte - 1, ptl); |
1264 | return 0; | 1263 | return 0; |
1265 | } | 1264 | } |
1266 | 1265 | ||
1267 | static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, | 1266 | static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, |
1268 | unsigned long addr, unsigned long end, | 1267 | unsigned long addr, unsigned long end, |
1269 | unsigned long pfn, pgprot_t prot) | 1268 | unsigned long pfn, pgprot_t prot) |
1270 | { | 1269 | { |
1271 | pmd_t *pmd; | 1270 | pmd_t *pmd; |
1272 | unsigned long next; | 1271 | unsigned long next; |
1273 | 1272 | ||
1274 | pfn -= addr >> PAGE_SHIFT; | 1273 | pfn -= addr >> PAGE_SHIFT; |
1275 | pmd = pmd_alloc(mm, pud, addr); | 1274 | pmd = pmd_alloc(mm, pud, addr); |
1276 | if (!pmd) | 1275 | if (!pmd) |
1277 | return -ENOMEM; | 1276 | return -ENOMEM; |
1278 | do { | 1277 | do { |
1279 | next = pmd_addr_end(addr, end); | 1278 | next = pmd_addr_end(addr, end); |
1280 | if (remap_pte_range(mm, pmd, addr, next, | 1279 | if (remap_pte_range(mm, pmd, addr, next, |
1281 | pfn + (addr >> PAGE_SHIFT), prot)) | 1280 | pfn + (addr >> PAGE_SHIFT), prot)) |
1282 | return -ENOMEM; | 1281 | return -ENOMEM; |
1283 | } while (pmd++, addr = next, addr != end); | 1282 | } while (pmd++, addr = next, addr != end); |
1284 | return 0; | 1283 | return 0; |
1285 | } | 1284 | } |
1286 | 1285 | ||
1287 | static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd, | 1286 | static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd, |
1288 | unsigned long addr, unsigned long end, | 1287 | unsigned long addr, unsigned long end, |
1289 | unsigned long pfn, pgprot_t prot) | 1288 | unsigned long pfn, pgprot_t prot) |
1290 | { | 1289 | { |
1291 | pud_t *pud; | 1290 | pud_t *pud; |
1292 | unsigned long next; | 1291 | unsigned long next; |
1293 | 1292 | ||
1294 | pfn -= addr >> PAGE_SHIFT; | 1293 | pfn -= addr >> PAGE_SHIFT; |
1295 | pud = pud_alloc(mm, pgd, addr); | 1294 | pud = pud_alloc(mm, pgd, addr); |
1296 | if (!pud) | 1295 | if (!pud) |
1297 | return -ENOMEM; | 1296 | return -ENOMEM; |
1298 | do { | 1297 | do { |
1299 | next = pud_addr_end(addr, end); | 1298 | next = pud_addr_end(addr, end); |
1300 | if (remap_pmd_range(mm, pud, addr, next, | 1299 | if (remap_pmd_range(mm, pud, addr, next, |
1301 | pfn + (addr >> PAGE_SHIFT), prot)) | 1300 | pfn + (addr >> PAGE_SHIFT), prot)) |
1302 | return -ENOMEM; | 1301 | return -ENOMEM; |
1303 | } while (pud++, addr = next, addr != end); | 1302 | } while (pud++, addr = next, addr != end); |
1304 | return 0; | 1303 | return 0; |
1305 | } | 1304 | } |
1306 | 1305 | ||
1307 | /** | 1306 | /** |
1308 | * remap_pfn_range - remap kernel memory to userspace | 1307 | * remap_pfn_range - remap kernel memory to userspace |
1309 | * @vma: user vma to map to | 1308 | * @vma: user vma to map to |
1310 | * @addr: target user address to start at | 1309 | * @addr: target user address to start at |
1311 | * @pfn: physical address of kernel memory | 1310 | * @pfn: physical address of kernel memory |
1312 | * @size: size of map area | 1311 | * @size: size of map area |
1313 | * @prot: page protection flags for this mapping | 1312 | * @prot: page protection flags for this mapping |
1314 | * | 1313 | * |
1315 | * Note: this is only safe if the mm semaphore is held when called. | 1314 | * Note: this is only safe if the mm semaphore is held when called. |
1316 | */ | 1315 | */ |
1317 | int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, | 1316 | int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, |
1318 | unsigned long pfn, unsigned long size, pgprot_t prot) | 1317 | unsigned long pfn, unsigned long size, pgprot_t prot) |
1319 | { | 1318 | { |
1320 | pgd_t *pgd; | 1319 | pgd_t *pgd; |
1321 | unsigned long next; | 1320 | unsigned long next; |
1322 | unsigned long end = addr + PAGE_ALIGN(size); | 1321 | unsigned long end = addr + PAGE_ALIGN(size); |
1323 | struct mm_struct *mm = vma->vm_mm; | 1322 | struct mm_struct *mm = vma->vm_mm; |
1324 | int err; | 1323 | int err; |
1325 | 1324 | ||
1326 | /* | 1325 | /* |
1327 | * Physically remapped pages are special. Tell the | 1326 | * Physically remapped pages are special. Tell the |
1328 | * rest of the world about it: | 1327 | * rest of the world about it: |
1329 | * VM_IO tells people not to look at these pages | 1328 | * VM_IO tells people not to look at these pages |
1330 | * (accesses can have side effects). | 1329 | * (accesses can have side effects). |
1331 | * VM_RESERVED is specified all over the place, because | 1330 | * VM_RESERVED is specified all over the place, because |
1332 | * in 2.4 it kept swapout's vma scan off this vma; but | 1331 | * in 2.4 it kept swapout's vma scan off this vma; but |
1333 | * in 2.6 the LRU scan won't even find its pages, so this | 1332 | * in 2.6 the LRU scan won't even find its pages, so this |
1334 | * flag means no more than count its pages in reserved_vm, | 1333 | * flag means no more than count its pages in reserved_vm, |
1335 | * and omit it from core dump, even when VM_IO turned off. | 1334 | * and omit it from core dump, even when VM_IO turned off. |
1336 | * VM_PFNMAP tells the core MM that the base pages are just | 1335 | * VM_PFNMAP tells the core MM that the base pages are just |
1337 | * raw PFN mappings, and do not have a "struct page" associated | 1336 | * raw PFN mappings, and do not have a "struct page" associated |
1338 | * with them. | 1337 | * with them. |
1339 | * | 1338 | * |
1340 | * There's a horrible special case to handle copy-on-write | 1339 | * There's a horrible special case to handle copy-on-write |
1341 | * behaviour that some programs depend on. We mark the "original" | 1340 | * behaviour that some programs depend on. We mark the "original" |
1342 | * un-COW'ed pages by matching them up with "vma->vm_pgoff". | 1341 | * un-COW'ed pages by matching them up with "vma->vm_pgoff". |
1343 | */ | 1342 | */ |
1344 | if (is_cow_mapping(vma->vm_flags)) { | 1343 | if (is_cow_mapping(vma->vm_flags)) { |
1345 | if (addr != vma->vm_start || end != vma->vm_end) | 1344 | if (addr != vma->vm_start || end != vma->vm_end) |
1346 | return -EINVAL; | 1345 | return -EINVAL; |
1347 | vma->vm_pgoff = pfn; | 1346 | vma->vm_pgoff = pfn; |
1348 | } | 1347 | } |
1349 | 1348 | ||
1350 | vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; | 1349 | vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; |
1351 | 1350 | ||
1352 | BUG_ON(addr >= end); | 1351 | BUG_ON(addr >= end); |
1353 | pfn -= addr >> PAGE_SHIFT; | 1352 | pfn -= addr >> PAGE_SHIFT; |
1354 | pgd = pgd_offset(mm, addr); | 1353 | pgd = pgd_offset(mm, addr); |
1355 | flush_cache_range(vma, addr, end); | 1354 | flush_cache_range(vma, addr, end); |
1356 | do { | 1355 | do { |
1357 | next = pgd_addr_end(addr, end); | 1356 | next = pgd_addr_end(addr, end); |
1358 | err = remap_pud_range(mm, pgd, addr, next, | 1357 | err = remap_pud_range(mm, pgd, addr, next, |
1359 | pfn + (addr >> PAGE_SHIFT), prot); | 1358 | pfn + (addr >> PAGE_SHIFT), prot); |
1360 | if (err) | 1359 | if (err) |
1361 | break; | 1360 | break; |
1362 | } while (pgd++, addr = next, addr != end); | 1361 | } while (pgd++, addr = next, addr != end); |
1363 | return err; | 1362 | return err; |
1364 | } | 1363 | } |
1365 | EXPORT_SYMBOL(remap_pfn_range); | 1364 | EXPORT_SYMBOL(remap_pfn_range); |
1366 | 1365 | ||
1367 | static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, | 1366 | static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, |
1368 | unsigned long addr, unsigned long end, | 1367 | unsigned long addr, unsigned long end, |
1369 | pte_fn_t fn, void *data) | 1368 | pte_fn_t fn, void *data) |
1370 | { | 1369 | { |
1371 | pte_t *pte; | 1370 | pte_t *pte; |
1372 | int err; | 1371 | int err; |
1373 | struct page *pmd_page; | 1372 | struct page *pmd_page; |
1374 | spinlock_t *uninitialized_var(ptl); | 1373 | spinlock_t *uninitialized_var(ptl); |
1375 | 1374 | ||
1376 | pte = (mm == &init_mm) ? | 1375 | pte = (mm == &init_mm) ? |
1377 | pte_alloc_kernel(pmd, addr) : | 1376 | pte_alloc_kernel(pmd, addr) : |
1378 | pte_alloc_map_lock(mm, pmd, addr, &ptl); | 1377 | pte_alloc_map_lock(mm, pmd, addr, &ptl); |
1379 | if (!pte) | 1378 | if (!pte) |
1380 | return -ENOMEM; | 1379 | return -ENOMEM; |
1381 | 1380 | ||
1382 | BUG_ON(pmd_huge(*pmd)); | 1381 | BUG_ON(pmd_huge(*pmd)); |
1383 | 1382 | ||
1384 | pmd_page = pmd_page(*pmd); | 1383 | pmd_page = pmd_page(*pmd); |
1385 | 1384 | ||
1386 | do { | 1385 | do { |
1387 | err = fn(pte, pmd_page, addr, data); | 1386 | err = fn(pte, pmd_page, addr, data); |
1388 | if (err) | 1387 | if (err) |
1389 | break; | 1388 | break; |
1390 | } while (pte++, addr += PAGE_SIZE, addr != end); | 1389 | } while (pte++, addr += PAGE_SIZE, addr != end); |
1391 | 1390 | ||
1392 | if (mm != &init_mm) | 1391 | if (mm != &init_mm) |
1393 | pte_unmap_unlock(pte-1, ptl); | 1392 | pte_unmap_unlock(pte-1, ptl); |
1394 | return err; | 1393 | return err; |
1395 | } | 1394 | } |
1396 | 1395 | ||
1397 | static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, | 1396 | static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, |
1398 | unsigned long addr, unsigned long end, | 1397 | unsigned long addr, unsigned long end, |
1399 | pte_fn_t fn, void *data) | 1398 | pte_fn_t fn, void *data) |
1400 | { | 1399 | { |
1401 | pmd_t *pmd; | 1400 | pmd_t *pmd; |
1402 | unsigned long next; | 1401 | unsigned long next; |
1403 | int err; | 1402 | int err; |
1404 | 1403 | ||
1405 | pmd = pmd_alloc(mm, pud, addr); | 1404 | pmd = pmd_alloc(mm, pud, addr); |
1406 | if (!pmd) | 1405 | if (!pmd) |
1407 | return -ENOMEM; | 1406 | return -ENOMEM; |
1408 | do { | 1407 | do { |
1409 | next = pmd_addr_end(addr, end); | 1408 | next = pmd_addr_end(addr, end); |
1410 | err = apply_to_pte_range(mm, pmd, addr, next, fn, data); | 1409 | err = apply_to_pte_range(mm, pmd, addr, next, fn, data); |
1411 | if (err) | 1410 | if (err) |
1412 | break; | 1411 | break; |
1413 | } while (pmd++, addr = next, addr != end); | 1412 | } while (pmd++, addr = next, addr != end); |
1414 | return err; | 1413 | return err; |
1415 | } | 1414 | } |
1416 | 1415 | ||
1417 | static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd, | 1416 | static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd, |
1418 | unsigned long addr, unsigned long end, | 1417 | unsigned long addr, unsigned long end, |
1419 | pte_fn_t fn, void *data) | 1418 | pte_fn_t fn, void *data) |
1420 | { | 1419 | { |
1421 | pud_t *pud; | 1420 | pud_t *pud; |
1422 | unsigned long next; | 1421 | unsigned long next; |
1423 | int err; | 1422 | int err; |
1424 | 1423 | ||
1425 | pud = pud_alloc(mm, pgd, addr); | 1424 | pud = pud_alloc(mm, pgd, addr); |
1426 | if (!pud) | 1425 | if (!pud) |
1427 | return -ENOMEM; | 1426 | return -ENOMEM; |
1428 | do { | 1427 | do { |
1429 | next = pud_addr_end(addr, end); | 1428 | next = pud_addr_end(addr, end); |
1430 | err = apply_to_pmd_range(mm, pud, addr, next, fn, data); | 1429 | err = apply_to_pmd_range(mm, pud, addr, next, fn, data); |
1431 | if (err) | 1430 | if (err) |
1432 | break; | 1431 | break; |
1433 | } while (pud++, addr = next, addr != end); | 1432 | } while (pud++, addr = next, addr != end); |
1434 | return err; | 1433 | return err; |
1435 | } | 1434 | } |
1436 | 1435 | ||
1437 | /* | 1436 | /* |
1438 | * Scan a region of virtual memory, filling in page tables as necessary | 1437 | * Scan a region of virtual memory, filling in page tables as necessary |
1439 | * and calling a provided function on each leaf page table. | 1438 | * and calling a provided function on each leaf page table. |
1440 | */ | 1439 | */ |
1441 | int apply_to_page_range(struct mm_struct *mm, unsigned long addr, | 1440 | int apply_to_page_range(struct mm_struct *mm, unsigned long addr, |
1442 | unsigned long size, pte_fn_t fn, void *data) | 1441 | unsigned long size, pte_fn_t fn, void *data) |
1443 | { | 1442 | { |
1444 | pgd_t *pgd; | 1443 | pgd_t *pgd; |
1445 | unsigned long next; | 1444 | unsigned long next; |
1446 | unsigned long end = addr + size; | 1445 | unsigned long end = addr + size; |
1447 | int err; | 1446 | int err; |
1448 | 1447 | ||
1449 | BUG_ON(addr >= end); | 1448 | BUG_ON(addr >= end); |
1450 | pgd = pgd_offset(mm, addr); | 1449 | pgd = pgd_offset(mm, addr); |
1451 | do { | 1450 | do { |
1452 | next = pgd_addr_end(addr, end); | 1451 | next = pgd_addr_end(addr, end); |
1453 | err = apply_to_pud_range(mm, pgd, addr, next, fn, data); | 1452 | err = apply_to_pud_range(mm, pgd, addr, next, fn, data); |
1454 | if (err) | 1453 | if (err) |
1455 | break; | 1454 | break; |
1456 | } while (pgd++, addr = next, addr != end); | 1455 | } while (pgd++, addr = next, addr != end); |
1457 | return err; | 1456 | return err; |
1458 | } | 1457 | } |
1459 | EXPORT_SYMBOL_GPL(apply_to_page_range); | 1458 | EXPORT_SYMBOL_GPL(apply_to_page_range); |
1460 | 1459 | ||
1461 | /* | 1460 | /* |
1462 | * handle_pte_fault chooses page fault handler according to an entry | 1461 | * handle_pte_fault chooses page fault handler according to an entry |
1463 | * which was read non-atomically. Before making any commitment, on | 1462 | * which was read non-atomically. Before making any commitment, on |
1464 | * those architectures or configurations (e.g. i386 with PAE) which | 1463 | * those architectures or configurations (e.g. i386 with PAE) which |
1465 | * might give a mix of unmatched parts, do_swap_page and do_file_page | 1464 | * might give a mix of unmatched parts, do_swap_page and do_file_page |
1466 | * must check under lock before unmapping the pte and proceeding | 1465 | * must check under lock before unmapping the pte and proceeding |
1467 | * (but do_wp_page is only called after already making such a check; | 1466 | * (but do_wp_page is only called after already making such a check; |
1468 | * and do_anonymous_page and do_no_page can safely check later on). | 1467 | * and do_anonymous_page and do_no_page can safely check later on). |
1469 | */ | 1468 | */ |
1470 | static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, | 1469 | static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, |
1471 | pte_t *page_table, pte_t orig_pte) | 1470 | pte_t *page_table, pte_t orig_pte) |
1472 | { | 1471 | { |
1473 | int same = 1; | 1472 | int same = 1; |
1474 | #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) | 1473 | #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) |
1475 | if (sizeof(pte_t) > sizeof(unsigned long)) { | 1474 | if (sizeof(pte_t) > sizeof(unsigned long)) { |
1476 | spinlock_t *ptl = pte_lockptr(mm, pmd); | 1475 | spinlock_t *ptl = pte_lockptr(mm, pmd); |
1477 | spin_lock(ptl); | 1476 | spin_lock(ptl); |
1478 | same = pte_same(*page_table, orig_pte); | 1477 | same = pte_same(*page_table, orig_pte); |
1479 | spin_unlock(ptl); | 1478 | spin_unlock(ptl); |
1480 | } | 1479 | } |
1481 | #endif | 1480 | #endif |
1482 | pte_unmap(page_table); | 1481 | pte_unmap(page_table); |
1483 | return same; | 1482 | return same; |
1484 | } | 1483 | } |
1485 | 1484 | ||
1486 | /* | 1485 | /* |
1487 | * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when | 1486 | * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when |
1488 | * servicing faults for write access. In the normal case, do always want | 1487 | * servicing faults for write access. In the normal case, do always want |
1489 | * pte_mkwrite. But get_user_pages can cause write faults for mappings | 1488 | * pte_mkwrite. But get_user_pages can cause write faults for mappings |
1490 | * that do not have writing enabled, when used by access_process_vm. | 1489 | * that do not have writing enabled, when used by access_process_vm. |
1491 | */ | 1490 | */ |
1492 | static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) | 1491 | static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) |
1493 | { | 1492 | { |
1494 | if (likely(vma->vm_flags & VM_WRITE)) | 1493 | if (likely(vma->vm_flags & VM_WRITE)) |
1495 | pte = pte_mkwrite(pte); | 1494 | pte = pte_mkwrite(pte); |
1496 | return pte; | 1495 | return pte; |
1497 | } | 1496 | } |
1498 | 1497 | ||
1499 | static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) | 1498 | static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) |
1500 | { | 1499 | { |
1501 | /* | 1500 | /* |
1502 | * If the source page was a PFN mapping, we don't have | 1501 | * If the source page was a PFN mapping, we don't have |
1503 | * a "struct page" for it. We do a best-effort copy by | 1502 | * a "struct page" for it. We do a best-effort copy by |
1504 | * just copying from the original user address. If that | 1503 | * just copying from the original user address. If that |
1505 | * fails, we just zero-fill it. Live with it. | 1504 | * fails, we just zero-fill it. Live with it. |
1506 | */ | 1505 | */ |
1507 | if (unlikely(!src)) { | 1506 | if (unlikely(!src)) { |
1508 | void *kaddr = kmap_atomic(dst, KM_USER0); | 1507 | void *kaddr = kmap_atomic(dst, KM_USER0); |
1509 | void __user *uaddr = (void __user *)(va & PAGE_MASK); | 1508 | void __user *uaddr = (void __user *)(va & PAGE_MASK); |
1510 | 1509 | ||
1511 | /* | 1510 | /* |
1512 | * This really shouldn't fail, because the page is there | 1511 | * This really shouldn't fail, because the page is there |
1513 | * in the page tables. But it might just be unreadable, | 1512 | * in the page tables. But it might just be unreadable, |
1514 | * in which case we just give up and fill the result with | 1513 | * in which case we just give up and fill the result with |
1515 | * zeroes. | 1514 | * zeroes. |
1516 | */ | 1515 | */ |
1517 | if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) | 1516 | if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) |
1518 | memset(kaddr, 0, PAGE_SIZE); | 1517 | memset(kaddr, 0, PAGE_SIZE); |
1519 | kunmap_atomic(kaddr, KM_USER0); | 1518 | kunmap_atomic(kaddr, KM_USER0); |
1520 | flush_dcache_page(dst); | 1519 | flush_dcache_page(dst); |
1521 | return; | 1520 | return; |
1522 | 1521 | ||
1523 | } | 1522 | } |
1524 | copy_user_highpage(dst, src, va, vma); | 1523 | copy_user_highpage(dst, src, va, vma); |
1525 | } | 1524 | } |
1526 | 1525 | ||
1527 | /* | 1526 | /* |
1528 | * This routine handles present pages, when users try to write | 1527 | * This routine handles present pages, when users try to write |
1529 | * to a shared page. It is done by copying the page to a new address | 1528 | * to a shared page. It is done by copying the page to a new address |
1530 | * and decrementing the shared-page counter for the old page. | 1529 | * and decrementing the shared-page counter for the old page. |
1531 | * | 1530 | * |
1532 | * Note that this routine assumes that the protection checks have been | 1531 | * Note that this routine assumes that the protection checks have been |
1533 | * done by the caller (the low-level page fault routine in most cases). | 1532 | * done by the caller (the low-level page fault routine in most cases). |
1534 | * Thus we can safely just mark it writable once we've done any necessary | 1533 | * Thus we can safely just mark it writable once we've done any necessary |
1535 | * COW. | 1534 | * COW. |
1536 | * | 1535 | * |
1537 | * We also mark the page dirty at this point even though the page will | 1536 | * We also mark the page dirty at this point even though the page will |
1538 | * change only once the write actually happens. This avoids a few races, | 1537 | * change only once the write actually happens. This avoids a few races, |
1539 | * and potentially makes it more efficient. | 1538 | * and potentially makes it more efficient. |
1540 | * | 1539 | * |
1541 | * We enter with non-exclusive mmap_sem (to exclude vma changes, | 1540 | * We enter with non-exclusive mmap_sem (to exclude vma changes, |
1542 | * but allow concurrent faults), with pte both mapped and locked. | 1541 | * but allow concurrent faults), with pte both mapped and locked. |
1543 | * We return with mmap_sem still held, but pte unmapped and unlocked. | 1542 | * We return with mmap_sem still held, but pte unmapped and unlocked. |
1544 | */ | 1543 | */ |
1545 | static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | 1544 | static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, |
1546 | unsigned long address, pte_t *page_table, pmd_t *pmd, | 1545 | unsigned long address, pte_t *page_table, pmd_t *pmd, |
1547 | spinlock_t *ptl, pte_t orig_pte) | 1546 | spinlock_t *ptl, pte_t orig_pte) |
1548 | { | 1547 | { |
1549 | struct page *old_page, *new_page; | 1548 | struct page *old_page, *new_page; |
1550 | pte_t entry; | 1549 | pte_t entry; |
1551 | int reuse = 0, ret = 0; | 1550 | int reuse = 0, ret = 0; |
1552 | int page_mkwrite = 0; | 1551 | int page_mkwrite = 0; |
1553 | struct page *dirty_page = NULL; | 1552 | struct page *dirty_page = NULL; |
1554 | 1553 | ||
1555 | old_page = vm_normal_page(vma, address, orig_pte); | 1554 | old_page = vm_normal_page(vma, address, orig_pte); |
1556 | if (!old_page) | 1555 | if (!old_page) |
1557 | goto gotten; | 1556 | goto gotten; |
1558 | 1557 | ||
1559 | /* | 1558 | /* |
1560 | * Take out anonymous pages first, anonymous shared vmas are | 1559 | * Take out anonymous pages first, anonymous shared vmas are |
1561 | * not dirty accountable. | 1560 | * not dirty accountable. |
1562 | */ | 1561 | */ |
1563 | if (PageAnon(old_page)) { | 1562 | if (PageAnon(old_page)) { |
1564 | if (!TestSetPageLocked(old_page)) { | 1563 | if (!TestSetPageLocked(old_page)) { |
1565 | reuse = can_share_swap_page(old_page); | 1564 | reuse = can_share_swap_page(old_page); |
1566 | unlock_page(old_page); | 1565 | unlock_page(old_page); |
1567 | } | 1566 | } |
1568 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | 1567 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == |
1569 | (VM_WRITE|VM_SHARED))) { | 1568 | (VM_WRITE|VM_SHARED))) { |
1570 | /* | 1569 | /* |
1571 | * Only catch write-faults on shared writable pages, | 1570 | * Only catch write-faults on shared writable pages, |
1572 | * read-only shared pages can get COWed by | 1571 | * read-only shared pages can get COWed by |
1573 | * get_user_pages(.write=1, .force=1). | 1572 | * get_user_pages(.write=1, .force=1). |
1574 | */ | 1573 | */ |
1575 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) { | 1574 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) { |
1576 | /* | 1575 | /* |
1577 | * Notify the address space that the page is about to | 1576 | * Notify the address space that the page is about to |
1578 | * become writable so that it can prohibit this or wait | 1577 | * become writable so that it can prohibit this or wait |
1579 | * for the page to get into an appropriate state. | 1578 | * for the page to get into an appropriate state. |
1580 | * | 1579 | * |
1581 | * We do this without the lock held, so that it can | 1580 | * We do this without the lock held, so that it can |
1582 | * sleep if it needs to. | 1581 | * sleep if it needs to. |
1583 | */ | 1582 | */ |
1584 | page_cache_get(old_page); | 1583 | page_cache_get(old_page); |
1585 | pte_unmap_unlock(page_table, ptl); | 1584 | pte_unmap_unlock(page_table, ptl); |
1586 | 1585 | ||
1587 | if (vma->vm_ops->page_mkwrite(vma, old_page) < 0) | 1586 | if (vma->vm_ops->page_mkwrite(vma, old_page) < 0) |
1588 | goto unwritable_page; | 1587 | goto unwritable_page; |
1589 | 1588 | ||
1590 | /* | 1589 | /* |
1591 | * Since we dropped the lock we need to revalidate | 1590 | * Since we dropped the lock we need to revalidate |
1592 | * the PTE as someone else may have changed it. If | 1591 | * the PTE as someone else may have changed it. If |
1593 | * they did, we just return, as we can count on the | 1592 | * they did, we just return, as we can count on the |
1594 | * MMU to tell us if they didn't also make it writable. | 1593 | * MMU to tell us if they didn't also make it writable. |
1595 | */ | 1594 | */ |
1596 | page_table = pte_offset_map_lock(mm, pmd, address, | 1595 | page_table = pte_offset_map_lock(mm, pmd, address, |
1597 | &ptl); | 1596 | &ptl); |
1598 | page_cache_release(old_page); | 1597 | page_cache_release(old_page); |
1599 | if (!pte_same(*page_table, orig_pte)) | 1598 | if (!pte_same(*page_table, orig_pte)) |
1600 | goto unlock; | 1599 | goto unlock; |
1601 | 1600 | ||
1602 | page_mkwrite = 1; | 1601 | page_mkwrite = 1; |
1603 | } | 1602 | } |
1604 | dirty_page = old_page; | 1603 | dirty_page = old_page; |
1605 | get_page(dirty_page); | 1604 | get_page(dirty_page); |
1606 | reuse = 1; | 1605 | reuse = 1; |
1607 | } | 1606 | } |
1608 | 1607 | ||
1609 | if (reuse) { | 1608 | if (reuse) { |
1610 | flush_cache_page(vma, address, pte_pfn(orig_pte)); | 1609 | flush_cache_page(vma, address, pte_pfn(orig_pte)); |
1611 | entry = pte_mkyoung(orig_pte); | 1610 | entry = pte_mkyoung(orig_pte); |
1612 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 1611 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
1613 | if (ptep_set_access_flags(vma, address, page_table, entry,1)) | 1612 | if (ptep_set_access_flags(vma, address, page_table, entry,1)) |
1614 | update_mmu_cache(vma, address, entry); | 1613 | update_mmu_cache(vma, address, entry); |
1615 | ret |= VM_FAULT_WRITE; | 1614 | ret |= VM_FAULT_WRITE; |
1616 | goto unlock; | 1615 | goto unlock; |
1617 | } | 1616 | } |
1618 | 1617 | ||
1619 | /* | 1618 | /* |
1620 | * Ok, we need to copy. Oh, well.. | 1619 | * Ok, we need to copy. Oh, well.. |
1621 | */ | 1620 | */ |
1622 | page_cache_get(old_page); | 1621 | page_cache_get(old_page); |
1623 | gotten: | 1622 | gotten: |
1624 | pte_unmap_unlock(page_table, ptl); | 1623 | pte_unmap_unlock(page_table, ptl); |
1625 | 1624 | ||
1626 | if (unlikely(anon_vma_prepare(vma))) | 1625 | if (unlikely(anon_vma_prepare(vma))) |
1627 | goto oom; | 1626 | goto oom; |
1628 | VM_BUG_ON(old_page == ZERO_PAGE(0)); | 1627 | VM_BUG_ON(old_page == ZERO_PAGE(0)); |
1629 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | 1628 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); |
1630 | if (!new_page) | 1629 | if (!new_page) |
1631 | goto oom; | 1630 | goto oom; |
1632 | cow_user_page(new_page, old_page, address, vma); | 1631 | cow_user_page(new_page, old_page, address, vma); |
1633 | 1632 | ||
1634 | /* | 1633 | /* |
1635 | * Re-check the pte - we dropped the lock | 1634 | * Re-check the pte - we dropped the lock |
1636 | */ | 1635 | */ |
1637 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 1636 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); |
1638 | if (likely(pte_same(*page_table, orig_pte))) { | 1637 | if (likely(pte_same(*page_table, orig_pte))) { |
1639 | if (old_page) { | 1638 | if (old_page) { |
1640 | page_remove_rmap(old_page, vma); | 1639 | page_remove_rmap(old_page, vma); |
1641 | if (!PageAnon(old_page)) { | 1640 | if (!PageAnon(old_page)) { |
1642 | dec_mm_counter(mm, file_rss); | 1641 | dec_mm_counter(mm, file_rss); |
1643 | inc_mm_counter(mm, anon_rss); | 1642 | inc_mm_counter(mm, anon_rss); |
1644 | } | 1643 | } |
1645 | } else | 1644 | } else |
1646 | inc_mm_counter(mm, anon_rss); | 1645 | inc_mm_counter(mm, anon_rss); |
1647 | flush_cache_page(vma, address, pte_pfn(orig_pte)); | 1646 | flush_cache_page(vma, address, pte_pfn(orig_pte)); |
1648 | entry = mk_pte(new_page, vma->vm_page_prot); | 1647 | entry = mk_pte(new_page, vma->vm_page_prot); |
1649 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 1648 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
1650 | /* | 1649 | /* |
1651 | * Clear the pte entry and flush it first, before updating the | 1650 | * Clear the pte entry and flush it first, before updating the |
1652 | * pte with the new entry. This will avoid a race condition | 1651 | * pte with the new entry. This will avoid a race condition |
1653 | * seen in the presence of one thread doing SMC and another | 1652 | * seen in the presence of one thread doing SMC and another |
1654 | * thread doing COW. | 1653 | * thread doing COW. |
1655 | */ | 1654 | */ |
1656 | ptep_clear_flush(vma, address, page_table); | 1655 | ptep_clear_flush(vma, address, page_table); |
1657 | set_pte_at(mm, address, page_table, entry); | 1656 | set_pte_at(mm, address, page_table, entry); |
1658 | update_mmu_cache(vma, address, entry); | 1657 | update_mmu_cache(vma, address, entry); |
1659 | lru_cache_add_active(new_page); | 1658 | lru_cache_add_active(new_page); |
1660 | page_add_new_anon_rmap(new_page, vma, address); | 1659 | page_add_new_anon_rmap(new_page, vma, address); |
1661 | 1660 | ||
1662 | /* Free the old page.. */ | 1661 | /* Free the old page.. */ |
1663 | new_page = old_page; | 1662 | new_page = old_page; |
1664 | ret |= VM_FAULT_WRITE; | 1663 | ret |= VM_FAULT_WRITE; |
1665 | } | 1664 | } |
1666 | if (new_page) | 1665 | if (new_page) |
1667 | page_cache_release(new_page); | 1666 | page_cache_release(new_page); |
1668 | if (old_page) | 1667 | if (old_page) |
1669 | page_cache_release(old_page); | 1668 | page_cache_release(old_page); |
1670 | unlock: | 1669 | unlock: |
1671 | pte_unmap_unlock(page_table, ptl); | 1670 | pte_unmap_unlock(page_table, ptl); |
1672 | if (dirty_page) { | 1671 | if (dirty_page) { |
1673 | if (vma->vm_file) | 1672 | if (vma->vm_file) |
1674 | file_update_time(vma->vm_file); | 1673 | file_update_time(vma->vm_file); |
1675 | 1674 | ||
1676 | /* | 1675 | /* |
1677 | * Yes, Virginia, this is actually required to prevent a race | 1676 | * Yes, Virginia, this is actually required to prevent a race |
1678 | * with clear_page_dirty_for_io() from clearing the page dirty | 1677 | * with clear_page_dirty_for_io() from clearing the page dirty |
1679 | * bit after it clear all dirty ptes, but before a racing | 1678 | * bit after it clear all dirty ptes, but before a racing |
1680 | * do_wp_page installs a dirty pte. | 1679 | * do_wp_page installs a dirty pte. |
1681 | * | 1680 | * |
1682 | * do_no_page is protected similarly. | 1681 | * do_no_page is protected similarly. |
1683 | */ | 1682 | */ |
1684 | wait_on_page_locked(dirty_page); | 1683 | wait_on_page_locked(dirty_page); |
1685 | set_page_dirty_balance(dirty_page, page_mkwrite); | 1684 | set_page_dirty_balance(dirty_page, page_mkwrite); |
1686 | put_page(dirty_page); | 1685 | put_page(dirty_page); |
1687 | } | 1686 | } |
1688 | return ret; | 1687 | return ret; |
1689 | oom: | 1688 | oom: |
1690 | if (old_page) | 1689 | if (old_page) |
1691 | page_cache_release(old_page); | 1690 | page_cache_release(old_page); |
1692 | return VM_FAULT_OOM; | 1691 | return VM_FAULT_OOM; |
1693 | 1692 | ||
1694 | unwritable_page: | 1693 | unwritable_page: |
1695 | page_cache_release(old_page); | 1694 | page_cache_release(old_page); |
1696 | return VM_FAULT_SIGBUS; | 1695 | return VM_FAULT_SIGBUS; |
1697 | } | 1696 | } |
1698 | 1697 | ||
1699 | /* | 1698 | /* |
1700 | * Helper functions for unmap_mapping_range(). | 1699 | * Helper functions for unmap_mapping_range(). |
1701 | * | 1700 | * |
1702 | * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __ | 1701 | * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __ |
1703 | * | 1702 | * |
1704 | * We have to restart searching the prio_tree whenever we drop the lock, | 1703 | * We have to restart searching the prio_tree whenever we drop the lock, |
1705 | * since the iterator is only valid while the lock is held, and anyway | 1704 | * since the iterator is only valid while the lock is held, and anyway |
1706 | * a later vma might be split and reinserted earlier while lock dropped. | 1705 | * a later vma might be split and reinserted earlier while lock dropped. |
1707 | * | 1706 | * |
1708 | * The list of nonlinear vmas could be handled more efficiently, using | 1707 | * The list of nonlinear vmas could be handled more efficiently, using |
1709 | * a placeholder, but handle it in the same way until a need is shown. | 1708 | * a placeholder, but handle it in the same way until a need is shown. |
1710 | * It is important to search the prio_tree before nonlinear list: a vma | 1709 | * It is important to search the prio_tree before nonlinear list: a vma |
1711 | * may become nonlinear and be shifted from prio_tree to nonlinear list | 1710 | * may become nonlinear and be shifted from prio_tree to nonlinear list |
1712 | * while the lock is dropped; but never shifted from list to prio_tree. | 1711 | * while the lock is dropped; but never shifted from list to prio_tree. |
1713 | * | 1712 | * |
1714 | * In order to make forward progress despite restarting the search, | 1713 | * In order to make forward progress despite restarting the search, |
1715 | * vm_truncate_count is used to mark a vma as now dealt with, so we can | 1714 | * vm_truncate_count is used to mark a vma as now dealt with, so we can |
1716 | * quickly skip it next time around. Since the prio_tree search only | 1715 | * quickly skip it next time around. Since the prio_tree search only |
1717 | * shows us those vmas affected by unmapping the range in question, we | 1716 | * shows us those vmas affected by unmapping the range in question, we |
1718 | * can't efficiently keep all vmas in step with mapping->truncate_count: | 1717 | * can't efficiently keep all vmas in step with mapping->truncate_count: |
1719 | * so instead reset them all whenever it wraps back to 0 (then go to 1). | 1718 | * so instead reset them all whenever it wraps back to 0 (then go to 1). |
1720 | * mapping->truncate_count and vma->vm_truncate_count are protected by | 1719 | * mapping->truncate_count and vma->vm_truncate_count are protected by |
1721 | * i_mmap_lock. | 1720 | * i_mmap_lock. |
1722 | * | 1721 | * |
1723 | * In order to make forward progress despite repeatedly restarting some | 1722 | * In order to make forward progress despite repeatedly restarting some |
1724 | * large vma, note the restart_addr from unmap_vmas when it breaks out: | 1723 | * large vma, note the restart_addr from unmap_vmas when it breaks out: |
1725 | * and restart from that address when we reach that vma again. It might | 1724 | * and restart from that address when we reach that vma again. It might |
1726 | * have been split or merged, shrunk or extended, but never shifted: so | 1725 | * have been split or merged, shrunk or extended, but never shifted: so |
1727 | * restart_addr remains valid so long as it remains in the vma's range. | 1726 | * restart_addr remains valid so long as it remains in the vma's range. |
1728 | * unmap_mapping_range forces truncate_count to leap over page-aligned | 1727 | * unmap_mapping_range forces truncate_count to leap over page-aligned |
1729 | * values so we can save vma's restart_addr in its truncate_count field. | 1728 | * values so we can save vma's restart_addr in its truncate_count field. |
1730 | */ | 1729 | */ |
1731 | #define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK)) | 1730 | #define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK)) |
1732 | 1731 | ||
1733 | static void reset_vma_truncate_counts(struct address_space *mapping) | 1732 | static void reset_vma_truncate_counts(struct address_space *mapping) |
1734 | { | 1733 | { |
1735 | struct vm_area_struct *vma; | 1734 | struct vm_area_struct *vma; |
1736 | struct prio_tree_iter iter; | 1735 | struct prio_tree_iter iter; |
1737 | 1736 | ||
1738 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX) | 1737 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX) |
1739 | vma->vm_truncate_count = 0; | 1738 | vma->vm_truncate_count = 0; |
1740 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) | 1739 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) |
1741 | vma->vm_truncate_count = 0; | 1740 | vma->vm_truncate_count = 0; |
1742 | } | 1741 | } |
1743 | 1742 | ||
1744 | static int unmap_mapping_range_vma(struct vm_area_struct *vma, | 1743 | static int unmap_mapping_range_vma(struct vm_area_struct *vma, |
1745 | unsigned long start_addr, unsigned long end_addr, | 1744 | unsigned long start_addr, unsigned long end_addr, |
1746 | struct zap_details *details) | 1745 | struct zap_details *details) |
1747 | { | 1746 | { |
1748 | unsigned long restart_addr; | 1747 | unsigned long restart_addr; |
1749 | int need_break; | 1748 | int need_break; |
1750 | 1749 | ||
1751 | /* | 1750 | /* |
1752 | * files that support invalidating or truncating portions of the | 1751 | * files that support invalidating or truncating portions of the |
1753 | * file from under mmaped areas must have their ->fault function | 1752 | * file from under mmaped areas must have their ->fault function |
1754 | * return a locked page (and set VM_FAULT_LOCKED in the return). | 1753 | * return a locked page (and set VM_FAULT_LOCKED in the return). |
1755 | * This provides synchronisation against concurrent unmapping here. | 1754 | * This provides synchronisation against concurrent unmapping here. |
1756 | */ | 1755 | */ |
1757 | 1756 | ||
1758 | again: | 1757 | again: |
1759 | restart_addr = vma->vm_truncate_count; | 1758 | restart_addr = vma->vm_truncate_count; |
1760 | if (is_restart_addr(restart_addr) && start_addr < restart_addr) { | 1759 | if (is_restart_addr(restart_addr) && start_addr < restart_addr) { |
1761 | start_addr = restart_addr; | 1760 | start_addr = restart_addr; |
1762 | if (start_addr >= end_addr) { | 1761 | if (start_addr >= end_addr) { |
1763 | /* Top of vma has been split off since last time */ | 1762 | /* Top of vma has been split off since last time */ |
1764 | vma->vm_truncate_count = details->truncate_count; | 1763 | vma->vm_truncate_count = details->truncate_count; |
1765 | return 0; | 1764 | return 0; |
1766 | } | 1765 | } |
1767 | } | 1766 | } |
1768 | 1767 | ||
1769 | restart_addr = zap_page_range(vma, start_addr, | 1768 | restart_addr = zap_page_range(vma, start_addr, |
1770 | end_addr - start_addr, details); | 1769 | end_addr - start_addr, details); |
1771 | need_break = need_resched() || | 1770 | need_break = need_resched() || spin_needbreak(details->i_mmap_lock); |
1772 | need_lockbreak(details->i_mmap_lock); | ||
1773 | 1771 | ||
1774 | if (restart_addr >= end_addr) { | 1772 | if (restart_addr >= end_addr) { |
1775 | /* We have now completed this vma: mark it so */ | 1773 | /* We have now completed this vma: mark it so */ |
1776 | vma->vm_truncate_count = details->truncate_count; | 1774 | vma->vm_truncate_count = details->truncate_count; |
1777 | if (!need_break) | 1775 | if (!need_break) |
1778 | return 0; | 1776 | return 0; |
1779 | } else { | 1777 | } else { |
1780 | /* Note restart_addr in vma's truncate_count field */ | 1778 | /* Note restart_addr in vma's truncate_count field */ |
1781 | vma->vm_truncate_count = restart_addr; | 1779 | vma->vm_truncate_count = restart_addr; |
1782 | if (!need_break) | 1780 | if (!need_break) |
1783 | goto again; | 1781 | goto again; |
1784 | } | 1782 | } |
1785 | 1783 | ||
1786 | spin_unlock(details->i_mmap_lock); | 1784 | spin_unlock(details->i_mmap_lock); |
1787 | cond_resched(); | 1785 | cond_resched(); |
1788 | spin_lock(details->i_mmap_lock); | 1786 | spin_lock(details->i_mmap_lock); |
1789 | return -EINTR; | 1787 | return -EINTR; |
1790 | } | 1788 | } |
1791 | 1789 | ||
1792 | static inline void unmap_mapping_range_tree(struct prio_tree_root *root, | 1790 | static inline void unmap_mapping_range_tree(struct prio_tree_root *root, |
1793 | struct zap_details *details) | 1791 | struct zap_details *details) |
1794 | { | 1792 | { |
1795 | struct vm_area_struct *vma; | 1793 | struct vm_area_struct *vma; |
1796 | struct prio_tree_iter iter; | 1794 | struct prio_tree_iter iter; |
1797 | pgoff_t vba, vea, zba, zea; | 1795 | pgoff_t vba, vea, zba, zea; |
1798 | 1796 | ||
1799 | restart: | 1797 | restart: |
1800 | vma_prio_tree_foreach(vma, &iter, root, | 1798 | vma_prio_tree_foreach(vma, &iter, root, |
1801 | details->first_index, details->last_index) { | 1799 | details->first_index, details->last_index) { |
1802 | /* Skip quickly over those we have already dealt with */ | 1800 | /* Skip quickly over those we have already dealt with */ |
1803 | if (vma->vm_truncate_count == details->truncate_count) | 1801 | if (vma->vm_truncate_count == details->truncate_count) |
1804 | continue; | 1802 | continue; |
1805 | 1803 | ||
1806 | vba = vma->vm_pgoff; | 1804 | vba = vma->vm_pgoff; |
1807 | vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1; | 1805 | vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1; |
1808 | /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */ | 1806 | /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */ |
1809 | zba = details->first_index; | 1807 | zba = details->first_index; |
1810 | if (zba < vba) | 1808 | if (zba < vba) |
1811 | zba = vba; | 1809 | zba = vba; |
1812 | zea = details->last_index; | 1810 | zea = details->last_index; |
1813 | if (zea > vea) | 1811 | if (zea > vea) |
1814 | zea = vea; | 1812 | zea = vea; |
1815 | 1813 | ||
1816 | if (unmap_mapping_range_vma(vma, | 1814 | if (unmap_mapping_range_vma(vma, |
1817 | ((zba - vba) << PAGE_SHIFT) + vma->vm_start, | 1815 | ((zba - vba) << PAGE_SHIFT) + vma->vm_start, |
1818 | ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start, | 1816 | ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start, |
1819 | details) < 0) | 1817 | details) < 0) |
1820 | goto restart; | 1818 | goto restart; |
1821 | } | 1819 | } |
1822 | } | 1820 | } |
1823 | 1821 | ||
1824 | static inline void unmap_mapping_range_list(struct list_head *head, | 1822 | static inline void unmap_mapping_range_list(struct list_head *head, |
1825 | struct zap_details *details) | 1823 | struct zap_details *details) |
1826 | { | 1824 | { |
1827 | struct vm_area_struct *vma; | 1825 | struct vm_area_struct *vma; |
1828 | 1826 | ||
1829 | /* | 1827 | /* |
1830 | * In nonlinear VMAs there is no correspondence between virtual address | 1828 | * In nonlinear VMAs there is no correspondence between virtual address |
1831 | * offset and file offset. So we must perform an exhaustive search | 1829 | * offset and file offset. So we must perform an exhaustive search |
1832 | * across *all* the pages in each nonlinear VMA, not just the pages | 1830 | * across *all* the pages in each nonlinear VMA, not just the pages |
1833 | * whose virtual address lies outside the file truncation point. | 1831 | * whose virtual address lies outside the file truncation point. |
1834 | */ | 1832 | */ |
1835 | restart: | 1833 | restart: |
1836 | list_for_each_entry(vma, head, shared.vm_set.list) { | 1834 | list_for_each_entry(vma, head, shared.vm_set.list) { |
1837 | /* Skip quickly over those we have already dealt with */ | 1835 | /* Skip quickly over those we have already dealt with */ |
1838 | if (vma->vm_truncate_count == details->truncate_count) | 1836 | if (vma->vm_truncate_count == details->truncate_count) |
1839 | continue; | 1837 | continue; |
1840 | details->nonlinear_vma = vma; | 1838 | details->nonlinear_vma = vma; |
1841 | if (unmap_mapping_range_vma(vma, vma->vm_start, | 1839 | if (unmap_mapping_range_vma(vma, vma->vm_start, |
1842 | vma->vm_end, details) < 0) | 1840 | vma->vm_end, details) < 0) |
1843 | goto restart; | 1841 | goto restart; |
1844 | } | 1842 | } |
1845 | } | 1843 | } |
1846 | 1844 | ||
1847 | /** | 1845 | /** |
1848 | * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file. | 1846 | * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file. |
1849 | * @mapping: the address space containing mmaps to be unmapped. | 1847 | * @mapping: the address space containing mmaps to be unmapped. |
1850 | * @holebegin: byte in first page to unmap, relative to the start of | 1848 | * @holebegin: byte in first page to unmap, relative to the start of |
1851 | * the underlying file. This will be rounded down to a PAGE_SIZE | 1849 | * the underlying file. This will be rounded down to a PAGE_SIZE |
1852 | * boundary. Note that this is different from vmtruncate(), which | 1850 | * boundary. Note that this is different from vmtruncate(), which |
1853 | * must keep the partial page. In contrast, we must get rid of | 1851 | * must keep the partial page. In contrast, we must get rid of |
1854 | * partial pages. | 1852 | * partial pages. |
1855 | * @holelen: size of prospective hole in bytes. This will be rounded | 1853 | * @holelen: size of prospective hole in bytes. This will be rounded |
1856 | * up to a PAGE_SIZE boundary. A holelen of zero truncates to the | 1854 | * up to a PAGE_SIZE boundary. A holelen of zero truncates to the |
1857 | * end of the file. | 1855 | * end of the file. |
1858 | * @even_cows: 1 when truncating a file, unmap even private COWed pages; | 1856 | * @even_cows: 1 when truncating a file, unmap even private COWed pages; |
1859 | * but 0 when invalidating pagecache, don't throw away private data. | 1857 | * but 0 when invalidating pagecache, don't throw away private data. |
1860 | */ | 1858 | */ |
1861 | void unmap_mapping_range(struct address_space *mapping, | 1859 | void unmap_mapping_range(struct address_space *mapping, |
1862 | loff_t const holebegin, loff_t const holelen, int even_cows) | 1860 | loff_t const holebegin, loff_t const holelen, int even_cows) |
1863 | { | 1861 | { |
1864 | struct zap_details details; | 1862 | struct zap_details details; |
1865 | pgoff_t hba = holebegin >> PAGE_SHIFT; | 1863 | pgoff_t hba = holebegin >> PAGE_SHIFT; |
1866 | pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; | 1864 | pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; |
1867 | 1865 | ||
1868 | /* Check for overflow. */ | 1866 | /* Check for overflow. */ |
1869 | if (sizeof(holelen) > sizeof(hlen)) { | 1867 | if (sizeof(holelen) > sizeof(hlen)) { |
1870 | long long holeend = | 1868 | long long holeend = |
1871 | (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; | 1869 | (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; |
1872 | if (holeend & ~(long long)ULONG_MAX) | 1870 | if (holeend & ~(long long)ULONG_MAX) |
1873 | hlen = ULONG_MAX - hba + 1; | 1871 | hlen = ULONG_MAX - hba + 1; |
1874 | } | 1872 | } |
1875 | 1873 | ||
1876 | details.check_mapping = even_cows? NULL: mapping; | 1874 | details.check_mapping = even_cows? NULL: mapping; |
1877 | details.nonlinear_vma = NULL; | 1875 | details.nonlinear_vma = NULL; |
1878 | details.first_index = hba; | 1876 | details.first_index = hba; |
1879 | details.last_index = hba + hlen - 1; | 1877 | details.last_index = hba + hlen - 1; |
1880 | if (details.last_index < details.first_index) | 1878 | if (details.last_index < details.first_index) |
1881 | details.last_index = ULONG_MAX; | 1879 | details.last_index = ULONG_MAX; |
1882 | details.i_mmap_lock = &mapping->i_mmap_lock; | 1880 | details.i_mmap_lock = &mapping->i_mmap_lock; |
1883 | 1881 | ||
1884 | spin_lock(&mapping->i_mmap_lock); | 1882 | spin_lock(&mapping->i_mmap_lock); |
1885 | 1883 | ||
1886 | /* Protect against endless unmapping loops */ | 1884 | /* Protect against endless unmapping loops */ |
1887 | mapping->truncate_count++; | 1885 | mapping->truncate_count++; |
1888 | if (unlikely(is_restart_addr(mapping->truncate_count))) { | 1886 | if (unlikely(is_restart_addr(mapping->truncate_count))) { |
1889 | if (mapping->truncate_count == 0) | 1887 | if (mapping->truncate_count == 0) |
1890 | reset_vma_truncate_counts(mapping); | 1888 | reset_vma_truncate_counts(mapping); |
1891 | mapping->truncate_count++; | 1889 | mapping->truncate_count++; |
1892 | } | 1890 | } |
1893 | details.truncate_count = mapping->truncate_count; | 1891 | details.truncate_count = mapping->truncate_count; |
1894 | 1892 | ||
1895 | if (unlikely(!prio_tree_empty(&mapping->i_mmap))) | 1893 | if (unlikely(!prio_tree_empty(&mapping->i_mmap))) |
1896 | unmap_mapping_range_tree(&mapping->i_mmap, &details); | 1894 | unmap_mapping_range_tree(&mapping->i_mmap, &details); |
1897 | if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) | 1895 | if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) |
1898 | unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); | 1896 | unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); |
1899 | spin_unlock(&mapping->i_mmap_lock); | 1897 | spin_unlock(&mapping->i_mmap_lock); |
1900 | } | 1898 | } |
1901 | EXPORT_SYMBOL(unmap_mapping_range); | 1899 | EXPORT_SYMBOL(unmap_mapping_range); |
1902 | 1900 | ||
1903 | /** | 1901 | /** |
1904 | * vmtruncate - unmap mappings "freed" by truncate() syscall | 1902 | * vmtruncate - unmap mappings "freed" by truncate() syscall |
1905 | * @inode: inode of the file used | 1903 | * @inode: inode of the file used |
1906 | * @offset: file offset to start truncating | 1904 | * @offset: file offset to start truncating |
1907 | * | 1905 | * |
1908 | * NOTE! We have to be ready to update the memory sharing | 1906 | * NOTE! We have to be ready to update the memory sharing |
1909 | * between the file and the memory map for a potential last | 1907 | * between the file and the memory map for a potential last |
1910 | * incomplete page. Ugly, but necessary. | 1908 | * incomplete page. Ugly, but necessary. |
1911 | */ | 1909 | */ |
1912 | int vmtruncate(struct inode * inode, loff_t offset) | 1910 | int vmtruncate(struct inode * inode, loff_t offset) |
1913 | { | 1911 | { |
1914 | struct address_space *mapping = inode->i_mapping; | 1912 | struct address_space *mapping = inode->i_mapping; |
1915 | unsigned long limit; | 1913 | unsigned long limit; |
1916 | 1914 | ||
1917 | if (inode->i_size < offset) | 1915 | if (inode->i_size < offset) |
1918 | goto do_expand; | 1916 | goto do_expand; |
1919 | /* | 1917 | /* |
1920 | * truncation of in-use swapfiles is disallowed - it would cause | 1918 | * truncation of in-use swapfiles is disallowed - it would cause |
1921 | * subsequent swapout to scribble on the now-freed blocks. | 1919 | * subsequent swapout to scribble on the now-freed blocks. |
1922 | */ | 1920 | */ |
1923 | if (IS_SWAPFILE(inode)) | 1921 | if (IS_SWAPFILE(inode)) |
1924 | goto out_busy; | 1922 | goto out_busy; |
1925 | i_size_write(inode, offset); | 1923 | i_size_write(inode, offset); |
1926 | 1924 | ||
1927 | /* | 1925 | /* |
1928 | * unmap_mapping_range is called twice, first simply for efficiency | 1926 | * unmap_mapping_range is called twice, first simply for efficiency |
1929 | * so that truncate_inode_pages does fewer single-page unmaps. However | 1927 | * so that truncate_inode_pages does fewer single-page unmaps. However |
1930 | * after this first call, and before truncate_inode_pages finishes, | 1928 | * after this first call, and before truncate_inode_pages finishes, |
1931 | * it is possible for private pages to be COWed, which remain after | 1929 | * it is possible for private pages to be COWed, which remain after |
1932 | * truncate_inode_pages finishes, hence the second unmap_mapping_range | 1930 | * truncate_inode_pages finishes, hence the second unmap_mapping_range |
1933 | * call must be made for correctness. | 1931 | * call must be made for correctness. |
1934 | */ | 1932 | */ |
1935 | unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); | 1933 | unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); |
1936 | truncate_inode_pages(mapping, offset); | 1934 | truncate_inode_pages(mapping, offset); |
1937 | unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); | 1935 | unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); |
1938 | goto out_truncate; | 1936 | goto out_truncate; |
1939 | 1937 | ||
1940 | do_expand: | 1938 | do_expand: |
1941 | limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; | 1939 | limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; |
1942 | if (limit != RLIM_INFINITY && offset > limit) | 1940 | if (limit != RLIM_INFINITY && offset > limit) |
1943 | goto out_sig; | 1941 | goto out_sig; |
1944 | if (offset > inode->i_sb->s_maxbytes) | 1942 | if (offset > inode->i_sb->s_maxbytes) |
1945 | goto out_big; | 1943 | goto out_big; |
1946 | i_size_write(inode, offset); | 1944 | i_size_write(inode, offset); |
1947 | 1945 | ||
1948 | out_truncate: | 1946 | out_truncate: |
1949 | if (inode->i_op && inode->i_op->truncate) | 1947 | if (inode->i_op && inode->i_op->truncate) |
1950 | inode->i_op->truncate(inode); | 1948 | inode->i_op->truncate(inode); |
1951 | return 0; | 1949 | return 0; |
1952 | out_sig: | 1950 | out_sig: |
1953 | send_sig(SIGXFSZ, current, 0); | 1951 | send_sig(SIGXFSZ, current, 0); |
1954 | out_big: | 1952 | out_big: |
1955 | return -EFBIG; | 1953 | return -EFBIG; |
1956 | out_busy: | 1954 | out_busy: |
1957 | return -ETXTBSY; | 1955 | return -ETXTBSY; |
1958 | } | 1956 | } |
1959 | EXPORT_SYMBOL(vmtruncate); | 1957 | EXPORT_SYMBOL(vmtruncate); |
1960 | 1958 | ||
1961 | int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) | 1959 | int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) |
1962 | { | 1960 | { |
1963 | struct address_space *mapping = inode->i_mapping; | 1961 | struct address_space *mapping = inode->i_mapping; |
1964 | 1962 | ||
1965 | /* | 1963 | /* |
1966 | * If the underlying filesystem is not going to provide | 1964 | * If the underlying filesystem is not going to provide |
1967 | * a way to truncate a range of blocks (punch a hole) - | 1965 | * a way to truncate a range of blocks (punch a hole) - |
1968 | * we should return failure right now. | 1966 | * we should return failure right now. |
1969 | */ | 1967 | */ |
1970 | if (!inode->i_op || !inode->i_op->truncate_range) | 1968 | if (!inode->i_op || !inode->i_op->truncate_range) |
1971 | return -ENOSYS; | 1969 | return -ENOSYS; |
1972 | 1970 | ||
1973 | mutex_lock(&inode->i_mutex); | 1971 | mutex_lock(&inode->i_mutex); |
1974 | down_write(&inode->i_alloc_sem); | 1972 | down_write(&inode->i_alloc_sem); |
1975 | unmap_mapping_range(mapping, offset, (end - offset), 1); | 1973 | unmap_mapping_range(mapping, offset, (end - offset), 1); |
1976 | truncate_inode_pages_range(mapping, offset, end); | 1974 | truncate_inode_pages_range(mapping, offset, end); |
1977 | unmap_mapping_range(mapping, offset, (end - offset), 1); | 1975 | unmap_mapping_range(mapping, offset, (end - offset), 1); |
1978 | inode->i_op->truncate_range(inode, offset, end); | 1976 | inode->i_op->truncate_range(inode, offset, end); |
1979 | up_write(&inode->i_alloc_sem); | 1977 | up_write(&inode->i_alloc_sem); |
1980 | mutex_unlock(&inode->i_mutex); | 1978 | mutex_unlock(&inode->i_mutex); |
1981 | 1979 | ||
1982 | return 0; | 1980 | return 0; |
1983 | } | 1981 | } |
1984 | 1982 | ||
1985 | /** | 1983 | /** |
1986 | * swapin_readahead - swap in pages in hope we need them soon | 1984 | * swapin_readahead - swap in pages in hope we need them soon |
1987 | * @entry: swap entry of this memory | 1985 | * @entry: swap entry of this memory |
1988 | * @addr: address to start | 1986 | * @addr: address to start |
1989 | * @vma: user vma this addresses belong to | 1987 | * @vma: user vma this addresses belong to |
1990 | * | 1988 | * |
1991 | * Primitive swap readahead code. We simply read an aligned block of | 1989 | * Primitive swap readahead code. We simply read an aligned block of |
1992 | * (1 << page_cluster) entries in the swap area. This method is chosen | 1990 | * (1 << page_cluster) entries in the swap area. This method is chosen |
1993 | * because it doesn't cost us any seek time. We also make sure to queue | 1991 | * because it doesn't cost us any seek time. We also make sure to queue |
1994 | * the 'original' request together with the readahead ones... | 1992 | * the 'original' request together with the readahead ones... |
1995 | * | 1993 | * |
1996 | * This has been extended to use the NUMA policies from the mm triggering | 1994 | * This has been extended to use the NUMA policies from the mm triggering |
1997 | * the readahead. | 1995 | * the readahead. |
1998 | * | 1996 | * |
1999 | * Caller must hold down_read on the vma->vm_mm if vma is not NULL. | 1997 | * Caller must hold down_read on the vma->vm_mm if vma is not NULL. |
2000 | */ | 1998 | */ |
2001 | void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struct *vma) | 1999 | void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struct *vma) |
2002 | { | 2000 | { |
2003 | #ifdef CONFIG_NUMA | 2001 | #ifdef CONFIG_NUMA |
2004 | struct vm_area_struct *next_vma = vma ? vma->vm_next : NULL; | 2002 | struct vm_area_struct *next_vma = vma ? vma->vm_next : NULL; |
2005 | #endif | 2003 | #endif |
2006 | int i, num; | 2004 | int i, num; |
2007 | struct page *new_page; | 2005 | struct page *new_page; |
2008 | unsigned long offset; | 2006 | unsigned long offset; |
2009 | 2007 | ||
2010 | /* | 2008 | /* |
2011 | * Get the number of handles we should do readahead io to. | 2009 | * Get the number of handles we should do readahead io to. |
2012 | */ | 2010 | */ |
2013 | num = valid_swaphandles(entry, &offset); | 2011 | num = valid_swaphandles(entry, &offset); |
2014 | for (i = 0; i < num; offset++, i++) { | 2012 | for (i = 0; i < num; offset++, i++) { |
2015 | /* Ok, do the async read-ahead now */ | 2013 | /* Ok, do the async read-ahead now */ |
2016 | new_page = read_swap_cache_async(swp_entry(swp_type(entry), | 2014 | new_page = read_swap_cache_async(swp_entry(swp_type(entry), |
2017 | offset), vma, addr); | 2015 | offset), vma, addr); |
2018 | if (!new_page) | 2016 | if (!new_page) |
2019 | break; | 2017 | break; |
2020 | page_cache_release(new_page); | 2018 | page_cache_release(new_page); |
2021 | #ifdef CONFIG_NUMA | 2019 | #ifdef CONFIG_NUMA |
2022 | /* | 2020 | /* |
2023 | * Find the next applicable VMA for the NUMA policy. | 2021 | * Find the next applicable VMA for the NUMA policy. |
2024 | */ | 2022 | */ |
2025 | addr += PAGE_SIZE; | 2023 | addr += PAGE_SIZE; |
2026 | if (addr == 0) | 2024 | if (addr == 0) |
2027 | vma = NULL; | 2025 | vma = NULL; |
2028 | if (vma) { | 2026 | if (vma) { |
2029 | if (addr >= vma->vm_end) { | 2027 | if (addr >= vma->vm_end) { |
2030 | vma = next_vma; | 2028 | vma = next_vma; |
2031 | next_vma = vma ? vma->vm_next : NULL; | 2029 | next_vma = vma ? vma->vm_next : NULL; |
2032 | } | 2030 | } |
2033 | if (vma && addr < vma->vm_start) | 2031 | if (vma && addr < vma->vm_start) |
2034 | vma = NULL; | 2032 | vma = NULL; |
2035 | } else { | 2033 | } else { |
2036 | if (next_vma && addr >= next_vma->vm_start) { | 2034 | if (next_vma && addr >= next_vma->vm_start) { |
2037 | vma = next_vma; | 2035 | vma = next_vma; |
2038 | next_vma = vma->vm_next; | 2036 | next_vma = vma->vm_next; |
2039 | } | 2037 | } |
2040 | } | 2038 | } |
2041 | #endif | 2039 | #endif |
2042 | } | 2040 | } |
2043 | lru_add_drain(); /* Push any new pages onto the LRU now */ | 2041 | lru_add_drain(); /* Push any new pages onto the LRU now */ |
2044 | } | 2042 | } |
2045 | 2043 | ||
2046 | /* | 2044 | /* |
2047 | * We enter with non-exclusive mmap_sem (to exclude vma changes, | 2045 | * We enter with non-exclusive mmap_sem (to exclude vma changes, |
2048 | * but allow concurrent faults), and pte mapped but not yet locked. | 2046 | * but allow concurrent faults), and pte mapped but not yet locked. |
2049 | * We return with mmap_sem still held, but pte unmapped and unlocked. | 2047 | * We return with mmap_sem still held, but pte unmapped and unlocked. |
2050 | */ | 2048 | */ |
2051 | static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | 2049 | static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, |
2052 | unsigned long address, pte_t *page_table, pmd_t *pmd, | 2050 | unsigned long address, pte_t *page_table, pmd_t *pmd, |
2053 | int write_access, pte_t orig_pte) | 2051 | int write_access, pte_t orig_pte) |
2054 | { | 2052 | { |
2055 | spinlock_t *ptl; | 2053 | spinlock_t *ptl; |
2056 | struct page *page; | 2054 | struct page *page; |
2057 | swp_entry_t entry; | 2055 | swp_entry_t entry; |
2058 | pte_t pte; | 2056 | pte_t pte; |
2059 | int ret = 0; | 2057 | int ret = 0; |
2060 | 2058 | ||
2061 | if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) | 2059 | if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) |
2062 | goto out; | 2060 | goto out; |
2063 | 2061 | ||
2064 | entry = pte_to_swp_entry(orig_pte); | 2062 | entry = pte_to_swp_entry(orig_pte); |
2065 | if (is_migration_entry(entry)) { | 2063 | if (is_migration_entry(entry)) { |
2066 | migration_entry_wait(mm, pmd, address); | 2064 | migration_entry_wait(mm, pmd, address); |
2067 | goto out; | 2065 | goto out; |
2068 | } | 2066 | } |
2069 | delayacct_set_flag(DELAYACCT_PF_SWAPIN); | 2067 | delayacct_set_flag(DELAYACCT_PF_SWAPIN); |
2070 | page = lookup_swap_cache(entry); | 2068 | page = lookup_swap_cache(entry); |
2071 | if (!page) { | 2069 | if (!page) { |
2072 | grab_swap_token(); /* Contend for token _before_ read-in */ | 2070 | grab_swap_token(); /* Contend for token _before_ read-in */ |
2073 | swapin_readahead(entry, address, vma); | 2071 | swapin_readahead(entry, address, vma); |
2074 | page = read_swap_cache_async(entry, vma, address); | 2072 | page = read_swap_cache_async(entry, vma, address); |
2075 | if (!page) { | 2073 | if (!page) { |
2076 | /* | 2074 | /* |
2077 | * Back out if somebody else faulted in this pte | 2075 | * Back out if somebody else faulted in this pte |
2078 | * while we released the pte lock. | 2076 | * while we released the pte lock. |
2079 | */ | 2077 | */ |
2080 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 2078 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); |
2081 | if (likely(pte_same(*page_table, orig_pte))) | 2079 | if (likely(pte_same(*page_table, orig_pte))) |
2082 | ret = VM_FAULT_OOM; | 2080 | ret = VM_FAULT_OOM; |
2083 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 2081 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
2084 | goto unlock; | 2082 | goto unlock; |
2085 | } | 2083 | } |
2086 | 2084 | ||
2087 | /* Had to read the page from swap area: Major fault */ | 2085 | /* Had to read the page from swap area: Major fault */ |
2088 | ret = VM_FAULT_MAJOR; | 2086 | ret = VM_FAULT_MAJOR; |
2089 | count_vm_event(PGMAJFAULT); | 2087 | count_vm_event(PGMAJFAULT); |
2090 | } | 2088 | } |
2091 | 2089 | ||
2092 | mark_page_accessed(page); | 2090 | mark_page_accessed(page); |
2093 | lock_page(page); | 2091 | lock_page(page); |
2094 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 2092 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
2095 | 2093 | ||
2096 | /* | 2094 | /* |
2097 | * Back out if somebody else already faulted in this pte. | 2095 | * Back out if somebody else already faulted in this pte. |
2098 | */ | 2096 | */ |
2099 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 2097 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); |
2100 | if (unlikely(!pte_same(*page_table, orig_pte))) | 2098 | if (unlikely(!pte_same(*page_table, orig_pte))) |
2101 | goto out_nomap; | 2099 | goto out_nomap; |
2102 | 2100 | ||
2103 | if (unlikely(!PageUptodate(page))) { | 2101 | if (unlikely(!PageUptodate(page))) { |
2104 | ret = VM_FAULT_SIGBUS; | 2102 | ret = VM_FAULT_SIGBUS; |
2105 | goto out_nomap; | 2103 | goto out_nomap; |
2106 | } | 2104 | } |
2107 | 2105 | ||
2108 | /* The page isn't present yet, go ahead with the fault. */ | 2106 | /* The page isn't present yet, go ahead with the fault. */ |
2109 | 2107 | ||
2110 | inc_mm_counter(mm, anon_rss); | 2108 | inc_mm_counter(mm, anon_rss); |
2111 | pte = mk_pte(page, vma->vm_page_prot); | 2109 | pte = mk_pte(page, vma->vm_page_prot); |
2112 | if (write_access && can_share_swap_page(page)) { | 2110 | if (write_access && can_share_swap_page(page)) { |
2113 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); | 2111 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); |
2114 | write_access = 0; | 2112 | write_access = 0; |
2115 | } | 2113 | } |
2116 | 2114 | ||
2117 | flush_icache_page(vma, page); | 2115 | flush_icache_page(vma, page); |
2118 | set_pte_at(mm, address, page_table, pte); | 2116 | set_pte_at(mm, address, page_table, pte); |
2119 | page_add_anon_rmap(page, vma, address); | 2117 | page_add_anon_rmap(page, vma, address); |
2120 | 2118 | ||
2121 | swap_free(entry); | 2119 | swap_free(entry); |
2122 | if (vm_swap_full()) | 2120 | if (vm_swap_full()) |
2123 | remove_exclusive_swap_page(page); | 2121 | remove_exclusive_swap_page(page); |
2124 | unlock_page(page); | 2122 | unlock_page(page); |
2125 | 2123 | ||
2126 | if (write_access) { | 2124 | if (write_access) { |
2127 | /* XXX: We could OR the do_wp_page code with this one? */ | 2125 | /* XXX: We could OR the do_wp_page code with this one? */ |
2128 | if (do_wp_page(mm, vma, address, | 2126 | if (do_wp_page(mm, vma, address, |
2129 | page_table, pmd, ptl, pte) & VM_FAULT_OOM) | 2127 | page_table, pmd, ptl, pte) & VM_FAULT_OOM) |
2130 | ret = VM_FAULT_OOM; | 2128 | ret = VM_FAULT_OOM; |
2131 | goto out; | 2129 | goto out; |
2132 | } | 2130 | } |
2133 | 2131 | ||
2134 | /* No need to invalidate - it was non-present before */ | 2132 | /* No need to invalidate - it was non-present before */ |
2135 | update_mmu_cache(vma, address, pte); | 2133 | update_mmu_cache(vma, address, pte); |
2136 | unlock: | 2134 | unlock: |
2137 | pte_unmap_unlock(page_table, ptl); | 2135 | pte_unmap_unlock(page_table, ptl); |
2138 | out: | 2136 | out: |
2139 | return ret; | 2137 | return ret; |
2140 | out_nomap: | 2138 | out_nomap: |
2141 | pte_unmap_unlock(page_table, ptl); | 2139 | pte_unmap_unlock(page_table, ptl); |
2142 | unlock_page(page); | 2140 | unlock_page(page); |
2143 | page_cache_release(page); | 2141 | page_cache_release(page); |
2144 | return ret; | 2142 | return ret; |
2145 | } | 2143 | } |
2146 | 2144 | ||
2147 | /* | 2145 | /* |
2148 | * We enter with non-exclusive mmap_sem (to exclude vma changes, | 2146 | * We enter with non-exclusive mmap_sem (to exclude vma changes, |
2149 | * but allow concurrent faults), and pte mapped but not yet locked. | 2147 | * but allow concurrent faults), and pte mapped but not yet locked. |
2150 | * We return with mmap_sem still held, but pte unmapped and unlocked. | 2148 | * We return with mmap_sem still held, but pte unmapped and unlocked. |
2151 | */ | 2149 | */ |
2152 | static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | 2150 | static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, |
2153 | unsigned long address, pte_t *page_table, pmd_t *pmd, | 2151 | unsigned long address, pte_t *page_table, pmd_t *pmd, |
2154 | int write_access) | 2152 | int write_access) |
2155 | { | 2153 | { |
2156 | struct page *page; | 2154 | struct page *page; |
2157 | spinlock_t *ptl; | 2155 | spinlock_t *ptl; |
2158 | pte_t entry; | 2156 | pte_t entry; |
2159 | 2157 | ||
2160 | /* Allocate our own private page. */ | 2158 | /* Allocate our own private page. */ |
2161 | pte_unmap(page_table); | 2159 | pte_unmap(page_table); |
2162 | 2160 | ||
2163 | if (unlikely(anon_vma_prepare(vma))) | 2161 | if (unlikely(anon_vma_prepare(vma))) |
2164 | goto oom; | 2162 | goto oom; |
2165 | page = alloc_zeroed_user_highpage_movable(vma, address); | 2163 | page = alloc_zeroed_user_highpage_movable(vma, address); |
2166 | if (!page) | 2164 | if (!page) |
2167 | goto oom; | 2165 | goto oom; |
2168 | 2166 | ||
2169 | entry = mk_pte(page, vma->vm_page_prot); | 2167 | entry = mk_pte(page, vma->vm_page_prot); |
2170 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2168 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
2171 | 2169 | ||
2172 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 2170 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); |
2173 | if (!pte_none(*page_table)) | 2171 | if (!pte_none(*page_table)) |
2174 | goto release; | 2172 | goto release; |
2175 | inc_mm_counter(mm, anon_rss); | 2173 | inc_mm_counter(mm, anon_rss); |
2176 | lru_cache_add_active(page); | 2174 | lru_cache_add_active(page); |
2177 | page_add_new_anon_rmap(page, vma, address); | 2175 | page_add_new_anon_rmap(page, vma, address); |
2178 | set_pte_at(mm, address, page_table, entry); | 2176 | set_pte_at(mm, address, page_table, entry); |
2179 | 2177 | ||
2180 | /* No need to invalidate - it was non-present before */ | 2178 | /* No need to invalidate - it was non-present before */ |
2181 | update_mmu_cache(vma, address, entry); | 2179 | update_mmu_cache(vma, address, entry); |
2182 | unlock: | 2180 | unlock: |
2183 | pte_unmap_unlock(page_table, ptl); | 2181 | pte_unmap_unlock(page_table, ptl); |
2184 | return 0; | 2182 | return 0; |
2185 | release: | 2183 | release: |
2186 | page_cache_release(page); | 2184 | page_cache_release(page); |
2187 | goto unlock; | 2185 | goto unlock; |
2188 | oom: | 2186 | oom: |
2189 | return VM_FAULT_OOM; | 2187 | return VM_FAULT_OOM; |
2190 | } | 2188 | } |
2191 | 2189 | ||
2192 | /* | 2190 | /* |
2193 | * __do_fault() tries to create a new page mapping. It aggressively | 2191 | * __do_fault() tries to create a new page mapping. It aggressively |
2194 | * tries to share with existing pages, but makes a separate copy if | 2192 | * tries to share with existing pages, but makes a separate copy if |
2195 | * the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid | 2193 | * the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid |
2196 | * the next page fault. | 2194 | * the next page fault. |
2197 | * | 2195 | * |
2198 | * As this is called only for pages that do not currently exist, we | 2196 | * As this is called only for pages that do not currently exist, we |
2199 | * do not need to flush old virtual caches or the TLB. | 2197 | * do not need to flush old virtual caches or the TLB. |
2200 | * | 2198 | * |
2201 | * We enter with non-exclusive mmap_sem (to exclude vma changes, | 2199 | * We enter with non-exclusive mmap_sem (to exclude vma changes, |
2202 | * but allow concurrent faults), and pte neither mapped nor locked. | 2200 | * but allow concurrent faults), and pte neither mapped nor locked. |
2203 | * We return with mmap_sem still held, but pte unmapped and unlocked. | 2201 | * We return with mmap_sem still held, but pte unmapped and unlocked. |
2204 | */ | 2202 | */ |
2205 | static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 2203 | static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
2206 | unsigned long address, pmd_t *pmd, | 2204 | unsigned long address, pmd_t *pmd, |
2207 | pgoff_t pgoff, unsigned int flags, pte_t orig_pte) | 2205 | pgoff_t pgoff, unsigned int flags, pte_t orig_pte) |
2208 | { | 2206 | { |
2209 | pte_t *page_table; | 2207 | pte_t *page_table; |
2210 | spinlock_t *ptl; | 2208 | spinlock_t *ptl; |
2211 | struct page *page; | 2209 | struct page *page; |
2212 | pte_t entry; | 2210 | pte_t entry; |
2213 | int anon = 0; | 2211 | int anon = 0; |
2214 | struct page *dirty_page = NULL; | 2212 | struct page *dirty_page = NULL; |
2215 | struct vm_fault vmf; | 2213 | struct vm_fault vmf; |
2216 | int ret; | 2214 | int ret; |
2217 | int page_mkwrite = 0; | 2215 | int page_mkwrite = 0; |
2218 | 2216 | ||
2219 | vmf.virtual_address = (void __user *)(address & PAGE_MASK); | 2217 | vmf.virtual_address = (void __user *)(address & PAGE_MASK); |
2220 | vmf.pgoff = pgoff; | 2218 | vmf.pgoff = pgoff; |
2221 | vmf.flags = flags; | 2219 | vmf.flags = flags; |
2222 | vmf.page = NULL; | 2220 | vmf.page = NULL; |
2223 | 2221 | ||
2224 | BUG_ON(vma->vm_flags & VM_PFNMAP); | 2222 | BUG_ON(vma->vm_flags & VM_PFNMAP); |
2225 | 2223 | ||
2226 | if (likely(vma->vm_ops->fault)) { | 2224 | if (likely(vma->vm_ops->fault)) { |
2227 | ret = vma->vm_ops->fault(vma, &vmf); | 2225 | ret = vma->vm_ops->fault(vma, &vmf); |
2228 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) | 2226 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) |
2229 | return ret; | 2227 | return ret; |
2230 | } else { | 2228 | } else { |
2231 | /* Legacy ->nopage path */ | 2229 | /* Legacy ->nopage path */ |
2232 | ret = 0; | 2230 | ret = 0; |
2233 | vmf.page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret); | 2231 | vmf.page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret); |
2234 | /* no page was available -- either SIGBUS or OOM */ | 2232 | /* no page was available -- either SIGBUS or OOM */ |
2235 | if (unlikely(vmf.page == NOPAGE_SIGBUS)) | 2233 | if (unlikely(vmf.page == NOPAGE_SIGBUS)) |
2236 | return VM_FAULT_SIGBUS; | 2234 | return VM_FAULT_SIGBUS; |
2237 | else if (unlikely(vmf.page == NOPAGE_OOM)) | 2235 | else if (unlikely(vmf.page == NOPAGE_OOM)) |
2238 | return VM_FAULT_OOM; | 2236 | return VM_FAULT_OOM; |
2239 | } | 2237 | } |
2240 | 2238 | ||
2241 | /* | 2239 | /* |
2242 | * For consistency in subsequent calls, make the faulted page always | 2240 | * For consistency in subsequent calls, make the faulted page always |
2243 | * locked. | 2241 | * locked. |
2244 | */ | 2242 | */ |
2245 | if (unlikely(!(ret & VM_FAULT_LOCKED))) | 2243 | if (unlikely(!(ret & VM_FAULT_LOCKED))) |
2246 | lock_page(vmf.page); | 2244 | lock_page(vmf.page); |
2247 | else | 2245 | else |
2248 | VM_BUG_ON(!PageLocked(vmf.page)); | 2246 | VM_BUG_ON(!PageLocked(vmf.page)); |
2249 | 2247 | ||
2250 | /* | 2248 | /* |
2251 | * Should we do an early C-O-W break? | 2249 | * Should we do an early C-O-W break? |
2252 | */ | 2250 | */ |
2253 | page = vmf.page; | 2251 | page = vmf.page; |
2254 | if (flags & FAULT_FLAG_WRITE) { | 2252 | if (flags & FAULT_FLAG_WRITE) { |
2255 | if (!(vma->vm_flags & VM_SHARED)) { | 2253 | if (!(vma->vm_flags & VM_SHARED)) { |
2256 | anon = 1; | 2254 | anon = 1; |
2257 | if (unlikely(anon_vma_prepare(vma))) { | 2255 | if (unlikely(anon_vma_prepare(vma))) { |
2258 | ret = VM_FAULT_OOM; | 2256 | ret = VM_FAULT_OOM; |
2259 | goto out; | 2257 | goto out; |
2260 | } | 2258 | } |
2261 | page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, | 2259 | page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, |
2262 | vma, address); | 2260 | vma, address); |
2263 | if (!page) { | 2261 | if (!page) { |
2264 | ret = VM_FAULT_OOM; | 2262 | ret = VM_FAULT_OOM; |
2265 | goto out; | 2263 | goto out; |
2266 | } | 2264 | } |
2267 | copy_user_highpage(page, vmf.page, address, vma); | 2265 | copy_user_highpage(page, vmf.page, address, vma); |
2268 | } else { | 2266 | } else { |
2269 | /* | 2267 | /* |
2270 | * If the page will be shareable, see if the backing | 2268 | * If the page will be shareable, see if the backing |
2271 | * address space wants to know that the page is about | 2269 | * address space wants to know that the page is about |
2272 | * to become writable | 2270 | * to become writable |
2273 | */ | 2271 | */ |
2274 | if (vma->vm_ops->page_mkwrite) { | 2272 | if (vma->vm_ops->page_mkwrite) { |
2275 | unlock_page(page); | 2273 | unlock_page(page); |
2276 | if (vma->vm_ops->page_mkwrite(vma, page) < 0) { | 2274 | if (vma->vm_ops->page_mkwrite(vma, page) < 0) { |
2277 | ret = VM_FAULT_SIGBUS; | 2275 | ret = VM_FAULT_SIGBUS; |
2278 | anon = 1; /* no anon but release vmf.page */ | 2276 | anon = 1; /* no anon but release vmf.page */ |
2279 | goto out_unlocked; | 2277 | goto out_unlocked; |
2280 | } | 2278 | } |
2281 | lock_page(page); | 2279 | lock_page(page); |
2282 | /* | 2280 | /* |
2283 | * XXX: this is not quite right (racy vs | 2281 | * XXX: this is not quite right (racy vs |
2284 | * invalidate) to unlock and relock the page | 2282 | * invalidate) to unlock and relock the page |
2285 | * like this, however a better fix requires | 2283 | * like this, however a better fix requires |
2286 | * reworking page_mkwrite locking API, which | 2284 | * reworking page_mkwrite locking API, which |
2287 | * is better done later. | 2285 | * is better done later. |
2288 | */ | 2286 | */ |
2289 | if (!page->mapping) { | 2287 | if (!page->mapping) { |
2290 | ret = 0; | 2288 | ret = 0; |
2291 | anon = 1; /* no anon but release vmf.page */ | 2289 | anon = 1; /* no anon but release vmf.page */ |
2292 | goto out; | 2290 | goto out; |
2293 | } | 2291 | } |
2294 | page_mkwrite = 1; | 2292 | page_mkwrite = 1; |
2295 | } | 2293 | } |
2296 | } | 2294 | } |
2297 | 2295 | ||
2298 | } | 2296 | } |
2299 | 2297 | ||
2300 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 2298 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); |
2301 | 2299 | ||
2302 | /* | 2300 | /* |
2303 | * This silly early PAGE_DIRTY setting removes a race | 2301 | * This silly early PAGE_DIRTY setting removes a race |
2304 | * due to the bad i386 page protection. But it's valid | 2302 | * due to the bad i386 page protection. But it's valid |
2305 | * for other architectures too. | 2303 | * for other architectures too. |
2306 | * | 2304 | * |
2307 | * Note that if write_access is true, we either now have | 2305 | * Note that if write_access is true, we either now have |
2308 | * an exclusive copy of the page, or this is a shared mapping, | 2306 | * an exclusive copy of the page, or this is a shared mapping, |
2309 | * so we can make it writable and dirty to avoid having to | 2307 | * so we can make it writable and dirty to avoid having to |
2310 | * handle that later. | 2308 | * handle that later. |
2311 | */ | 2309 | */ |
2312 | /* Only go through if we didn't race with anybody else... */ | 2310 | /* Only go through if we didn't race with anybody else... */ |
2313 | if (likely(pte_same(*page_table, orig_pte))) { | 2311 | if (likely(pte_same(*page_table, orig_pte))) { |
2314 | flush_icache_page(vma, page); | 2312 | flush_icache_page(vma, page); |
2315 | entry = mk_pte(page, vma->vm_page_prot); | 2313 | entry = mk_pte(page, vma->vm_page_prot); |
2316 | if (flags & FAULT_FLAG_WRITE) | 2314 | if (flags & FAULT_FLAG_WRITE) |
2317 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2315 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
2318 | set_pte_at(mm, address, page_table, entry); | 2316 | set_pte_at(mm, address, page_table, entry); |
2319 | if (anon) { | 2317 | if (anon) { |
2320 | inc_mm_counter(mm, anon_rss); | 2318 | inc_mm_counter(mm, anon_rss); |
2321 | lru_cache_add_active(page); | 2319 | lru_cache_add_active(page); |
2322 | page_add_new_anon_rmap(page, vma, address); | 2320 | page_add_new_anon_rmap(page, vma, address); |
2323 | } else { | 2321 | } else { |
2324 | inc_mm_counter(mm, file_rss); | 2322 | inc_mm_counter(mm, file_rss); |
2325 | page_add_file_rmap(page); | 2323 | page_add_file_rmap(page); |
2326 | if (flags & FAULT_FLAG_WRITE) { | 2324 | if (flags & FAULT_FLAG_WRITE) { |
2327 | dirty_page = page; | 2325 | dirty_page = page; |
2328 | get_page(dirty_page); | 2326 | get_page(dirty_page); |
2329 | } | 2327 | } |
2330 | } | 2328 | } |
2331 | 2329 | ||
2332 | /* no need to invalidate: a not-present page won't be cached */ | 2330 | /* no need to invalidate: a not-present page won't be cached */ |
2333 | update_mmu_cache(vma, address, entry); | 2331 | update_mmu_cache(vma, address, entry); |
2334 | } else { | 2332 | } else { |
2335 | if (anon) | 2333 | if (anon) |
2336 | page_cache_release(page); | 2334 | page_cache_release(page); |
2337 | else | 2335 | else |
2338 | anon = 1; /* no anon but release faulted_page */ | 2336 | anon = 1; /* no anon but release faulted_page */ |
2339 | } | 2337 | } |
2340 | 2338 | ||
2341 | pte_unmap_unlock(page_table, ptl); | 2339 | pte_unmap_unlock(page_table, ptl); |
2342 | 2340 | ||
2343 | out: | 2341 | out: |
2344 | unlock_page(vmf.page); | 2342 | unlock_page(vmf.page); |
2345 | out_unlocked: | 2343 | out_unlocked: |
2346 | if (anon) | 2344 | if (anon) |
2347 | page_cache_release(vmf.page); | 2345 | page_cache_release(vmf.page); |
2348 | else if (dirty_page) { | 2346 | else if (dirty_page) { |
2349 | if (vma->vm_file) | 2347 | if (vma->vm_file) |
2350 | file_update_time(vma->vm_file); | 2348 | file_update_time(vma->vm_file); |
2351 | 2349 | ||
2352 | set_page_dirty_balance(dirty_page, page_mkwrite); | 2350 | set_page_dirty_balance(dirty_page, page_mkwrite); |
2353 | put_page(dirty_page); | 2351 | put_page(dirty_page); |
2354 | } | 2352 | } |
2355 | 2353 | ||
2356 | return ret; | 2354 | return ret; |
2357 | } | 2355 | } |
2358 | 2356 | ||
2359 | static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 2357 | static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
2360 | unsigned long address, pte_t *page_table, pmd_t *pmd, | 2358 | unsigned long address, pte_t *page_table, pmd_t *pmd, |
2361 | int write_access, pte_t orig_pte) | 2359 | int write_access, pte_t orig_pte) |
2362 | { | 2360 | { |
2363 | pgoff_t pgoff = (((address & PAGE_MASK) | 2361 | pgoff_t pgoff = (((address & PAGE_MASK) |
2364 | - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | 2362 | - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; |
2365 | unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0); | 2363 | unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0); |
2366 | 2364 | ||
2367 | pte_unmap(page_table); | 2365 | pte_unmap(page_table); |
2368 | return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); | 2366 | return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); |
2369 | } | 2367 | } |
2370 | 2368 | ||
2371 | 2369 | ||
2372 | /* | 2370 | /* |
2373 | * do_no_pfn() tries to create a new page mapping for a page without | 2371 | * do_no_pfn() tries to create a new page mapping for a page without |
2374 | * a struct_page backing it | 2372 | * a struct_page backing it |
2375 | * | 2373 | * |
2376 | * As this is called only for pages that do not currently exist, we | 2374 | * As this is called only for pages that do not currently exist, we |
2377 | * do not need to flush old virtual caches or the TLB. | 2375 | * do not need to flush old virtual caches or the TLB. |
2378 | * | 2376 | * |
2379 | * We enter with non-exclusive mmap_sem (to exclude vma changes, | 2377 | * We enter with non-exclusive mmap_sem (to exclude vma changes, |
2380 | * but allow concurrent faults), and pte mapped but not yet locked. | 2378 | * but allow concurrent faults), and pte mapped but not yet locked. |
2381 | * We return with mmap_sem still held, but pte unmapped and unlocked. | 2379 | * We return with mmap_sem still held, but pte unmapped and unlocked. |
2382 | * | 2380 | * |
2383 | * It is expected that the ->nopfn handler always returns the same pfn | 2381 | * It is expected that the ->nopfn handler always returns the same pfn |
2384 | * for a given virtual mapping. | 2382 | * for a given virtual mapping. |
2385 | * | 2383 | * |
2386 | * Mark this `noinline' to prevent it from bloating the main pagefault code. | 2384 | * Mark this `noinline' to prevent it from bloating the main pagefault code. |
2387 | */ | 2385 | */ |
2388 | static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma, | 2386 | static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma, |
2389 | unsigned long address, pte_t *page_table, pmd_t *pmd, | 2387 | unsigned long address, pte_t *page_table, pmd_t *pmd, |
2390 | int write_access) | 2388 | int write_access) |
2391 | { | 2389 | { |
2392 | spinlock_t *ptl; | 2390 | spinlock_t *ptl; |
2393 | pte_t entry; | 2391 | pte_t entry; |
2394 | unsigned long pfn; | 2392 | unsigned long pfn; |
2395 | 2393 | ||
2396 | pte_unmap(page_table); | 2394 | pte_unmap(page_table); |
2397 | BUG_ON(!(vma->vm_flags & VM_PFNMAP)); | 2395 | BUG_ON(!(vma->vm_flags & VM_PFNMAP)); |
2398 | BUG_ON(is_cow_mapping(vma->vm_flags)); | 2396 | BUG_ON(is_cow_mapping(vma->vm_flags)); |
2399 | 2397 | ||
2400 | pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK); | 2398 | pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK); |
2401 | if (unlikely(pfn == NOPFN_OOM)) | 2399 | if (unlikely(pfn == NOPFN_OOM)) |
2402 | return VM_FAULT_OOM; | 2400 | return VM_FAULT_OOM; |
2403 | else if (unlikely(pfn == NOPFN_SIGBUS)) | 2401 | else if (unlikely(pfn == NOPFN_SIGBUS)) |
2404 | return VM_FAULT_SIGBUS; | 2402 | return VM_FAULT_SIGBUS; |
2405 | else if (unlikely(pfn == NOPFN_REFAULT)) | 2403 | else if (unlikely(pfn == NOPFN_REFAULT)) |
2406 | return 0; | 2404 | return 0; |
2407 | 2405 | ||
2408 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 2406 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); |
2409 | 2407 | ||
2410 | /* Only go through if we didn't race with anybody else... */ | 2408 | /* Only go through if we didn't race with anybody else... */ |
2411 | if (pte_none(*page_table)) { | 2409 | if (pte_none(*page_table)) { |
2412 | entry = pfn_pte(pfn, vma->vm_page_prot); | 2410 | entry = pfn_pte(pfn, vma->vm_page_prot); |
2413 | if (write_access) | 2411 | if (write_access) |
2414 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2412 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
2415 | set_pte_at(mm, address, page_table, entry); | 2413 | set_pte_at(mm, address, page_table, entry); |
2416 | } | 2414 | } |
2417 | pte_unmap_unlock(page_table, ptl); | 2415 | pte_unmap_unlock(page_table, ptl); |
2418 | return 0; | 2416 | return 0; |
2419 | } | 2417 | } |
2420 | 2418 | ||
2421 | /* | 2419 | /* |
2422 | * Fault of a previously existing named mapping. Repopulate the pte | 2420 | * Fault of a previously existing named mapping. Repopulate the pte |
2423 | * from the encoded file_pte if possible. This enables swappable | 2421 | * from the encoded file_pte if possible. This enables swappable |
2424 | * nonlinear vmas. | 2422 | * nonlinear vmas. |
2425 | * | 2423 | * |
2426 | * We enter with non-exclusive mmap_sem (to exclude vma changes, | 2424 | * We enter with non-exclusive mmap_sem (to exclude vma changes, |
2427 | * but allow concurrent faults), and pte mapped but not yet locked. | 2425 | * but allow concurrent faults), and pte mapped but not yet locked. |
2428 | * We return with mmap_sem still held, but pte unmapped and unlocked. | 2426 | * We return with mmap_sem still held, but pte unmapped and unlocked. |
2429 | */ | 2427 | */ |
2430 | static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 2428 | static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
2431 | unsigned long address, pte_t *page_table, pmd_t *pmd, | 2429 | unsigned long address, pte_t *page_table, pmd_t *pmd, |
2432 | int write_access, pte_t orig_pte) | 2430 | int write_access, pte_t orig_pte) |
2433 | { | 2431 | { |
2434 | unsigned int flags = FAULT_FLAG_NONLINEAR | | 2432 | unsigned int flags = FAULT_FLAG_NONLINEAR | |
2435 | (write_access ? FAULT_FLAG_WRITE : 0); | 2433 | (write_access ? FAULT_FLAG_WRITE : 0); |
2436 | pgoff_t pgoff; | 2434 | pgoff_t pgoff; |
2437 | 2435 | ||
2438 | if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) | 2436 | if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) |
2439 | return 0; | 2437 | return 0; |
2440 | 2438 | ||
2441 | if (unlikely(!(vma->vm_flags & VM_NONLINEAR) || | 2439 | if (unlikely(!(vma->vm_flags & VM_NONLINEAR) || |
2442 | !(vma->vm_flags & VM_CAN_NONLINEAR))) { | 2440 | !(vma->vm_flags & VM_CAN_NONLINEAR))) { |
2443 | /* | 2441 | /* |
2444 | * Page table corrupted: show pte and kill process. | 2442 | * Page table corrupted: show pte and kill process. |
2445 | */ | 2443 | */ |
2446 | print_bad_pte(vma, orig_pte, address); | 2444 | print_bad_pte(vma, orig_pte, address); |
2447 | return VM_FAULT_OOM; | 2445 | return VM_FAULT_OOM; |
2448 | } | 2446 | } |
2449 | 2447 | ||
2450 | pgoff = pte_to_pgoff(orig_pte); | 2448 | pgoff = pte_to_pgoff(orig_pte); |
2451 | return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); | 2449 | return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); |
2452 | } | 2450 | } |
2453 | 2451 | ||
2454 | /* | 2452 | /* |
2455 | * These routines also need to handle stuff like marking pages dirty | 2453 | * These routines also need to handle stuff like marking pages dirty |
2456 | * and/or accessed for architectures that don't do it in hardware (most | 2454 | * and/or accessed for architectures that don't do it in hardware (most |
2457 | * RISC architectures). The early dirtying is also good on the i386. | 2455 | * RISC architectures). The early dirtying is also good on the i386. |
2458 | * | 2456 | * |
2459 | * There is also a hook called "update_mmu_cache()" that architectures | 2457 | * There is also a hook called "update_mmu_cache()" that architectures |
2460 | * with external mmu caches can use to update those (ie the Sparc or | 2458 | * with external mmu caches can use to update those (ie the Sparc or |
2461 | * PowerPC hashed page tables that act as extended TLBs). | 2459 | * PowerPC hashed page tables that act as extended TLBs). |
2462 | * | 2460 | * |
2463 | * We enter with non-exclusive mmap_sem (to exclude vma changes, | 2461 | * We enter with non-exclusive mmap_sem (to exclude vma changes, |
2464 | * but allow concurrent faults), and pte mapped but not yet locked. | 2462 | * but allow concurrent faults), and pte mapped but not yet locked. |
2465 | * We return with mmap_sem still held, but pte unmapped and unlocked. | 2463 | * We return with mmap_sem still held, but pte unmapped and unlocked. |
2466 | */ | 2464 | */ |
2467 | static inline int handle_pte_fault(struct mm_struct *mm, | 2465 | static inline int handle_pte_fault(struct mm_struct *mm, |
2468 | struct vm_area_struct *vma, unsigned long address, | 2466 | struct vm_area_struct *vma, unsigned long address, |
2469 | pte_t *pte, pmd_t *pmd, int write_access) | 2467 | pte_t *pte, pmd_t *pmd, int write_access) |
2470 | { | 2468 | { |
2471 | pte_t entry; | 2469 | pte_t entry; |
2472 | spinlock_t *ptl; | 2470 | spinlock_t *ptl; |
2473 | 2471 | ||
2474 | entry = *pte; | 2472 | entry = *pte; |
2475 | if (!pte_present(entry)) { | 2473 | if (!pte_present(entry)) { |
2476 | if (pte_none(entry)) { | 2474 | if (pte_none(entry)) { |
2477 | if (vma->vm_ops) { | 2475 | if (vma->vm_ops) { |
2478 | if (vma->vm_ops->fault || vma->vm_ops->nopage) | 2476 | if (vma->vm_ops->fault || vma->vm_ops->nopage) |
2479 | return do_linear_fault(mm, vma, address, | 2477 | return do_linear_fault(mm, vma, address, |
2480 | pte, pmd, write_access, entry); | 2478 | pte, pmd, write_access, entry); |
2481 | if (unlikely(vma->vm_ops->nopfn)) | 2479 | if (unlikely(vma->vm_ops->nopfn)) |
2482 | return do_no_pfn(mm, vma, address, pte, | 2480 | return do_no_pfn(mm, vma, address, pte, |
2483 | pmd, write_access); | 2481 | pmd, write_access); |
2484 | } | 2482 | } |
2485 | return do_anonymous_page(mm, vma, address, | 2483 | return do_anonymous_page(mm, vma, address, |
2486 | pte, pmd, write_access); | 2484 | pte, pmd, write_access); |
2487 | } | 2485 | } |
2488 | if (pte_file(entry)) | 2486 | if (pte_file(entry)) |
2489 | return do_nonlinear_fault(mm, vma, address, | 2487 | return do_nonlinear_fault(mm, vma, address, |
2490 | pte, pmd, write_access, entry); | 2488 | pte, pmd, write_access, entry); |
2491 | return do_swap_page(mm, vma, address, | 2489 | return do_swap_page(mm, vma, address, |
2492 | pte, pmd, write_access, entry); | 2490 | pte, pmd, write_access, entry); |
2493 | } | 2491 | } |
2494 | 2492 | ||
2495 | ptl = pte_lockptr(mm, pmd); | 2493 | ptl = pte_lockptr(mm, pmd); |
2496 | spin_lock(ptl); | 2494 | spin_lock(ptl); |
2497 | if (unlikely(!pte_same(*pte, entry))) | 2495 | if (unlikely(!pte_same(*pte, entry))) |
2498 | goto unlock; | 2496 | goto unlock; |
2499 | if (write_access) { | 2497 | if (write_access) { |
2500 | if (!pte_write(entry)) | 2498 | if (!pte_write(entry)) |
2501 | return do_wp_page(mm, vma, address, | 2499 | return do_wp_page(mm, vma, address, |
2502 | pte, pmd, ptl, entry); | 2500 | pte, pmd, ptl, entry); |
2503 | entry = pte_mkdirty(entry); | 2501 | entry = pte_mkdirty(entry); |
2504 | } | 2502 | } |
2505 | entry = pte_mkyoung(entry); | 2503 | entry = pte_mkyoung(entry); |
2506 | if (ptep_set_access_flags(vma, address, pte, entry, write_access)) { | 2504 | if (ptep_set_access_flags(vma, address, pte, entry, write_access)) { |
2507 | update_mmu_cache(vma, address, entry); | 2505 | update_mmu_cache(vma, address, entry); |
2508 | } else { | 2506 | } else { |
2509 | /* | 2507 | /* |
2510 | * This is needed only for protection faults but the arch code | 2508 | * This is needed only for protection faults but the arch code |
2511 | * is not yet telling us if this is a protection fault or not. | 2509 | * is not yet telling us if this is a protection fault or not. |
2512 | * This still avoids useless tlb flushes for .text page faults | 2510 | * This still avoids useless tlb flushes for .text page faults |
2513 | * with threads. | 2511 | * with threads. |
2514 | */ | 2512 | */ |
2515 | if (write_access) | 2513 | if (write_access) |
2516 | flush_tlb_page(vma, address); | 2514 | flush_tlb_page(vma, address); |
2517 | } | 2515 | } |
2518 | unlock: | 2516 | unlock: |
2519 | pte_unmap_unlock(pte, ptl); | 2517 | pte_unmap_unlock(pte, ptl); |
2520 | return 0; | 2518 | return 0; |
2521 | } | 2519 | } |
2522 | 2520 | ||
2523 | /* | 2521 | /* |
2524 | * By the time we get here, we already hold the mm semaphore | 2522 | * By the time we get here, we already hold the mm semaphore |
2525 | */ | 2523 | */ |
2526 | int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 2524 | int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
2527 | unsigned long address, int write_access) | 2525 | unsigned long address, int write_access) |
2528 | { | 2526 | { |
2529 | pgd_t *pgd; | 2527 | pgd_t *pgd; |
2530 | pud_t *pud; | 2528 | pud_t *pud; |
2531 | pmd_t *pmd; | 2529 | pmd_t *pmd; |
2532 | pte_t *pte; | 2530 | pte_t *pte; |
2533 | 2531 | ||
2534 | __set_current_state(TASK_RUNNING); | 2532 | __set_current_state(TASK_RUNNING); |
2535 | 2533 | ||
2536 | count_vm_event(PGFAULT); | 2534 | count_vm_event(PGFAULT); |
2537 | 2535 | ||
2538 | if (unlikely(is_vm_hugetlb_page(vma))) | 2536 | if (unlikely(is_vm_hugetlb_page(vma))) |
2539 | return hugetlb_fault(mm, vma, address, write_access); | 2537 | return hugetlb_fault(mm, vma, address, write_access); |
2540 | 2538 | ||
2541 | pgd = pgd_offset(mm, address); | 2539 | pgd = pgd_offset(mm, address); |
2542 | pud = pud_alloc(mm, pgd, address); | 2540 | pud = pud_alloc(mm, pgd, address); |
2543 | if (!pud) | 2541 | if (!pud) |
2544 | return VM_FAULT_OOM; | 2542 | return VM_FAULT_OOM; |
2545 | pmd = pmd_alloc(mm, pud, address); | 2543 | pmd = pmd_alloc(mm, pud, address); |
2546 | if (!pmd) | 2544 | if (!pmd) |
2547 | return VM_FAULT_OOM; | 2545 | return VM_FAULT_OOM; |
2548 | pte = pte_alloc_map(mm, pmd, address); | 2546 | pte = pte_alloc_map(mm, pmd, address); |
2549 | if (!pte) | 2547 | if (!pte) |
2550 | return VM_FAULT_OOM; | 2548 | return VM_FAULT_OOM; |
2551 | 2549 | ||
2552 | return handle_pte_fault(mm, vma, address, pte, pmd, write_access); | 2550 | return handle_pte_fault(mm, vma, address, pte, pmd, write_access); |
2553 | } | 2551 | } |
2554 | 2552 | ||
2555 | #ifndef __PAGETABLE_PUD_FOLDED | 2553 | #ifndef __PAGETABLE_PUD_FOLDED |
2556 | /* | 2554 | /* |
2557 | * Allocate page upper directory. | 2555 | * Allocate page upper directory. |
2558 | * We've already handled the fast-path in-line. | 2556 | * We've already handled the fast-path in-line. |
2559 | */ | 2557 | */ |
2560 | int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) | 2558 | int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) |
2561 | { | 2559 | { |
2562 | pud_t *new = pud_alloc_one(mm, address); | 2560 | pud_t *new = pud_alloc_one(mm, address); |
2563 | if (!new) | 2561 | if (!new) |
2564 | return -ENOMEM; | 2562 | return -ENOMEM; |
2565 | 2563 | ||
2566 | spin_lock(&mm->page_table_lock); | 2564 | spin_lock(&mm->page_table_lock); |
2567 | if (pgd_present(*pgd)) /* Another has populated it */ | 2565 | if (pgd_present(*pgd)) /* Another has populated it */ |
2568 | pud_free(new); | 2566 | pud_free(new); |
2569 | else | 2567 | else |
2570 | pgd_populate(mm, pgd, new); | 2568 | pgd_populate(mm, pgd, new); |
2571 | spin_unlock(&mm->page_table_lock); | 2569 | spin_unlock(&mm->page_table_lock); |
2572 | return 0; | 2570 | return 0; |
2573 | } | 2571 | } |
2574 | #endif /* __PAGETABLE_PUD_FOLDED */ | 2572 | #endif /* __PAGETABLE_PUD_FOLDED */ |
2575 | 2573 | ||
2576 | #ifndef __PAGETABLE_PMD_FOLDED | 2574 | #ifndef __PAGETABLE_PMD_FOLDED |
2577 | /* | 2575 | /* |
2578 | * Allocate page middle directory. | 2576 | * Allocate page middle directory. |
2579 | * We've already handled the fast-path in-line. | 2577 | * We've already handled the fast-path in-line. |
2580 | */ | 2578 | */ |
2581 | int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) | 2579 | int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) |
2582 | { | 2580 | { |
2583 | pmd_t *new = pmd_alloc_one(mm, address); | 2581 | pmd_t *new = pmd_alloc_one(mm, address); |
2584 | if (!new) | 2582 | if (!new) |
2585 | return -ENOMEM; | 2583 | return -ENOMEM; |
2586 | 2584 | ||
2587 | spin_lock(&mm->page_table_lock); | 2585 | spin_lock(&mm->page_table_lock); |
2588 | #ifndef __ARCH_HAS_4LEVEL_HACK | 2586 | #ifndef __ARCH_HAS_4LEVEL_HACK |
2589 | if (pud_present(*pud)) /* Another has populated it */ | 2587 | if (pud_present(*pud)) /* Another has populated it */ |
2590 | pmd_free(new); | 2588 | pmd_free(new); |
2591 | else | 2589 | else |
2592 | pud_populate(mm, pud, new); | 2590 | pud_populate(mm, pud, new); |
2593 | #else | 2591 | #else |
2594 | if (pgd_present(*pud)) /* Another has populated it */ | 2592 | if (pgd_present(*pud)) /* Another has populated it */ |
2595 | pmd_free(new); | 2593 | pmd_free(new); |
2596 | else | 2594 | else |
2597 | pgd_populate(mm, pud, new); | 2595 | pgd_populate(mm, pud, new); |
2598 | #endif /* __ARCH_HAS_4LEVEL_HACK */ | 2596 | #endif /* __ARCH_HAS_4LEVEL_HACK */ |
2599 | spin_unlock(&mm->page_table_lock); | 2597 | spin_unlock(&mm->page_table_lock); |
2600 | return 0; | 2598 | return 0; |
2601 | } | 2599 | } |
2602 | #endif /* __PAGETABLE_PMD_FOLDED */ | 2600 | #endif /* __PAGETABLE_PMD_FOLDED */ |
2603 | 2601 | ||
2604 | int make_pages_present(unsigned long addr, unsigned long end) | 2602 | int make_pages_present(unsigned long addr, unsigned long end) |
2605 | { | 2603 | { |
2606 | int ret, len, write; | 2604 | int ret, len, write; |
2607 | struct vm_area_struct * vma; | 2605 | struct vm_area_struct * vma; |
2608 | 2606 | ||
2609 | vma = find_vma(current->mm, addr); | 2607 | vma = find_vma(current->mm, addr); |
2610 | if (!vma) | 2608 | if (!vma) |
2611 | return -1; | 2609 | return -1; |
2612 | write = (vma->vm_flags & VM_WRITE) != 0; | 2610 | write = (vma->vm_flags & VM_WRITE) != 0; |
2613 | BUG_ON(addr >= end); | 2611 | BUG_ON(addr >= end); |
2614 | BUG_ON(end > vma->vm_end); | 2612 | BUG_ON(end > vma->vm_end); |
2615 | len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; | 2613 | len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; |
2616 | ret = get_user_pages(current, current->mm, addr, | 2614 | ret = get_user_pages(current, current->mm, addr, |
2617 | len, write, 0, NULL, NULL); | 2615 | len, write, 0, NULL, NULL); |
2618 | if (ret < 0) | 2616 | if (ret < 0) |
2619 | return ret; | 2617 | return ret; |
2620 | return ret == len ? 0 : -1; | 2618 | return ret == len ? 0 : -1; |
2621 | } | 2619 | } |
2622 | 2620 | ||
2623 | /* | 2621 | /* |
2624 | * Map a vmalloc()-space virtual address to the physical page. | 2622 | * Map a vmalloc()-space virtual address to the physical page. |
2625 | */ | 2623 | */ |
2626 | struct page * vmalloc_to_page(void * vmalloc_addr) | 2624 | struct page * vmalloc_to_page(void * vmalloc_addr) |
2627 | { | 2625 | { |
2628 | unsigned long addr = (unsigned long) vmalloc_addr; | 2626 | unsigned long addr = (unsigned long) vmalloc_addr; |
2629 | struct page *page = NULL; | 2627 | struct page *page = NULL; |
2630 | pgd_t *pgd = pgd_offset_k(addr); | 2628 | pgd_t *pgd = pgd_offset_k(addr); |
2631 | pud_t *pud; | 2629 | pud_t *pud; |
2632 | pmd_t *pmd; | 2630 | pmd_t *pmd; |
2633 | pte_t *ptep, pte; | 2631 | pte_t *ptep, pte; |
2634 | 2632 | ||
2635 | if (!pgd_none(*pgd)) { | 2633 | if (!pgd_none(*pgd)) { |
2636 | pud = pud_offset(pgd, addr); | 2634 | pud = pud_offset(pgd, addr); |
2637 | if (!pud_none(*pud)) { | 2635 | if (!pud_none(*pud)) { |
2638 | pmd = pmd_offset(pud, addr); | 2636 | pmd = pmd_offset(pud, addr); |
2639 | if (!pmd_none(*pmd)) { | 2637 | if (!pmd_none(*pmd)) { |
2640 | ptep = pte_offset_map(pmd, addr); | 2638 | ptep = pte_offset_map(pmd, addr); |
2641 | pte = *ptep; | 2639 | pte = *ptep; |
2642 | if (pte_present(pte)) | 2640 | if (pte_present(pte)) |
2643 | page = pte_page(pte); | 2641 | page = pte_page(pte); |
2644 | pte_unmap(ptep); | 2642 | pte_unmap(ptep); |
2645 | } | 2643 | } |
2646 | } | 2644 | } |
2647 | } | 2645 | } |
2648 | return page; | 2646 | return page; |
2649 | } | 2647 | } |
2650 | 2648 | ||
2651 | EXPORT_SYMBOL(vmalloc_to_page); | 2649 | EXPORT_SYMBOL(vmalloc_to_page); |
2652 | 2650 | ||
2653 | /* | 2651 | /* |
2654 | * Map a vmalloc()-space virtual address to the physical page frame number. | 2652 | * Map a vmalloc()-space virtual address to the physical page frame number. |
2655 | */ | 2653 | */ |
2656 | unsigned long vmalloc_to_pfn(void * vmalloc_addr) | 2654 | unsigned long vmalloc_to_pfn(void * vmalloc_addr) |
2657 | { | 2655 | { |
2658 | return page_to_pfn(vmalloc_to_page(vmalloc_addr)); | 2656 | return page_to_pfn(vmalloc_to_page(vmalloc_addr)); |
2659 | } | 2657 | } |
2660 | 2658 | ||
2661 | EXPORT_SYMBOL(vmalloc_to_pfn); | 2659 | EXPORT_SYMBOL(vmalloc_to_pfn); |
2662 | 2660 | ||
2663 | #if !defined(__HAVE_ARCH_GATE_AREA) | 2661 | #if !defined(__HAVE_ARCH_GATE_AREA) |
2664 | 2662 | ||
2665 | #if defined(AT_SYSINFO_EHDR) | 2663 | #if defined(AT_SYSINFO_EHDR) |
2666 | static struct vm_area_struct gate_vma; | 2664 | static struct vm_area_struct gate_vma; |
2667 | 2665 | ||
2668 | static int __init gate_vma_init(void) | 2666 | static int __init gate_vma_init(void) |
2669 | { | 2667 | { |
2670 | gate_vma.vm_mm = NULL; | 2668 | gate_vma.vm_mm = NULL; |
2671 | gate_vma.vm_start = FIXADDR_USER_START; | 2669 | gate_vma.vm_start = FIXADDR_USER_START; |
2672 | gate_vma.vm_end = FIXADDR_USER_END; | 2670 | gate_vma.vm_end = FIXADDR_USER_END; |
2673 | gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; | 2671 | gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; |
2674 | gate_vma.vm_page_prot = __P101; | 2672 | gate_vma.vm_page_prot = __P101; |
2675 | /* | 2673 | /* |
2676 | * Make sure the vDSO gets into every core dump. | 2674 | * Make sure the vDSO gets into every core dump. |
2677 | * Dumping its contents makes post-mortem fully interpretable later | 2675 | * Dumping its contents makes post-mortem fully interpretable later |
2678 | * without matching up the same kernel and hardware config to see | 2676 | * without matching up the same kernel and hardware config to see |
2679 | * what PC values meant. | 2677 | * what PC values meant. |
2680 | */ | 2678 | */ |
2681 | gate_vma.vm_flags |= VM_ALWAYSDUMP; | 2679 | gate_vma.vm_flags |= VM_ALWAYSDUMP; |
2682 | return 0; | 2680 | return 0; |
2683 | } | 2681 | } |
2684 | __initcall(gate_vma_init); | 2682 | __initcall(gate_vma_init); |
2685 | #endif | 2683 | #endif |
2686 | 2684 | ||
2687 | struct vm_area_struct *get_gate_vma(struct task_struct *tsk) | 2685 | struct vm_area_struct *get_gate_vma(struct task_struct *tsk) |
2688 | { | 2686 | { |
2689 | #ifdef AT_SYSINFO_EHDR | 2687 | #ifdef AT_SYSINFO_EHDR |
2690 | return &gate_vma; | 2688 | return &gate_vma; |
2691 | #else | 2689 | #else |
2692 | return NULL; | 2690 | return NULL; |
2693 | #endif | 2691 | #endif |
2694 | } | 2692 | } |
2695 | 2693 | ||
2696 | int in_gate_area_no_task(unsigned long addr) | 2694 | int in_gate_area_no_task(unsigned long addr) |
2697 | { | 2695 | { |
2698 | #ifdef AT_SYSINFO_EHDR | 2696 | #ifdef AT_SYSINFO_EHDR |
2699 | if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END)) | 2697 | if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END)) |
2700 | return 1; | 2698 | return 1; |
2701 | #endif | 2699 | #endif |
2702 | return 0; | 2700 | return 0; |
2703 | } | 2701 | } |
2704 | 2702 | ||
2705 | #endif /* __HAVE_ARCH_GATE_AREA */ | 2703 | #endif /* __HAVE_ARCH_GATE_AREA */ |
2706 | 2704 | ||
2707 | /* | 2705 | /* |
2708 | * Access another process' address space. | 2706 | * Access another process' address space. |
2709 | * Source/target buffer must be kernel space, | 2707 | * Source/target buffer must be kernel space, |
2710 | * Do not walk the page table directly, use get_user_pages | 2708 | * Do not walk the page table directly, use get_user_pages |
2711 | */ | 2709 | */ |
2712 | int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) | 2710 | int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) |
2713 | { | 2711 | { |
2714 | struct mm_struct *mm; | 2712 | struct mm_struct *mm; |
2715 | struct vm_area_struct *vma; | 2713 | struct vm_area_struct *vma; |
2716 | struct page *page; | 2714 | struct page *page; |
2717 | void *old_buf = buf; | 2715 | void *old_buf = buf; |
2718 | 2716 | ||
2719 | mm = get_task_mm(tsk); | 2717 | mm = get_task_mm(tsk); |
2720 | if (!mm) | 2718 | if (!mm) |
2721 | return 0; | 2719 | return 0; |
2722 | 2720 | ||
2723 | down_read(&mm->mmap_sem); | 2721 | down_read(&mm->mmap_sem); |
2724 | /* ignore errors, just check how much was successfully transferred */ | 2722 | /* ignore errors, just check how much was successfully transferred */ |
2725 | while (len) { | 2723 | while (len) { |
2726 | int bytes, ret, offset; | 2724 | int bytes, ret, offset; |
2727 | void *maddr; | 2725 | void *maddr; |
2728 | 2726 | ||
2729 | ret = get_user_pages(tsk, mm, addr, 1, | 2727 | ret = get_user_pages(tsk, mm, addr, 1, |
2730 | write, 1, &page, &vma); | 2728 | write, 1, &page, &vma); |
2731 | if (ret <= 0) | 2729 | if (ret <= 0) |
2732 | break; | 2730 | break; |
2733 | 2731 | ||
2734 | bytes = len; | 2732 | bytes = len; |
2735 | offset = addr & (PAGE_SIZE-1); | 2733 | offset = addr & (PAGE_SIZE-1); |
2736 | if (bytes > PAGE_SIZE-offset) | 2734 | if (bytes > PAGE_SIZE-offset) |
2737 | bytes = PAGE_SIZE-offset; | 2735 | bytes = PAGE_SIZE-offset; |
2738 | 2736 | ||
2739 | maddr = kmap(page); | 2737 | maddr = kmap(page); |
2740 | if (write) { | 2738 | if (write) { |
2741 | copy_to_user_page(vma, page, addr, | 2739 | copy_to_user_page(vma, page, addr, |
2742 | maddr + offset, buf, bytes); | 2740 | maddr + offset, buf, bytes); |
2743 | set_page_dirty_lock(page); | 2741 | set_page_dirty_lock(page); |
2744 | } else { | 2742 | } else { |
2745 | copy_from_user_page(vma, page, addr, | 2743 | copy_from_user_page(vma, page, addr, |
2746 | buf, maddr + offset, bytes); | 2744 | buf, maddr + offset, bytes); |
2747 | } | 2745 | } |
2748 | kunmap(page); | 2746 | kunmap(page); |
2749 | page_cache_release(page); | 2747 | page_cache_release(page); |
2750 | len -= bytes; | 2748 | len -= bytes; |
2751 | buf += bytes; | 2749 | buf += bytes; |
2752 | addr += bytes; | 2750 | addr += bytes; |
2753 | } | 2751 | } |
2754 | up_read(&mm->mmap_sem); | 2752 | up_read(&mm->mmap_sem); |
2755 | mmput(mm); | 2753 | mmput(mm); |
2756 | 2754 | ||
2757 | return buf - old_buf; | 2755 | return buf - old_buf; |
2758 | } | 2756 | } |
2759 | 2757 |