Commit 17d9f311eca13a42bf950198a358be1420d19c5f

Authored by Daniel Hellstrom
Committed by David S. Miller
1 parent 90d3ac15e5

SCHED_TTWU_QUEUE is not longer needed since sparc32 now implements IPI

Signed-off-by: Daniel Hellstrom <daniel@gaisler.com>
Reported-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

Showing 2 changed files with 1 additions and 6 deletions Inline Diff

1 config ARCH 1 config ARCH
2 string 2 string
3 option env="ARCH" 3 option env="ARCH"
4 4
5 config KERNELVERSION 5 config KERNELVERSION
6 string 6 string
7 option env="KERNELVERSION" 7 option env="KERNELVERSION"
8 8
9 config DEFCONFIG_LIST 9 config DEFCONFIG_LIST
10 string 10 string
11 depends on !UML 11 depends on !UML
12 option defconfig_list 12 option defconfig_list
13 default "/lib/modules/$UNAME_RELEASE/.config" 13 default "/lib/modules/$UNAME_RELEASE/.config"
14 default "/etc/kernel-config" 14 default "/etc/kernel-config"
15 default "/boot/config-$UNAME_RELEASE" 15 default "/boot/config-$UNAME_RELEASE"
16 default "$ARCH_DEFCONFIG" 16 default "$ARCH_DEFCONFIG"
17 default "arch/$ARCH/defconfig" 17 default "arch/$ARCH/defconfig"
18 18
19 config CONSTRUCTORS 19 config CONSTRUCTORS
20 bool 20 bool
21 depends on !UML 21 depends on !UML
22 default y 22 default y
23 23
24 config HAVE_IRQ_WORK 24 config HAVE_IRQ_WORK
25 bool 25 bool
26 26
27 config IRQ_WORK 27 config IRQ_WORK
28 bool 28 bool
29 depends on HAVE_IRQ_WORK 29 depends on HAVE_IRQ_WORK
30 30
31 menu "General setup" 31 menu "General setup"
32 32
33 config EXPERIMENTAL 33 config EXPERIMENTAL
34 bool "Prompt for development and/or incomplete code/drivers" 34 bool "Prompt for development and/or incomplete code/drivers"
35 ---help--- 35 ---help---
36 Some of the various things that Linux supports (such as network 36 Some of the various things that Linux supports (such as network
37 drivers, file systems, network protocols, etc.) can be in a state 37 drivers, file systems, network protocols, etc.) can be in a state
38 of development where the functionality, stability, or the level of 38 of development where the functionality, stability, or the level of
39 testing is not yet high enough for general use. This is usually 39 testing is not yet high enough for general use. This is usually
40 known as the "alpha-test" phase among developers. If a feature is 40 known as the "alpha-test" phase among developers. If a feature is
41 currently in alpha-test, then the developers usually discourage 41 currently in alpha-test, then the developers usually discourage
42 uninformed widespread use of this feature by the general public to 42 uninformed widespread use of this feature by the general public to
43 avoid "Why doesn't this work?" type mail messages. However, active 43 avoid "Why doesn't this work?" type mail messages. However, active
44 testing and use of these systems is welcomed. Just be aware that it 44 testing and use of these systems is welcomed. Just be aware that it
45 may not meet the normal level of reliability or it may fail to work 45 may not meet the normal level of reliability or it may fail to work
46 in some special cases. Detailed bug reports from people familiar 46 in some special cases. Detailed bug reports from people familiar
47 with the kernel internals are usually welcomed by the developers 47 with the kernel internals are usually welcomed by the developers
48 (before submitting bug reports, please read the documents 48 (before submitting bug reports, please read the documents
49 <file:README>, <file:MAINTAINERS>, <file:REPORTING-BUGS>, 49 <file:README>, <file:MAINTAINERS>, <file:REPORTING-BUGS>,
50 <file:Documentation/BUG-HUNTING>, and 50 <file:Documentation/BUG-HUNTING>, and
51 <file:Documentation/oops-tracing.txt> in the kernel source). 51 <file:Documentation/oops-tracing.txt> in the kernel source).
52 52
53 This option will also make obsoleted drivers available. These are 53 This option will also make obsoleted drivers available. These are
54 drivers that have been replaced by something else, and/or are 54 drivers that have been replaced by something else, and/or are
55 scheduled to be removed in a future kernel release. 55 scheduled to be removed in a future kernel release.
56 56
57 Unless you intend to help test and develop a feature or driver that 57 Unless you intend to help test and develop a feature or driver that
58 falls into this category, or you have a situation that requires 58 falls into this category, or you have a situation that requires
59 using these features, you should probably say N here, which will 59 using these features, you should probably say N here, which will
60 cause the configurator to present you with fewer choices. If 60 cause the configurator to present you with fewer choices. If
61 you say Y here, you will be offered the choice of using features or 61 you say Y here, you will be offered the choice of using features or
62 drivers that are currently considered to be in the alpha-test phase. 62 drivers that are currently considered to be in the alpha-test phase.
63 63
64 config BROKEN 64 config BROKEN
65 bool 65 bool
66 66
67 config BROKEN_ON_SMP 67 config BROKEN_ON_SMP
68 bool 68 bool
69 depends on BROKEN || !SMP 69 depends on BROKEN || !SMP
70 default y 70 default y
71 71
72 config INIT_ENV_ARG_LIMIT 72 config INIT_ENV_ARG_LIMIT
73 int 73 int
74 default 32 if !UML 74 default 32 if !UML
75 default 128 if UML 75 default 128 if UML
76 help 76 help
77 Maximum of each of the number of arguments and environment 77 Maximum of each of the number of arguments and environment
78 variables passed to init from the kernel command line. 78 variables passed to init from the kernel command line.
79 79
80 80
81 config CROSS_COMPILE 81 config CROSS_COMPILE
82 string "Cross-compiler tool prefix" 82 string "Cross-compiler tool prefix"
83 help 83 help
84 Same as running 'make CROSS_COMPILE=prefix-' but stored for 84 Same as running 'make CROSS_COMPILE=prefix-' but stored for
85 default make runs in this kernel build directory. You don't 85 default make runs in this kernel build directory. You don't
86 need to set this unless you want the configured kernel build 86 need to set this unless you want the configured kernel build
87 directory to select the cross-compiler automatically. 87 directory to select the cross-compiler automatically.
88 88
89 config LOCALVERSION 89 config LOCALVERSION
90 string "Local version - append to kernel release" 90 string "Local version - append to kernel release"
91 help 91 help
92 Append an extra string to the end of your kernel version. 92 Append an extra string to the end of your kernel version.
93 This will show up when you type uname, for example. 93 This will show up when you type uname, for example.
94 The string you set here will be appended after the contents of 94 The string you set here will be appended after the contents of
95 any files with a filename matching localversion* in your 95 any files with a filename matching localversion* in your
96 object and source tree, in that order. Your total string can 96 object and source tree, in that order. Your total string can
97 be a maximum of 64 characters. 97 be a maximum of 64 characters.
98 98
99 config LOCALVERSION_AUTO 99 config LOCALVERSION_AUTO
100 bool "Automatically append version information to the version string" 100 bool "Automatically append version information to the version string"
101 default y 101 default y
102 help 102 help
103 This will try to automatically determine if the current tree is a 103 This will try to automatically determine if the current tree is a
104 release tree by looking for git tags that belong to the current 104 release tree by looking for git tags that belong to the current
105 top of tree revision. 105 top of tree revision.
106 106
107 A string of the format -gxxxxxxxx will be added to the localversion 107 A string of the format -gxxxxxxxx will be added to the localversion
108 if a git-based tree is found. The string generated by this will be 108 if a git-based tree is found. The string generated by this will be
109 appended after any matching localversion* files, and after the value 109 appended after any matching localversion* files, and after the value
110 set in CONFIG_LOCALVERSION. 110 set in CONFIG_LOCALVERSION.
111 111
112 (The actual string used here is the first eight characters produced 112 (The actual string used here is the first eight characters produced
113 by running the command: 113 by running the command:
114 114
115 $ git rev-parse --verify HEAD 115 $ git rev-parse --verify HEAD
116 116
117 which is done within the script "scripts/setlocalversion".) 117 which is done within the script "scripts/setlocalversion".)
118 118
119 config HAVE_KERNEL_GZIP 119 config HAVE_KERNEL_GZIP
120 bool 120 bool
121 121
122 config HAVE_KERNEL_BZIP2 122 config HAVE_KERNEL_BZIP2
123 bool 123 bool
124 124
125 config HAVE_KERNEL_LZMA 125 config HAVE_KERNEL_LZMA
126 bool 126 bool
127 127
128 config HAVE_KERNEL_XZ 128 config HAVE_KERNEL_XZ
129 bool 129 bool
130 130
131 config HAVE_KERNEL_LZO 131 config HAVE_KERNEL_LZO
132 bool 132 bool
133 133
134 choice 134 choice
135 prompt "Kernel compression mode" 135 prompt "Kernel compression mode"
136 default KERNEL_GZIP 136 default KERNEL_GZIP
137 depends on HAVE_KERNEL_GZIP || HAVE_KERNEL_BZIP2 || HAVE_KERNEL_LZMA || HAVE_KERNEL_XZ || HAVE_KERNEL_LZO 137 depends on HAVE_KERNEL_GZIP || HAVE_KERNEL_BZIP2 || HAVE_KERNEL_LZMA || HAVE_KERNEL_XZ || HAVE_KERNEL_LZO
138 help 138 help
139 The linux kernel is a kind of self-extracting executable. 139 The linux kernel is a kind of self-extracting executable.
140 Several compression algorithms are available, which differ 140 Several compression algorithms are available, which differ
141 in efficiency, compression and decompression speed. 141 in efficiency, compression and decompression speed.
142 Compression speed is only relevant when building a kernel. 142 Compression speed is only relevant when building a kernel.
143 Decompression speed is relevant at each boot. 143 Decompression speed is relevant at each boot.
144 144
145 If you have any problems with bzip2 or lzma compressed 145 If you have any problems with bzip2 or lzma compressed
146 kernels, mail me (Alain Knaff) <alain@knaff.lu>. (An older 146 kernels, mail me (Alain Knaff) <alain@knaff.lu>. (An older
147 version of this functionality (bzip2 only), for 2.4, was 147 version of this functionality (bzip2 only), for 2.4, was
148 supplied by Christian Ludwig) 148 supplied by Christian Ludwig)
149 149
150 High compression options are mostly useful for users, who 150 High compression options are mostly useful for users, who
151 are low on disk space (embedded systems), but for whom ram 151 are low on disk space (embedded systems), but for whom ram
152 size matters less. 152 size matters less.
153 153
154 If in doubt, select 'gzip' 154 If in doubt, select 'gzip'
155 155
156 config KERNEL_GZIP 156 config KERNEL_GZIP
157 bool "Gzip" 157 bool "Gzip"
158 depends on HAVE_KERNEL_GZIP 158 depends on HAVE_KERNEL_GZIP
159 help 159 help
160 The old and tried gzip compression. It provides a good balance 160 The old and tried gzip compression. It provides a good balance
161 between compression ratio and decompression speed. 161 between compression ratio and decompression speed.
162 162
163 config KERNEL_BZIP2 163 config KERNEL_BZIP2
164 bool "Bzip2" 164 bool "Bzip2"
165 depends on HAVE_KERNEL_BZIP2 165 depends on HAVE_KERNEL_BZIP2
166 help 166 help
167 Its compression ratio and speed is intermediate. 167 Its compression ratio and speed is intermediate.
168 Decompression speed is slowest among the three. The kernel 168 Decompression speed is slowest among the three. The kernel
169 size is about 10% smaller with bzip2, in comparison to gzip. 169 size is about 10% smaller with bzip2, in comparison to gzip.
170 Bzip2 uses a large amount of memory. For modern kernels you 170 Bzip2 uses a large amount of memory. For modern kernels you
171 will need at least 8MB RAM or more for booting. 171 will need at least 8MB RAM or more for booting.
172 172
173 config KERNEL_LZMA 173 config KERNEL_LZMA
174 bool "LZMA" 174 bool "LZMA"
175 depends on HAVE_KERNEL_LZMA 175 depends on HAVE_KERNEL_LZMA
176 help 176 help
177 The most recent compression algorithm. 177 The most recent compression algorithm.
178 Its ratio is best, decompression speed is between the other 178 Its ratio is best, decompression speed is between the other
179 two. Compression is slowest. The kernel size is about 33% 179 two. Compression is slowest. The kernel size is about 33%
180 smaller with LZMA in comparison to gzip. 180 smaller with LZMA in comparison to gzip.
181 181
182 config KERNEL_XZ 182 config KERNEL_XZ
183 bool "XZ" 183 bool "XZ"
184 depends on HAVE_KERNEL_XZ 184 depends on HAVE_KERNEL_XZ
185 help 185 help
186 XZ uses the LZMA2 algorithm and instruction set specific 186 XZ uses the LZMA2 algorithm and instruction set specific
187 BCJ filters which can improve compression ratio of executable 187 BCJ filters which can improve compression ratio of executable
188 code. The size of the kernel is about 30% smaller with XZ in 188 code. The size of the kernel is about 30% smaller with XZ in
189 comparison to gzip. On architectures for which there is a BCJ 189 comparison to gzip. On architectures for which there is a BCJ
190 filter (i386, x86_64, ARM, IA-64, PowerPC, and SPARC), XZ 190 filter (i386, x86_64, ARM, IA-64, PowerPC, and SPARC), XZ
191 will create a few percent smaller kernel than plain LZMA. 191 will create a few percent smaller kernel than plain LZMA.
192 192
193 The speed is about the same as with LZMA: The decompression 193 The speed is about the same as with LZMA: The decompression
194 speed of XZ is better than that of bzip2 but worse than gzip 194 speed of XZ is better than that of bzip2 but worse than gzip
195 and LZO. Compression is slow. 195 and LZO. Compression is slow.
196 196
197 config KERNEL_LZO 197 config KERNEL_LZO
198 bool "LZO" 198 bool "LZO"
199 depends on HAVE_KERNEL_LZO 199 depends on HAVE_KERNEL_LZO
200 help 200 help
201 Its compression ratio is the poorest among the 4. The kernel 201 Its compression ratio is the poorest among the 4. The kernel
202 size is about 10% bigger than gzip; however its speed 202 size is about 10% bigger than gzip; however its speed
203 (both compression and decompression) is the fastest. 203 (both compression and decompression) is the fastest.
204 204
205 endchoice 205 endchoice
206 206
207 config SWAP 207 config SWAP
208 bool "Support for paging of anonymous memory (swap)" 208 bool "Support for paging of anonymous memory (swap)"
209 depends on MMU && BLOCK 209 depends on MMU && BLOCK
210 default y 210 default y
211 help 211 help
212 This option allows you to choose whether you want to have support 212 This option allows you to choose whether you want to have support
213 for so called swap devices or swap files in your kernel that are 213 for so called swap devices or swap files in your kernel that are
214 used to provide more virtual memory than the actual RAM present 214 used to provide more virtual memory than the actual RAM present
215 in your computer. If unsure say Y. 215 in your computer. If unsure say Y.
216 216
217 config SYSVIPC 217 config SYSVIPC
218 bool "System V IPC" 218 bool "System V IPC"
219 ---help--- 219 ---help---
220 Inter Process Communication is a suite of library functions and 220 Inter Process Communication is a suite of library functions and
221 system calls which let processes (running programs) synchronize and 221 system calls which let processes (running programs) synchronize and
222 exchange information. It is generally considered to be a good thing, 222 exchange information. It is generally considered to be a good thing,
223 and some programs won't run unless you say Y here. In particular, if 223 and some programs won't run unless you say Y here. In particular, if
224 you want to run the DOS emulator dosemu under Linux (read the 224 you want to run the DOS emulator dosemu under Linux (read the
225 DOSEMU-HOWTO, available from <http://www.tldp.org/docs.html#howto>), 225 DOSEMU-HOWTO, available from <http://www.tldp.org/docs.html#howto>),
226 you'll need to say Y here. 226 you'll need to say Y here.
227 227
228 You can find documentation about IPC with "info ipc" and also in 228 You can find documentation about IPC with "info ipc" and also in
229 section 6.4 of the Linux Programmer's Guide, available from 229 section 6.4 of the Linux Programmer's Guide, available from
230 <http://www.tldp.org/guides.html>. 230 <http://www.tldp.org/guides.html>.
231 231
232 config SYSVIPC_SYSCTL 232 config SYSVIPC_SYSCTL
233 bool 233 bool
234 depends on SYSVIPC 234 depends on SYSVIPC
235 depends on SYSCTL 235 depends on SYSCTL
236 default y 236 default y
237 237
238 config POSIX_MQUEUE 238 config POSIX_MQUEUE
239 bool "POSIX Message Queues" 239 bool "POSIX Message Queues"
240 depends on NET && EXPERIMENTAL 240 depends on NET && EXPERIMENTAL
241 ---help--- 241 ---help---
242 POSIX variant of message queues is a part of IPC. In POSIX message 242 POSIX variant of message queues is a part of IPC. In POSIX message
243 queues every message has a priority which decides about succession 243 queues every message has a priority which decides about succession
244 of receiving it by a process. If you want to compile and run 244 of receiving it by a process. If you want to compile and run
245 programs written e.g. for Solaris with use of its POSIX message 245 programs written e.g. for Solaris with use of its POSIX message
246 queues (functions mq_*) say Y here. 246 queues (functions mq_*) say Y here.
247 247
248 POSIX message queues are visible as a filesystem called 'mqueue' 248 POSIX message queues are visible as a filesystem called 'mqueue'
249 and can be mounted somewhere if you want to do filesystem 249 and can be mounted somewhere if you want to do filesystem
250 operations on message queues. 250 operations on message queues.
251 251
252 If unsure, say Y. 252 If unsure, say Y.
253 253
254 config POSIX_MQUEUE_SYSCTL 254 config POSIX_MQUEUE_SYSCTL
255 bool 255 bool
256 depends on POSIX_MQUEUE 256 depends on POSIX_MQUEUE
257 depends on SYSCTL 257 depends on SYSCTL
258 default y 258 default y
259 259
260 config BSD_PROCESS_ACCT 260 config BSD_PROCESS_ACCT
261 bool "BSD Process Accounting" 261 bool "BSD Process Accounting"
262 help 262 help
263 If you say Y here, a user level program will be able to instruct the 263 If you say Y here, a user level program will be able to instruct the
264 kernel (via a special system call) to write process accounting 264 kernel (via a special system call) to write process accounting
265 information to a file: whenever a process exits, information about 265 information to a file: whenever a process exits, information about
266 that process will be appended to the file by the kernel. The 266 that process will be appended to the file by the kernel. The
267 information includes things such as creation time, owning user, 267 information includes things such as creation time, owning user,
268 command name, memory usage, controlling terminal etc. (the complete 268 command name, memory usage, controlling terminal etc. (the complete
269 list is in the struct acct in <file:include/linux/acct.h>). It is 269 list is in the struct acct in <file:include/linux/acct.h>). It is
270 up to the user level program to do useful things with this 270 up to the user level program to do useful things with this
271 information. This is generally a good idea, so say Y. 271 information. This is generally a good idea, so say Y.
272 272
273 config BSD_PROCESS_ACCT_V3 273 config BSD_PROCESS_ACCT_V3
274 bool "BSD Process Accounting version 3 file format" 274 bool "BSD Process Accounting version 3 file format"
275 depends on BSD_PROCESS_ACCT 275 depends on BSD_PROCESS_ACCT
276 default n 276 default n
277 help 277 help
278 If you say Y here, the process accounting information is written 278 If you say Y here, the process accounting information is written
279 in a new file format that also logs the process IDs of each 279 in a new file format that also logs the process IDs of each
280 process and it's parent. Note that this file format is incompatible 280 process and it's parent. Note that this file format is incompatible
281 with previous v0/v1/v2 file formats, so you will need updated tools 281 with previous v0/v1/v2 file formats, so you will need updated tools
282 for processing it. A preliminary version of these tools is available 282 for processing it. A preliminary version of these tools is available
283 at <http://www.gnu.org/software/acct/>. 283 at <http://www.gnu.org/software/acct/>.
284 284
285 config FHANDLE 285 config FHANDLE
286 bool "open by fhandle syscalls" 286 bool "open by fhandle syscalls"
287 select EXPORTFS 287 select EXPORTFS
288 help 288 help
289 If you say Y here, a user level program will be able to map 289 If you say Y here, a user level program will be able to map
290 file names to handle and then later use the handle for 290 file names to handle and then later use the handle for
291 different file system operations. This is useful in implementing 291 different file system operations. This is useful in implementing
292 userspace file servers, which now track files using handles instead 292 userspace file servers, which now track files using handles instead
293 of names. The handle would remain the same even if file names 293 of names. The handle would remain the same even if file names
294 get renamed. Enables open_by_handle_at(2) and name_to_handle_at(2) 294 get renamed. Enables open_by_handle_at(2) and name_to_handle_at(2)
295 syscalls. 295 syscalls.
296 296
297 config TASKSTATS 297 config TASKSTATS
298 bool "Export task/process statistics through netlink (EXPERIMENTAL)" 298 bool "Export task/process statistics through netlink (EXPERIMENTAL)"
299 depends on NET 299 depends on NET
300 default n 300 default n
301 help 301 help
302 Export selected statistics for tasks/processes through the 302 Export selected statistics for tasks/processes through the
303 generic netlink interface. Unlike BSD process accounting, the 303 generic netlink interface. Unlike BSD process accounting, the
304 statistics are available during the lifetime of tasks/processes as 304 statistics are available during the lifetime of tasks/processes as
305 responses to commands. Like BSD accounting, they are sent to user 305 responses to commands. Like BSD accounting, they are sent to user
306 space on task exit. 306 space on task exit.
307 307
308 Say N if unsure. 308 Say N if unsure.
309 309
310 config TASK_DELAY_ACCT 310 config TASK_DELAY_ACCT
311 bool "Enable per-task delay accounting (EXPERIMENTAL)" 311 bool "Enable per-task delay accounting (EXPERIMENTAL)"
312 depends on TASKSTATS 312 depends on TASKSTATS
313 help 313 help
314 Collect information on time spent by a task waiting for system 314 Collect information on time spent by a task waiting for system
315 resources like cpu, synchronous block I/O completion and swapping 315 resources like cpu, synchronous block I/O completion and swapping
316 in pages. Such statistics can help in setting a task's priorities 316 in pages. Such statistics can help in setting a task's priorities
317 relative to other tasks for cpu, io, rss limits etc. 317 relative to other tasks for cpu, io, rss limits etc.
318 318
319 Say N if unsure. 319 Say N if unsure.
320 320
321 config TASK_XACCT 321 config TASK_XACCT
322 bool "Enable extended accounting over taskstats (EXPERIMENTAL)" 322 bool "Enable extended accounting over taskstats (EXPERIMENTAL)"
323 depends on TASKSTATS 323 depends on TASKSTATS
324 help 324 help
325 Collect extended task accounting data and send the data 325 Collect extended task accounting data and send the data
326 to userland for processing over the taskstats interface. 326 to userland for processing over the taskstats interface.
327 327
328 Say N if unsure. 328 Say N if unsure.
329 329
330 config TASK_IO_ACCOUNTING 330 config TASK_IO_ACCOUNTING
331 bool "Enable per-task storage I/O accounting (EXPERIMENTAL)" 331 bool "Enable per-task storage I/O accounting (EXPERIMENTAL)"
332 depends on TASK_XACCT 332 depends on TASK_XACCT
333 help 333 help
334 Collect information on the number of bytes of storage I/O which this 334 Collect information on the number of bytes of storage I/O which this
335 task has caused. 335 task has caused.
336 336
337 Say N if unsure. 337 Say N if unsure.
338 338
339 config AUDIT 339 config AUDIT
340 bool "Auditing support" 340 bool "Auditing support"
341 depends on NET 341 depends on NET
342 help 342 help
343 Enable auditing infrastructure that can be used with another 343 Enable auditing infrastructure that can be used with another
344 kernel subsystem, such as SELinux (which requires this for 344 kernel subsystem, such as SELinux (which requires this for
345 logging of avc messages output). Does not do system-call 345 logging of avc messages output). Does not do system-call
346 auditing without CONFIG_AUDITSYSCALL. 346 auditing without CONFIG_AUDITSYSCALL.
347 347
348 config AUDITSYSCALL 348 config AUDITSYSCALL
349 bool "Enable system-call auditing support" 349 bool "Enable system-call auditing support"
350 depends on AUDIT && (X86 || PPC || S390 || IA64 || UML || SPARC64 || SUPERH) 350 depends on AUDIT && (X86 || PPC || S390 || IA64 || UML || SPARC64 || SUPERH)
351 default y if SECURITY_SELINUX 351 default y if SECURITY_SELINUX
352 help 352 help
353 Enable low-overhead system-call auditing infrastructure that 353 Enable low-overhead system-call auditing infrastructure that
354 can be used independently or with another kernel subsystem, 354 can be used independently or with another kernel subsystem,
355 such as SELinux. 355 such as SELinux.
356 356
357 config AUDIT_WATCH 357 config AUDIT_WATCH
358 def_bool y 358 def_bool y
359 depends on AUDITSYSCALL 359 depends on AUDITSYSCALL
360 select FSNOTIFY 360 select FSNOTIFY
361 361
362 config AUDIT_TREE 362 config AUDIT_TREE
363 def_bool y 363 def_bool y
364 depends on AUDITSYSCALL 364 depends on AUDITSYSCALL
365 select FSNOTIFY 365 select FSNOTIFY
366 366
367 source "kernel/irq/Kconfig" 367 source "kernel/irq/Kconfig"
368 368
369 menu "RCU Subsystem" 369 menu "RCU Subsystem"
370 370
371 choice 371 choice
372 prompt "RCU Implementation" 372 prompt "RCU Implementation"
373 default TREE_RCU 373 default TREE_RCU
374 374
375 config TREE_RCU 375 config TREE_RCU
376 bool "Tree-based hierarchical RCU" 376 bool "Tree-based hierarchical RCU"
377 depends on !PREEMPT && SMP 377 depends on !PREEMPT && SMP
378 help 378 help
379 This option selects the RCU implementation that is 379 This option selects the RCU implementation that is
380 designed for very large SMP system with hundreds or 380 designed for very large SMP system with hundreds or
381 thousands of CPUs. It also scales down nicely to 381 thousands of CPUs. It also scales down nicely to
382 smaller systems. 382 smaller systems.
383 383
384 config TREE_PREEMPT_RCU 384 config TREE_PREEMPT_RCU
385 bool "Preemptible tree-based hierarchical RCU" 385 bool "Preemptible tree-based hierarchical RCU"
386 depends on PREEMPT 386 depends on PREEMPT
387 help 387 help
388 This option selects the RCU implementation that is 388 This option selects the RCU implementation that is
389 designed for very large SMP systems with hundreds or 389 designed for very large SMP systems with hundreds or
390 thousands of CPUs, but for which real-time response 390 thousands of CPUs, but for which real-time response
391 is also required. It also scales down nicely to 391 is also required. It also scales down nicely to
392 smaller systems. 392 smaller systems.
393 393
394 config TINY_RCU 394 config TINY_RCU
395 bool "UP-only small-memory-footprint RCU" 395 bool "UP-only small-memory-footprint RCU"
396 depends on !SMP 396 depends on !SMP
397 help 397 help
398 This option selects the RCU implementation that is 398 This option selects the RCU implementation that is
399 designed for UP systems from which real-time response 399 designed for UP systems from which real-time response
400 is not required. This option greatly reduces the 400 is not required. This option greatly reduces the
401 memory footprint of RCU. 401 memory footprint of RCU.
402 402
403 config TINY_PREEMPT_RCU 403 config TINY_PREEMPT_RCU
404 bool "Preemptible UP-only small-memory-footprint RCU" 404 bool "Preemptible UP-only small-memory-footprint RCU"
405 depends on !SMP && PREEMPT 405 depends on !SMP && PREEMPT
406 help 406 help
407 This option selects the RCU implementation that is designed 407 This option selects the RCU implementation that is designed
408 for real-time UP systems. This option greatly reduces the 408 for real-time UP systems. This option greatly reduces the
409 memory footprint of RCU. 409 memory footprint of RCU.
410 410
411 endchoice 411 endchoice
412 412
413 config PREEMPT_RCU 413 config PREEMPT_RCU
414 def_bool ( TREE_PREEMPT_RCU || TINY_PREEMPT_RCU ) 414 def_bool ( TREE_PREEMPT_RCU || TINY_PREEMPT_RCU )
415 help 415 help
416 This option enables preemptible-RCU code that is common between 416 This option enables preemptible-RCU code that is common between
417 the TREE_PREEMPT_RCU and TINY_PREEMPT_RCU implementations. 417 the TREE_PREEMPT_RCU and TINY_PREEMPT_RCU implementations.
418 418
419 config RCU_TRACE 419 config RCU_TRACE
420 bool "Enable tracing for RCU" 420 bool "Enable tracing for RCU"
421 help 421 help
422 This option provides tracing in RCU which presents stats 422 This option provides tracing in RCU which presents stats
423 in debugfs for debugging RCU implementation. 423 in debugfs for debugging RCU implementation.
424 424
425 Say Y here if you want to enable RCU tracing 425 Say Y here if you want to enable RCU tracing
426 Say N if you are unsure. 426 Say N if you are unsure.
427 427
428 config RCU_FANOUT 428 config RCU_FANOUT
429 int "Tree-based hierarchical RCU fanout value" 429 int "Tree-based hierarchical RCU fanout value"
430 range 2 64 if 64BIT 430 range 2 64 if 64BIT
431 range 2 32 if !64BIT 431 range 2 32 if !64BIT
432 depends on TREE_RCU || TREE_PREEMPT_RCU 432 depends on TREE_RCU || TREE_PREEMPT_RCU
433 default 64 if 64BIT 433 default 64 if 64BIT
434 default 32 if !64BIT 434 default 32 if !64BIT
435 help 435 help
436 This option controls the fanout of hierarchical implementations 436 This option controls the fanout of hierarchical implementations
437 of RCU, allowing RCU to work efficiently on machines with 437 of RCU, allowing RCU to work efficiently on machines with
438 large numbers of CPUs. This value must be at least the fourth 438 large numbers of CPUs. This value must be at least the fourth
439 root of NR_CPUS, which allows NR_CPUS to be insanely large. 439 root of NR_CPUS, which allows NR_CPUS to be insanely large.
440 The default value of RCU_FANOUT should be used for production 440 The default value of RCU_FANOUT should be used for production
441 systems, but if you are stress-testing the RCU implementation 441 systems, but if you are stress-testing the RCU implementation
442 itself, small RCU_FANOUT values allow you to test large-system 442 itself, small RCU_FANOUT values allow you to test large-system
443 code paths on small(er) systems. 443 code paths on small(er) systems.
444 444
445 Select a specific number if testing RCU itself. 445 Select a specific number if testing RCU itself.
446 Take the default if unsure. 446 Take the default if unsure.
447 447
448 config RCU_FANOUT_EXACT 448 config RCU_FANOUT_EXACT
449 bool "Disable tree-based hierarchical RCU auto-balancing" 449 bool "Disable tree-based hierarchical RCU auto-balancing"
450 depends on TREE_RCU || TREE_PREEMPT_RCU 450 depends on TREE_RCU || TREE_PREEMPT_RCU
451 default n 451 default n
452 help 452 help
453 This option forces use of the exact RCU_FANOUT value specified, 453 This option forces use of the exact RCU_FANOUT value specified,
454 regardless of imbalances in the hierarchy. This is useful for 454 regardless of imbalances in the hierarchy. This is useful for
455 testing RCU itself, and might one day be useful on systems with 455 testing RCU itself, and might one day be useful on systems with
456 strong NUMA behavior. 456 strong NUMA behavior.
457 457
458 Without RCU_FANOUT_EXACT, the code will balance the hierarchy. 458 Without RCU_FANOUT_EXACT, the code will balance the hierarchy.
459 459
460 Say N if unsure. 460 Say N if unsure.
461 461
462 config RCU_FAST_NO_HZ 462 config RCU_FAST_NO_HZ
463 bool "Accelerate last non-dyntick-idle CPU's grace periods" 463 bool "Accelerate last non-dyntick-idle CPU's grace periods"
464 depends on TREE_RCU && NO_HZ && SMP 464 depends on TREE_RCU && NO_HZ && SMP
465 default n 465 default n
466 help 466 help
467 This option causes RCU to attempt to accelerate grace periods 467 This option causes RCU to attempt to accelerate grace periods
468 in order to allow the final CPU to enter dynticks-idle state 468 in order to allow the final CPU to enter dynticks-idle state
469 more quickly. On the other hand, this option increases the 469 more quickly. On the other hand, this option increases the
470 overhead of the dynticks-idle checking, particularly on systems 470 overhead of the dynticks-idle checking, particularly on systems
471 with large numbers of CPUs. 471 with large numbers of CPUs.
472 472
473 Say Y if energy efficiency is critically important, particularly 473 Say Y if energy efficiency is critically important, particularly
474 if you have relatively few CPUs. 474 if you have relatively few CPUs.
475 475
476 Say N if you are unsure. 476 Say N if you are unsure.
477 477
478 config TREE_RCU_TRACE 478 config TREE_RCU_TRACE
479 def_bool RCU_TRACE && ( TREE_RCU || TREE_PREEMPT_RCU ) 479 def_bool RCU_TRACE && ( TREE_RCU || TREE_PREEMPT_RCU )
480 select DEBUG_FS 480 select DEBUG_FS
481 help 481 help
482 This option provides tracing for the TREE_RCU and 482 This option provides tracing for the TREE_RCU and
483 TREE_PREEMPT_RCU implementations, permitting Makefile to 483 TREE_PREEMPT_RCU implementations, permitting Makefile to
484 trivially select kernel/rcutree_trace.c. 484 trivially select kernel/rcutree_trace.c.
485 485
486 config RCU_BOOST 486 config RCU_BOOST
487 bool "Enable RCU priority boosting" 487 bool "Enable RCU priority boosting"
488 depends on RT_MUTEXES && TINY_PREEMPT_RCU 488 depends on RT_MUTEXES && TINY_PREEMPT_RCU
489 default n 489 default n
490 help 490 help
491 This option boosts the priority of preempted RCU readers that 491 This option boosts the priority of preempted RCU readers that
492 block the current preemptible RCU grace period for too long. 492 block the current preemptible RCU grace period for too long.
493 This option also prevents heavy loads from blocking RCU 493 This option also prevents heavy loads from blocking RCU
494 callback invocation for all flavors of RCU. 494 callback invocation for all flavors of RCU.
495 495
496 Say Y here if you are working with real-time apps or heavy loads 496 Say Y here if you are working with real-time apps or heavy loads
497 Say N here if you are unsure. 497 Say N here if you are unsure.
498 498
499 config RCU_BOOST_PRIO 499 config RCU_BOOST_PRIO
500 int "Real-time priority to boost RCU readers to" 500 int "Real-time priority to boost RCU readers to"
501 range 1 99 501 range 1 99
502 depends on RCU_BOOST 502 depends on RCU_BOOST
503 default 1 503 default 1
504 help 504 help
505 This option specifies the real-time priority to which preempted 505 This option specifies the real-time priority to which preempted
506 RCU readers are to be boosted. If you are working with CPU-bound 506 RCU readers are to be boosted. If you are working with CPU-bound
507 real-time applications, you should specify a priority higher then 507 real-time applications, you should specify a priority higher then
508 the highest-priority CPU-bound application. 508 the highest-priority CPU-bound application.
509 509
510 Specify the real-time priority, or take the default if unsure. 510 Specify the real-time priority, or take the default if unsure.
511 511
512 config RCU_BOOST_DELAY 512 config RCU_BOOST_DELAY
513 int "Milliseconds to delay boosting after RCU grace-period start" 513 int "Milliseconds to delay boosting after RCU grace-period start"
514 range 0 3000 514 range 0 3000
515 depends on RCU_BOOST 515 depends on RCU_BOOST
516 default 500 516 default 500
517 help 517 help
518 This option specifies the time to wait after the beginning of 518 This option specifies the time to wait after the beginning of
519 a given grace period before priority-boosting preempted RCU 519 a given grace period before priority-boosting preempted RCU
520 readers blocking that grace period. Note that any RCU reader 520 readers blocking that grace period. Note that any RCU reader
521 blocking an expedited RCU grace period is boosted immediately. 521 blocking an expedited RCU grace period is boosted immediately.
522 522
523 Accept the default if unsure. 523 Accept the default if unsure.
524 524
525 endmenu # "RCU Subsystem" 525 endmenu # "RCU Subsystem"
526 526
527 config IKCONFIG 527 config IKCONFIG
528 tristate "Kernel .config support" 528 tristate "Kernel .config support"
529 ---help--- 529 ---help---
530 This option enables the complete Linux kernel ".config" file 530 This option enables the complete Linux kernel ".config" file
531 contents to be saved in the kernel. It provides documentation 531 contents to be saved in the kernel. It provides documentation
532 of which kernel options are used in a running kernel or in an 532 of which kernel options are used in a running kernel or in an
533 on-disk kernel. This information can be extracted from the kernel 533 on-disk kernel. This information can be extracted from the kernel
534 image file with the script scripts/extract-ikconfig and used as 534 image file with the script scripts/extract-ikconfig and used as
535 input to rebuild the current kernel or to build another kernel. 535 input to rebuild the current kernel or to build another kernel.
536 It can also be extracted from a running kernel by reading 536 It can also be extracted from a running kernel by reading
537 /proc/config.gz if enabled (below). 537 /proc/config.gz if enabled (below).
538 538
539 config IKCONFIG_PROC 539 config IKCONFIG_PROC
540 bool "Enable access to .config through /proc/config.gz" 540 bool "Enable access to .config through /proc/config.gz"
541 depends on IKCONFIG && PROC_FS 541 depends on IKCONFIG && PROC_FS
542 ---help--- 542 ---help---
543 This option enables access to the kernel configuration file 543 This option enables access to the kernel configuration file
544 through /proc/config.gz. 544 through /proc/config.gz.
545 545
546 config LOG_BUF_SHIFT 546 config LOG_BUF_SHIFT
547 int "Kernel log buffer size (16 => 64KB, 17 => 128KB)" 547 int "Kernel log buffer size (16 => 64KB, 17 => 128KB)"
548 range 12 21 548 range 12 21
549 default 17 549 default 17
550 help 550 help
551 Select kernel log buffer size as a power of 2. 551 Select kernel log buffer size as a power of 2.
552 Examples: 552 Examples:
553 17 => 128 KB 553 17 => 128 KB
554 16 => 64 KB 554 16 => 64 KB
555 15 => 32 KB 555 15 => 32 KB
556 14 => 16 KB 556 14 => 16 KB
557 13 => 8 KB 557 13 => 8 KB
558 12 => 4 KB 558 12 => 4 KB
559 559
560 # 560 #
561 # Architectures with an unreliable sched_clock() should select this: 561 # Architectures with an unreliable sched_clock() should select this:
562 # 562 #
563 config HAVE_UNSTABLE_SCHED_CLOCK 563 config HAVE_UNSTABLE_SCHED_CLOCK
564 bool 564 bool
565 565
566 menuconfig CGROUPS 566 menuconfig CGROUPS
567 boolean "Control Group support" 567 boolean "Control Group support"
568 depends on EVENTFD 568 depends on EVENTFD
569 help 569 help
570 This option adds support for grouping sets of processes together, for 570 This option adds support for grouping sets of processes together, for
571 use with process control subsystems such as Cpusets, CFS, memory 571 use with process control subsystems such as Cpusets, CFS, memory
572 controls or device isolation. 572 controls or device isolation.
573 See 573 See
574 - Documentation/scheduler/sched-design-CFS.txt (CFS) 574 - Documentation/scheduler/sched-design-CFS.txt (CFS)
575 - Documentation/cgroups/ (features for grouping, isolation 575 - Documentation/cgroups/ (features for grouping, isolation
576 and resource control) 576 and resource control)
577 577
578 Say N if unsure. 578 Say N if unsure.
579 579
580 if CGROUPS 580 if CGROUPS
581 581
582 config CGROUP_DEBUG 582 config CGROUP_DEBUG
583 bool "Example debug cgroup subsystem" 583 bool "Example debug cgroup subsystem"
584 default n 584 default n
585 help 585 help
586 This option enables a simple cgroup subsystem that 586 This option enables a simple cgroup subsystem that
587 exports useful debugging information about the cgroups 587 exports useful debugging information about the cgroups
588 framework. 588 framework.
589 589
590 Say N if unsure. 590 Say N if unsure.
591 591
592 config CGROUP_NS 592 config CGROUP_NS
593 bool "Namespace cgroup subsystem" 593 bool "Namespace cgroup subsystem"
594 help 594 help
595 Provides a simple namespace cgroup subsystem to 595 Provides a simple namespace cgroup subsystem to
596 provide hierarchical naming of sets of namespaces, 596 provide hierarchical naming of sets of namespaces,
597 for instance virtual servers and checkpoint/restart 597 for instance virtual servers and checkpoint/restart
598 jobs. 598 jobs.
599 599
600 config CGROUP_FREEZER 600 config CGROUP_FREEZER
601 bool "Freezer cgroup subsystem" 601 bool "Freezer cgroup subsystem"
602 help 602 help
603 Provides a way to freeze and unfreeze all tasks in a 603 Provides a way to freeze and unfreeze all tasks in a
604 cgroup. 604 cgroup.
605 605
606 config CGROUP_DEVICE 606 config CGROUP_DEVICE
607 bool "Device controller for cgroups" 607 bool "Device controller for cgroups"
608 help 608 help
609 Provides a cgroup implementing whitelists for devices which 609 Provides a cgroup implementing whitelists for devices which
610 a process in the cgroup can mknod or open. 610 a process in the cgroup can mknod or open.
611 611
612 config CPUSETS 612 config CPUSETS
613 bool "Cpuset support" 613 bool "Cpuset support"
614 help 614 help
615 This option will let you create and manage CPUSETs which 615 This option will let you create and manage CPUSETs which
616 allow dynamically partitioning a system into sets of CPUs and 616 allow dynamically partitioning a system into sets of CPUs and
617 Memory Nodes and assigning tasks to run only within those sets. 617 Memory Nodes and assigning tasks to run only within those sets.
618 This is primarily useful on large SMP or NUMA systems. 618 This is primarily useful on large SMP or NUMA systems.
619 619
620 Say N if unsure. 620 Say N if unsure.
621 621
622 config PROC_PID_CPUSET 622 config PROC_PID_CPUSET
623 bool "Include legacy /proc/<pid>/cpuset file" 623 bool "Include legacy /proc/<pid>/cpuset file"
624 depends on CPUSETS 624 depends on CPUSETS
625 default y 625 default y
626 626
627 config CGROUP_CPUACCT 627 config CGROUP_CPUACCT
628 bool "Simple CPU accounting cgroup subsystem" 628 bool "Simple CPU accounting cgroup subsystem"
629 help 629 help
630 Provides a simple Resource Controller for monitoring the 630 Provides a simple Resource Controller for monitoring the
631 total CPU consumed by the tasks in a cgroup. 631 total CPU consumed by the tasks in a cgroup.
632 632
633 config RESOURCE_COUNTERS 633 config RESOURCE_COUNTERS
634 bool "Resource counters" 634 bool "Resource counters"
635 help 635 help
636 This option enables controller independent resource accounting 636 This option enables controller independent resource accounting
637 infrastructure that works with cgroups. 637 infrastructure that works with cgroups.
638 638
639 config CGROUP_MEM_RES_CTLR 639 config CGROUP_MEM_RES_CTLR
640 bool "Memory Resource Controller for Control Groups" 640 bool "Memory Resource Controller for Control Groups"
641 depends on RESOURCE_COUNTERS 641 depends on RESOURCE_COUNTERS
642 select MM_OWNER 642 select MM_OWNER
643 help 643 help
644 Provides a memory resource controller that manages both anonymous 644 Provides a memory resource controller that manages both anonymous
645 memory and page cache. (See Documentation/cgroups/memory.txt) 645 memory and page cache. (See Documentation/cgroups/memory.txt)
646 646
647 Note that setting this option increases fixed memory overhead 647 Note that setting this option increases fixed memory overhead
648 associated with each page of memory in the system. By this, 648 associated with each page of memory in the system. By this,
649 20(40)bytes/PAGE_SIZE on 32(64)bit system will be occupied by memory 649 20(40)bytes/PAGE_SIZE on 32(64)bit system will be occupied by memory
650 usage tracking struct at boot. Total amount of this is printed out 650 usage tracking struct at boot. Total amount of this is printed out
651 at boot. 651 at boot.
652 652
653 Only enable when you're ok with these trade offs and really 653 Only enable when you're ok with these trade offs and really
654 sure you need the memory resource controller. Even when you enable 654 sure you need the memory resource controller. Even when you enable
655 this, you can set "cgroup_disable=memory" at your boot option to 655 this, you can set "cgroup_disable=memory" at your boot option to
656 disable memory resource controller and you can avoid overheads. 656 disable memory resource controller and you can avoid overheads.
657 (and lose benefits of memory resource controller) 657 (and lose benefits of memory resource controller)
658 658
659 This config option also selects MM_OWNER config option, which 659 This config option also selects MM_OWNER config option, which
660 could in turn add some fork/exit overhead. 660 could in turn add some fork/exit overhead.
661 661
662 config CGROUP_MEM_RES_CTLR_SWAP 662 config CGROUP_MEM_RES_CTLR_SWAP
663 bool "Memory Resource Controller Swap Extension" 663 bool "Memory Resource Controller Swap Extension"
664 depends on CGROUP_MEM_RES_CTLR && SWAP 664 depends on CGROUP_MEM_RES_CTLR && SWAP
665 help 665 help
666 Add swap management feature to memory resource controller. When you 666 Add swap management feature to memory resource controller. When you
667 enable this, you can limit mem+swap usage per cgroup. In other words, 667 enable this, you can limit mem+swap usage per cgroup. In other words,
668 when you disable this, memory resource controller has no cares to 668 when you disable this, memory resource controller has no cares to
669 usage of swap...a process can exhaust all of the swap. This extension 669 usage of swap...a process can exhaust all of the swap. This extension
670 is useful when you want to avoid exhaustion swap but this itself 670 is useful when you want to avoid exhaustion swap but this itself
671 adds more overheads and consumes memory for remembering information. 671 adds more overheads and consumes memory for remembering information.
672 Especially if you use 32bit system or small memory system, please 672 Especially if you use 32bit system or small memory system, please
673 be careful about enabling this. When memory resource controller 673 be careful about enabling this. When memory resource controller
674 is disabled by boot option, this will be automatically disabled and 674 is disabled by boot option, this will be automatically disabled and
675 there will be no overhead from this. Even when you set this config=y, 675 there will be no overhead from this. Even when you set this config=y,
676 if boot option "noswapaccount" is set, swap will not be accounted. 676 if boot option "noswapaccount" is set, swap will not be accounted.
677 Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page 677 Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page
678 size is 4096bytes, 512k per 1Gbytes of swap. 678 size is 4096bytes, 512k per 1Gbytes of swap.
679 config CGROUP_MEM_RES_CTLR_SWAP_ENABLED 679 config CGROUP_MEM_RES_CTLR_SWAP_ENABLED
680 bool "Memory Resource Controller Swap Extension enabled by default" 680 bool "Memory Resource Controller Swap Extension enabled by default"
681 depends on CGROUP_MEM_RES_CTLR_SWAP 681 depends on CGROUP_MEM_RES_CTLR_SWAP
682 default y 682 default y
683 help 683 help
684 Memory Resource Controller Swap Extension comes with its price in 684 Memory Resource Controller Swap Extension comes with its price in
685 a bigger memory consumption. General purpose distribution kernels 685 a bigger memory consumption. General purpose distribution kernels
686 which want to enable the feature but keep it disabled by default 686 which want to enable the feature but keep it disabled by default
687 and let the user enable it by swapaccount boot command line 687 and let the user enable it by swapaccount boot command line
688 parameter should have this option unselected. 688 parameter should have this option unselected.
689 For those who want to have the feature enabled by default should 689 For those who want to have the feature enabled by default should
690 select this option (if, for some reason, they need to disable it 690 select this option (if, for some reason, they need to disable it
691 then noswapaccount does the trick). 691 then noswapaccount does the trick).
692 692
693 config CGROUP_PERF 693 config CGROUP_PERF
694 bool "Enable perf_event per-cpu per-container group (cgroup) monitoring" 694 bool "Enable perf_event per-cpu per-container group (cgroup) monitoring"
695 depends on PERF_EVENTS && CGROUPS 695 depends on PERF_EVENTS && CGROUPS
696 help 696 help
697 This option extends the per-cpu mode to restrict monitoring to 697 This option extends the per-cpu mode to restrict monitoring to
698 threads which belong to the cgroup specified and run on the 698 threads which belong to the cgroup specified and run on the
699 designated cpu. 699 designated cpu.
700 700
701 Say N if unsure. 701 Say N if unsure.
702 702
703 menuconfig CGROUP_SCHED 703 menuconfig CGROUP_SCHED
704 bool "Group CPU scheduler" 704 bool "Group CPU scheduler"
705 depends on EXPERIMENTAL 705 depends on EXPERIMENTAL
706 default n 706 default n
707 help 707 help
708 This feature lets CPU scheduler recognize task groups and control CPU 708 This feature lets CPU scheduler recognize task groups and control CPU
709 bandwidth allocation to such task groups. It uses cgroups to group 709 bandwidth allocation to such task groups. It uses cgroups to group
710 tasks. 710 tasks.
711 711
712 if CGROUP_SCHED 712 if CGROUP_SCHED
713 config FAIR_GROUP_SCHED 713 config FAIR_GROUP_SCHED
714 bool "Group scheduling for SCHED_OTHER" 714 bool "Group scheduling for SCHED_OTHER"
715 depends on CGROUP_SCHED 715 depends on CGROUP_SCHED
716 default CGROUP_SCHED 716 default CGROUP_SCHED
717 717
718 config RT_GROUP_SCHED 718 config RT_GROUP_SCHED
719 bool "Group scheduling for SCHED_RR/FIFO" 719 bool "Group scheduling for SCHED_RR/FIFO"
720 depends on EXPERIMENTAL 720 depends on EXPERIMENTAL
721 depends on CGROUP_SCHED 721 depends on CGROUP_SCHED
722 default n 722 default n
723 help 723 help
724 This feature lets you explicitly allocate real CPU bandwidth 724 This feature lets you explicitly allocate real CPU bandwidth
725 to task groups. If enabled, it will also make it impossible to 725 to task groups. If enabled, it will also make it impossible to
726 schedule realtime tasks for non-root users until you allocate 726 schedule realtime tasks for non-root users until you allocate
727 realtime bandwidth for them. 727 realtime bandwidth for them.
728 See Documentation/scheduler/sched-rt-group.txt for more information. 728 See Documentation/scheduler/sched-rt-group.txt for more information.
729 729
730 endif #CGROUP_SCHED 730 endif #CGROUP_SCHED
731 731
732 config BLK_CGROUP 732 config BLK_CGROUP
733 tristate "Block IO controller" 733 tristate "Block IO controller"
734 depends on BLOCK 734 depends on BLOCK
735 default n 735 default n
736 ---help--- 736 ---help---
737 Generic block IO controller cgroup interface. This is the common 737 Generic block IO controller cgroup interface. This is the common
738 cgroup interface which should be used by various IO controlling 738 cgroup interface which should be used by various IO controlling
739 policies. 739 policies.
740 740
741 Currently, CFQ IO scheduler uses it to recognize task groups and 741 Currently, CFQ IO scheduler uses it to recognize task groups and
742 control disk bandwidth allocation (proportional time slice allocation) 742 control disk bandwidth allocation (proportional time slice allocation)
743 to such task groups. It is also used by bio throttling logic in 743 to such task groups. It is also used by bio throttling logic in
744 block layer to implement upper limit in IO rates on a device. 744 block layer to implement upper limit in IO rates on a device.
745 745
746 This option only enables generic Block IO controller infrastructure. 746 This option only enables generic Block IO controller infrastructure.
747 One needs to also enable actual IO controlling logic/policy. For 747 One needs to also enable actual IO controlling logic/policy. For
748 enabling proportional weight division of disk bandwidth in CFQ, set 748 enabling proportional weight division of disk bandwidth in CFQ, set
749 CONFIG_CFQ_GROUP_IOSCHED=y; for enabling throttling policy, set 749 CONFIG_CFQ_GROUP_IOSCHED=y; for enabling throttling policy, set
750 CONFIG_BLK_DEV_THROTTLING=y. 750 CONFIG_BLK_DEV_THROTTLING=y.
751 751
752 See Documentation/cgroups/blkio-controller.txt for more information. 752 See Documentation/cgroups/blkio-controller.txt for more information.
753 753
754 config DEBUG_BLK_CGROUP 754 config DEBUG_BLK_CGROUP
755 bool "Enable Block IO controller debugging" 755 bool "Enable Block IO controller debugging"
756 depends on BLK_CGROUP 756 depends on BLK_CGROUP
757 default n 757 default n
758 ---help--- 758 ---help---
759 Enable some debugging help. Currently it exports additional stat 759 Enable some debugging help. Currently it exports additional stat
760 files in a cgroup which can be useful for debugging. 760 files in a cgroup which can be useful for debugging.
761 761
762 endif # CGROUPS 762 endif # CGROUPS
763 763
764 menuconfig NAMESPACES 764 menuconfig NAMESPACES
765 bool "Namespaces support" if EXPERT 765 bool "Namespaces support" if EXPERT
766 default !EXPERT 766 default !EXPERT
767 help 767 help
768 Provides the way to make tasks work with different objects using 768 Provides the way to make tasks work with different objects using
769 the same id. For example same IPC id may refer to different objects 769 the same id. For example same IPC id may refer to different objects
770 or same user id or pid may refer to different tasks when used in 770 or same user id or pid may refer to different tasks when used in
771 different namespaces. 771 different namespaces.
772 772
773 if NAMESPACES 773 if NAMESPACES
774 774
775 config UTS_NS 775 config UTS_NS
776 bool "UTS namespace" 776 bool "UTS namespace"
777 default y 777 default y
778 help 778 help
779 In this namespace tasks see different info provided with the 779 In this namespace tasks see different info provided with the
780 uname() system call 780 uname() system call
781 781
782 config IPC_NS 782 config IPC_NS
783 bool "IPC namespace" 783 bool "IPC namespace"
784 depends on (SYSVIPC || POSIX_MQUEUE) 784 depends on (SYSVIPC || POSIX_MQUEUE)
785 default y 785 default y
786 help 786 help
787 In this namespace tasks work with IPC ids which correspond to 787 In this namespace tasks work with IPC ids which correspond to
788 different IPC objects in different namespaces. 788 different IPC objects in different namespaces.
789 789
790 config USER_NS 790 config USER_NS
791 bool "User namespace (EXPERIMENTAL)" 791 bool "User namespace (EXPERIMENTAL)"
792 depends on EXPERIMENTAL 792 depends on EXPERIMENTAL
793 default y 793 default y
794 help 794 help
795 This allows containers, i.e. vservers, to use user namespaces 795 This allows containers, i.e. vservers, to use user namespaces
796 to provide different user info for different servers. 796 to provide different user info for different servers.
797 If unsure, say N. 797 If unsure, say N.
798 798
799 config PID_NS 799 config PID_NS
800 bool "PID Namespaces" 800 bool "PID Namespaces"
801 default y 801 default y
802 help 802 help
803 Support process id namespaces. This allows having multiple 803 Support process id namespaces. This allows having multiple
804 processes with the same pid as long as they are in different 804 processes with the same pid as long as they are in different
805 pid namespaces. This is a building block of containers. 805 pid namespaces. This is a building block of containers.
806 806
807 config NET_NS 807 config NET_NS
808 bool "Network namespace" 808 bool "Network namespace"
809 depends on NET 809 depends on NET
810 default y 810 default y
811 help 811 help
812 Allow user space to create what appear to be multiple instances 812 Allow user space to create what appear to be multiple instances
813 of the network stack. 813 of the network stack.
814 814
815 endif # NAMESPACES 815 endif # NAMESPACES
816 816
817 config SCHED_AUTOGROUP 817 config SCHED_AUTOGROUP
818 bool "Automatic process group scheduling" 818 bool "Automatic process group scheduling"
819 select EVENTFD 819 select EVENTFD
820 select CGROUPS 820 select CGROUPS
821 select CGROUP_SCHED 821 select CGROUP_SCHED
822 select FAIR_GROUP_SCHED 822 select FAIR_GROUP_SCHED
823 help 823 help
824 This option optimizes the scheduler for common desktop workloads by 824 This option optimizes the scheduler for common desktop workloads by
825 automatically creating and populating task groups. This separation 825 automatically creating and populating task groups. This separation
826 of workloads isolates aggressive CPU burners (like build jobs) from 826 of workloads isolates aggressive CPU burners (like build jobs) from
827 desktop applications. Task group autogeneration is currently based 827 desktop applications. Task group autogeneration is currently based
828 upon task session. 828 upon task session.
829 829
830 config SCHED_TTWU_QUEUE
831 bool
832 depends on !SPARC32
833 default y
834
835 config MM_OWNER 830 config MM_OWNER
836 bool 831 bool
837 832
838 config SYSFS_DEPRECATED 833 config SYSFS_DEPRECATED
839 bool "Enable deprecated sysfs features to support old userspace tools" 834 bool "Enable deprecated sysfs features to support old userspace tools"
840 depends on SYSFS 835 depends on SYSFS
841 default n 836 default n
842 help 837 help
843 This option adds code that switches the layout of the "block" class 838 This option adds code that switches the layout of the "block" class
844 devices, to not show up in /sys/class/block/, but only in 839 devices, to not show up in /sys/class/block/, but only in
845 /sys/block/. 840 /sys/block/.
846 841
847 This switch is only active when the sysfs.deprecated=1 boot option is 842 This switch is only active when the sysfs.deprecated=1 boot option is
848 passed or the SYSFS_DEPRECATED_V2 option is set. 843 passed or the SYSFS_DEPRECATED_V2 option is set.
849 844
850 This option allows new kernels to run on old distributions and tools, 845 This option allows new kernels to run on old distributions and tools,
851 which might get confused by /sys/class/block/. Since 2007/2008 all 846 which might get confused by /sys/class/block/. Since 2007/2008 all
852 major distributions and tools handle this just fine. 847 major distributions and tools handle this just fine.
853 848
854 Recent distributions and userspace tools after 2009/2010 depend on 849 Recent distributions and userspace tools after 2009/2010 depend on
855 the existence of /sys/class/block/, and will not work with this 850 the existence of /sys/class/block/, and will not work with this
856 option enabled. 851 option enabled.
857 852
858 Only if you are using a new kernel on an old distribution, you might 853 Only if you are using a new kernel on an old distribution, you might
859 need to say Y here. 854 need to say Y here.
860 855
861 config SYSFS_DEPRECATED_V2 856 config SYSFS_DEPRECATED_V2
862 bool "Enable deprecated sysfs features by default" 857 bool "Enable deprecated sysfs features by default"
863 default n 858 default n
864 depends on SYSFS 859 depends on SYSFS
865 depends on SYSFS_DEPRECATED 860 depends on SYSFS_DEPRECATED
866 help 861 help
867 Enable deprecated sysfs by default. 862 Enable deprecated sysfs by default.
868 863
869 See the CONFIG_SYSFS_DEPRECATED option for more details about this 864 See the CONFIG_SYSFS_DEPRECATED option for more details about this
870 option. 865 option.
871 866
872 Only if you are using a new kernel on an old distribution, you might 867 Only if you are using a new kernel on an old distribution, you might
873 need to say Y here. Even then, odds are you would not need it 868 need to say Y here. Even then, odds are you would not need it
874 enabled, you can always pass the boot option if absolutely necessary. 869 enabled, you can always pass the boot option if absolutely necessary.
875 870
876 config RELAY 871 config RELAY
877 bool "Kernel->user space relay support (formerly relayfs)" 872 bool "Kernel->user space relay support (formerly relayfs)"
878 help 873 help
879 This option enables support for relay interface support in 874 This option enables support for relay interface support in
880 certain file systems (such as debugfs). 875 certain file systems (such as debugfs).
881 It is designed to provide an efficient mechanism for tools and 876 It is designed to provide an efficient mechanism for tools and
882 facilities to relay large amounts of data from kernel space to 877 facilities to relay large amounts of data from kernel space to
883 user space. 878 user space.
884 879
885 If unsure, say N. 880 If unsure, say N.
886 881
887 config BLK_DEV_INITRD 882 config BLK_DEV_INITRD
888 bool "Initial RAM filesystem and RAM disk (initramfs/initrd) support" 883 bool "Initial RAM filesystem and RAM disk (initramfs/initrd) support"
889 depends on BROKEN || !FRV 884 depends on BROKEN || !FRV
890 help 885 help
891 The initial RAM filesystem is a ramfs which is loaded by the 886 The initial RAM filesystem is a ramfs which is loaded by the
892 boot loader (loadlin or lilo) and that is mounted as root 887 boot loader (loadlin or lilo) and that is mounted as root
893 before the normal boot procedure. It is typically used to 888 before the normal boot procedure. It is typically used to
894 load modules needed to mount the "real" root file system, 889 load modules needed to mount the "real" root file system,
895 etc. See <file:Documentation/initrd.txt> for details. 890 etc. See <file:Documentation/initrd.txt> for details.
896 891
897 If RAM disk support (BLK_DEV_RAM) is also included, this 892 If RAM disk support (BLK_DEV_RAM) is also included, this
898 also enables initial RAM disk (initrd) support and adds 893 also enables initial RAM disk (initrd) support and adds
899 15 Kbytes (more on some other architectures) to the kernel size. 894 15 Kbytes (more on some other architectures) to the kernel size.
900 895
901 If unsure say Y. 896 If unsure say Y.
902 897
903 if BLK_DEV_INITRD 898 if BLK_DEV_INITRD
904 899
905 source "usr/Kconfig" 900 source "usr/Kconfig"
906 901
907 endif 902 endif
908 903
909 config CC_OPTIMIZE_FOR_SIZE 904 config CC_OPTIMIZE_FOR_SIZE
910 bool "Optimize for size" 905 bool "Optimize for size"
911 default y 906 default y
912 help 907 help
913 Enabling this option will pass "-Os" instead of "-O2" to gcc 908 Enabling this option will pass "-Os" instead of "-O2" to gcc
914 resulting in a smaller kernel. 909 resulting in a smaller kernel.
915 910
916 If unsure, say Y. 911 If unsure, say Y.
917 912
918 config SYSCTL 913 config SYSCTL
919 bool 914 bool
920 915
921 config ANON_INODES 916 config ANON_INODES
922 bool 917 bool
923 918
924 menuconfig EXPERT 919 menuconfig EXPERT
925 bool "Configure standard kernel features (expert users)" 920 bool "Configure standard kernel features (expert users)"
926 help 921 help
927 This option allows certain base kernel options and settings 922 This option allows certain base kernel options and settings
928 to be disabled or tweaked. This is for specialized 923 to be disabled or tweaked. This is for specialized
929 environments which can tolerate a "non-standard" kernel. 924 environments which can tolerate a "non-standard" kernel.
930 Only use this if you really know what you are doing. 925 Only use this if you really know what you are doing.
931 926
932 config EMBEDDED 927 config EMBEDDED
933 bool "Embedded system" 928 bool "Embedded system"
934 select EXPERT 929 select EXPERT
935 help 930 help
936 This option should be enabled if compiling the kernel for 931 This option should be enabled if compiling the kernel for
937 an embedded system so certain expert options are available 932 an embedded system so certain expert options are available
938 for configuration. 933 for configuration.
939 934
940 config UID16 935 config UID16
941 bool "Enable 16-bit UID system calls" if EXPERT 936 bool "Enable 16-bit UID system calls" if EXPERT
942 depends on ARM || BLACKFIN || CRIS || FRV || H8300 || X86_32 || M68K || (S390 && !64BIT) || SUPERH || SPARC32 || (SPARC64 && COMPAT) || UML || (X86_64 && IA32_EMULATION) 937 depends on ARM || BLACKFIN || CRIS || FRV || H8300 || X86_32 || M68K || (S390 && !64BIT) || SUPERH || SPARC32 || (SPARC64 && COMPAT) || UML || (X86_64 && IA32_EMULATION)
943 default y 938 default y
944 help 939 help
945 This enables the legacy 16-bit UID syscall wrappers. 940 This enables the legacy 16-bit UID syscall wrappers.
946 941
947 config SYSCTL_SYSCALL 942 config SYSCTL_SYSCALL
948 bool "Sysctl syscall support" if EXPERT 943 bool "Sysctl syscall support" if EXPERT
949 depends on PROC_SYSCTL 944 depends on PROC_SYSCTL
950 default y 945 default y
951 select SYSCTL 946 select SYSCTL
952 ---help--- 947 ---help---
953 sys_sysctl uses binary paths that have been found challenging 948 sys_sysctl uses binary paths that have been found challenging
954 to properly maintain and use. The interface in /proc/sys 949 to properly maintain and use. The interface in /proc/sys
955 using paths with ascii names is now the primary path to this 950 using paths with ascii names is now the primary path to this
956 information. 951 information.
957 952
958 Almost nothing using the binary sysctl interface so if you are 953 Almost nothing using the binary sysctl interface so if you are
959 trying to save some space it is probably safe to disable this, 954 trying to save some space it is probably safe to disable this,
960 making your kernel marginally smaller. 955 making your kernel marginally smaller.
961 956
962 If unsure say Y here. 957 If unsure say Y here.
963 958
964 config KALLSYMS 959 config KALLSYMS
965 bool "Load all symbols for debugging/ksymoops" if EXPERT 960 bool "Load all symbols for debugging/ksymoops" if EXPERT
966 default y 961 default y
967 help 962 help
968 Say Y here to let the kernel print out symbolic crash information and 963 Say Y here to let the kernel print out symbolic crash information and
969 symbolic stack backtraces. This increases the size of the kernel 964 symbolic stack backtraces. This increases the size of the kernel
970 somewhat, as all symbols have to be loaded into the kernel image. 965 somewhat, as all symbols have to be loaded into the kernel image.
971 966
972 config KALLSYMS_ALL 967 config KALLSYMS_ALL
973 bool "Include all symbols in kallsyms" 968 bool "Include all symbols in kallsyms"
974 depends on DEBUG_KERNEL && KALLSYMS 969 depends on DEBUG_KERNEL && KALLSYMS
975 help 970 help
976 Normally kallsyms only contains the symbols of functions, for nicer 971 Normally kallsyms only contains the symbols of functions, for nicer
977 OOPS messages. Some debuggers can use kallsyms for other 972 OOPS messages. Some debuggers can use kallsyms for other
978 symbols too: say Y here to include all symbols, if you need them 973 symbols too: say Y here to include all symbols, if you need them
979 and you don't care about adding 300k to the size of your kernel. 974 and you don't care about adding 300k to the size of your kernel.
980 975
981 Say N. 976 Say N.
982 977
983 config KALLSYMS_EXTRA_PASS 978 config KALLSYMS_EXTRA_PASS
984 bool "Do an extra kallsyms pass" 979 bool "Do an extra kallsyms pass"
985 depends on KALLSYMS 980 depends on KALLSYMS
986 help 981 help
987 If kallsyms is not working correctly, the build will fail with 982 If kallsyms is not working correctly, the build will fail with
988 inconsistent kallsyms data. If that occurs, log a bug report and 983 inconsistent kallsyms data. If that occurs, log a bug report and
989 turn on KALLSYMS_EXTRA_PASS which should result in a stable build. 984 turn on KALLSYMS_EXTRA_PASS which should result in a stable build.
990 Always say N here unless you find a bug in kallsyms, which must be 985 Always say N here unless you find a bug in kallsyms, which must be
991 reported. KALLSYMS_EXTRA_PASS is only a temporary workaround while 986 reported. KALLSYMS_EXTRA_PASS is only a temporary workaround while
992 you wait for kallsyms to be fixed. 987 you wait for kallsyms to be fixed.
993 988
994 989
995 config HOTPLUG 990 config HOTPLUG
996 bool "Support for hot-pluggable devices" if EXPERT 991 bool "Support for hot-pluggable devices" if EXPERT
997 default y 992 default y
998 help 993 help
999 This option is provided for the case where no hotplug or uevent 994 This option is provided for the case where no hotplug or uevent
1000 capabilities is wanted by the kernel. You should only consider 995 capabilities is wanted by the kernel. You should only consider
1001 disabling this option for embedded systems that do not use modules, a 996 disabling this option for embedded systems that do not use modules, a
1002 dynamic /dev tree, or dynamic device discovery. Just say Y. 997 dynamic /dev tree, or dynamic device discovery. Just say Y.
1003 998
1004 config PRINTK 999 config PRINTK
1005 default y 1000 default y
1006 bool "Enable support for printk" if EXPERT 1001 bool "Enable support for printk" if EXPERT
1007 help 1002 help
1008 This option enables normal printk support. Removing it 1003 This option enables normal printk support. Removing it
1009 eliminates most of the message strings from the kernel image 1004 eliminates most of the message strings from the kernel image
1010 and makes the kernel more or less silent. As this makes it 1005 and makes the kernel more or less silent. As this makes it
1011 very difficult to diagnose system problems, saying N here is 1006 very difficult to diagnose system problems, saying N here is
1012 strongly discouraged. 1007 strongly discouraged.
1013 1008
1014 config BUG 1009 config BUG
1015 bool "BUG() support" if EXPERT 1010 bool "BUG() support" if EXPERT
1016 default y 1011 default y
1017 help 1012 help
1018 Disabling this option eliminates support for BUG and WARN, reducing 1013 Disabling this option eliminates support for BUG and WARN, reducing
1019 the size of your kernel image and potentially quietly ignoring 1014 the size of your kernel image and potentially quietly ignoring
1020 numerous fatal conditions. You should only consider disabling this 1015 numerous fatal conditions. You should only consider disabling this
1021 option for embedded systems with no facilities for reporting errors. 1016 option for embedded systems with no facilities for reporting errors.
1022 Just say Y. 1017 Just say Y.
1023 1018
1024 config ELF_CORE 1019 config ELF_CORE
1025 default y 1020 default y
1026 bool "Enable ELF core dumps" if EXPERT 1021 bool "Enable ELF core dumps" if EXPERT
1027 help 1022 help
1028 Enable support for generating core dumps. Disabling saves about 4k. 1023 Enable support for generating core dumps. Disabling saves about 4k.
1029 1024
1030 config PCSPKR_PLATFORM 1025 config PCSPKR_PLATFORM
1031 bool "Enable PC-Speaker support" if EXPERT 1026 bool "Enable PC-Speaker support" if EXPERT
1032 depends on ALPHA || X86 || MIPS || PPC_PREP || PPC_CHRP || PPC_PSERIES 1027 depends on ALPHA || X86 || MIPS || PPC_PREP || PPC_CHRP || PPC_PSERIES
1033 default y 1028 default y
1034 help 1029 help
1035 This option allows to disable the internal PC-Speaker 1030 This option allows to disable the internal PC-Speaker
1036 support, saving some memory. 1031 support, saving some memory.
1037 1032
1038 config BASE_FULL 1033 config BASE_FULL
1039 default y 1034 default y
1040 bool "Enable full-sized data structures for core" if EXPERT 1035 bool "Enable full-sized data structures for core" if EXPERT
1041 help 1036 help
1042 Disabling this option reduces the size of miscellaneous core 1037 Disabling this option reduces the size of miscellaneous core
1043 kernel data structures. This saves memory on small machines, 1038 kernel data structures. This saves memory on small machines,
1044 but may reduce performance. 1039 but may reduce performance.
1045 1040
1046 config FUTEX 1041 config FUTEX
1047 bool "Enable futex support" if EXPERT 1042 bool "Enable futex support" if EXPERT
1048 default y 1043 default y
1049 select RT_MUTEXES 1044 select RT_MUTEXES
1050 help 1045 help
1051 Disabling this option will cause the kernel to be built without 1046 Disabling this option will cause the kernel to be built without
1052 support for "fast userspace mutexes". The resulting kernel may not 1047 support for "fast userspace mutexes". The resulting kernel may not
1053 run glibc-based applications correctly. 1048 run glibc-based applications correctly.
1054 1049
1055 config EPOLL 1050 config EPOLL
1056 bool "Enable eventpoll support" if EXPERT 1051 bool "Enable eventpoll support" if EXPERT
1057 default y 1052 default y
1058 select ANON_INODES 1053 select ANON_INODES
1059 help 1054 help
1060 Disabling this option will cause the kernel to be built without 1055 Disabling this option will cause the kernel to be built without
1061 support for epoll family of system calls. 1056 support for epoll family of system calls.
1062 1057
1063 config SIGNALFD 1058 config SIGNALFD
1064 bool "Enable signalfd() system call" if EXPERT 1059 bool "Enable signalfd() system call" if EXPERT
1065 select ANON_INODES 1060 select ANON_INODES
1066 default y 1061 default y
1067 help 1062 help
1068 Enable the signalfd() system call that allows to receive signals 1063 Enable the signalfd() system call that allows to receive signals
1069 on a file descriptor. 1064 on a file descriptor.
1070 1065
1071 If unsure, say Y. 1066 If unsure, say Y.
1072 1067
1073 config TIMERFD 1068 config TIMERFD
1074 bool "Enable timerfd() system call" if EXPERT 1069 bool "Enable timerfd() system call" if EXPERT
1075 select ANON_INODES 1070 select ANON_INODES
1076 default y 1071 default y
1077 help 1072 help
1078 Enable the timerfd() system call that allows to receive timer 1073 Enable the timerfd() system call that allows to receive timer
1079 events on a file descriptor. 1074 events on a file descriptor.
1080 1075
1081 If unsure, say Y. 1076 If unsure, say Y.
1082 1077
1083 config EVENTFD 1078 config EVENTFD
1084 bool "Enable eventfd() system call" if EXPERT 1079 bool "Enable eventfd() system call" if EXPERT
1085 select ANON_INODES 1080 select ANON_INODES
1086 default y 1081 default y
1087 help 1082 help
1088 Enable the eventfd() system call that allows to receive both 1083 Enable the eventfd() system call that allows to receive both
1089 kernel notification (ie. KAIO) or userspace notifications. 1084 kernel notification (ie. KAIO) or userspace notifications.
1090 1085
1091 If unsure, say Y. 1086 If unsure, say Y.
1092 1087
1093 config SHMEM 1088 config SHMEM
1094 bool "Use full shmem filesystem" if EXPERT 1089 bool "Use full shmem filesystem" if EXPERT
1095 default y 1090 default y
1096 depends on MMU 1091 depends on MMU
1097 help 1092 help
1098 The shmem is an internal filesystem used to manage shared memory. 1093 The shmem is an internal filesystem used to manage shared memory.
1099 It is backed by swap and manages resource limits. It is also exported 1094 It is backed by swap and manages resource limits. It is also exported
1100 to userspace as tmpfs if TMPFS is enabled. Disabling this 1095 to userspace as tmpfs if TMPFS is enabled. Disabling this
1101 option replaces shmem and tmpfs with the much simpler ramfs code, 1096 option replaces shmem and tmpfs with the much simpler ramfs code,
1102 which may be appropriate on small systems without swap. 1097 which may be appropriate on small systems without swap.
1103 1098
1104 config AIO 1099 config AIO
1105 bool "Enable AIO support" if EXPERT 1100 bool "Enable AIO support" if EXPERT
1106 default y 1101 default y
1107 help 1102 help
1108 This option enables POSIX asynchronous I/O which may by used 1103 This option enables POSIX asynchronous I/O which may by used
1109 by some high performance threaded applications. Disabling 1104 by some high performance threaded applications. Disabling
1110 this option saves about 7k. 1105 this option saves about 7k.
1111 1106
1112 config HAVE_PERF_EVENTS 1107 config HAVE_PERF_EVENTS
1113 bool 1108 bool
1114 help 1109 help
1115 See tools/perf/design.txt for details. 1110 See tools/perf/design.txt for details.
1116 1111
1117 config PERF_USE_VMALLOC 1112 config PERF_USE_VMALLOC
1118 bool 1113 bool
1119 help 1114 help
1120 See tools/perf/design.txt for details 1115 See tools/perf/design.txt for details
1121 1116
1122 menu "Kernel Performance Events And Counters" 1117 menu "Kernel Performance Events And Counters"
1123 1118
1124 config PERF_EVENTS 1119 config PERF_EVENTS
1125 bool "Kernel performance events and counters" 1120 bool "Kernel performance events and counters"
1126 default y if (PROFILING || PERF_COUNTERS) 1121 default y if (PROFILING || PERF_COUNTERS)
1127 depends on HAVE_PERF_EVENTS 1122 depends on HAVE_PERF_EVENTS
1128 select ANON_INODES 1123 select ANON_INODES
1129 select IRQ_WORK 1124 select IRQ_WORK
1130 help 1125 help
1131 Enable kernel support for various performance events provided 1126 Enable kernel support for various performance events provided
1132 by software and hardware. 1127 by software and hardware.
1133 1128
1134 Software events are supported either built-in or via the 1129 Software events are supported either built-in or via the
1135 use of generic tracepoints. 1130 use of generic tracepoints.
1136 1131
1137 Most modern CPUs support performance events via performance 1132 Most modern CPUs support performance events via performance
1138 counter registers. These registers count the number of certain 1133 counter registers. These registers count the number of certain
1139 types of hw events: such as instructions executed, cachemisses 1134 types of hw events: such as instructions executed, cachemisses
1140 suffered, or branches mis-predicted - without slowing down the 1135 suffered, or branches mis-predicted - without slowing down the
1141 kernel or applications. These registers can also trigger interrupts 1136 kernel or applications. These registers can also trigger interrupts
1142 when a threshold number of events have passed - and can thus be 1137 when a threshold number of events have passed - and can thus be
1143 used to profile the code that runs on that CPU. 1138 used to profile the code that runs on that CPU.
1144 1139
1145 The Linux Performance Event subsystem provides an abstraction of 1140 The Linux Performance Event subsystem provides an abstraction of
1146 these software and hardware event capabilities, available via a 1141 these software and hardware event capabilities, available via a
1147 system call and used by the "perf" utility in tools/perf/. It 1142 system call and used by the "perf" utility in tools/perf/. It
1148 provides per task and per CPU counters, and it provides event 1143 provides per task and per CPU counters, and it provides event
1149 capabilities on top of those. 1144 capabilities on top of those.
1150 1145
1151 Say Y if unsure. 1146 Say Y if unsure.
1152 1147
1153 config PERF_COUNTERS 1148 config PERF_COUNTERS
1154 bool "Kernel performance counters (old config option)" 1149 bool "Kernel performance counters (old config option)"
1155 depends on HAVE_PERF_EVENTS 1150 depends on HAVE_PERF_EVENTS
1156 help 1151 help
1157 This config has been obsoleted by the PERF_EVENTS 1152 This config has been obsoleted by the PERF_EVENTS
1158 config option - please see that one for details. 1153 config option - please see that one for details.
1159 1154
1160 It has no effect on the kernel whether you enable 1155 It has no effect on the kernel whether you enable
1161 it or not, it is a compatibility placeholder. 1156 it or not, it is a compatibility placeholder.
1162 1157
1163 Say N if unsure. 1158 Say N if unsure.
1164 1159
1165 config DEBUG_PERF_USE_VMALLOC 1160 config DEBUG_PERF_USE_VMALLOC
1166 default n 1161 default n
1167 bool "Debug: use vmalloc to back perf mmap() buffers" 1162 bool "Debug: use vmalloc to back perf mmap() buffers"
1168 depends on PERF_EVENTS && DEBUG_KERNEL 1163 depends on PERF_EVENTS && DEBUG_KERNEL
1169 select PERF_USE_VMALLOC 1164 select PERF_USE_VMALLOC
1170 help 1165 help
1171 Use vmalloc memory to back perf mmap() buffers. 1166 Use vmalloc memory to back perf mmap() buffers.
1172 1167
1173 Mostly useful for debugging the vmalloc code on platforms 1168 Mostly useful for debugging the vmalloc code on platforms
1174 that don't require it. 1169 that don't require it.
1175 1170
1176 Say N if unsure. 1171 Say N if unsure.
1177 1172
1178 endmenu 1173 endmenu
1179 1174
1180 config VM_EVENT_COUNTERS 1175 config VM_EVENT_COUNTERS
1181 default y 1176 default y
1182 bool "Enable VM event counters for /proc/vmstat" if EXPERT 1177 bool "Enable VM event counters for /proc/vmstat" if EXPERT
1183 help 1178 help
1184 VM event counters are needed for event counts to be shown. 1179 VM event counters are needed for event counts to be shown.
1185 This option allows the disabling of the VM event counters 1180 This option allows the disabling of the VM event counters
1186 on EXPERT systems. /proc/vmstat will only show page counts 1181 on EXPERT systems. /proc/vmstat will only show page counts
1187 if VM event counters are disabled. 1182 if VM event counters are disabled.
1188 1183
1189 config PCI_QUIRKS 1184 config PCI_QUIRKS
1190 default y 1185 default y
1191 bool "Enable PCI quirk workarounds" if EXPERT 1186 bool "Enable PCI quirk workarounds" if EXPERT
1192 depends on PCI 1187 depends on PCI
1193 help 1188 help
1194 This enables workarounds for various PCI chipset 1189 This enables workarounds for various PCI chipset
1195 bugs/quirks. Disable this only if your target machine is 1190 bugs/quirks. Disable this only if your target machine is
1196 unaffected by PCI quirks. 1191 unaffected by PCI quirks.
1197 1192
1198 config SLUB_DEBUG 1193 config SLUB_DEBUG
1199 default y 1194 default y
1200 bool "Enable SLUB debugging support" if EXPERT 1195 bool "Enable SLUB debugging support" if EXPERT
1201 depends on SLUB && SYSFS 1196 depends on SLUB && SYSFS
1202 help 1197 help
1203 SLUB has extensive debug support features. Disabling these can 1198 SLUB has extensive debug support features. Disabling these can
1204 result in significant savings in code size. This also disables 1199 result in significant savings in code size. This also disables
1205 SLUB sysfs support. /sys/slab will not exist and there will be 1200 SLUB sysfs support. /sys/slab will not exist and there will be
1206 no support for cache validation etc. 1201 no support for cache validation etc.
1207 1202
1208 config COMPAT_BRK 1203 config COMPAT_BRK
1209 bool "Disable heap randomization" 1204 bool "Disable heap randomization"
1210 default y 1205 default y
1211 help 1206 help
1212 Randomizing heap placement makes heap exploits harder, but it 1207 Randomizing heap placement makes heap exploits harder, but it
1213 also breaks ancient binaries (including anything libc5 based). 1208 also breaks ancient binaries (including anything libc5 based).
1214 This option changes the bootup default to heap randomization 1209 This option changes the bootup default to heap randomization
1215 disabled, and can be overridden at runtime by setting 1210 disabled, and can be overridden at runtime by setting
1216 /proc/sys/kernel/randomize_va_space to 2. 1211 /proc/sys/kernel/randomize_va_space to 2.
1217 1212
1218 On non-ancient distros (post-2000 ones) N is usually a safe choice. 1213 On non-ancient distros (post-2000 ones) N is usually a safe choice.
1219 1214
1220 choice 1215 choice
1221 prompt "Choose SLAB allocator" 1216 prompt "Choose SLAB allocator"
1222 default SLUB 1217 default SLUB
1223 help 1218 help
1224 This option allows to select a slab allocator. 1219 This option allows to select a slab allocator.
1225 1220
1226 config SLAB 1221 config SLAB
1227 bool "SLAB" 1222 bool "SLAB"
1228 help 1223 help
1229 The regular slab allocator that is established and known to work 1224 The regular slab allocator that is established and known to work
1230 well in all environments. It organizes cache hot objects in 1225 well in all environments. It organizes cache hot objects in
1231 per cpu and per node queues. 1226 per cpu and per node queues.
1232 1227
1233 config SLUB 1228 config SLUB
1234 bool "SLUB (Unqueued Allocator)" 1229 bool "SLUB (Unqueued Allocator)"
1235 help 1230 help
1236 SLUB is a slab allocator that minimizes cache line usage 1231 SLUB is a slab allocator that minimizes cache line usage
1237 instead of managing queues of cached objects (SLAB approach). 1232 instead of managing queues of cached objects (SLAB approach).
1238 Per cpu caching is realized using slabs of objects instead 1233 Per cpu caching is realized using slabs of objects instead
1239 of queues of objects. SLUB can use memory efficiently 1234 of queues of objects. SLUB can use memory efficiently
1240 and has enhanced diagnostics. SLUB is the default choice for 1235 and has enhanced diagnostics. SLUB is the default choice for
1241 a slab allocator. 1236 a slab allocator.
1242 1237
1243 config SLOB 1238 config SLOB
1244 depends on EXPERT 1239 depends on EXPERT
1245 bool "SLOB (Simple Allocator)" 1240 bool "SLOB (Simple Allocator)"
1246 help 1241 help
1247 SLOB replaces the stock allocator with a drastically simpler 1242 SLOB replaces the stock allocator with a drastically simpler
1248 allocator. SLOB is generally more space efficient but 1243 allocator. SLOB is generally more space efficient but
1249 does not perform as well on large systems. 1244 does not perform as well on large systems.
1250 1245
1251 endchoice 1246 endchoice
1252 1247
1253 config MMAP_ALLOW_UNINITIALIZED 1248 config MMAP_ALLOW_UNINITIALIZED
1254 bool "Allow mmapped anonymous memory to be uninitialized" 1249 bool "Allow mmapped anonymous memory to be uninitialized"
1255 depends on EXPERT && !MMU 1250 depends on EXPERT && !MMU
1256 default n 1251 default n
1257 help 1252 help
1258 Normally, and according to the Linux spec, anonymous memory obtained 1253 Normally, and according to the Linux spec, anonymous memory obtained
1259 from mmap() has it's contents cleared before it is passed to 1254 from mmap() has it's contents cleared before it is passed to
1260 userspace. Enabling this config option allows you to request that 1255 userspace. Enabling this config option allows you to request that
1261 mmap() skip that if it is given an MAP_UNINITIALIZED flag, thus 1256 mmap() skip that if it is given an MAP_UNINITIALIZED flag, thus
1262 providing a huge performance boost. If this option is not enabled, 1257 providing a huge performance boost. If this option is not enabled,
1263 then the flag will be ignored. 1258 then the flag will be ignored.
1264 1259
1265 This is taken advantage of by uClibc's malloc(), and also by 1260 This is taken advantage of by uClibc's malloc(), and also by
1266 ELF-FDPIC binfmt's brk and stack allocator. 1261 ELF-FDPIC binfmt's brk and stack allocator.
1267 1262
1268 Because of the obvious security issues, this option should only be 1263 Because of the obvious security issues, this option should only be
1269 enabled on embedded devices where you control what is run in 1264 enabled on embedded devices where you control what is run in
1270 userspace. Since that isn't generally a problem on no-MMU systems, 1265 userspace. Since that isn't generally a problem on no-MMU systems,
1271 it is normally safe to say Y here. 1266 it is normally safe to say Y here.
1272 1267
1273 See Documentation/nommu-mmap.txt for more information. 1268 See Documentation/nommu-mmap.txt for more information.
1274 1269
1275 config PROFILING 1270 config PROFILING
1276 bool "Profiling support" 1271 bool "Profiling support"
1277 help 1272 help
1278 Say Y here to enable the extended profiling support mechanisms used 1273 Say Y here to enable the extended profiling support mechanisms used
1279 by profilers such as OProfile. 1274 by profilers such as OProfile.
1280 1275
1281 # 1276 #
1282 # Place an empty function call at each tracepoint site. Can be 1277 # Place an empty function call at each tracepoint site. Can be
1283 # dynamically changed for a probe function. 1278 # dynamically changed for a probe function.
1284 # 1279 #
1285 config TRACEPOINTS 1280 config TRACEPOINTS
1286 bool 1281 bool
1287 1282
1288 source "arch/Kconfig" 1283 source "arch/Kconfig"
1289 1284
1290 endmenu # General setup 1285 endmenu # General setup
1291 1286
1292 config HAVE_GENERIC_DMA_COHERENT 1287 config HAVE_GENERIC_DMA_COHERENT
1293 bool 1288 bool
1294 default n 1289 default n
1295 1290
1296 config SLABINFO 1291 config SLABINFO
1297 bool 1292 bool
1298 depends on PROC_FS 1293 depends on PROC_FS
1299 depends on SLAB || SLUB_DEBUG 1294 depends on SLAB || SLUB_DEBUG
1300 default y 1295 default y
1301 1296
1302 config RT_MUTEXES 1297 config RT_MUTEXES
1303 boolean 1298 boolean
1304 1299
1305 config BASE_SMALL 1300 config BASE_SMALL
1306 int 1301 int
1307 default 0 if BASE_FULL 1302 default 0 if BASE_FULL
1308 default 1 if !BASE_FULL 1303 default 1 if !BASE_FULL
1309 1304
1310 menuconfig MODULES 1305 menuconfig MODULES
1311 bool "Enable loadable module support" 1306 bool "Enable loadable module support"
1312 help 1307 help
1313 Kernel modules are small pieces of compiled code which can 1308 Kernel modules are small pieces of compiled code which can
1314 be inserted in the running kernel, rather than being 1309 be inserted in the running kernel, rather than being
1315 permanently built into the kernel. You use the "modprobe" 1310 permanently built into the kernel. You use the "modprobe"
1316 tool to add (and sometimes remove) them. If you say Y here, 1311 tool to add (and sometimes remove) them. If you say Y here,
1317 many parts of the kernel can be built as modules (by 1312 many parts of the kernel can be built as modules (by
1318 answering M instead of Y where indicated): this is most 1313 answering M instead of Y where indicated): this is most
1319 useful for infrequently used options which are not required 1314 useful for infrequently used options which are not required
1320 for booting. For more information, see the man pages for 1315 for booting. For more information, see the man pages for
1321 modprobe, lsmod, modinfo, insmod and rmmod. 1316 modprobe, lsmod, modinfo, insmod and rmmod.
1322 1317
1323 If you say Y here, you will need to run "make 1318 If you say Y here, you will need to run "make
1324 modules_install" to put the modules under /lib/modules/ 1319 modules_install" to put the modules under /lib/modules/
1325 where modprobe can find them (you may need to be root to do 1320 where modprobe can find them (you may need to be root to do
1326 this). 1321 this).
1327 1322
1328 If unsure, say Y. 1323 If unsure, say Y.
1329 1324
1330 if MODULES 1325 if MODULES
1331 1326
1332 config MODULE_FORCE_LOAD 1327 config MODULE_FORCE_LOAD
1333 bool "Forced module loading" 1328 bool "Forced module loading"
1334 default n 1329 default n
1335 help 1330 help
1336 Allow loading of modules without version information (ie. modprobe 1331 Allow loading of modules without version information (ie. modprobe
1337 --force). Forced module loading sets the 'F' (forced) taint flag and 1332 --force). Forced module loading sets the 'F' (forced) taint flag and
1338 is usually a really bad idea. 1333 is usually a really bad idea.
1339 1334
1340 config MODULE_UNLOAD 1335 config MODULE_UNLOAD
1341 bool "Module unloading" 1336 bool "Module unloading"
1342 help 1337 help
1343 Without this option you will not be able to unload any 1338 Without this option you will not be able to unload any
1344 modules (note that some modules may not be unloadable 1339 modules (note that some modules may not be unloadable
1345 anyway), which makes your kernel smaller, faster 1340 anyway), which makes your kernel smaller, faster
1346 and simpler. If unsure, say Y. 1341 and simpler. If unsure, say Y.
1347 1342
1348 config MODULE_FORCE_UNLOAD 1343 config MODULE_FORCE_UNLOAD
1349 bool "Forced module unloading" 1344 bool "Forced module unloading"
1350 depends on MODULE_UNLOAD && EXPERIMENTAL 1345 depends on MODULE_UNLOAD && EXPERIMENTAL
1351 help 1346 help
1352 This option allows you to force a module to unload, even if the 1347 This option allows you to force a module to unload, even if the
1353 kernel believes it is unsafe: the kernel will remove the module 1348 kernel believes it is unsafe: the kernel will remove the module
1354 without waiting for anyone to stop using it (using the -f option to 1349 without waiting for anyone to stop using it (using the -f option to
1355 rmmod). This is mainly for kernel developers and desperate users. 1350 rmmod). This is mainly for kernel developers and desperate users.
1356 If unsure, say N. 1351 If unsure, say N.
1357 1352
1358 config MODVERSIONS 1353 config MODVERSIONS
1359 bool "Module versioning support" 1354 bool "Module versioning support"
1360 help 1355 help
1361 Usually, you have to use modules compiled with your kernel. 1356 Usually, you have to use modules compiled with your kernel.
1362 Saying Y here makes it sometimes possible to use modules 1357 Saying Y here makes it sometimes possible to use modules
1363 compiled for different kernels, by adding enough information 1358 compiled for different kernels, by adding enough information
1364 to the modules to (hopefully) spot any changes which would 1359 to the modules to (hopefully) spot any changes which would
1365 make them incompatible with the kernel you are running. If 1360 make them incompatible with the kernel you are running. If
1366 unsure, say N. 1361 unsure, say N.
1367 1362
1368 config MODULE_SRCVERSION_ALL 1363 config MODULE_SRCVERSION_ALL
1369 bool "Source checksum for all modules" 1364 bool "Source checksum for all modules"
1370 help 1365 help
1371 Modules which contain a MODULE_VERSION get an extra "srcversion" 1366 Modules which contain a MODULE_VERSION get an extra "srcversion"
1372 field inserted into their modinfo section, which contains a 1367 field inserted into their modinfo section, which contains a
1373 sum of the source files which made it. This helps maintainers 1368 sum of the source files which made it. This helps maintainers
1374 see exactly which source was used to build a module (since 1369 see exactly which source was used to build a module (since
1375 others sometimes change the module source without updating 1370 others sometimes change the module source without updating
1376 the version). With this option, such a "srcversion" field 1371 the version). With this option, such a "srcversion" field
1377 will be created for all modules. If unsure, say N. 1372 will be created for all modules. If unsure, say N.
1378 1373
1379 endif # MODULES 1374 endif # MODULES
1380 1375
1381 config INIT_ALL_POSSIBLE 1376 config INIT_ALL_POSSIBLE
1382 bool 1377 bool
1383 help 1378 help
1384 Back when each arch used to define their own cpu_online_map and 1379 Back when each arch used to define their own cpu_online_map and
1385 cpu_possible_map, some of them chose to initialize cpu_possible_map 1380 cpu_possible_map, some of them chose to initialize cpu_possible_map
1386 with all 1s, and others with all 0s. When they were centralised, 1381 with all 1s, and others with all 0s. When they were centralised,
1387 it was better to provide this option than to break all the archs 1382 it was better to provide this option than to break all the archs
1388 and have several arch maintainers pursuing me down dark alleys. 1383 and have several arch maintainers pursuing me down dark alleys.
1389 1384
1390 config STOP_MACHINE 1385 config STOP_MACHINE
1391 bool 1386 bool
1392 default y 1387 default y
1393 depends on (SMP && MODULE_UNLOAD) || HOTPLUG_CPU 1388 depends on (SMP && MODULE_UNLOAD) || HOTPLUG_CPU
1394 help 1389 help
1395 Need stop_machine() primitive. 1390 Need stop_machine() primitive.
1396 1391
1397 source "block/Kconfig" 1392 source "block/Kconfig"
1398 1393
1399 config PREEMPT_NOTIFIERS 1394 config PREEMPT_NOTIFIERS
1400 bool 1395 bool
1401 1396
1402 config PADATA 1397 config PADATA
1403 depends on SMP 1398 depends on SMP
1404 bool 1399 bool
1405 1400
1406 source "kernel/Kconfig.locks" 1401 source "kernel/Kconfig.locks"
1407 1402
1 /* 1 /*
2 * kernel/sched.c 2 * kernel/sched.c
3 * 3 *
4 * Kernel scheduler and related syscalls 4 * Kernel scheduler and related syscalls
5 * 5 *
6 * Copyright (C) 1991-2002 Linus Torvalds 6 * Copyright (C) 1991-2002 Linus Torvalds
7 * 7 *
8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and 8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
9 * make semaphores SMP safe 9 * make semaphores SMP safe
10 * 1998-11-19 Implemented schedule_timeout() and related stuff 10 * 1998-11-19 Implemented schedule_timeout() and related stuff
11 * by Andrea Arcangeli 11 * by Andrea Arcangeli
12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: 12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
13 * hybrid priority-list and round-robin design with 13 * hybrid priority-list and round-robin design with
14 * an array-switch method of distributing timeslices 14 * an array-switch method of distributing timeslices
15 * and per-CPU runqueues. Cleanups and useful suggestions 15 * and per-CPU runqueues. Cleanups and useful suggestions
16 * by Davide Libenzi, preemptible kernel bits by Robert Love. 16 * by Davide Libenzi, preemptible kernel bits by Robert Love.
17 * 2003-09-03 Interactivity tuning by Con Kolivas. 17 * 2003-09-03 Interactivity tuning by Con Kolivas.
18 * 2004-04-02 Scheduler domains code by Nick Piggin 18 * 2004-04-02 Scheduler domains code by Nick Piggin
19 * 2007-04-15 Work begun on replacing all interactivity tuning with a 19 * 2007-04-15 Work begun on replacing all interactivity tuning with a
20 * fair scheduling design by Con Kolivas. 20 * fair scheduling design by Con Kolivas.
21 * 2007-05-05 Load balancing (smp-nice) and other improvements 21 * 2007-05-05 Load balancing (smp-nice) and other improvements
22 * by Peter Williams 22 * by Peter Williams
23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith 23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri 24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
25 * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, 25 * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,
26 * Thomas Gleixner, Mike Kravetz 26 * Thomas Gleixner, Mike Kravetz
27 */ 27 */
28 28
29 #include <linux/mm.h> 29 #include <linux/mm.h>
30 #include <linux/module.h> 30 #include <linux/module.h>
31 #include <linux/nmi.h> 31 #include <linux/nmi.h>
32 #include <linux/init.h> 32 #include <linux/init.h>
33 #include <linux/uaccess.h> 33 #include <linux/uaccess.h>
34 #include <linux/highmem.h> 34 #include <linux/highmem.h>
35 #include <asm/mmu_context.h> 35 #include <asm/mmu_context.h>
36 #include <linux/interrupt.h> 36 #include <linux/interrupt.h>
37 #include <linux/capability.h> 37 #include <linux/capability.h>
38 #include <linux/completion.h> 38 #include <linux/completion.h>
39 #include <linux/kernel_stat.h> 39 #include <linux/kernel_stat.h>
40 #include <linux/debug_locks.h> 40 #include <linux/debug_locks.h>
41 #include <linux/perf_event.h> 41 #include <linux/perf_event.h>
42 #include <linux/security.h> 42 #include <linux/security.h>
43 #include <linux/notifier.h> 43 #include <linux/notifier.h>
44 #include <linux/profile.h> 44 #include <linux/profile.h>
45 #include <linux/freezer.h> 45 #include <linux/freezer.h>
46 #include <linux/vmalloc.h> 46 #include <linux/vmalloc.h>
47 #include <linux/blkdev.h> 47 #include <linux/blkdev.h>
48 #include <linux/delay.h> 48 #include <linux/delay.h>
49 #include <linux/pid_namespace.h> 49 #include <linux/pid_namespace.h>
50 #include <linux/smp.h> 50 #include <linux/smp.h>
51 #include <linux/threads.h> 51 #include <linux/threads.h>
52 #include <linux/timer.h> 52 #include <linux/timer.h>
53 #include <linux/rcupdate.h> 53 #include <linux/rcupdate.h>
54 #include <linux/cpu.h> 54 #include <linux/cpu.h>
55 #include <linux/cpuset.h> 55 #include <linux/cpuset.h>
56 #include <linux/percpu.h> 56 #include <linux/percpu.h>
57 #include <linux/proc_fs.h> 57 #include <linux/proc_fs.h>
58 #include <linux/seq_file.h> 58 #include <linux/seq_file.h>
59 #include <linux/stop_machine.h> 59 #include <linux/stop_machine.h>
60 #include <linux/sysctl.h> 60 #include <linux/sysctl.h>
61 #include <linux/syscalls.h> 61 #include <linux/syscalls.h>
62 #include <linux/times.h> 62 #include <linux/times.h>
63 #include <linux/tsacct_kern.h> 63 #include <linux/tsacct_kern.h>
64 #include <linux/kprobes.h> 64 #include <linux/kprobes.h>
65 #include <linux/delayacct.h> 65 #include <linux/delayacct.h>
66 #include <linux/unistd.h> 66 #include <linux/unistd.h>
67 #include <linux/pagemap.h> 67 #include <linux/pagemap.h>
68 #include <linux/hrtimer.h> 68 #include <linux/hrtimer.h>
69 #include <linux/tick.h> 69 #include <linux/tick.h>
70 #include <linux/debugfs.h> 70 #include <linux/debugfs.h>
71 #include <linux/ctype.h> 71 #include <linux/ctype.h>
72 #include <linux/ftrace.h> 72 #include <linux/ftrace.h>
73 #include <linux/slab.h> 73 #include <linux/slab.h>
74 74
75 #include <asm/tlb.h> 75 #include <asm/tlb.h>
76 #include <asm/irq_regs.h> 76 #include <asm/irq_regs.h>
77 #include <asm/mutex.h> 77 #include <asm/mutex.h>
78 78
79 #include "sched_cpupri.h" 79 #include "sched_cpupri.h"
80 #include "workqueue_sched.h" 80 #include "workqueue_sched.h"
81 #include "sched_autogroup.h" 81 #include "sched_autogroup.h"
82 82
83 #define CREATE_TRACE_POINTS 83 #define CREATE_TRACE_POINTS
84 #include <trace/events/sched.h> 84 #include <trace/events/sched.h>
85 85
86 /* 86 /*
87 * Convert user-nice values [ -20 ... 0 ... 19 ] 87 * Convert user-nice values [ -20 ... 0 ... 19 ]
88 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], 88 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
89 * and back. 89 * and back.
90 */ 90 */
91 #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) 91 #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
92 #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) 92 #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
93 #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) 93 #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
94 94
95 /* 95 /*
96 * 'User priority' is the nice value converted to something we 96 * 'User priority' is the nice value converted to something we
97 * can work with better when scaling various scheduler parameters, 97 * can work with better when scaling various scheduler parameters,
98 * it's a [ 0 ... 39 ] range. 98 * it's a [ 0 ... 39 ] range.
99 */ 99 */
100 #define USER_PRIO(p) ((p)-MAX_RT_PRIO) 100 #define USER_PRIO(p) ((p)-MAX_RT_PRIO)
101 #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) 101 #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
102 #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) 102 #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
103 103
104 /* 104 /*
105 * Helpers for converting nanosecond timing to jiffy resolution 105 * Helpers for converting nanosecond timing to jiffy resolution
106 */ 106 */
107 #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) 107 #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
108 108
109 #define NICE_0_LOAD SCHED_LOAD_SCALE 109 #define NICE_0_LOAD SCHED_LOAD_SCALE
110 #define NICE_0_SHIFT SCHED_LOAD_SHIFT 110 #define NICE_0_SHIFT SCHED_LOAD_SHIFT
111 111
112 /* 112 /*
113 * These are the 'tuning knobs' of the scheduler: 113 * These are the 'tuning knobs' of the scheduler:
114 * 114 *
115 * default timeslice is 100 msecs (used only for SCHED_RR tasks). 115 * default timeslice is 100 msecs (used only for SCHED_RR tasks).
116 * Timeslices get refilled after they expire. 116 * Timeslices get refilled after they expire.
117 */ 117 */
118 #define DEF_TIMESLICE (100 * HZ / 1000) 118 #define DEF_TIMESLICE (100 * HZ / 1000)
119 119
120 /* 120 /*
121 * single value that denotes runtime == period, ie unlimited time. 121 * single value that denotes runtime == period, ie unlimited time.
122 */ 122 */
123 #define RUNTIME_INF ((u64)~0ULL) 123 #define RUNTIME_INF ((u64)~0ULL)
124 124
125 static inline int rt_policy(int policy) 125 static inline int rt_policy(int policy)
126 { 126 {
127 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) 127 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
128 return 1; 128 return 1;
129 return 0; 129 return 0;
130 } 130 }
131 131
132 static inline int task_has_rt_policy(struct task_struct *p) 132 static inline int task_has_rt_policy(struct task_struct *p)
133 { 133 {
134 return rt_policy(p->policy); 134 return rt_policy(p->policy);
135 } 135 }
136 136
137 /* 137 /*
138 * This is the priority-queue data structure of the RT scheduling class: 138 * This is the priority-queue data structure of the RT scheduling class:
139 */ 139 */
140 struct rt_prio_array { 140 struct rt_prio_array {
141 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ 141 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
142 struct list_head queue[MAX_RT_PRIO]; 142 struct list_head queue[MAX_RT_PRIO];
143 }; 143 };
144 144
145 struct rt_bandwidth { 145 struct rt_bandwidth {
146 /* nests inside the rq lock: */ 146 /* nests inside the rq lock: */
147 raw_spinlock_t rt_runtime_lock; 147 raw_spinlock_t rt_runtime_lock;
148 ktime_t rt_period; 148 ktime_t rt_period;
149 u64 rt_runtime; 149 u64 rt_runtime;
150 struct hrtimer rt_period_timer; 150 struct hrtimer rt_period_timer;
151 }; 151 };
152 152
153 static struct rt_bandwidth def_rt_bandwidth; 153 static struct rt_bandwidth def_rt_bandwidth;
154 154
155 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); 155 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
156 156
157 static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) 157 static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
158 { 158 {
159 struct rt_bandwidth *rt_b = 159 struct rt_bandwidth *rt_b =
160 container_of(timer, struct rt_bandwidth, rt_period_timer); 160 container_of(timer, struct rt_bandwidth, rt_period_timer);
161 ktime_t now; 161 ktime_t now;
162 int overrun; 162 int overrun;
163 int idle = 0; 163 int idle = 0;
164 164
165 for (;;) { 165 for (;;) {
166 now = hrtimer_cb_get_time(timer); 166 now = hrtimer_cb_get_time(timer);
167 overrun = hrtimer_forward(timer, now, rt_b->rt_period); 167 overrun = hrtimer_forward(timer, now, rt_b->rt_period);
168 168
169 if (!overrun) 169 if (!overrun)
170 break; 170 break;
171 171
172 idle = do_sched_rt_period_timer(rt_b, overrun); 172 idle = do_sched_rt_period_timer(rt_b, overrun);
173 } 173 }
174 174
175 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; 175 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
176 } 176 }
177 177
178 static 178 static
179 void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) 179 void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
180 { 180 {
181 rt_b->rt_period = ns_to_ktime(period); 181 rt_b->rt_period = ns_to_ktime(period);
182 rt_b->rt_runtime = runtime; 182 rt_b->rt_runtime = runtime;
183 183
184 raw_spin_lock_init(&rt_b->rt_runtime_lock); 184 raw_spin_lock_init(&rt_b->rt_runtime_lock);
185 185
186 hrtimer_init(&rt_b->rt_period_timer, 186 hrtimer_init(&rt_b->rt_period_timer,
187 CLOCK_MONOTONIC, HRTIMER_MODE_REL); 187 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
188 rt_b->rt_period_timer.function = sched_rt_period_timer; 188 rt_b->rt_period_timer.function = sched_rt_period_timer;
189 } 189 }
190 190
191 static inline int rt_bandwidth_enabled(void) 191 static inline int rt_bandwidth_enabled(void)
192 { 192 {
193 return sysctl_sched_rt_runtime >= 0; 193 return sysctl_sched_rt_runtime >= 0;
194 } 194 }
195 195
196 static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 196 static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
197 { 197 {
198 ktime_t now; 198 ktime_t now;
199 199
200 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) 200 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
201 return; 201 return;
202 202
203 if (hrtimer_active(&rt_b->rt_period_timer)) 203 if (hrtimer_active(&rt_b->rt_period_timer))
204 return; 204 return;
205 205
206 raw_spin_lock(&rt_b->rt_runtime_lock); 206 raw_spin_lock(&rt_b->rt_runtime_lock);
207 for (;;) { 207 for (;;) {
208 unsigned long delta; 208 unsigned long delta;
209 ktime_t soft, hard; 209 ktime_t soft, hard;
210 210
211 if (hrtimer_active(&rt_b->rt_period_timer)) 211 if (hrtimer_active(&rt_b->rt_period_timer))
212 break; 212 break;
213 213
214 now = hrtimer_cb_get_time(&rt_b->rt_period_timer); 214 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
215 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); 215 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
216 216
217 soft = hrtimer_get_softexpires(&rt_b->rt_period_timer); 217 soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
218 hard = hrtimer_get_expires(&rt_b->rt_period_timer); 218 hard = hrtimer_get_expires(&rt_b->rt_period_timer);
219 delta = ktime_to_ns(ktime_sub(hard, soft)); 219 delta = ktime_to_ns(ktime_sub(hard, soft));
220 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, 220 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
221 HRTIMER_MODE_ABS_PINNED, 0); 221 HRTIMER_MODE_ABS_PINNED, 0);
222 } 222 }
223 raw_spin_unlock(&rt_b->rt_runtime_lock); 223 raw_spin_unlock(&rt_b->rt_runtime_lock);
224 } 224 }
225 225
226 #ifdef CONFIG_RT_GROUP_SCHED 226 #ifdef CONFIG_RT_GROUP_SCHED
227 static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) 227 static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
228 { 228 {
229 hrtimer_cancel(&rt_b->rt_period_timer); 229 hrtimer_cancel(&rt_b->rt_period_timer);
230 } 230 }
231 #endif 231 #endif
232 232
233 /* 233 /*
234 * sched_domains_mutex serializes calls to arch_init_sched_domains, 234 * sched_domains_mutex serializes calls to arch_init_sched_domains,
235 * detach_destroy_domains and partition_sched_domains. 235 * detach_destroy_domains and partition_sched_domains.
236 */ 236 */
237 static DEFINE_MUTEX(sched_domains_mutex); 237 static DEFINE_MUTEX(sched_domains_mutex);
238 238
239 #ifdef CONFIG_CGROUP_SCHED 239 #ifdef CONFIG_CGROUP_SCHED
240 240
241 #include <linux/cgroup.h> 241 #include <linux/cgroup.h>
242 242
243 struct cfs_rq; 243 struct cfs_rq;
244 244
245 static LIST_HEAD(task_groups); 245 static LIST_HEAD(task_groups);
246 246
247 /* task group related information */ 247 /* task group related information */
248 struct task_group { 248 struct task_group {
249 struct cgroup_subsys_state css; 249 struct cgroup_subsys_state css;
250 250
251 #ifdef CONFIG_FAIR_GROUP_SCHED 251 #ifdef CONFIG_FAIR_GROUP_SCHED
252 /* schedulable entities of this group on each cpu */ 252 /* schedulable entities of this group on each cpu */
253 struct sched_entity **se; 253 struct sched_entity **se;
254 /* runqueue "owned" by this group on each cpu */ 254 /* runqueue "owned" by this group on each cpu */
255 struct cfs_rq **cfs_rq; 255 struct cfs_rq **cfs_rq;
256 unsigned long shares; 256 unsigned long shares;
257 257
258 atomic_t load_weight; 258 atomic_t load_weight;
259 #endif 259 #endif
260 260
261 #ifdef CONFIG_RT_GROUP_SCHED 261 #ifdef CONFIG_RT_GROUP_SCHED
262 struct sched_rt_entity **rt_se; 262 struct sched_rt_entity **rt_se;
263 struct rt_rq **rt_rq; 263 struct rt_rq **rt_rq;
264 264
265 struct rt_bandwidth rt_bandwidth; 265 struct rt_bandwidth rt_bandwidth;
266 #endif 266 #endif
267 267
268 struct rcu_head rcu; 268 struct rcu_head rcu;
269 struct list_head list; 269 struct list_head list;
270 270
271 struct task_group *parent; 271 struct task_group *parent;
272 struct list_head siblings; 272 struct list_head siblings;
273 struct list_head children; 273 struct list_head children;
274 274
275 #ifdef CONFIG_SCHED_AUTOGROUP 275 #ifdef CONFIG_SCHED_AUTOGROUP
276 struct autogroup *autogroup; 276 struct autogroup *autogroup;
277 #endif 277 #endif
278 }; 278 };
279 279
280 /* task_group_lock serializes the addition/removal of task groups */ 280 /* task_group_lock serializes the addition/removal of task groups */
281 static DEFINE_SPINLOCK(task_group_lock); 281 static DEFINE_SPINLOCK(task_group_lock);
282 282
283 #ifdef CONFIG_FAIR_GROUP_SCHED 283 #ifdef CONFIG_FAIR_GROUP_SCHED
284 284
285 # define ROOT_TASK_GROUP_LOAD NICE_0_LOAD 285 # define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
286 286
287 /* 287 /*
288 * A weight of 0 or 1 can cause arithmetics problems. 288 * A weight of 0 or 1 can cause arithmetics problems.
289 * A weight of a cfs_rq is the sum of weights of which entities 289 * A weight of a cfs_rq is the sum of weights of which entities
290 * are queued on this cfs_rq, so a weight of a entity should not be 290 * are queued on this cfs_rq, so a weight of a entity should not be
291 * too large, so as the shares value of a task group. 291 * too large, so as the shares value of a task group.
292 * (The default weight is 1024 - so there's no practical 292 * (The default weight is 1024 - so there's no practical
293 * limitation from this.) 293 * limitation from this.)
294 */ 294 */
295 #define MIN_SHARES 2 295 #define MIN_SHARES 2
296 #define MAX_SHARES (1UL << 18) 296 #define MAX_SHARES (1UL << 18)
297 297
298 static int root_task_group_load = ROOT_TASK_GROUP_LOAD; 298 static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
299 #endif 299 #endif
300 300
301 /* Default task group. 301 /* Default task group.
302 * Every task in system belong to this group at bootup. 302 * Every task in system belong to this group at bootup.
303 */ 303 */
304 struct task_group root_task_group; 304 struct task_group root_task_group;
305 305
306 #endif /* CONFIG_CGROUP_SCHED */ 306 #endif /* CONFIG_CGROUP_SCHED */
307 307
308 /* CFS-related fields in a runqueue */ 308 /* CFS-related fields in a runqueue */
309 struct cfs_rq { 309 struct cfs_rq {
310 struct load_weight load; 310 struct load_weight load;
311 unsigned long nr_running; 311 unsigned long nr_running;
312 312
313 u64 exec_clock; 313 u64 exec_clock;
314 u64 min_vruntime; 314 u64 min_vruntime;
315 #ifndef CONFIG_64BIT 315 #ifndef CONFIG_64BIT
316 u64 min_vruntime_copy; 316 u64 min_vruntime_copy;
317 #endif 317 #endif
318 318
319 struct rb_root tasks_timeline; 319 struct rb_root tasks_timeline;
320 struct rb_node *rb_leftmost; 320 struct rb_node *rb_leftmost;
321 321
322 struct list_head tasks; 322 struct list_head tasks;
323 struct list_head *balance_iterator; 323 struct list_head *balance_iterator;
324 324
325 /* 325 /*
326 * 'curr' points to currently running entity on this cfs_rq. 326 * 'curr' points to currently running entity on this cfs_rq.
327 * It is set to NULL otherwise (i.e when none are currently running). 327 * It is set to NULL otherwise (i.e when none are currently running).
328 */ 328 */
329 struct sched_entity *curr, *next, *last, *skip; 329 struct sched_entity *curr, *next, *last, *skip;
330 330
331 unsigned int nr_spread_over; 331 unsigned int nr_spread_over;
332 332
333 #ifdef CONFIG_FAIR_GROUP_SCHED 333 #ifdef CONFIG_FAIR_GROUP_SCHED
334 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 334 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
335 335
336 /* 336 /*
337 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in 337 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
338 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities 338 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
339 * (like users, containers etc.) 339 * (like users, containers etc.)
340 * 340 *
341 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This 341 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
342 * list is used during load balance. 342 * list is used during load balance.
343 */ 343 */
344 int on_list; 344 int on_list;
345 struct list_head leaf_cfs_rq_list; 345 struct list_head leaf_cfs_rq_list;
346 struct task_group *tg; /* group that "owns" this runqueue */ 346 struct task_group *tg; /* group that "owns" this runqueue */
347 347
348 #ifdef CONFIG_SMP 348 #ifdef CONFIG_SMP
349 /* 349 /*
350 * the part of load.weight contributed by tasks 350 * the part of load.weight contributed by tasks
351 */ 351 */
352 unsigned long task_weight; 352 unsigned long task_weight;
353 353
354 /* 354 /*
355 * h_load = weight * f(tg) 355 * h_load = weight * f(tg)
356 * 356 *
357 * Where f(tg) is the recursive weight fraction assigned to 357 * Where f(tg) is the recursive weight fraction assigned to
358 * this group. 358 * this group.
359 */ 359 */
360 unsigned long h_load; 360 unsigned long h_load;
361 361
362 /* 362 /*
363 * Maintaining per-cpu shares distribution for group scheduling 363 * Maintaining per-cpu shares distribution for group scheduling
364 * 364 *
365 * load_stamp is the last time we updated the load average 365 * load_stamp is the last time we updated the load average
366 * load_last is the last time we updated the load average and saw load 366 * load_last is the last time we updated the load average and saw load
367 * load_unacc_exec_time is currently unaccounted execution time 367 * load_unacc_exec_time is currently unaccounted execution time
368 */ 368 */
369 u64 load_avg; 369 u64 load_avg;
370 u64 load_period; 370 u64 load_period;
371 u64 load_stamp, load_last, load_unacc_exec_time; 371 u64 load_stamp, load_last, load_unacc_exec_time;
372 372
373 unsigned long load_contribution; 373 unsigned long load_contribution;
374 #endif 374 #endif
375 #endif 375 #endif
376 }; 376 };
377 377
378 /* Real-Time classes' related field in a runqueue: */ 378 /* Real-Time classes' related field in a runqueue: */
379 struct rt_rq { 379 struct rt_rq {
380 struct rt_prio_array active; 380 struct rt_prio_array active;
381 unsigned long rt_nr_running; 381 unsigned long rt_nr_running;
382 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 382 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
383 struct { 383 struct {
384 int curr; /* highest queued rt task prio */ 384 int curr; /* highest queued rt task prio */
385 #ifdef CONFIG_SMP 385 #ifdef CONFIG_SMP
386 int next; /* next highest */ 386 int next; /* next highest */
387 #endif 387 #endif
388 } highest_prio; 388 } highest_prio;
389 #endif 389 #endif
390 #ifdef CONFIG_SMP 390 #ifdef CONFIG_SMP
391 unsigned long rt_nr_migratory; 391 unsigned long rt_nr_migratory;
392 unsigned long rt_nr_total; 392 unsigned long rt_nr_total;
393 int overloaded; 393 int overloaded;
394 struct plist_head pushable_tasks; 394 struct plist_head pushable_tasks;
395 #endif 395 #endif
396 int rt_throttled; 396 int rt_throttled;
397 u64 rt_time; 397 u64 rt_time;
398 u64 rt_runtime; 398 u64 rt_runtime;
399 /* Nests inside the rq lock: */ 399 /* Nests inside the rq lock: */
400 raw_spinlock_t rt_runtime_lock; 400 raw_spinlock_t rt_runtime_lock;
401 401
402 #ifdef CONFIG_RT_GROUP_SCHED 402 #ifdef CONFIG_RT_GROUP_SCHED
403 unsigned long rt_nr_boosted; 403 unsigned long rt_nr_boosted;
404 404
405 struct rq *rq; 405 struct rq *rq;
406 struct list_head leaf_rt_rq_list; 406 struct list_head leaf_rt_rq_list;
407 struct task_group *tg; 407 struct task_group *tg;
408 #endif 408 #endif
409 }; 409 };
410 410
411 #ifdef CONFIG_SMP 411 #ifdef CONFIG_SMP
412 412
413 /* 413 /*
414 * We add the notion of a root-domain which will be used to define per-domain 414 * We add the notion of a root-domain which will be used to define per-domain
415 * variables. Each exclusive cpuset essentially defines an island domain by 415 * variables. Each exclusive cpuset essentially defines an island domain by
416 * fully partitioning the member cpus from any other cpuset. Whenever a new 416 * fully partitioning the member cpus from any other cpuset. Whenever a new
417 * exclusive cpuset is created, we also create and attach a new root-domain 417 * exclusive cpuset is created, we also create and attach a new root-domain
418 * object. 418 * object.
419 * 419 *
420 */ 420 */
421 struct root_domain { 421 struct root_domain {
422 atomic_t refcount; 422 atomic_t refcount;
423 cpumask_var_t span; 423 cpumask_var_t span;
424 cpumask_var_t online; 424 cpumask_var_t online;
425 425
426 /* 426 /*
427 * The "RT overload" flag: it gets set if a CPU has more than 427 * The "RT overload" flag: it gets set if a CPU has more than
428 * one runnable RT task. 428 * one runnable RT task.
429 */ 429 */
430 cpumask_var_t rto_mask; 430 cpumask_var_t rto_mask;
431 atomic_t rto_count; 431 atomic_t rto_count;
432 struct cpupri cpupri; 432 struct cpupri cpupri;
433 }; 433 };
434 434
435 /* 435 /*
436 * By default the system creates a single root-domain with all cpus as 436 * By default the system creates a single root-domain with all cpus as
437 * members (mimicking the global state we have today). 437 * members (mimicking the global state we have today).
438 */ 438 */
439 static struct root_domain def_root_domain; 439 static struct root_domain def_root_domain;
440 440
441 #endif /* CONFIG_SMP */ 441 #endif /* CONFIG_SMP */
442 442
443 /* 443 /*
444 * This is the main, per-CPU runqueue data structure. 444 * This is the main, per-CPU runqueue data structure.
445 * 445 *
446 * Locking rule: those places that want to lock multiple runqueues 446 * Locking rule: those places that want to lock multiple runqueues
447 * (such as the load balancing or the thread migration code), lock 447 * (such as the load balancing or the thread migration code), lock
448 * acquire operations must be ordered by ascending &runqueue. 448 * acquire operations must be ordered by ascending &runqueue.
449 */ 449 */
450 struct rq { 450 struct rq {
451 /* runqueue lock: */ 451 /* runqueue lock: */
452 raw_spinlock_t lock; 452 raw_spinlock_t lock;
453 453
454 /* 454 /*
455 * nr_running and cpu_load should be in the same cacheline because 455 * nr_running and cpu_load should be in the same cacheline because
456 * remote CPUs use both these fields when doing load calculation. 456 * remote CPUs use both these fields when doing load calculation.
457 */ 457 */
458 unsigned long nr_running; 458 unsigned long nr_running;
459 #define CPU_LOAD_IDX_MAX 5 459 #define CPU_LOAD_IDX_MAX 5
460 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 460 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
461 unsigned long last_load_update_tick; 461 unsigned long last_load_update_tick;
462 #ifdef CONFIG_NO_HZ 462 #ifdef CONFIG_NO_HZ
463 u64 nohz_stamp; 463 u64 nohz_stamp;
464 unsigned char nohz_balance_kick; 464 unsigned char nohz_balance_kick;
465 #endif 465 #endif
466 unsigned int skip_clock_update; 466 unsigned int skip_clock_update;
467 467
468 /* capture load from *all* tasks on this cpu: */ 468 /* capture load from *all* tasks on this cpu: */
469 struct load_weight load; 469 struct load_weight load;
470 unsigned long nr_load_updates; 470 unsigned long nr_load_updates;
471 u64 nr_switches; 471 u64 nr_switches;
472 472
473 struct cfs_rq cfs; 473 struct cfs_rq cfs;
474 struct rt_rq rt; 474 struct rt_rq rt;
475 475
476 #ifdef CONFIG_FAIR_GROUP_SCHED 476 #ifdef CONFIG_FAIR_GROUP_SCHED
477 /* list of leaf cfs_rq on this cpu: */ 477 /* list of leaf cfs_rq on this cpu: */
478 struct list_head leaf_cfs_rq_list; 478 struct list_head leaf_cfs_rq_list;
479 #endif 479 #endif
480 #ifdef CONFIG_RT_GROUP_SCHED 480 #ifdef CONFIG_RT_GROUP_SCHED
481 struct list_head leaf_rt_rq_list; 481 struct list_head leaf_rt_rq_list;
482 #endif 482 #endif
483 483
484 /* 484 /*
485 * This is part of a global counter where only the total sum 485 * This is part of a global counter where only the total sum
486 * over all CPUs matters. A task can increase this counter on 486 * over all CPUs matters. A task can increase this counter on
487 * one CPU and if it got migrated afterwards it may decrease 487 * one CPU and if it got migrated afterwards it may decrease
488 * it on another CPU. Always updated under the runqueue lock: 488 * it on another CPU. Always updated under the runqueue lock:
489 */ 489 */
490 unsigned long nr_uninterruptible; 490 unsigned long nr_uninterruptible;
491 491
492 struct task_struct *curr, *idle, *stop; 492 struct task_struct *curr, *idle, *stop;
493 unsigned long next_balance; 493 unsigned long next_balance;
494 struct mm_struct *prev_mm; 494 struct mm_struct *prev_mm;
495 495
496 u64 clock; 496 u64 clock;
497 u64 clock_task; 497 u64 clock_task;
498 498
499 atomic_t nr_iowait; 499 atomic_t nr_iowait;
500 500
501 #ifdef CONFIG_SMP 501 #ifdef CONFIG_SMP
502 struct root_domain *rd; 502 struct root_domain *rd;
503 struct sched_domain *sd; 503 struct sched_domain *sd;
504 504
505 unsigned long cpu_power; 505 unsigned long cpu_power;
506 506
507 unsigned char idle_at_tick; 507 unsigned char idle_at_tick;
508 /* For active balancing */ 508 /* For active balancing */
509 int post_schedule; 509 int post_schedule;
510 int active_balance; 510 int active_balance;
511 int push_cpu; 511 int push_cpu;
512 struct cpu_stop_work active_balance_work; 512 struct cpu_stop_work active_balance_work;
513 /* cpu of this runqueue: */ 513 /* cpu of this runqueue: */
514 int cpu; 514 int cpu;
515 int online; 515 int online;
516 516
517 unsigned long avg_load_per_task; 517 unsigned long avg_load_per_task;
518 518
519 u64 rt_avg; 519 u64 rt_avg;
520 u64 age_stamp; 520 u64 age_stamp;
521 u64 idle_stamp; 521 u64 idle_stamp;
522 u64 avg_idle; 522 u64 avg_idle;
523 #endif 523 #endif
524 524
525 #ifdef CONFIG_IRQ_TIME_ACCOUNTING 525 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
526 u64 prev_irq_time; 526 u64 prev_irq_time;
527 #endif 527 #endif
528 528
529 /* calc_load related fields */ 529 /* calc_load related fields */
530 unsigned long calc_load_update; 530 unsigned long calc_load_update;
531 long calc_load_active; 531 long calc_load_active;
532 532
533 #ifdef CONFIG_SCHED_HRTICK 533 #ifdef CONFIG_SCHED_HRTICK
534 #ifdef CONFIG_SMP 534 #ifdef CONFIG_SMP
535 int hrtick_csd_pending; 535 int hrtick_csd_pending;
536 struct call_single_data hrtick_csd; 536 struct call_single_data hrtick_csd;
537 #endif 537 #endif
538 struct hrtimer hrtick_timer; 538 struct hrtimer hrtick_timer;
539 #endif 539 #endif
540 540
541 #ifdef CONFIG_SCHEDSTATS 541 #ifdef CONFIG_SCHEDSTATS
542 /* latency stats */ 542 /* latency stats */
543 struct sched_info rq_sched_info; 543 struct sched_info rq_sched_info;
544 unsigned long long rq_cpu_time; 544 unsigned long long rq_cpu_time;
545 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ 545 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
546 546
547 /* sys_sched_yield() stats */ 547 /* sys_sched_yield() stats */
548 unsigned int yld_count; 548 unsigned int yld_count;
549 549
550 /* schedule() stats */ 550 /* schedule() stats */
551 unsigned int sched_switch; 551 unsigned int sched_switch;
552 unsigned int sched_count; 552 unsigned int sched_count;
553 unsigned int sched_goidle; 553 unsigned int sched_goidle;
554 554
555 /* try_to_wake_up() stats */ 555 /* try_to_wake_up() stats */
556 unsigned int ttwu_count; 556 unsigned int ttwu_count;
557 unsigned int ttwu_local; 557 unsigned int ttwu_local;
558 #endif 558 #endif
559 559
560 #ifdef CONFIG_SMP 560 #ifdef CONFIG_SMP
561 struct task_struct *wake_list; 561 struct task_struct *wake_list;
562 #endif 562 #endif
563 }; 563 };
564 564
565 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 565 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
566 566
567 567
568 static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); 568 static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
569 569
570 static inline int cpu_of(struct rq *rq) 570 static inline int cpu_of(struct rq *rq)
571 { 571 {
572 #ifdef CONFIG_SMP 572 #ifdef CONFIG_SMP
573 return rq->cpu; 573 return rq->cpu;
574 #else 574 #else
575 return 0; 575 return 0;
576 #endif 576 #endif
577 } 577 }
578 578
579 #define rcu_dereference_check_sched_domain(p) \ 579 #define rcu_dereference_check_sched_domain(p) \
580 rcu_dereference_check((p), \ 580 rcu_dereference_check((p), \
581 rcu_read_lock_sched_held() || \ 581 rcu_read_lock_sched_held() || \
582 lockdep_is_held(&sched_domains_mutex)) 582 lockdep_is_held(&sched_domains_mutex))
583 583
584 /* 584 /*
585 * The domain tree (rq->sd) is protected by RCU's quiescent state transition. 585 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
586 * See detach_destroy_domains: synchronize_sched for details. 586 * See detach_destroy_domains: synchronize_sched for details.
587 * 587 *
588 * The domain tree of any CPU may only be accessed from within 588 * The domain tree of any CPU may only be accessed from within
589 * preempt-disabled sections. 589 * preempt-disabled sections.
590 */ 590 */
591 #define for_each_domain(cpu, __sd) \ 591 #define for_each_domain(cpu, __sd) \
592 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) 592 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
593 593
594 #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) 594 #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
595 #define this_rq() (&__get_cpu_var(runqueues)) 595 #define this_rq() (&__get_cpu_var(runqueues))
596 #define task_rq(p) cpu_rq(task_cpu(p)) 596 #define task_rq(p) cpu_rq(task_cpu(p))
597 #define cpu_curr(cpu) (cpu_rq(cpu)->curr) 597 #define cpu_curr(cpu) (cpu_rq(cpu)->curr)
598 #define raw_rq() (&__raw_get_cpu_var(runqueues)) 598 #define raw_rq() (&__raw_get_cpu_var(runqueues))
599 599
600 #ifdef CONFIG_CGROUP_SCHED 600 #ifdef CONFIG_CGROUP_SCHED
601 601
602 /* 602 /*
603 * Return the group to which this tasks belongs. 603 * Return the group to which this tasks belongs.
604 * 604 *
605 * We use task_subsys_state_check() and extend the RCU verification 605 * We use task_subsys_state_check() and extend the RCU verification
606 * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach() 606 * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach()
607 * holds that lock for each task it moves into the cgroup. Therefore 607 * holds that lock for each task it moves into the cgroup. Therefore
608 * by holding that lock, we pin the task to the current cgroup. 608 * by holding that lock, we pin the task to the current cgroup.
609 */ 609 */
610 static inline struct task_group *task_group(struct task_struct *p) 610 static inline struct task_group *task_group(struct task_struct *p)
611 { 611 {
612 struct task_group *tg; 612 struct task_group *tg;
613 struct cgroup_subsys_state *css; 613 struct cgroup_subsys_state *css;
614 614
615 css = task_subsys_state_check(p, cpu_cgroup_subsys_id, 615 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
616 lockdep_is_held(&p->pi_lock)); 616 lockdep_is_held(&p->pi_lock));
617 tg = container_of(css, struct task_group, css); 617 tg = container_of(css, struct task_group, css);
618 618
619 return autogroup_task_group(p, tg); 619 return autogroup_task_group(p, tg);
620 } 620 }
621 621
622 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 622 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
623 static inline void set_task_rq(struct task_struct *p, unsigned int cpu) 623 static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
624 { 624 {
625 #ifdef CONFIG_FAIR_GROUP_SCHED 625 #ifdef CONFIG_FAIR_GROUP_SCHED
626 p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; 626 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
627 p->se.parent = task_group(p)->se[cpu]; 627 p->se.parent = task_group(p)->se[cpu];
628 #endif 628 #endif
629 629
630 #ifdef CONFIG_RT_GROUP_SCHED 630 #ifdef CONFIG_RT_GROUP_SCHED
631 p->rt.rt_rq = task_group(p)->rt_rq[cpu]; 631 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
632 p->rt.parent = task_group(p)->rt_se[cpu]; 632 p->rt.parent = task_group(p)->rt_se[cpu];
633 #endif 633 #endif
634 } 634 }
635 635
636 #else /* CONFIG_CGROUP_SCHED */ 636 #else /* CONFIG_CGROUP_SCHED */
637 637
638 static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 638 static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
639 static inline struct task_group *task_group(struct task_struct *p) 639 static inline struct task_group *task_group(struct task_struct *p)
640 { 640 {
641 return NULL; 641 return NULL;
642 } 642 }
643 643
644 #endif /* CONFIG_CGROUP_SCHED */ 644 #endif /* CONFIG_CGROUP_SCHED */
645 645
646 static void update_rq_clock_task(struct rq *rq, s64 delta); 646 static void update_rq_clock_task(struct rq *rq, s64 delta);
647 647
648 static void update_rq_clock(struct rq *rq) 648 static void update_rq_clock(struct rq *rq)
649 { 649 {
650 s64 delta; 650 s64 delta;
651 651
652 if (rq->skip_clock_update) 652 if (rq->skip_clock_update)
653 return; 653 return;
654 654
655 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; 655 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
656 rq->clock += delta; 656 rq->clock += delta;
657 update_rq_clock_task(rq, delta); 657 update_rq_clock_task(rq, delta);
658 } 658 }
659 659
660 /* 660 /*
661 * Tunables that become constants when CONFIG_SCHED_DEBUG is off: 661 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
662 */ 662 */
663 #ifdef CONFIG_SCHED_DEBUG 663 #ifdef CONFIG_SCHED_DEBUG
664 # define const_debug __read_mostly 664 # define const_debug __read_mostly
665 #else 665 #else
666 # define const_debug static const 666 # define const_debug static const
667 #endif 667 #endif
668 668
669 /** 669 /**
670 * runqueue_is_locked - Returns true if the current cpu runqueue is locked 670 * runqueue_is_locked - Returns true if the current cpu runqueue is locked
671 * @cpu: the processor in question. 671 * @cpu: the processor in question.
672 * 672 *
673 * This interface allows printk to be called with the runqueue lock 673 * This interface allows printk to be called with the runqueue lock
674 * held and know whether or not it is OK to wake up the klogd. 674 * held and know whether or not it is OK to wake up the klogd.
675 */ 675 */
676 int runqueue_is_locked(int cpu) 676 int runqueue_is_locked(int cpu)
677 { 677 {
678 return raw_spin_is_locked(&cpu_rq(cpu)->lock); 678 return raw_spin_is_locked(&cpu_rq(cpu)->lock);
679 } 679 }
680 680
681 /* 681 /*
682 * Debugging: various feature bits 682 * Debugging: various feature bits
683 */ 683 */
684 684
685 #define SCHED_FEAT(name, enabled) \ 685 #define SCHED_FEAT(name, enabled) \
686 __SCHED_FEAT_##name , 686 __SCHED_FEAT_##name ,
687 687
688 enum { 688 enum {
689 #include "sched_features.h" 689 #include "sched_features.h"
690 }; 690 };
691 691
692 #undef SCHED_FEAT 692 #undef SCHED_FEAT
693 693
694 #define SCHED_FEAT(name, enabled) \ 694 #define SCHED_FEAT(name, enabled) \
695 (1UL << __SCHED_FEAT_##name) * enabled | 695 (1UL << __SCHED_FEAT_##name) * enabled |
696 696
697 const_debug unsigned int sysctl_sched_features = 697 const_debug unsigned int sysctl_sched_features =
698 #include "sched_features.h" 698 #include "sched_features.h"
699 0; 699 0;
700 700
701 #undef SCHED_FEAT 701 #undef SCHED_FEAT
702 702
703 #ifdef CONFIG_SCHED_DEBUG 703 #ifdef CONFIG_SCHED_DEBUG
704 #define SCHED_FEAT(name, enabled) \ 704 #define SCHED_FEAT(name, enabled) \
705 #name , 705 #name ,
706 706
707 static __read_mostly char *sched_feat_names[] = { 707 static __read_mostly char *sched_feat_names[] = {
708 #include "sched_features.h" 708 #include "sched_features.h"
709 NULL 709 NULL
710 }; 710 };
711 711
712 #undef SCHED_FEAT 712 #undef SCHED_FEAT
713 713
714 static int sched_feat_show(struct seq_file *m, void *v) 714 static int sched_feat_show(struct seq_file *m, void *v)
715 { 715 {
716 int i; 716 int i;
717 717
718 for (i = 0; sched_feat_names[i]; i++) { 718 for (i = 0; sched_feat_names[i]; i++) {
719 if (!(sysctl_sched_features & (1UL << i))) 719 if (!(sysctl_sched_features & (1UL << i)))
720 seq_puts(m, "NO_"); 720 seq_puts(m, "NO_");
721 seq_printf(m, "%s ", sched_feat_names[i]); 721 seq_printf(m, "%s ", sched_feat_names[i]);
722 } 722 }
723 seq_puts(m, "\n"); 723 seq_puts(m, "\n");
724 724
725 return 0; 725 return 0;
726 } 726 }
727 727
728 static ssize_t 728 static ssize_t
729 sched_feat_write(struct file *filp, const char __user *ubuf, 729 sched_feat_write(struct file *filp, const char __user *ubuf,
730 size_t cnt, loff_t *ppos) 730 size_t cnt, loff_t *ppos)
731 { 731 {
732 char buf[64]; 732 char buf[64];
733 char *cmp; 733 char *cmp;
734 int neg = 0; 734 int neg = 0;
735 int i; 735 int i;
736 736
737 if (cnt > 63) 737 if (cnt > 63)
738 cnt = 63; 738 cnt = 63;
739 739
740 if (copy_from_user(&buf, ubuf, cnt)) 740 if (copy_from_user(&buf, ubuf, cnt))
741 return -EFAULT; 741 return -EFAULT;
742 742
743 buf[cnt] = 0; 743 buf[cnt] = 0;
744 cmp = strstrip(buf); 744 cmp = strstrip(buf);
745 745
746 if (strncmp(cmp, "NO_", 3) == 0) { 746 if (strncmp(cmp, "NO_", 3) == 0) {
747 neg = 1; 747 neg = 1;
748 cmp += 3; 748 cmp += 3;
749 } 749 }
750 750
751 for (i = 0; sched_feat_names[i]; i++) { 751 for (i = 0; sched_feat_names[i]; i++) {
752 if (strcmp(cmp, sched_feat_names[i]) == 0) { 752 if (strcmp(cmp, sched_feat_names[i]) == 0) {
753 if (neg) 753 if (neg)
754 sysctl_sched_features &= ~(1UL << i); 754 sysctl_sched_features &= ~(1UL << i);
755 else 755 else
756 sysctl_sched_features |= (1UL << i); 756 sysctl_sched_features |= (1UL << i);
757 break; 757 break;
758 } 758 }
759 } 759 }
760 760
761 if (!sched_feat_names[i]) 761 if (!sched_feat_names[i])
762 return -EINVAL; 762 return -EINVAL;
763 763
764 *ppos += cnt; 764 *ppos += cnt;
765 765
766 return cnt; 766 return cnt;
767 } 767 }
768 768
769 static int sched_feat_open(struct inode *inode, struct file *filp) 769 static int sched_feat_open(struct inode *inode, struct file *filp)
770 { 770 {
771 return single_open(filp, sched_feat_show, NULL); 771 return single_open(filp, sched_feat_show, NULL);
772 } 772 }
773 773
774 static const struct file_operations sched_feat_fops = { 774 static const struct file_operations sched_feat_fops = {
775 .open = sched_feat_open, 775 .open = sched_feat_open,
776 .write = sched_feat_write, 776 .write = sched_feat_write,
777 .read = seq_read, 777 .read = seq_read,
778 .llseek = seq_lseek, 778 .llseek = seq_lseek,
779 .release = single_release, 779 .release = single_release,
780 }; 780 };
781 781
782 static __init int sched_init_debug(void) 782 static __init int sched_init_debug(void)
783 { 783 {
784 debugfs_create_file("sched_features", 0644, NULL, NULL, 784 debugfs_create_file("sched_features", 0644, NULL, NULL,
785 &sched_feat_fops); 785 &sched_feat_fops);
786 786
787 return 0; 787 return 0;
788 } 788 }
789 late_initcall(sched_init_debug); 789 late_initcall(sched_init_debug);
790 790
791 #endif 791 #endif
792 792
793 #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) 793 #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
794 794
795 /* 795 /*
796 * Number of tasks to iterate in a single balance run. 796 * Number of tasks to iterate in a single balance run.
797 * Limited because this is done with IRQs disabled. 797 * Limited because this is done with IRQs disabled.
798 */ 798 */
799 const_debug unsigned int sysctl_sched_nr_migrate = 32; 799 const_debug unsigned int sysctl_sched_nr_migrate = 32;
800 800
801 /* 801 /*
802 * period over which we average the RT time consumption, measured 802 * period over which we average the RT time consumption, measured
803 * in ms. 803 * in ms.
804 * 804 *
805 * default: 1s 805 * default: 1s
806 */ 806 */
807 const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; 807 const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
808 808
809 /* 809 /*
810 * period over which we measure -rt task cpu usage in us. 810 * period over which we measure -rt task cpu usage in us.
811 * default: 1s 811 * default: 1s
812 */ 812 */
813 unsigned int sysctl_sched_rt_period = 1000000; 813 unsigned int sysctl_sched_rt_period = 1000000;
814 814
815 static __read_mostly int scheduler_running; 815 static __read_mostly int scheduler_running;
816 816
817 /* 817 /*
818 * part of the period that we allow rt tasks to run in us. 818 * part of the period that we allow rt tasks to run in us.
819 * default: 0.95s 819 * default: 0.95s
820 */ 820 */
821 int sysctl_sched_rt_runtime = 950000; 821 int sysctl_sched_rt_runtime = 950000;
822 822
823 static inline u64 global_rt_period(void) 823 static inline u64 global_rt_period(void)
824 { 824 {
825 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; 825 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
826 } 826 }
827 827
828 static inline u64 global_rt_runtime(void) 828 static inline u64 global_rt_runtime(void)
829 { 829 {
830 if (sysctl_sched_rt_runtime < 0) 830 if (sysctl_sched_rt_runtime < 0)
831 return RUNTIME_INF; 831 return RUNTIME_INF;
832 832
833 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; 833 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
834 } 834 }
835 835
836 #ifndef prepare_arch_switch 836 #ifndef prepare_arch_switch
837 # define prepare_arch_switch(next) do { } while (0) 837 # define prepare_arch_switch(next) do { } while (0)
838 #endif 838 #endif
839 #ifndef finish_arch_switch 839 #ifndef finish_arch_switch
840 # define finish_arch_switch(prev) do { } while (0) 840 # define finish_arch_switch(prev) do { } while (0)
841 #endif 841 #endif
842 842
843 static inline int task_current(struct rq *rq, struct task_struct *p) 843 static inline int task_current(struct rq *rq, struct task_struct *p)
844 { 844 {
845 return rq->curr == p; 845 return rq->curr == p;
846 } 846 }
847 847
848 static inline int task_running(struct rq *rq, struct task_struct *p) 848 static inline int task_running(struct rq *rq, struct task_struct *p)
849 { 849 {
850 #ifdef CONFIG_SMP 850 #ifdef CONFIG_SMP
851 return p->on_cpu; 851 return p->on_cpu;
852 #else 852 #else
853 return task_current(rq, p); 853 return task_current(rq, p);
854 #endif 854 #endif
855 } 855 }
856 856
857 #ifndef __ARCH_WANT_UNLOCKED_CTXSW 857 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
858 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 858 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
859 { 859 {
860 #ifdef CONFIG_SMP 860 #ifdef CONFIG_SMP
861 /* 861 /*
862 * We can optimise this out completely for !SMP, because the 862 * We can optimise this out completely for !SMP, because the
863 * SMP rebalancing from interrupt is the only thing that cares 863 * SMP rebalancing from interrupt is the only thing that cares
864 * here. 864 * here.
865 */ 865 */
866 next->on_cpu = 1; 866 next->on_cpu = 1;
867 #endif 867 #endif
868 } 868 }
869 869
870 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) 870 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
871 { 871 {
872 #ifdef CONFIG_SMP 872 #ifdef CONFIG_SMP
873 /* 873 /*
874 * After ->on_cpu is cleared, the task can be moved to a different CPU. 874 * After ->on_cpu is cleared, the task can be moved to a different CPU.
875 * We must ensure this doesn't happen until the switch is completely 875 * We must ensure this doesn't happen until the switch is completely
876 * finished. 876 * finished.
877 */ 877 */
878 smp_wmb(); 878 smp_wmb();
879 prev->on_cpu = 0; 879 prev->on_cpu = 0;
880 #endif 880 #endif
881 #ifdef CONFIG_DEBUG_SPINLOCK 881 #ifdef CONFIG_DEBUG_SPINLOCK
882 /* this is a valid case when another task releases the spinlock */ 882 /* this is a valid case when another task releases the spinlock */
883 rq->lock.owner = current; 883 rq->lock.owner = current;
884 #endif 884 #endif
885 /* 885 /*
886 * If we are tracking spinlock dependencies then we have to 886 * If we are tracking spinlock dependencies then we have to
887 * fix up the runqueue lock - which gets 'carried over' from 887 * fix up the runqueue lock - which gets 'carried over' from
888 * prev into current: 888 * prev into current:
889 */ 889 */
890 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); 890 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
891 891
892 raw_spin_unlock_irq(&rq->lock); 892 raw_spin_unlock_irq(&rq->lock);
893 } 893 }
894 894
895 #else /* __ARCH_WANT_UNLOCKED_CTXSW */ 895 #else /* __ARCH_WANT_UNLOCKED_CTXSW */
896 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 896 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
897 { 897 {
898 #ifdef CONFIG_SMP 898 #ifdef CONFIG_SMP
899 /* 899 /*
900 * We can optimise this out completely for !SMP, because the 900 * We can optimise this out completely for !SMP, because the
901 * SMP rebalancing from interrupt is the only thing that cares 901 * SMP rebalancing from interrupt is the only thing that cares
902 * here. 902 * here.
903 */ 903 */
904 next->on_cpu = 1; 904 next->on_cpu = 1;
905 #endif 905 #endif
906 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 906 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
907 raw_spin_unlock_irq(&rq->lock); 907 raw_spin_unlock_irq(&rq->lock);
908 #else 908 #else
909 raw_spin_unlock(&rq->lock); 909 raw_spin_unlock(&rq->lock);
910 #endif 910 #endif
911 } 911 }
912 912
913 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) 913 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
914 { 914 {
915 #ifdef CONFIG_SMP 915 #ifdef CONFIG_SMP
916 /* 916 /*
917 * After ->on_cpu is cleared, the task can be moved to a different CPU. 917 * After ->on_cpu is cleared, the task can be moved to a different CPU.
918 * We must ensure this doesn't happen until the switch is completely 918 * We must ensure this doesn't happen until the switch is completely
919 * finished. 919 * finished.
920 */ 920 */
921 smp_wmb(); 921 smp_wmb();
922 prev->on_cpu = 0; 922 prev->on_cpu = 0;
923 #endif 923 #endif
924 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW 924 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
925 local_irq_enable(); 925 local_irq_enable();
926 #endif 926 #endif
927 } 927 }
928 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 928 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
929 929
930 /* 930 /*
931 * __task_rq_lock - lock the rq @p resides on. 931 * __task_rq_lock - lock the rq @p resides on.
932 */ 932 */
933 static inline struct rq *__task_rq_lock(struct task_struct *p) 933 static inline struct rq *__task_rq_lock(struct task_struct *p)
934 __acquires(rq->lock) 934 __acquires(rq->lock)
935 { 935 {
936 struct rq *rq; 936 struct rq *rq;
937 937
938 lockdep_assert_held(&p->pi_lock); 938 lockdep_assert_held(&p->pi_lock);
939 939
940 for (;;) { 940 for (;;) {
941 rq = task_rq(p); 941 rq = task_rq(p);
942 raw_spin_lock(&rq->lock); 942 raw_spin_lock(&rq->lock);
943 if (likely(rq == task_rq(p))) 943 if (likely(rq == task_rq(p)))
944 return rq; 944 return rq;
945 raw_spin_unlock(&rq->lock); 945 raw_spin_unlock(&rq->lock);
946 } 946 }
947 } 947 }
948 948
949 /* 949 /*
950 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. 950 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
951 */ 951 */
952 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) 952 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
953 __acquires(p->pi_lock) 953 __acquires(p->pi_lock)
954 __acquires(rq->lock) 954 __acquires(rq->lock)
955 { 955 {
956 struct rq *rq; 956 struct rq *rq;
957 957
958 for (;;) { 958 for (;;) {
959 raw_spin_lock_irqsave(&p->pi_lock, *flags); 959 raw_spin_lock_irqsave(&p->pi_lock, *flags);
960 rq = task_rq(p); 960 rq = task_rq(p);
961 raw_spin_lock(&rq->lock); 961 raw_spin_lock(&rq->lock);
962 if (likely(rq == task_rq(p))) 962 if (likely(rq == task_rq(p)))
963 return rq; 963 return rq;
964 raw_spin_unlock(&rq->lock); 964 raw_spin_unlock(&rq->lock);
965 raw_spin_unlock_irqrestore(&p->pi_lock, *flags); 965 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
966 } 966 }
967 } 967 }
968 968
969 static void __task_rq_unlock(struct rq *rq) 969 static void __task_rq_unlock(struct rq *rq)
970 __releases(rq->lock) 970 __releases(rq->lock)
971 { 971 {
972 raw_spin_unlock(&rq->lock); 972 raw_spin_unlock(&rq->lock);
973 } 973 }
974 974
975 static inline void 975 static inline void
976 task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) 976 task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
977 __releases(rq->lock) 977 __releases(rq->lock)
978 __releases(p->pi_lock) 978 __releases(p->pi_lock)
979 { 979 {
980 raw_spin_unlock(&rq->lock); 980 raw_spin_unlock(&rq->lock);
981 raw_spin_unlock_irqrestore(&p->pi_lock, *flags); 981 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
982 } 982 }
983 983
984 /* 984 /*
985 * this_rq_lock - lock this runqueue and disable interrupts. 985 * this_rq_lock - lock this runqueue and disable interrupts.
986 */ 986 */
987 static struct rq *this_rq_lock(void) 987 static struct rq *this_rq_lock(void)
988 __acquires(rq->lock) 988 __acquires(rq->lock)
989 { 989 {
990 struct rq *rq; 990 struct rq *rq;
991 991
992 local_irq_disable(); 992 local_irq_disable();
993 rq = this_rq(); 993 rq = this_rq();
994 raw_spin_lock(&rq->lock); 994 raw_spin_lock(&rq->lock);
995 995
996 return rq; 996 return rq;
997 } 997 }
998 998
999 #ifdef CONFIG_SCHED_HRTICK 999 #ifdef CONFIG_SCHED_HRTICK
1000 /* 1000 /*
1001 * Use HR-timers to deliver accurate preemption points. 1001 * Use HR-timers to deliver accurate preemption points.
1002 * 1002 *
1003 * Its all a bit involved since we cannot program an hrt while holding the 1003 * Its all a bit involved since we cannot program an hrt while holding the
1004 * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a 1004 * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
1005 * reschedule event. 1005 * reschedule event.
1006 * 1006 *
1007 * When we get rescheduled we reprogram the hrtick_timer outside of the 1007 * When we get rescheduled we reprogram the hrtick_timer outside of the
1008 * rq->lock. 1008 * rq->lock.
1009 */ 1009 */
1010 1010
1011 /* 1011 /*
1012 * Use hrtick when: 1012 * Use hrtick when:
1013 * - enabled by features 1013 * - enabled by features
1014 * - hrtimer is actually high res 1014 * - hrtimer is actually high res
1015 */ 1015 */
1016 static inline int hrtick_enabled(struct rq *rq) 1016 static inline int hrtick_enabled(struct rq *rq)
1017 { 1017 {
1018 if (!sched_feat(HRTICK)) 1018 if (!sched_feat(HRTICK))
1019 return 0; 1019 return 0;
1020 if (!cpu_active(cpu_of(rq))) 1020 if (!cpu_active(cpu_of(rq)))
1021 return 0; 1021 return 0;
1022 return hrtimer_is_hres_active(&rq->hrtick_timer); 1022 return hrtimer_is_hres_active(&rq->hrtick_timer);
1023 } 1023 }
1024 1024
1025 static void hrtick_clear(struct rq *rq) 1025 static void hrtick_clear(struct rq *rq)
1026 { 1026 {
1027 if (hrtimer_active(&rq->hrtick_timer)) 1027 if (hrtimer_active(&rq->hrtick_timer))
1028 hrtimer_cancel(&rq->hrtick_timer); 1028 hrtimer_cancel(&rq->hrtick_timer);
1029 } 1029 }
1030 1030
1031 /* 1031 /*
1032 * High-resolution timer tick. 1032 * High-resolution timer tick.
1033 * Runs from hardirq context with interrupts disabled. 1033 * Runs from hardirq context with interrupts disabled.
1034 */ 1034 */
1035 static enum hrtimer_restart hrtick(struct hrtimer *timer) 1035 static enum hrtimer_restart hrtick(struct hrtimer *timer)
1036 { 1036 {
1037 struct rq *rq = container_of(timer, struct rq, hrtick_timer); 1037 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
1038 1038
1039 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); 1039 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1040 1040
1041 raw_spin_lock(&rq->lock); 1041 raw_spin_lock(&rq->lock);
1042 update_rq_clock(rq); 1042 update_rq_clock(rq);
1043 rq->curr->sched_class->task_tick(rq, rq->curr, 1); 1043 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
1044 raw_spin_unlock(&rq->lock); 1044 raw_spin_unlock(&rq->lock);
1045 1045
1046 return HRTIMER_NORESTART; 1046 return HRTIMER_NORESTART;
1047 } 1047 }
1048 1048
1049 #ifdef CONFIG_SMP 1049 #ifdef CONFIG_SMP
1050 /* 1050 /*
1051 * called from hardirq (IPI) context 1051 * called from hardirq (IPI) context
1052 */ 1052 */
1053 static void __hrtick_start(void *arg) 1053 static void __hrtick_start(void *arg)
1054 { 1054 {
1055 struct rq *rq = arg; 1055 struct rq *rq = arg;
1056 1056
1057 raw_spin_lock(&rq->lock); 1057 raw_spin_lock(&rq->lock);
1058 hrtimer_restart(&rq->hrtick_timer); 1058 hrtimer_restart(&rq->hrtick_timer);
1059 rq->hrtick_csd_pending = 0; 1059 rq->hrtick_csd_pending = 0;
1060 raw_spin_unlock(&rq->lock); 1060 raw_spin_unlock(&rq->lock);
1061 } 1061 }
1062 1062
1063 /* 1063 /*
1064 * Called to set the hrtick timer state. 1064 * Called to set the hrtick timer state.
1065 * 1065 *
1066 * called with rq->lock held and irqs disabled 1066 * called with rq->lock held and irqs disabled
1067 */ 1067 */
1068 static void hrtick_start(struct rq *rq, u64 delay) 1068 static void hrtick_start(struct rq *rq, u64 delay)
1069 { 1069 {
1070 struct hrtimer *timer = &rq->hrtick_timer; 1070 struct hrtimer *timer = &rq->hrtick_timer;
1071 ktime_t time = ktime_add_ns(timer->base->get_time(), delay); 1071 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
1072 1072
1073 hrtimer_set_expires(timer, time); 1073 hrtimer_set_expires(timer, time);
1074 1074
1075 if (rq == this_rq()) { 1075 if (rq == this_rq()) {
1076 hrtimer_restart(timer); 1076 hrtimer_restart(timer);
1077 } else if (!rq->hrtick_csd_pending) { 1077 } else if (!rq->hrtick_csd_pending) {
1078 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); 1078 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
1079 rq->hrtick_csd_pending = 1; 1079 rq->hrtick_csd_pending = 1;
1080 } 1080 }
1081 } 1081 }
1082 1082
1083 static int 1083 static int
1084 hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) 1084 hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
1085 { 1085 {
1086 int cpu = (int)(long)hcpu; 1086 int cpu = (int)(long)hcpu;
1087 1087
1088 switch (action) { 1088 switch (action) {
1089 case CPU_UP_CANCELED: 1089 case CPU_UP_CANCELED:
1090 case CPU_UP_CANCELED_FROZEN: 1090 case CPU_UP_CANCELED_FROZEN:
1091 case CPU_DOWN_PREPARE: 1091 case CPU_DOWN_PREPARE:
1092 case CPU_DOWN_PREPARE_FROZEN: 1092 case CPU_DOWN_PREPARE_FROZEN:
1093 case CPU_DEAD: 1093 case CPU_DEAD:
1094 case CPU_DEAD_FROZEN: 1094 case CPU_DEAD_FROZEN:
1095 hrtick_clear(cpu_rq(cpu)); 1095 hrtick_clear(cpu_rq(cpu));
1096 return NOTIFY_OK; 1096 return NOTIFY_OK;
1097 } 1097 }
1098 1098
1099 return NOTIFY_DONE; 1099 return NOTIFY_DONE;
1100 } 1100 }
1101 1101
1102 static __init void init_hrtick(void) 1102 static __init void init_hrtick(void)
1103 { 1103 {
1104 hotcpu_notifier(hotplug_hrtick, 0); 1104 hotcpu_notifier(hotplug_hrtick, 0);
1105 } 1105 }
1106 #else 1106 #else
1107 /* 1107 /*
1108 * Called to set the hrtick timer state. 1108 * Called to set the hrtick timer state.
1109 * 1109 *
1110 * called with rq->lock held and irqs disabled 1110 * called with rq->lock held and irqs disabled
1111 */ 1111 */
1112 static void hrtick_start(struct rq *rq, u64 delay) 1112 static void hrtick_start(struct rq *rq, u64 delay)
1113 { 1113 {
1114 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, 1114 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
1115 HRTIMER_MODE_REL_PINNED, 0); 1115 HRTIMER_MODE_REL_PINNED, 0);
1116 } 1116 }
1117 1117
1118 static inline void init_hrtick(void) 1118 static inline void init_hrtick(void)
1119 { 1119 {
1120 } 1120 }
1121 #endif /* CONFIG_SMP */ 1121 #endif /* CONFIG_SMP */
1122 1122
1123 static void init_rq_hrtick(struct rq *rq) 1123 static void init_rq_hrtick(struct rq *rq)
1124 { 1124 {
1125 #ifdef CONFIG_SMP 1125 #ifdef CONFIG_SMP
1126 rq->hrtick_csd_pending = 0; 1126 rq->hrtick_csd_pending = 0;
1127 1127
1128 rq->hrtick_csd.flags = 0; 1128 rq->hrtick_csd.flags = 0;
1129 rq->hrtick_csd.func = __hrtick_start; 1129 rq->hrtick_csd.func = __hrtick_start;
1130 rq->hrtick_csd.info = rq; 1130 rq->hrtick_csd.info = rq;
1131 #endif 1131 #endif
1132 1132
1133 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1133 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1134 rq->hrtick_timer.function = hrtick; 1134 rq->hrtick_timer.function = hrtick;
1135 } 1135 }
1136 #else /* CONFIG_SCHED_HRTICK */ 1136 #else /* CONFIG_SCHED_HRTICK */
1137 static inline void hrtick_clear(struct rq *rq) 1137 static inline void hrtick_clear(struct rq *rq)
1138 { 1138 {
1139 } 1139 }
1140 1140
1141 static inline void init_rq_hrtick(struct rq *rq) 1141 static inline void init_rq_hrtick(struct rq *rq)
1142 { 1142 {
1143 } 1143 }
1144 1144
1145 static inline void init_hrtick(void) 1145 static inline void init_hrtick(void)
1146 { 1146 {
1147 } 1147 }
1148 #endif /* CONFIG_SCHED_HRTICK */ 1148 #endif /* CONFIG_SCHED_HRTICK */
1149 1149
1150 /* 1150 /*
1151 * resched_task - mark a task 'to be rescheduled now'. 1151 * resched_task - mark a task 'to be rescheduled now'.
1152 * 1152 *
1153 * On UP this means the setting of the need_resched flag, on SMP it 1153 * On UP this means the setting of the need_resched flag, on SMP it
1154 * might also involve a cross-CPU call to trigger the scheduler on 1154 * might also involve a cross-CPU call to trigger the scheduler on
1155 * the target CPU. 1155 * the target CPU.
1156 */ 1156 */
1157 #ifdef CONFIG_SMP 1157 #ifdef CONFIG_SMP
1158 1158
1159 #ifndef tsk_is_polling 1159 #ifndef tsk_is_polling
1160 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) 1160 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1161 #endif 1161 #endif
1162 1162
1163 static void resched_task(struct task_struct *p) 1163 static void resched_task(struct task_struct *p)
1164 { 1164 {
1165 int cpu; 1165 int cpu;
1166 1166
1167 assert_raw_spin_locked(&task_rq(p)->lock); 1167 assert_raw_spin_locked(&task_rq(p)->lock);
1168 1168
1169 if (test_tsk_need_resched(p)) 1169 if (test_tsk_need_resched(p))
1170 return; 1170 return;
1171 1171
1172 set_tsk_need_resched(p); 1172 set_tsk_need_resched(p);
1173 1173
1174 cpu = task_cpu(p); 1174 cpu = task_cpu(p);
1175 if (cpu == smp_processor_id()) 1175 if (cpu == smp_processor_id())
1176 return; 1176 return;
1177 1177
1178 /* NEED_RESCHED must be visible before we test polling */ 1178 /* NEED_RESCHED must be visible before we test polling */
1179 smp_mb(); 1179 smp_mb();
1180 if (!tsk_is_polling(p)) 1180 if (!tsk_is_polling(p))
1181 smp_send_reschedule(cpu); 1181 smp_send_reschedule(cpu);
1182 } 1182 }
1183 1183
1184 static void resched_cpu(int cpu) 1184 static void resched_cpu(int cpu)
1185 { 1185 {
1186 struct rq *rq = cpu_rq(cpu); 1186 struct rq *rq = cpu_rq(cpu);
1187 unsigned long flags; 1187 unsigned long flags;
1188 1188
1189 if (!raw_spin_trylock_irqsave(&rq->lock, flags)) 1189 if (!raw_spin_trylock_irqsave(&rq->lock, flags))
1190 return; 1190 return;
1191 resched_task(cpu_curr(cpu)); 1191 resched_task(cpu_curr(cpu));
1192 raw_spin_unlock_irqrestore(&rq->lock, flags); 1192 raw_spin_unlock_irqrestore(&rq->lock, flags);
1193 } 1193 }
1194 1194
1195 #ifdef CONFIG_NO_HZ 1195 #ifdef CONFIG_NO_HZ
1196 /* 1196 /*
1197 * In the semi idle case, use the nearest busy cpu for migrating timers 1197 * In the semi idle case, use the nearest busy cpu for migrating timers
1198 * from an idle cpu. This is good for power-savings. 1198 * from an idle cpu. This is good for power-savings.
1199 * 1199 *
1200 * We don't do similar optimization for completely idle system, as 1200 * We don't do similar optimization for completely idle system, as
1201 * selecting an idle cpu will add more delays to the timers than intended 1201 * selecting an idle cpu will add more delays to the timers than intended
1202 * (as that cpu's timer base may not be uptodate wrt jiffies etc). 1202 * (as that cpu's timer base may not be uptodate wrt jiffies etc).
1203 */ 1203 */
1204 int get_nohz_timer_target(void) 1204 int get_nohz_timer_target(void)
1205 { 1205 {
1206 int cpu = smp_processor_id(); 1206 int cpu = smp_processor_id();
1207 int i; 1207 int i;
1208 struct sched_domain *sd; 1208 struct sched_domain *sd;
1209 1209
1210 for_each_domain(cpu, sd) { 1210 for_each_domain(cpu, sd) {
1211 for_each_cpu(i, sched_domain_span(sd)) 1211 for_each_cpu(i, sched_domain_span(sd))
1212 if (!idle_cpu(i)) 1212 if (!idle_cpu(i))
1213 return i; 1213 return i;
1214 } 1214 }
1215 return cpu; 1215 return cpu;
1216 } 1216 }
1217 /* 1217 /*
1218 * When add_timer_on() enqueues a timer into the timer wheel of an 1218 * When add_timer_on() enqueues a timer into the timer wheel of an
1219 * idle CPU then this timer might expire before the next timer event 1219 * idle CPU then this timer might expire before the next timer event
1220 * which is scheduled to wake up that CPU. In case of a completely 1220 * which is scheduled to wake up that CPU. In case of a completely
1221 * idle system the next event might even be infinite time into the 1221 * idle system the next event might even be infinite time into the
1222 * future. wake_up_idle_cpu() ensures that the CPU is woken up and 1222 * future. wake_up_idle_cpu() ensures that the CPU is woken up and
1223 * leaves the inner idle loop so the newly added timer is taken into 1223 * leaves the inner idle loop so the newly added timer is taken into
1224 * account when the CPU goes back to idle and evaluates the timer 1224 * account when the CPU goes back to idle and evaluates the timer
1225 * wheel for the next timer event. 1225 * wheel for the next timer event.
1226 */ 1226 */
1227 void wake_up_idle_cpu(int cpu) 1227 void wake_up_idle_cpu(int cpu)
1228 { 1228 {
1229 struct rq *rq = cpu_rq(cpu); 1229 struct rq *rq = cpu_rq(cpu);
1230 1230
1231 if (cpu == smp_processor_id()) 1231 if (cpu == smp_processor_id())
1232 return; 1232 return;
1233 1233
1234 /* 1234 /*
1235 * This is safe, as this function is called with the timer 1235 * This is safe, as this function is called with the timer
1236 * wheel base lock of (cpu) held. When the CPU is on the way 1236 * wheel base lock of (cpu) held. When the CPU is on the way
1237 * to idle and has not yet set rq->curr to idle then it will 1237 * to idle and has not yet set rq->curr to idle then it will
1238 * be serialized on the timer wheel base lock and take the new 1238 * be serialized on the timer wheel base lock and take the new
1239 * timer into account automatically. 1239 * timer into account automatically.
1240 */ 1240 */
1241 if (rq->curr != rq->idle) 1241 if (rq->curr != rq->idle)
1242 return; 1242 return;
1243 1243
1244 /* 1244 /*
1245 * We can set TIF_RESCHED on the idle task of the other CPU 1245 * We can set TIF_RESCHED on the idle task of the other CPU
1246 * lockless. The worst case is that the other CPU runs the 1246 * lockless. The worst case is that the other CPU runs the
1247 * idle task through an additional NOOP schedule() 1247 * idle task through an additional NOOP schedule()
1248 */ 1248 */
1249 set_tsk_need_resched(rq->idle); 1249 set_tsk_need_resched(rq->idle);
1250 1250
1251 /* NEED_RESCHED must be visible before we test polling */ 1251 /* NEED_RESCHED must be visible before we test polling */
1252 smp_mb(); 1252 smp_mb();
1253 if (!tsk_is_polling(rq->idle)) 1253 if (!tsk_is_polling(rq->idle))
1254 smp_send_reschedule(cpu); 1254 smp_send_reschedule(cpu);
1255 } 1255 }
1256 1256
1257 #endif /* CONFIG_NO_HZ */ 1257 #endif /* CONFIG_NO_HZ */
1258 1258
1259 static u64 sched_avg_period(void) 1259 static u64 sched_avg_period(void)
1260 { 1260 {
1261 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; 1261 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
1262 } 1262 }
1263 1263
1264 static void sched_avg_update(struct rq *rq) 1264 static void sched_avg_update(struct rq *rq)
1265 { 1265 {
1266 s64 period = sched_avg_period(); 1266 s64 period = sched_avg_period();
1267 1267
1268 while ((s64)(rq->clock - rq->age_stamp) > period) { 1268 while ((s64)(rq->clock - rq->age_stamp) > period) {
1269 /* 1269 /*
1270 * Inline assembly required to prevent the compiler 1270 * Inline assembly required to prevent the compiler
1271 * optimising this loop into a divmod call. 1271 * optimising this loop into a divmod call.
1272 * See __iter_div_u64_rem() for another example of this. 1272 * See __iter_div_u64_rem() for another example of this.
1273 */ 1273 */
1274 asm("" : "+rm" (rq->age_stamp)); 1274 asm("" : "+rm" (rq->age_stamp));
1275 rq->age_stamp += period; 1275 rq->age_stamp += period;
1276 rq->rt_avg /= 2; 1276 rq->rt_avg /= 2;
1277 } 1277 }
1278 } 1278 }
1279 1279
1280 static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) 1280 static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1281 { 1281 {
1282 rq->rt_avg += rt_delta; 1282 rq->rt_avg += rt_delta;
1283 sched_avg_update(rq); 1283 sched_avg_update(rq);
1284 } 1284 }
1285 1285
1286 #else /* !CONFIG_SMP */ 1286 #else /* !CONFIG_SMP */
1287 static void resched_task(struct task_struct *p) 1287 static void resched_task(struct task_struct *p)
1288 { 1288 {
1289 assert_raw_spin_locked(&task_rq(p)->lock); 1289 assert_raw_spin_locked(&task_rq(p)->lock);
1290 set_tsk_need_resched(p); 1290 set_tsk_need_resched(p);
1291 } 1291 }
1292 1292
1293 static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) 1293 static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1294 { 1294 {
1295 } 1295 }
1296 1296
1297 static void sched_avg_update(struct rq *rq) 1297 static void sched_avg_update(struct rq *rq)
1298 { 1298 {
1299 } 1299 }
1300 #endif /* CONFIG_SMP */ 1300 #endif /* CONFIG_SMP */
1301 1301
1302 #if BITS_PER_LONG == 32 1302 #if BITS_PER_LONG == 32
1303 # define WMULT_CONST (~0UL) 1303 # define WMULT_CONST (~0UL)
1304 #else 1304 #else
1305 # define WMULT_CONST (1UL << 32) 1305 # define WMULT_CONST (1UL << 32)
1306 #endif 1306 #endif
1307 1307
1308 #define WMULT_SHIFT 32 1308 #define WMULT_SHIFT 32
1309 1309
1310 /* 1310 /*
1311 * Shift right and round: 1311 * Shift right and round:
1312 */ 1312 */
1313 #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) 1313 #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
1314 1314
1315 /* 1315 /*
1316 * delta *= weight / lw 1316 * delta *= weight / lw
1317 */ 1317 */
1318 static unsigned long 1318 static unsigned long
1319 calc_delta_mine(unsigned long delta_exec, unsigned long weight, 1319 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1320 struct load_weight *lw) 1320 struct load_weight *lw)
1321 { 1321 {
1322 u64 tmp; 1322 u64 tmp;
1323 1323
1324 if (!lw->inv_weight) { 1324 if (!lw->inv_weight) {
1325 if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) 1325 if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
1326 lw->inv_weight = 1; 1326 lw->inv_weight = 1;
1327 else 1327 else
1328 lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) 1328 lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
1329 / (lw->weight+1); 1329 / (lw->weight+1);
1330 } 1330 }
1331 1331
1332 tmp = (u64)delta_exec * weight; 1332 tmp = (u64)delta_exec * weight;
1333 /* 1333 /*
1334 * Check whether we'd overflow the 64-bit multiplication: 1334 * Check whether we'd overflow the 64-bit multiplication:
1335 */ 1335 */
1336 if (unlikely(tmp > WMULT_CONST)) 1336 if (unlikely(tmp > WMULT_CONST))
1337 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, 1337 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
1338 WMULT_SHIFT/2); 1338 WMULT_SHIFT/2);
1339 else 1339 else
1340 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); 1340 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
1341 1341
1342 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); 1342 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1343 } 1343 }
1344 1344
1345 static inline void update_load_add(struct load_weight *lw, unsigned long inc) 1345 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1346 { 1346 {
1347 lw->weight += inc; 1347 lw->weight += inc;
1348 lw->inv_weight = 0; 1348 lw->inv_weight = 0;
1349 } 1349 }
1350 1350
1351 static inline void update_load_sub(struct load_weight *lw, unsigned long dec) 1351 static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1352 { 1352 {
1353 lw->weight -= dec; 1353 lw->weight -= dec;
1354 lw->inv_weight = 0; 1354 lw->inv_weight = 0;
1355 } 1355 }
1356 1356
1357 static inline void update_load_set(struct load_weight *lw, unsigned long w) 1357 static inline void update_load_set(struct load_weight *lw, unsigned long w)
1358 { 1358 {
1359 lw->weight = w; 1359 lw->weight = w;
1360 lw->inv_weight = 0; 1360 lw->inv_weight = 0;
1361 } 1361 }
1362 1362
1363 /* 1363 /*
1364 * To aid in avoiding the subversion of "niceness" due to uneven distribution 1364 * To aid in avoiding the subversion of "niceness" due to uneven distribution
1365 * of tasks with abnormal "nice" values across CPUs the contribution that 1365 * of tasks with abnormal "nice" values across CPUs the contribution that
1366 * each task makes to its run queue's load is weighted according to its 1366 * each task makes to its run queue's load is weighted according to its
1367 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a 1367 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
1368 * scaled version of the new time slice allocation that they receive on time 1368 * scaled version of the new time slice allocation that they receive on time
1369 * slice expiry etc. 1369 * slice expiry etc.
1370 */ 1370 */
1371 1371
1372 #define WEIGHT_IDLEPRIO 3 1372 #define WEIGHT_IDLEPRIO 3
1373 #define WMULT_IDLEPRIO 1431655765 1373 #define WMULT_IDLEPRIO 1431655765
1374 1374
1375 /* 1375 /*
1376 * Nice levels are multiplicative, with a gentle 10% change for every 1376 * Nice levels are multiplicative, with a gentle 10% change for every
1377 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to 1377 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
1378 * nice 1, it will get ~10% less CPU time than another CPU-bound task 1378 * nice 1, it will get ~10% less CPU time than another CPU-bound task
1379 * that remained on nice 0. 1379 * that remained on nice 0.
1380 * 1380 *
1381 * The "10% effect" is relative and cumulative: from _any_ nice level, 1381 * The "10% effect" is relative and cumulative: from _any_ nice level,
1382 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level 1382 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
1383 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. 1383 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
1384 * If a task goes up by ~10% and another task goes down by ~10% then 1384 * If a task goes up by ~10% and another task goes down by ~10% then
1385 * the relative distance between them is ~25%.) 1385 * the relative distance between them is ~25%.)
1386 */ 1386 */
1387 static const int prio_to_weight[40] = { 1387 static const int prio_to_weight[40] = {
1388 /* -20 */ 88761, 71755, 56483, 46273, 36291, 1388 /* -20 */ 88761, 71755, 56483, 46273, 36291,
1389 /* -15 */ 29154, 23254, 18705, 14949, 11916, 1389 /* -15 */ 29154, 23254, 18705, 14949, 11916,
1390 /* -10 */ 9548, 7620, 6100, 4904, 3906, 1390 /* -10 */ 9548, 7620, 6100, 4904, 3906,
1391 /* -5 */ 3121, 2501, 1991, 1586, 1277, 1391 /* -5 */ 3121, 2501, 1991, 1586, 1277,
1392 /* 0 */ 1024, 820, 655, 526, 423, 1392 /* 0 */ 1024, 820, 655, 526, 423,
1393 /* 5 */ 335, 272, 215, 172, 137, 1393 /* 5 */ 335, 272, 215, 172, 137,
1394 /* 10 */ 110, 87, 70, 56, 45, 1394 /* 10 */ 110, 87, 70, 56, 45,
1395 /* 15 */ 36, 29, 23, 18, 15, 1395 /* 15 */ 36, 29, 23, 18, 15,
1396 }; 1396 };
1397 1397
1398 /* 1398 /*
1399 * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. 1399 * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
1400 * 1400 *
1401 * In cases where the weight does not change often, we can use the 1401 * In cases where the weight does not change often, we can use the
1402 * precalculated inverse to speed up arithmetics by turning divisions 1402 * precalculated inverse to speed up arithmetics by turning divisions
1403 * into multiplications: 1403 * into multiplications:
1404 */ 1404 */
1405 static const u32 prio_to_wmult[40] = { 1405 static const u32 prio_to_wmult[40] = {
1406 /* -20 */ 48388, 59856, 76040, 92818, 118348, 1406 /* -20 */ 48388, 59856, 76040, 92818, 118348,
1407 /* -15 */ 147320, 184698, 229616, 287308, 360437, 1407 /* -15 */ 147320, 184698, 229616, 287308, 360437,
1408 /* -10 */ 449829, 563644, 704093, 875809, 1099582, 1408 /* -10 */ 449829, 563644, 704093, 875809, 1099582,
1409 /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, 1409 /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
1410 /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, 1410 /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
1411 /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, 1411 /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
1412 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, 1412 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
1413 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 1413 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
1414 }; 1414 };
1415 1415
1416 /* Time spent by the tasks of the cpu accounting group executing in ... */ 1416 /* Time spent by the tasks of the cpu accounting group executing in ... */
1417 enum cpuacct_stat_index { 1417 enum cpuacct_stat_index {
1418 CPUACCT_STAT_USER, /* ... user mode */ 1418 CPUACCT_STAT_USER, /* ... user mode */
1419 CPUACCT_STAT_SYSTEM, /* ... kernel mode */ 1419 CPUACCT_STAT_SYSTEM, /* ... kernel mode */
1420 1420
1421 CPUACCT_STAT_NSTATS, 1421 CPUACCT_STAT_NSTATS,
1422 }; 1422 };
1423 1423
1424 #ifdef CONFIG_CGROUP_CPUACCT 1424 #ifdef CONFIG_CGROUP_CPUACCT
1425 static void cpuacct_charge(struct task_struct *tsk, u64 cputime); 1425 static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1426 static void cpuacct_update_stats(struct task_struct *tsk, 1426 static void cpuacct_update_stats(struct task_struct *tsk,
1427 enum cpuacct_stat_index idx, cputime_t val); 1427 enum cpuacct_stat_index idx, cputime_t val);
1428 #else 1428 #else
1429 static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} 1429 static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1430 static inline void cpuacct_update_stats(struct task_struct *tsk, 1430 static inline void cpuacct_update_stats(struct task_struct *tsk,
1431 enum cpuacct_stat_index idx, cputime_t val) {} 1431 enum cpuacct_stat_index idx, cputime_t val) {}
1432 #endif 1432 #endif
1433 1433
1434 static inline void inc_cpu_load(struct rq *rq, unsigned long load) 1434 static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1435 { 1435 {
1436 update_load_add(&rq->load, load); 1436 update_load_add(&rq->load, load);
1437 } 1437 }
1438 1438
1439 static inline void dec_cpu_load(struct rq *rq, unsigned long load) 1439 static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1440 { 1440 {
1441 update_load_sub(&rq->load, load); 1441 update_load_sub(&rq->load, load);
1442 } 1442 }
1443 1443
1444 #if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) 1444 #if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
1445 typedef int (*tg_visitor)(struct task_group *, void *); 1445 typedef int (*tg_visitor)(struct task_group *, void *);
1446 1446
1447 /* 1447 /*
1448 * Iterate the full tree, calling @down when first entering a node and @up when 1448 * Iterate the full tree, calling @down when first entering a node and @up when
1449 * leaving it for the final time. 1449 * leaving it for the final time.
1450 */ 1450 */
1451 static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) 1451 static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1452 { 1452 {
1453 struct task_group *parent, *child; 1453 struct task_group *parent, *child;
1454 int ret; 1454 int ret;
1455 1455
1456 rcu_read_lock(); 1456 rcu_read_lock();
1457 parent = &root_task_group; 1457 parent = &root_task_group;
1458 down: 1458 down:
1459 ret = (*down)(parent, data); 1459 ret = (*down)(parent, data);
1460 if (ret) 1460 if (ret)
1461 goto out_unlock; 1461 goto out_unlock;
1462 list_for_each_entry_rcu(child, &parent->children, siblings) { 1462 list_for_each_entry_rcu(child, &parent->children, siblings) {
1463 parent = child; 1463 parent = child;
1464 goto down; 1464 goto down;
1465 1465
1466 up: 1466 up:
1467 continue; 1467 continue;
1468 } 1468 }
1469 ret = (*up)(parent, data); 1469 ret = (*up)(parent, data);
1470 if (ret) 1470 if (ret)
1471 goto out_unlock; 1471 goto out_unlock;
1472 1472
1473 child = parent; 1473 child = parent;
1474 parent = parent->parent; 1474 parent = parent->parent;
1475 if (parent) 1475 if (parent)
1476 goto up; 1476 goto up;
1477 out_unlock: 1477 out_unlock:
1478 rcu_read_unlock(); 1478 rcu_read_unlock();
1479 1479
1480 return ret; 1480 return ret;
1481 } 1481 }
1482 1482
1483 static int tg_nop(struct task_group *tg, void *data) 1483 static int tg_nop(struct task_group *tg, void *data)
1484 { 1484 {
1485 return 0; 1485 return 0;
1486 } 1486 }
1487 #endif 1487 #endif
1488 1488
1489 #ifdef CONFIG_SMP 1489 #ifdef CONFIG_SMP
1490 /* Used instead of source_load when we know the type == 0 */ 1490 /* Used instead of source_load when we know the type == 0 */
1491 static unsigned long weighted_cpuload(const int cpu) 1491 static unsigned long weighted_cpuload(const int cpu)
1492 { 1492 {
1493 return cpu_rq(cpu)->load.weight; 1493 return cpu_rq(cpu)->load.weight;
1494 } 1494 }
1495 1495
1496 /* 1496 /*
1497 * Return a low guess at the load of a migration-source cpu weighted 1497 * Return a low guess at the load of a migration-source cpu weighted
1498 * according to the scheduling class and "nice" value. 1498 * according to the scheduling class and "nice" value.
1499 * 1499 *
1500 * We want to under-estimate the load of migration sources, to 1500 * We want to under-estimate the load of migration sources, to
1501 * balance conservatively. 1501 * balance conservatively.
1502 */ 1502 */
1503 static unsigned long source_load(int cpu, int type) 1503 static unsigned long source_load(int cpu, int type)
1504 { 1504 {
1505 struct rq *rq = cpu_rq(cpu); 1505 struct rq *rq = cpu_rq(cpu);
1506 unsigned long total = weighted_cpuload(cpu); 1506 unsigned long total = weighted_cpuload(cpu);
1507 1507
1508 if (type == 0 || !sched_feat(LB_BIAS)) 1508 if (type == 0 || !sched_feat(LB_BIAS))
1509 return total; 1509 return total;
1510 1510
1511 return min(rq->cpu_load[type-1], total); 1511 return min(rq->cpu_load[type-1], total);
1512 } 1512 }
1513 1513
1514 /* 1514 /*
1515 * Return a high guess at the load of a migration-target cpu weighted 1515 * Return a high guess at the load of a migration-target cpu weighted
1516 * according to the scheduling class and "nice" value. 1516 * according to the scheduling class and "nice" value.
1517 */ 1517 */
1518 static unsigned long target_load(int cpu, int type) 1518 static unsigned long target_load(int cpu, int type)
1519 { 1519 {
1520 struct rq *rq = cpu_rq(cpu); 1520 struct rq *rq = cpu_rq(cpu);
1521 unsigned long total = weighted_cpuload(cpu); 1521 unsigned long total = weighted_cpuload(cpu);
1522 1522
1523 if (type == 0 || !sched_feat(LB_BIAS)) 1523 if (type == 0 || !sched_feat(LB_BIAS))
1524 return total; 1524 return total;
1525 1525
1526 return max(rq->cpu_load[type-1], total); 1526 return max(rq->cpu_load[type-1], total);
1527 } 1527 }
1528 1528
1529 static unsigned long power_of(int cpu) 1529 static unsigned long power_of(int cpu)
1530 { 1530 {
1531 return cpu_rq(cpu)->cpu_power; 1531 return cpu_rq(cpu)->cpu_power;
1532 } 1532 }
1533 1533
1534 static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); 1534 static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1535 1535
1536 static unsigned long cpu_avg_load_per_task(int cpu) 1536 static unsigned long cpu_avg_load_per_task(int cpu)
1537 { 1537 {
1538 struct rq *rq = cpu_rq(cpu); 1538 struct rq *rq = cpu_rq(cpu);
1539 unsigned long nr_running = ACCESS_ONCE(rq->nr_running); 1539 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
1540 1540
1541 if (nr_running) 1541 if (nr_running)
1542 rq->avg_load_per_task = rq->load.weight / nr_running; 1542 rq->avg_load_per_task = rq->load.weight / nr_running;
1543 else 1543 else
1544 rq->avg_load_per_task = 0; 1544 rq->avg_load_per_task = 0;
1545 1545
1546 return rq->avg_load_per_task; 1546 return rq->avg_load_per_task;
1547 } 1547 }
1548 1548
1549 #ifdef CONFIG_FAIR_GROUP_SCHED 1549 #ifdef CONFIG_FAIR_GROUP_SCHED
1550 1550
1551 /* 1551 /*
1552 * Compute the cpu's hierarchical load factor for each task group. 1552 * Compute the cpu's hierarchical load factor for each task group.
1553 * This needs to be done in a top-down fashion because the load of a child 1553 * This needs to be done in a top-down fashion because the load of a child
1554 * group is a fraction of its parents load. 1554 * group is a fraction of its parents load.
1555 */ 1555 */
1556 static int tg_load_down(struct task_group *tg, void *data) 1556 static int tg_load_down(struct task_group *tg, void *data)
1557 { 1557 {
1558 unsigned long load; 1558 unsigned long load;
1559 long cpu = (long)data; 1559 long cpu = (long)data;
1560 1560
1561 if (!tg->parent) { 1561 if (!tg->parent) {
1562 load = cpu_rq(cpu)->load.weight; 1562 load = cpu_rq(cpu)->load.weight;
1563 } else { 1563 } else {
1564 load = tg->parent->cfs_rq[cpu]->h_load; 1564 load = tg->parent->cfs_rq[cpu]->h_load;
1565 load *= tg->se[cpu]->load.weight; 1565 load *= tg->se[cpu]->load.weight;
1566 load /= tg->parent->cfs_rq[cpu]->load.weight + 1; 1566 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1567 } 1567 }
1568 1568
1569 tg->cfs_rq[cpu]->h_load = load; 1569 tg->cfs_rq[cpu]->h_load = load;
1570 1570
1571 return 0; 1571 return 0;
1572 } 1572 }
1573 1573
1574 static void update_h_load(long cpu) 1574 static void update_h_load(long cpu)
1575 { 1575 {
1576 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 1576 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1577 } 1577 }
1578 1578
1579 #endif 1579 #endif
1580 1580
1581 #ifdef CONFIG_PREEMPT 1581 #ifdef CONFIG_PREEMPT
1582 1582
1583 static void double_rq_lock(struct rq *rq1, struct rq *rq2); 1583 static void double_rq_lock(struct rq *rq1, struct rq *rq2);
1584 1584
1585 /* 1585 /*
1586 * fair double_lock_balance: Safely acquires both rq->locks in a fair 1586 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1587 * way at the expense of forcing extra atomic operations in all 1587 * way at the expense of forcing extra atomic operations in all
1588 * invocations. This assures that the double_lock is acquired using the 1588 * invocations. This assures that the double_lock is acquired using the
1589 * same underlying policy as the spinlock_t on this architecture, which 1589 * same underlying policy as the spinlock_t on this architecture, which
1590 * reduces latency compared to the unfair variant below. However, it 1590 * reduces latency compared to the unfair variant below. However, it
1591 * also adds more overhead and therefore may reduce throughput. 1591 * also adds more overhead and therefore may reduce throughput.
1592 */ 1592 */
1593 static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) 1593 static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1594 __releases(this_rq->lock) 1594 __releases(this_rq->lock)
1595 __acquires(busiest->lock) 1595 __acquires(busiest->lock)
1596 __acquires(this_rq->lock) 1596 __acquires(this_rq->lock)
1597 { 1597 {
1598 raw_spin_unlock(&this_rq->lock); 1598 raw_spin_unlock(&this_rq->lock);
1599 double_rq_lock(this_rq, busiest); 1599 double_rq_lock(this_rq, busiest);
1600 1600
1601 return 1; 1601 return 1;
1602 } 1602 }
1603 1603
1604 #else 1604 #else
1605 /* 1605 /*
1606 * Unfair double_lock_balance: Optimizes throughput at the expense of 1606 * Unfair double_lock_balance: Optimizes throughput at the expense of
1607 * latency by eliminating extra atomic operations when the locks are 1607 * latency by eliminating extra atomic operations when the locks are
1608 * already in proper order on entry. This favors lower cpu-ids and will 1608 * already in proper order on entry. This favors lower cpu-ids and will
1609 * grant the double lock to lower cpus over higher ids under contention, 1609 * grant the double lock to lower cpus over higher ids under contention,
1610 * regardless of entry order into the function. 1610 * regardless of entry order into the function.
1611 */ 1611 */
1612 static int _double_lock_balance(struct rq *this_rq, struct rq *busiest) 1612 static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1613 __releases(this_rq->lock) 1613 __releases(this_rq->lock)
1614 __acquires(busiest->lock) 1614 __acquires(busiest->lock)
1615 __acquires(this_rq->lock) 1615 __acquires(this_rq->lock)
1616 { 1616 {
1617 int ret = 0; 1617 int ret = 0;
1618 1618
1619 if (unlikely(!raw_spin_trylock(&busiest->lock))) { 1619 if (unlikely(!raw_spin_trylock(&busiest->lock))) {
1620 if (busiest < this_rq) { 1620 if (busiest < this_rq) {
1621 raw_spin_unlock(&this_rq->lock); 1621 raw_spin_unlock(&this_rq->lock);
1622 raw_spin_lock(&busiest->lock); 1622 raw_spin_lock(&busiest->lock);
1623 raw_spin_lock_nested(&this_rq->lock, 1623 raw_spin_lock_nested(&this_rq->lock,
1624 SINGLE_DEPTH_NESTING); 1624 SINGLE_DEPTH_NESTING);
1625 ret = 1; 1625 ret = 1;
1626 } else 1626 } else
1627 raw_spin_lock_nested(&busiest->lock, 1627 raw_spin_lock_nested(&busiest->lock,
1628 SINGLE_DEPTH_NESTING); 1628 SINGLE_DEPTH_NESTING);
1629 } 1629 }
1630 return ret; 1630 return ret;
1631 } 1631 }
1632 1632
1633 #endif /* CONFIG_PREEMPT */ 1633 #endif /* CONFIG_PREEMPT */
1634 1634
1635 /* 1635 /*
1636 * double_lock_balance - lock the busiest runqueue, this_rq is locked already. 1636 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1637 */ 1637 */
1638 static int double_lock_balance(struct rq *this_rq, struct rq *busiest) 1638 static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1639 { 1639 {
1640 if (unlikely(!irqs_disabled())) { 1640 if (unlikely(!irqs_disabled())) {
1641 /* printk() doesn't work good under rq->lock */ 1641 /* printk() doesn't work good under rq->lock */
1642 raw_spin_unlock(&this_rq->lock); 1642 raw_spin_unlock(&this_rq->lock);
1643 BUG_ON(1); 1643 BUG_ON(1);
1644 } 1644 }
1645 1645
1646 return _double_lock_balance(this_rq, busiest); 1646 return _double_lock_balance(this_rq, busiest);
1647 } 1647 }
1648 1648
1649 static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) 1649 static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1650 __releases(busiest->lock) 1650 __releases(busiest->lock)
1651 { 1651 {
1652 raw_spin_unlock(&busiest->lock); 1652 raw_spin_unlock(&busiest->lock);
1653 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); 1653 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1654 } 1654 }
1655 1655
1656 /* 1656 /*
1657 * double_rq_lock - safely lock two runqueues 1657 * double_rq_lock - safely lock two runqueues
1658 * 1658 *
1659 * Note this does not disable interrupts like task_rq_lock, 1659 * Note this does not disable interrupts like task_rq_lock,
1660 * you need to do so manually before calling. 1660 * you need to do so manually before calling.
1661 */ 1661 */
1662 static void double_rq_lock(struct rq *rq1, struct rq *rq2) 1662 static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1663 __acquires(rq1->lock) 1663 __acquires(rq1->lock)
1664 __acquires(rq2->lock) 1664 __acquires(rq2->lock)
1665 { 1665 {
1666 BUG_ON(!irqs_disabled()); 1666 BUG_ON(!irqs_disabled());
1667 if (rq1 == rq2) { 1667 if (rq1 == rq2) {
1668 raw_spin_lock(&rq1->lock); 1668 raw_spin_lock(&rq1->lock);
1669 __acquire(rq2->lock); /* Fake it out ;) */ 1669 __acquire(rq2->lock); /* Fake it out ;) */
1670 } else { 1670 } else {
1671 if (rq1 < rq2) { 1671 if (rq1 < rq2) {
1672 raw_spin_lock(&rq1->lock); 1672 raw_spin_lock(&rq1->lock);
1673 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); 1673 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1674 } else { 1674 } else {
1675 raw_spin_lock(&rq2->lock); 1675 raw_spin_lock(&rq2->lock);
1676 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); 1676 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1677 } 1677 }
1678 } 1678 }
1679 } 1679 }
1680 1680
1681 /* 1681 /*
1682 * double_rq_unlock - safely unlock two runqueues 1682 * double_rq_unlock - safely unlock two runqueues
1683 * 1683 *
1684 * Note this does not restore interrupts like task_rq_unlock, 1684 * Note this does not restore interrupts like task_rq_unlock,
1685 * you need to do so manually after calling. 1685 * you need to do so manually after calling.
1686 */ 1686 */
1687 static void double_rq_unlock(struct rq *rq1, struct rq *rq2) 1687 static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1688 __releases(rq1->lock) 1688 __releases(rq1->lock)
1689 __releases(rq2->lock) 1689 __releases(rq2->lock)
1690 { 1690 {
1691 raw_spin_unlock(&rq1->lock); 1691 raw_spin_unlock(&rq1->lock);
1692 if (rq1 != rq2) 1692 if (rq1 != rq2)
1693 raw_spin_unlock(&rq2->lock); 1693 raw_spin_unlock(&rq2->lock);
1694 else 1694 else
1695 __release(rq2->lock); 1695 __release(rq2->lock);
1696 } 1696 }
1697 1697
1698 #else /* CONFIG_SMP */ 1698 #else /* CONFIG_SMP */
1699 1699
1700 /* 1700 /*
1701 * double_rq_lock - safely lock two runqueues 1701 * double_rq_lock - safely lock two runqueues
1702 * 1702 *
1703 * Note this does not disable interrupts like task_rq_lock, 1703 * Note this does not disable interrupts like task_rq_lock,
1704 * you need to do so manually before calling. 1704 * you need to do so manually before calling.
1705 */ 1705 */
1706 static void double_rq_lock(struct rq *rq1, struct rq *rq2) 1706 static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1707 __acquires(rq1->lock) 1707 __acquires(rq1->lock)
1708 __acquires(rq2->lock) 1708 __acquires(rq2->lock)
1709 { 1709 {
1710 BUG_ON(!irqs_disabled()); 1710 BUG_ON(!irqs_disabled());
1711 BUG_ON(rq1 != rq2); 1711 BUG_ON(rq1 != rq2);
1712 raw_spin_lock(&rq1->lock); 1712 raw_spin_lock(&rq1->lock);
1713 __acquire(rq2->lock); /* Fake it out ;) */ 1713 __acquire(rq2->lock); /* Fake it out ;) */
1714 } 1714 }
1715 1715
1716 /* 1716 /*
1717 * double_rq_unlock - safely unlock two runqueues 1717 * double_rq_unlock - safely unlock two runqueues
1718 * 1718 *
1719 * Note this does not restore interrupts like task_rq_unlock, 1719 * Note this does not restore interrupts like task_rq_unlock,
1720 * you need to do so manually after calling. 1720 * you need to do so manually after calling.
1721 */ 1721 */
1722 static void double_rq_unlock(struct rq *rq1, struct rq *rq2) 1722 static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1723 __releases(rq1->lock) 1723 __releases(rq1->lock)
1724 __releases(rq2->lock) 1724 __releases(rq2->lock)
1725 { 1725 {
1726 BUG_ON(rq1 != rq2); 1726 BUG_ON(rq1 != rq2);
1727 raw_spin_unlock(&rq1->lock); 1727 raw_spin_unlock(&rq1->lock);
1728 __release(rq2->lock); 1728 __release(rq2->lock);
1729 } 1729 }
1730 1730
1731 #endif 1731 #endif
1732 1732
1733 static void calc_load_account_idle(struct rq *this_rq); 1733 static void calc_load_account_idle(struct rq *this_rq);
1734 static void update_sysctl(void); 1734 static void update_sysctl(void);
1735 static int get_update_sysctl_factor(void); 1735 static int get_update_sysctl_factor(void);
1736 static void update_cpu_load(struct rq *this_rq); 1736 static void update_cpu_load(struct rq *this_rq);
1737 1737
1738 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) 1738 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1739 { 1739 {
1740 set_task_rq(p, cpu); 1740 set_task_rq(p, cpu);
1741 #ifdef CONFIG_SMP 1741 #ifdef CONFIG_SMP
1742 /* 1742 /*
1743 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be 1743 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1744 * successfuly executed on another CPU. We must ensure that updates of 1744 * successfuly executed on another CPU. We must ensure that updates of
1745 * per-task data have been completed by this moment. 1745 * per-task data have been completed by this moment.
1746 */ 1746 */
1747 smp_wmb(); 1747 smp_wmb();
1748 task_thread_info(p)->cpu = cpu; 1748 task_thread_info(p)->cpu = cpu;
1749 #endif 1749 #endif
1750 } 1750 }
1751 1751
1752 static const struct sched_class rt_sched_class; 1752 static const struct sched_class rt_sched_class;
1753 1753
1754 #define sched_class_highest (&stop_sched_class) 1754 #define sched_class_highest (&stop_sched_class)
1755 #define for_each_class(class) \ 1755 #define for_each_class(class) \
1756 for (class = sched_class_highest; class; class = class->next) 1756 for (class = sched_class_highest; class; class = class->next)
1757 1757
1758 #include "sched_stats.h" 1758 #include "sched_stats.h"
1759 1759
1760 static void inc_nr_running(struct rq *rq) 1760 static void inc_nr_running(struct rq *rq)
1761 { 1761 {
1762 rq->nr_running++; 1762 rq->nr_running++;
1763 } 1763 }
1764 1764
1765 static void dec_nr_running(struct rq *rq) 1765 static void dec_nr_running(struct rq *rq)
1766 { 1766 {
1767 rq->nr_running--; 1767 rq->nr_running--;
1768 } 1768 }
1769 1769
1770 static void set_load_weight(struct task_struct *p) 1770 static void set_load_weight(struct task_struct *p)
1771 { 1771 {
1772 /* 1772 /*
1773 * SCHED_IDLE tasks get minimal weight: 1773 * SCHED_IDLE tasks get minimal weight:
1774 */ 1774 */
1775 if (p->policy == SCHED_IDLE) { 1775 if (p->policy == SCHED_IDLE) {
1776 p->se.load.weight = WEIGHT_IDLEPRIO; 1776 p->se.load.weight = WEIGHT_IDLEPRIO;
1777 p->se.load.inv_weight = WMULT_IDLEPRIO; 1777 p->se.load.inv_weight = WMULT_IDLEPRIO;
1778 return; 1778 return;
1779 } 1779 }
1780 1780
1781 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO]; 1781 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
1782 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; 1782 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
1783 } 1783 }
1784 1784
1785 static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 1785 static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
1786 { 1786 {
1787 update_rq_clock(rq); 1787 update_rq_clock(rq);
1788 sched_info_queued(p); 1788 sched_info_queued(p);
1789 p->sched_class->enqueue_task(rq, p, flags); 1789 p->sched_class->enqueue_task(rq, p, flags);
1790 } 1790 }
1791 1791
1792 static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 1792 static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1793 { 1793 {
1794 update_rq_clock(rq); 1794 update_rq_clock(rq);
1795 sched_info_dequeued(p); 1795 sched_info_dequeued(p);
1796 p->sched_class->dequeue_task(rq, p, flags); 1796 p->sched_class->dequeue_task(rq, p, flags);
1797 } 1797 }
1798 1798
1799 /* 1799 /*
1800 * activate_task - move a task to the runqueue. 1800 * activate_task - move a task to the runqueue.
1801 */ 1801 */
1802 static void activate_task(struct rq *rq, struct task_struct *p, int flags) 1802 static void activate_task(struct rq *rq, struct task_struct *p, int flags)
1803 { 1803 {
1804 if (task_contributes_to_load(p)) 1804 if (task_contributes_to_load(p))
1805 rq->nr_uninterruptible--; 1805 rq->nr_uninterruptible--;
1806 1806
1807 enqueue_task(rq, p, flags); 1807 enqueue_task(rq, p, flags);
1808 inc_nr_running(rq); 1808 inc_nr_running(rq);
1809 } 1809 }
1810 1810
1811 /* 1811 /*
1812 * deactivate_task - remove a task from the runqueue. 1812 * deactivate_task - remove a task from the runqueue.
1813 */ 1813 */
1814 static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) 1814 static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1815 { 1815 {
1816 if (task_contributes_to_load(p)) 1816 if (task_contributes_to_load(p))
1817 rq->nr_uninterruptible++; 1817 rq->nr_uninterruptible++;
1818 1818
1819 dequeue_task(rq, p, flags); 1819 dequeue_task(rq, p, flags);
1820 dec_nr_running(rq); 1820 dec_nr_running(rq);
1821 } 1821 }
1822 1822
1823 #ifdef CONFIG_IRQ_TIME_ACCOUNTING 1823 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
1824 1824
1825 /* 1825 /*
1826 * There are no locks covering percpu hardirq/softirq time. 1826 * There are no locks covering percpu hardirq/softirq time.
1827 * They are only modified in account_system_vtime, on corresponding CPU 1827 * They are only modified in account_system_vtime, on corresponding CPU
1828 * with interrupts disabled. So, writes are safe. 1828 * with interrupts disabled. So, writes are safe.
1829 * They are read and saved off onto struct rq in update_rq_clock(). 1829 * They are read and saved off onto struct rq in update_rq_clock().
1830 * This may result in other CPU reading this CPU's irq time and can 1830 * This may result in other CPU reading this CPU's irq time and can
1831 * race with irq/account_system_vtime on this CPU. We would either get old 1831 * race with irq/account_system_vtime on this CPU. We would either get old
1832 * or new value with a side effect of accounting a slice of irq time to wrong 1832 * or new value with a side effect of accounting a slice of irq time to wrong
1833 * task when irq is in progress while we read rq->clock. That is a worthy 1833 * task when irq is in progress while we read rq->clock. That is a worthy
1834 * compromise in place of having locks on each irq in account_system_time. 1834 * compromise in place of having locks on each irq in account_system_time.
1835 */ 1835 */
1836 static DEFINE_PER_CPU(u64, cpu_hardirq_time); 1836 static DEFINE_PER_CPU(u64, cpu_hardirq_time);
1837 static DEFINE_PER_CPU(u64, cpu_softirq_time); 1837 static DEFINE_PER_CPU(u64, cpu_softirq_time);
1838 1838
1839 static DEFINE_PER_CPU(u64, irq_start_time); 1839 static DEFINE_PER_CPU(u64, irq_start_time);
1840 static int sched_clock_irqtime; 1840 static int sched_clock_irqtime;
1841 1841
1842 void enable_sched_clock_irqtime(void) 1842 void enable_sched_clock_irqtime(void)
1843 { 1843 {
1844 sched_clock_irqtime = 1; 1844 sched_clock_irqtime = 1;
1845 } 1845 }
1846 1846
1847 void disable_sched_clock_irqtime(void) 1847 void disable_sched_clock_irqtime(void)
1848 { 1848 {
1849 sched_clock_irqtime = 0; 1849 sched_clock_irqtime = 0;
1850 } 1850 }
1851 1851
1852 #ifndef CONFIG_64BIT 1852 #ifndef CONFIG_64BIT
1853 static DEFINE_PER_CPU(seqcount_t, irq_time_seq); 1853 static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
1854 1854
1855 static inline void irq_time_write_begin(void) 1855 static inline void irq_time_write_begin(void)
1856 { 1856 {
1857 __this_cpu_inc(irq_time_seq.sequence); 1857 __this_cpu_inc(irq_time_seq.sequence);
1858 smp_wmb(); 1858 smp_wmb();
1859 } 1859 }
1860 1860
1861 static inline void irq_time_write_end(void) 1861 static inline void irq_time_write_end(void)
1862 { 1862 {
1863 smp_wmb(); 1863 smp_wmb();
1864 __this_cpu_inc(irq_time_seq.sequence); 1864 __this_cpu_inc(irq_time_seq.sequence);
1865 } 1865 }
1866 1866
1867 static inline u64 irq_time_read(int cpu) 1867 static inline u64 irq_time_read(int cpu)
1868 { 1868 {
1869 u64 irq_time; 1869 u64 irq_time;
1870 unsigned seq; 1870 unsigned seq;
1871 1871
1872 do { 1872 do {
1873 seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); 1873 seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
1874 irq_time = per_cpu(cpu_softirq_time, cpu) + 1874 irq_time = per_cpu(cpu_softirq_time, cpu) +
1875 per_cpu(cpu_hardirq_time, cpu); 1875 per_cpu(cpu_hardirq_time, cpu);
1876 } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); 1876 } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
1877 1877
1878 return irq_time; 1878 return irq_time;
1879 } 1879 }
1880 #else /* CONFIG_64BIT */ 1880 #else /* CONFIG_64BIT */
1881 static inline void irq_time_write_begin(void) 1881 static inline void irq_time_write_begin(void)
1882 { 1882 {
1883 } 1883 }
1884 1884
1885 static inline void irq_time_write_end(void) 1885 static inline void irq_time_write_end(void)
1886 { 1886 {
1887 } 1887 }
1888 1888
1889 static inline u64 irq_time_read(int cpu) 1889 static inline u64 irq_time_read(int cpu)
1890 { 1890 {
1891 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); 1891 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
1892 } 1892 }
1893 #endif /* CONFIG_64BIT */ 1893 #endif /* CONFIG_64BIT */
1894 1894
1895 /* 1895 /*
1896 * Called before incrementing preempt_count on {soft,}irq_enter 1896 * Called before incrementing preempt_count on {soft,}irq_enter
1897 * and before decrementing preempt_count on {soft,}irq_exit. 1897 * and before decrementing preempt_count on {soft,}irq_exit.
1898 */ 1898 */
1899 void account_system_vtime(struct task_struct *curr) 1899 void account_system_vtime(struct task_struct *curr)
1900 { 1900 {
1901 unsigned long flags; 1901 unsigned long flags;
1902 s64 delta; 1902 s64 delta;
1903 int cpu; 1903 int cpu;
1904 1904
1905 if (!sched_clock_irqtime) 1905 if (!sched_clock_irqtime)
1906 return; 1906 return;
1907 1907
1908 local_irq_save(flags); 1908 local_irq_save(flags);
1909 1909
1910 cpu = smp_processor_id(); 1910 cpu = smp_processor_id();
1911 delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); 1911 delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
1912 __this_cpu_add(irq_start_time, delta); 1912 __this_cpu_add(irq_start_time, delta);
1913 1913
1914 irq_time_write_begin(); 1914 irq_time_write_begin();
1915 /* 1915 /*
1916 * We do not account for softirq time from ksoftirqd here. 1916 * We do not account for softirq time from ksoftirqd here.
1917 * We want to continue accounting softirq time to ksoftirqd thread 1917 * We want to continue accounting softirq time to ksoftirqd thread
1918 * in that case, so as not to confuse scheduler with a special task 1918 * in that case, so as not to confuse scheduler with a special task
1919 * that do not consume any time, but still wants to run. 1919 * that do not consume any time, but still wants to run.
1920 */ 1920 */
1921 if (hardirq_count()) 1921 if (hardirq_count())
1922 __this_cpu_add(cpu_hardirq_time, delta); 1922 __this_cpu_add(cpu_hardirq_time, delta);
1923 else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) 1923 else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
1924 __this_cpu_add(cpu_softirq_time, delta); 1924 __this_cpu_add(cpu_softirq_time, delta);
1925 1925
1926 irq_time_write_end(); 1926 irq_time_write_end();
1927 local_irq_restore(flags); 1927 local_irq_restore(flags);
1928 } 1928 }
1929 EXPORT_SYMBOL_GPL(account_system_vtime); 1929 EXPORT_SYMBOL_GPL(account_system_vtime);
1930 1930
1931 static void update_rq_clock_task(struct rq *rq, s64 delta) 1931 static void update_rq_clock_task(struct rq *rq, s64 delta)
1932 { 1932 {
1933 s64 irq_delta; 1933 s64 irq_delta;
1934 1934
1935 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; 1935 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
1936 1936
1937 /* 1937 /*
1938 * Since irq_time is only updated on {soft,}irq_exit, we might run into 1938 * Since irq_time is only updated on {soft,}irq_exit, we might run into
1939 * this case when a previous update_rq_clock() happened inside a 1939 * this case when a previous update_rq_clock() happened inside a
1940 * {soft,}irq region. 1940 * {soft,}irq region.
1941 * 1941 *
1942 * When this happens, we stop ->clock_task and only update the 1942 * When this happens, we stop ->clock_task and only update the
1943 * prev_irq_time stamp to account for the part that fit, so that a next 1943 * prev_irq_time stamp to account for the part that fit, so that a next
1944 * update will consume the rest. This ensures ->clock_task is 1944 * update will consume the rest. This ensures ->clock_task is
1945 * monotonic. 1945 * monotonic.
1946 * 1946 *
1947 * It does however cause some slight miss-attribution of {soft,}irq 1947 * It does however cause some slight miss-attribution of {soft,}irq
1948 * time, a more accurate solution would be to update the irq_time using 1948 * time, a more accurate solution would be to update the irq_time using
1949 * the current rq->clock timestamp, except that would require using 1949 * the current rq->clock timestamp, except that would require using
1950 * atomic ops. 1950 * atomic ops.
1951 */ 1951 */
1952 if (irq_delta > delta) 1952 if (irq_delta > delta)
1953 irq_delta = delta; 1953 irq_delta = delta;
1954 1954
1955 rq->prev_irq_time += irq_delta; 1955 rq->prev_irq_time += irq_delta;
1956 delta -= irq_delta; 1956 delta -= irq_delta;
1957 rq->clock_task += delta; 1957 rq->clock_task += delta;
1958 1958
1959 if (irq_delta && sched_feat(NONIRQ_POWER)) 1959 if (irq_delta && sched_feat(NONIRQ_POWER))
1960 sched_rt_avg_update(rq, irq_delta); 1960 sched_rt_avg_update(rq, irq_delta);
1961 } 1961 }
1962 1962
1963 static int irqtime_account_hi_update(void) 1963 static int irqtime_account_hi_update(void)
1964 { 1964 {
1965 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 1965 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
1966 unsigned long flags; 1966 unsigned long flags;
1967 u64 latest_ns; 1967 u64 latest_ns;
1968 int ret = 0; 1968 int ret = 0;
1969 1969
1970 local_irq_save(flags); 1970 local_irq_save(flags);
1971 latest_ns = this_cpu_read(cpu_hardirq_time); 1971 latest_ns = this_cpu_read(cpu_hardirq_time);
1972 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq)) 1972 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))
1973 ret = 1; 1973 ret = 1;
1974 local_irq_restore(flags); 1974 local_irq_restore(flags);
1975 return ret; 1975 return ret;
1976 } 1976 }
1977 1977
1978 static int irqtime_account_si_update(void) 1978 static int irqtime_account_si_update(void)
1979 { 1979 {
1980 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 1980 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
1981 unsigned long flags; 1981 unsigned long flags;
1982 u64 latest_ns; 1982 u64 latest_ns;
1983 int ret = 0; 1983 int ret = 0;
1984 1984
1985 local_irq_save(flags); 1985 local_irq_save(flags);
1986 latest_ns = this_cpu_read(cpu_softirq_time); 1986 latest_ns = this_cpu_read(cpu_softirq_time);
1987 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq)) 1987 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))
1988 ret = 1; 1988 ret = 1;
1989 local_irq_restore(flags); 1989 local_irq_restore(flags);
1990 return ret; 1990 return ret;
1991 } 1991 }
1992 1992
1993 #else /* CONFIG_IRQ_TIME_ACCOUNTING */ 1993 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
1994 1994
1995 #define sched_clock_irqtime (0) 1995 #define sched_clock_irqtime (0)
1996 1996
1997 static void update_rq_clock_task(struct rq *rq, s64 delta) 1997 static void update_rq_clock_task(struct rq *rq, s64 delta)
1998 { 1998 {
1999 rq->clock_task += delta; 1999 rq->clock_task += delta;
2000 } 2000 }
2001 2001
2002 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ 2002 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
2003 2003
2004 #include "sched_idletask.c" 2004 #include "sched_idletask.c"
2005 #include "sched_fair.c" 2005 #include "sched_fair.c"
2006 #include "sched_rt.c" 2006 #include "sched_rt.c"
2007 #include "sched_autogroup.c" 2007 #include "sched_autogroup.c"
2008 #include "sched_stoptask.c" 2008 #include "sched_stoptask.c"
2009 #ifdef CONFIG_SCHED_DEBUG 2009 #ifdef CONFIG_SCHED_DEBUG
2010 # include "sched_debug.c" 2010 # include "sched_debug.c"
2011 #endif 2011 #endif
2012 2012
2013 void sched_set_stop_task(int cpu, struct task_struct *stop) 2013 void sched_set_stop_task(int cpu, struct task_struct *stop)
2014 { 2014 {
2015 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; 2015 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
2016 struct task_struct *old_stop = cpu_rq(cpu)->stop; 2016 struct task_struct *old_stop = cpu_rq(cpu)->stop;
2017 2017
2018 if (stop) { 2018 if (stop) {
2019 /* 2019 /*
2020 * Make it appear like a SCHED_FIFO task, its something 2020 * Make it appear like a SCHED_FIFO task, its something
2021 * userspace knows about and won't get confused about. 2021 * userspace knows about and won't get confused about.
2022 * 2022 *
2023 * Also, it will make PI more or less work without too 2023 * Also, it will make PI more or less work without too
2024 * much confusion -- but then, stop work should not 2024 * much confusion -- but then, stop work should not
2025 * rely on PI working anyway. 2025 * rely on PI working anyway.
2026 */ 2026 */
2027 sched_setscheduler_nocheck(stop, SCHED_FIFO, &param); 2027 sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
2028 2028
2029 stop->sched_class = &stop_sched_class; 2029 stop->sched_class = &stop_sched_class;
2030 } 2030 }
2031 2031
2032 cpu_rq(cpu)->stop = stop; 2032 cpu_rq(cpu)->stop = stop;
2033 2033
2034 if (old_stop) { 2034 if (old_stop) {
2035 /* 2035 /*
2036 * Reset it back to a normal scheduling class so that 2036 * Reset it back to a normal scheduling class so that
2037 * it can die in pieces. 2037 * it can die in pieces.
2038 */ 2038 */
2039 old_stop->sched_class = &rt_sched_class; 2039 old_stop->sched_class = &rt_sched_class;
2040 } 2040 }
2041 } 2041 }
2042 2042
2043 /* 2043 /*
2044 * __normal_prio - return the priority that is based on the static prio 2044 * __normal_prio - return the priority that is based on the static prio
2045 */ 2045 */
2046 static inline int __normal_prio(struct task_struct *p) 2046 static inline int __normal_prio(struct task_struct *p)
2047 { 2047 {
2048 return p->static_prio; 2048 return p->static_prio;
2049 } 2049 }
2050 2050
2051 /* 2051 /*
2052 * Calculate the expected normal priority: i.e. priority 2052 * Calculate the expected normal priority: i.e. priority
2053 * without taking RT-inheritance into account. Might be 2053 * without taking RT-inheritance into account. Might be
2054 * boosted by interactivity modifiers. Changes upon fork, 2054 * boosted by interactivity modifiers. Changes upon fork,
2055 * setprio syscalls, and whenever the interactivity 2055 * setprio syscalls, and whenever the interactivity
2056 * estimator recalculates. 2056 * estimator recalculates.
2057 */ 2057 */
2058 static inline int normal_prio(struct task_struct *p) 2058 static inline int normal_prio(struct task_struct *p)
2059 { 2059 {
2060 int prio; 2060 int prio;
2061 2061
2062 if (task_has_rt_policy(p)) 2062 if (task_has_rt_policy(p))
2063 prio = MAX_RT_PRIO-1 - p->rt_priority; 2063 prio = MAX_RT_PRIO-1 - p->rt_priority;
2064 else 2064 else
2065 prio = __normal_prio(p); 2065 prio = __normal_prio(p);
2066 return prio; 2066 return prio;
2067 } 2067 }
2068 2068
2069 /* 2069 /*
2070 * Calculate the current priority, i.e. the priority 2070 * Calculate the current priority, i.e. the priority
2071 * taken into account by the scheduler. This value might 2071 * taken into account by the scheduler. This value might
2072 * be boosted by RT tasks, or might be boosted by 2072 * be boosted by RT tasks, or might be boosted by
2073 * interactivity modifiers. Will be RT if the task got 2073 * interactivity modifiers. Will be RT if the task got
2074 * RT-boosted. If not then it returns p->normal_prio. 2074 * RT-boosted. If not then it returns p->normal_prio.
2075 */ 2075 */
2076 static int effective_prio(struct task_struct *p) 2076 static int effective_prio(struct task_struct *p)
2077 { 2077 {
2078 p->normal_prio = normal_prio(p); 2078 p->normal_prio = normal_prio(p);
2079 /* 2079 /*
2080 * If we are RT tasks or we were boosted to RT priority, 2080 * If we are RT tasks or we were boosted to RT priority,
2081 * keep the priority unchanged. Otherwise, update priority 2081 * keep the priority unchanged. Otherwise, update priority
2082 * to the normal priority: 2082 * to the normal priority:
2083 */ 2083 */
2084 if (!rt_prio(p->prio)) 2084 if (!rt_prio(p->prio))
2085 return p->normal_prio; 2085 return p->normal_prio;
2086 return p->prio; 2086 return p->prio;
2087 } 2087 }
2088 2088
2089 /** 2089 /**
2090 * task_curr - is this task currently executing on a CPU? 2090 * task_curr - is this task currently executing on a CPU?
2091 * @p: the task in question. 2091 * @p: the task in question.
2092 */ 2092 */
2093 inline int task_curr(const struct task_struct *p) 2093 inline int task_curr(const struct task_struct *p)
2094 { 2094 {
2095 return cpu_curr(task_cpu(p)) == p; 2095 return cpu_curr(task_cpu(p)) == p;
2096 } 2096 }
2097 2097
2098 static inline void check_class_changed(struct rq *rq, struct task_struct *p, 2098 static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2099 const struct sched_class *prev_class, 2099 const struct sched_class *prev_class,
2100 int oldprio) 2100 int oldprio)
2101 { 2101 {
2102 if (prev_class != p->sched_class) { 2102 if (prev_class != p->sched_class) {
2103 if (prev_class->switched_from) 2103 if (prev_class->switched_from)
2104 prev_class->switched_from(rq, p); 2104 prev_class->switched_from(rq, p);
2105 p->sched_class->switched_to(rq, p); 2105 p->sched_class->switched_to(rq, p);
2106 } else if (oldprio != p->prio) 2106 } else if (oldprio != p->prio)
2107 p->sched_class->prio_changed(rq, p, oldprio); 2107 p->sched_class->prio_changed(rq, p, oldprio);
2108 } 2108 }
2109 2109
2110 static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 2110 static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2111 { 2111 {
2112 const struct sched_class *class; 2112 const struct sched_class *class;
2113 2113
2114 if (p->sched_class == rq->curr->sched_class) { 2114 if (p->sched_class == rq->curr->sched_class) {
2115 rq->curr->sched_class->check_preempt_curr(rq, p, flags); 2115 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
2116 } else { 2116 } else {
2117 for_each_class(class) { 2117 for_each_class(class) {
2118 if (class == rq->curr->sched_class) 2118 if (class == rq->curr->sched_class)
2119 break; 2119 break;
2120 if (class == p->sched_class) { 2120 if (class == p->sched_class) {
2121 resched_task(rq->curr); 2121 resched_task(rq->curr);
2122 break; 2122 break;
2123 } 2123 }
2124 } 2124 }
2125 } 2125 }
2126 2126
2127 /* 2127 /*
2128 * A queue event has occurred, and we're going to schedule. In 2128 * A queue event has occurred, and we're going to schedule. In
2129 * this case, we can save a useless back to back clock update. 2129 * this case, we can save a useless back to back clock update.
2130 */ 2130 */
2131 if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) 2131 if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
2132 rq->skip_clock_update = 1; 2132 rq->skip_clock_update = 1;
2133 } 2133 }
2134 2134
2135 #ifdef CONFIG_SMP 2135 #ifdef CONFIG_SMP
2136 /* 2136 /*
2137 * Is this task likely cache-hot: 2137 * Is this task likely cache-hot:
2138 */ 2138 */
2139 static int 2139 static int
2140 task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) 2140 task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2141 { 2141 {
2142 s64 delta; 2142 s64 delta;
2143 2143
2144 if (p->sched_class != &fair_sched_class) 2144 if (p->sched_class != &fair_sched_class)
2145 return 0; 2145 return 0;
2146 2146
2147 if (unlikely(p->policy == SCHED_IDLE)) 2147 if (unlikely(p->policy == SCHED_IDLE))
2148 return 0; 2148 return 0;
2149 2149
2150 /* 2150 /*
2151 * Buddy candidates are cache hot: 2151 * Buddy candidates are cache hot:
2152 */ 2152 */
2153 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && 2153 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
2154 (&p->se == cfs_rq_of(&p->se)->next || 2154 (&p->se == cfs_rq_of(&p->se)->next ||
2155 &p->se == cfs_rq_of(&p->se)->last)) 2155 &p->se == cfs_rq_of(&p->se)->last))
2156 return 1; 2156 return 1;
2157 2157
2158 if (sysctl_sched_migration_cost == -1) 2158 if (sysctl_sched_migration_cost == -1)
2159 return 1; 2159 return 1;
2160 if (sysctl_sched_migration_cost == 0) 2160 if (sysctl_sched_migration_cost == 0)
2161 return 0; 2161 return 0;
2162 2162
2163 delta = now - p->se.exec_start; 2163 delta = now - p->se.exec_start;
2164 2164
2165 return delta < (s64)sysctl_sched_migration_cost; 2165 return delta < (s64)sysctl_sched_migration_cost;
2166 } 2166 }
2167 2167
2168 void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 2168 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2169 { 2169 {
2170 #ifdef CONFIG_SCHED_DEBUG 2170 #ifdef CONFIG_SCHED_DEBUG
2171 /* 2171 /*
2172 * We should never call set_task_cpu() on a blocked task, 2172 * We should never call set_task_cpu() on a blocked task,
2173 * ttwu() will sort out the placement. 2173 * ttwu() will sort out the placement.
2174 */ 2174 */
2175 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 2175 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
2176 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); 2176 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
2177 2177
2178 #ifdef CONFIG_LOCKDEP 2178 #ifdef CONFIG_LOCKDEP
2179 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || 2179 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
2180 lockdep_is_held(&task_rq(p)->lock))); 2180 lockdep_is_held(&task_rq(p)->lock)));
2181 #endif 2181 #endif
2182 #endif 2182 #endif
2183 2183
2184 trace_sched_migrate_task(p, new_cpu); 2184 trace_sched_migrate_task(p, new_cpu);
2185 2185
2186 if (task_cpu(p) != new_cpu) { 2186 if (task_cpu(p) != new_cpu) {
2187 p->se.nr_migrations++; 2187 p->se.nr_migrations++;
2188 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); 2188 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0);
2189 } 2189 }
2190 2190
2191 __set_task_cpu(p, new_cpu); 2191 __set_task_cpu(p, new_cpu);
2192 } 2192 }
2193 2193
2194 struct migration_arg { 2194 struct migration_arg {
2195 struct task_struct *task; 2195 struct task_struct *task;
2196 int dest_cpu; 2196 int dest_cpu;
2197 }; 2197 };
2198 2198
2199 static int migration_cpu_stop(void *data); 2199 static int migration_cpu_stop(void *data);
2200 2200
2201 /* 2201 /*
2202 * The task's runqueue lock must be held. 2202 * The task's runqueue lock must be held.
2203 * Returns true if you have to wait for migration thread. 2203 * Returns true if you have to wait for migration thread.
2204 */ 2204 */
2205 static bool need_migrate_task(struct task_struct *p) 2205 static bool need_migrate_task(struct task_struct *p)
2206 { 2206 {
2207 /* 2207 /*
2208 * If the task is not on a runqueue (and not running), then 2208 * If the task is not on a runqueue (and not running), then
2209 * the next wake-up will properly place the task. 2209 * the next wake-up will properly place the task.
2210 */ 2210 */
2211 bool running = p->on_rq || p->on_cpu; 2211 bool running = p->on_rq || p->on_cpu;
2212 smp_rmb(); /* finish_lock_switch() */ 2212 smp_rmb(); /* finish_lock_switch() */
2213 return running; 2213 return running;
2214 } 2214 }
2215 2215
2216 /* 2216 /*
2217 * wait_task_inactive - wait for a thread to unschedule. 2217 * wait_task_inactive - wait for a thread to unschedule.
2218 * 2218 *
2219 * If @match_state is nonzero, it's the @p->state value just checked and 2219 * If @match_state is nonzero, it's the @p->state value just checked and
2220 * not expected to change. If it changes, i.e. @p might have woken up, 2220 * not expected to change. If it changes, i.e. @p might have woken up,
2221 * then return zero. When we succeed in waiting for @p to be off its CPU, 2221 * then return zero. When we succeed in waiting for @p to be off its CPU,
2222 * we return a positive number (its total switch count). If a second call 2222 * we return a positive number (its total switch count). If a second call
2223 * a short while later returns the same number, the caller can be sure that 2223 * a short while later returns the same number, the caller can be sure that
2224 * @p has remained unscheduled the whole time. 2224 * @p has remained unscheduled the whole time.
2225 * 2225 *
2226 * The caller must ensure that the task *will* unschedule sometime soon, 2226 * The caller must ensure that the task *will* unschedule sometime soon,
2227 * else this function might spin for a *long* time. This function can't 2227 * else this function might spin for a *long* time. This function can't
2228 * be called with interrupts off, or it may introduce deadlock with 2228 * be called with interrupts off, or it may introduce deadlock with
2229 * smp_call_function() if an IPI is sent by the same process we are 2229 * smp_call_function() if an IPI is sent by the same process we are
2230 * waiting to become inactive. 2230 * waiting to become inactive.
2231 */ 2231 */
2232 unsigned long wait_task_inactive(struct task_struct *p, long match_state) 2232 unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2233 { 2233 {
2234 unsigned long flags; 2234 unsigned long flags;
2235 int running, on_rq; 2235 int running, on_rq;
2236 unsigned long ncsw; 2236 unsigned long ncsw;
2237 struct rq *rq; 2237 struct rq *rq;
2238 2238
2239 for (;;) { 2239 for (;;) {
2240 /* 2240 /*
2241 * We do the initial early heuristics without holding 2241 * We do the initial early heuristics without holding
2242 * any task-queue locks at all. We'll only try to get 2242 * any task-queue locks at all. We'll only try to get
2243 * the runqueue lock when things look like they will 2243 * the runqueue lock when things look like they will
2244 * work out! 2244 * work out!
2245 */ 2245 */
2246 rq = task_rq(p); 2246 rq = task_rq(p);
2247 2247
2248 /* 2248 /*
2249 * If the task is actively running on another CPU 2249 * If the task is actively running on another CPU
2250 * still, just relax and busy-wait without holding 2250 * still, just relax and busy-wait without holding
2251 * any locks. 2251 * any locks.
2252 * 2252 *
2253 * NOTE! Since we don't hold any locks, it's not 2253 * NOTE! Since we don't hold any locks, it's not
2254 * even sure that "rq" stays as the right runqueue! 2254 * even sure that "rq" stays as the right runqueue!
2255 * But we don't care, since "task_running()" will 2255 * But we don't care, since "task_running()" will
2256 * return false if the runqueue has changed and p 2256 * return false if the runqueue has changed and p
2257 * is actually now running somewhere else! 2257 * is actually now running somewhere else!
2258 */ 2258 */
2259 while (task_running(rq, p)) { 2259 while (task_running(rq, p)) {
2260 if (match_state && unlikely(p->state != match_state)) 2260 if (match_state && unlikely(p->state != match_state))
2261 return 0; 2261 return 0;
2262 cpu_relax(); 2262 cpu_relax();
2263 } 2263 }
2264 2264
2265 /* 2265 /*
2266 * Ok, time to look more closely! We need the rq 2266 * Ok, time to look more closely! We need the rq
2267 * lock now, to be *sure*. If we're wrong, we'll 2267 * lock now, to be *sure*. If we're wrong, we'll
2268 * just go back and repeat. 2268 * just go back and repeat.
2269 */ 2269 */
2270 rq = task_rq_lock(p, &flags); 2270 rq = task_rq_lock(p, &flags);
2271 trace_sched_wait_task(p); 2271 trace_sched_wait_task(p);
2272 running = task_running(rq, p); 2272 running = task_running(rq, p);
2273 on_rq = p->on_rq; 2273 on_rq = p->on_rq;
2274 ncsw = 0; 2274 ncsw = 0;
2275 if (!match_state || p->state == match_state) 2275 if (!match_state || p->state == match_state)
2276 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ 2276 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
2277 task_rq_unlock(rq, p, &flags); 2277 task_rq_unlock(rq, p, &flags);
2278 2278
2279 /* 2279 /*
2280 * If it changed from the expected state, bail out now. 2280 * If it changed from the expected state, bail out now.
2281 */ 2281 */
2282 if (unlikely(!ncsw)) 2282 if (unlikely(!ncsw))
2283 break; 2283 break;
2284 2284
2285 /* 2285 /*
2286 * Was it really running after all now that we 2286 * Was it really running after all now that we
2287 * checked with the proper locks actually held? 2287 * checked with the proper locks actually held?
2288 * 2288 *
2289 * Oops. Go back and try again.. 2289 * Oops. Go back and try again..
2290 */ 2290 */
2291 if (unlikely(running)) { 2291 if (unlikely(running)) {
2292 cpu_relax(); 2292 cpu_relax();
2293 continue; 2293 continue;
2294 } 2294 }
2295 2295
2296 /* 2296 /*
2297 * It's not enough that it's not actively running, 2297 * It's not enough that it's not actively running,
2298 * it must be off the runqueue _entirely_, and not 2298 * it must be off the runqueue _entirely_, and not
2299 * preempted! 2299 * preempted!
2300 * 2300 *
2301 * So if it was still runnable (but just not actively 2301 * So if it was still runnable (but just not actively
2302 * running right now), it's preempted, and we should 2302 * running right now), it's preempted, and we should
2303 * yield - it could be a while. 2303 * yield - it could be a while.
2304 */ 2304 */
2305 if (unlikely(on_rq)) { 2305 if (unlikely(on_rq)) {
2306 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); 2306 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
2307 2307
2308 set_current_state(TASK_UNINTERRUPTIBLE); 2308 set_current_state(TASK_UNINTERRUPTIBLE);
2309 schedule_hrtimeout(&to, HRTIMER_MODE_REL); 2309 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
2310 continue; 2310 continue;
2311 } 2311 }
2312 2312
2313 /* 2313 /*
2314 * Ahh, all good. It wasn't running, and it wasn't 2314 * Ahh, all good. It wasn't running, and it wasn't
2315 * runnable, which means that it will never become 2315 * runnable, which means that it will never become
2316 * running in the future either. We're all done! 2316 * running in the future either. We're all done!
2317 */ 2317 */
2318 break; 2318 break;
2319 } 2319 }
2320 2320
2321 return ncsw; 2321 return ncsw;
2322 } 2322 }
2323 2323
2324 /*** 2324 /***
2325 * kick_process - kick a running thread to enter/exit the kernel 2325 * kick_process - kick a running thread to enter/exit the kernel
2326 * @p: the to-be-kicked thread 2326 * @p: the to-be-kicked thread
2327 * 2327 *
2328 * Cause a process which is running on another CPU to enter 2328 * Cause a process which is running on another CPU to enter
2329 * kernel-mode, without any delay. (to get signals handled.) 2329 * kernel-mode, without any delay. (to get signals handled.)
2330 * 2330 *
2331 * NOTE: this function doesn't have to take the runqueue lock, 2331 * NOTE: this function doesn't have to take the runqueue lock,
2332 * because all it wants to ensure is that the remote task enters 2332 * because all it wants to ensure is that the remote task enters
2333 * the kernel. If the IPI races and the task has been migrated 2333 * the kernel. If the IPI races and the task has been migrated
2334 * to another CPU then no harm is done and the purpose has been 2334 * to another CPU then no harm is done and the purpose has been
2335 * achieved as well. 2335 * achieved as well.
2336 */ 2336 */
2337 void kick_process(struct task_struct *p) 2337 void kick_process(struct task_struct *p)
2338 { 2338 {
2339 int cpu; 2339 int cpu;
2340 2340
2341 preempt_disable(); 2341 preempt_disable();
2342 cpu = task_cpu(p); 2342 cpu = task_cpu(p);
2343 if ((cpu != smp_processor_id()) && task_curr(p)) 2343 if ((cpu != smp_processor_id()) && task_curr(p))
2344 smp_send_reschedule(cpu); 2344 smp_send_reschedule(cpu);
2345 preempt_enable(); 2345 preempt_enable();
2346 } 2346 }
2347 EXPORT_SYMBOL_GPL(kick_process); 2347 EXPORT_SYMBOL_GPL(kick_process);
2348 #endif /* CONFIG_SMP */ 2348 #endif /* CONFIG_SMP */
2349 2349
2350 #ifdef CONFIG_SMP 2350 #ifdef CONFIG_SMP
2351 /* 2351 /*
2352 * ->cpus_allowed is protected by both rq->lock and p->pi_lock 2352 * ->cpus_allowed is protected by both rq->lock and p->pi_lock
2353 */ 2353 */
2354 static int select_fallback_rq(int cpu, struct task_struct *p) 2354 static int select_fallback_rq(int cpu, struct task_struct *p)
2355 { 2355 {
2356 int dest_cpu; 2356 int dest_cpu;
2357 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); 2357 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
2358 2358
2359 /* Look for allowed, online CPU in same node. */ 2359 /* Look for allowed, online CPU in same node. */
2360 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) 2360 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
2361 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) 2361 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
2362 return dest_cpu; 2362 return dest_cpu;
2363 2363
2364 /* Any allowed, online CPU? */ 2364 /* Any allowed, online CPU? */
2365 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); 2365 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
2366 if (dest_cpu < nr_cpu_ids) 2366 if (dest_cpu < nr_cpu_ids)
2367 return dest_cpu; 2367 return dest_cpu;
2368 2368
2369 /* No more Mr. Nice Guy. */ 2369 /* No more Mr. Nice Guy. */
2370 dest_cpu = cpuset_cpus_allowed_fallback(p); 2370 dest_cpu = cpuset_cpus_allowed_fallback(p);
2371 /* 2371 /*
2372 * Don't tell them about moving exiting tasks or 2372 * Don't tell them about moving exiting tasks or
2373 * kernel threads (both mm NULL), since they never 2373 * kernel threads (both mm NULL), since they never
2374 * leave kernel. 2374 * leave kernel.
2375 */ 2375 */
2376 if (p->mm && printk_ratelimit()) { 2376 if (p->mm && printk_ratelimit()) {
2377 printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n", 2377 printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
2378 task_pid_nr(p), p->comm, cpu); 2378 task_pid_nr(p), p->comm, cpu);
2379 } 2379 }
2380 2380
2381 return dest_cpu; 2381 return dest_cpu;
2382 } 2382 }
2383 2383
2384 /* 2384 /*
2385 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. 2385 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
2386 */ 2386 */
2387 static inline 2387 static inline
2388 int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) 2388 int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2389 { 2389 {
2390 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); 2390 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
2391 2391
2392 /* 2392 /*
2393 * In order not to call set_task_cpu() on a blocking task we need 2393 * In order not to call set_task_cpu() on a blocking task we need
2394 * to rely on ttwu() to place the task on a valid ->cpus_allowed 2394 * to rely on ttwu() to place the task on a valid ->cpus_allowed
2395 * cpu. 2395 * cpu.
2396 * 2396 *
2397 * Since this is common to all placement strategies, this lives here. 2397 * Since this is common to all placement strategies, this lives here.
2398 * 2398 *
2399 * [ this allows ->select_task() to simply return task_cpu(p) and 2399 * [ this allows ->select_task() to simply return task_cpu(p) and
2400 * not worry about this generic constraint ] 2400 * not worry about this generic constraint ]
2401 */ 2401 */
2402 if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || 2402 if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||
2403 !cpu_online(cpu))) 2403 !cpu_online(cpu)))
2404 cpu = select_fallback_rq(task_cpu(p), p); 2404 cpu = select_fallback_rq(task_cpu(p), p);
2405 2405
2406 return cpu; 2406 return cpu;
2407 } 2407 }
2408 2408
2409 static void update_avg(u64 *avg, u64 sample) 2409 static void update_avg(u64 *avg, u64 sample)
2410 { 2410 {
2411 s64 diff = sample - *avg; 2411 s64 diff = sample - *avg;
2412 *avg += diff >> 3; 2412 *avg += diff >> 3;
2413 } 2413 }
2414 #endif 2414 #endif
2415 2415
2416 static void 2416 static void
2417 ttwu_stat(struct task_struct *p, int cpu, int wake_flags) 2417 ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
2418 { 2418 {
2419 #ifdef CONFIG_SCHEDSTATS 2419 #ifdef CONFIG_SCHEDSTATS
2420 struct rq *rq = this_rq(); 2420 struct rq *rq = this_rq();
2421 2421
2422 #ifdef CONFIG_SMP 2422 #ifdef CONFIG_SMP
2423 int this_cpu = smp_processor_id(); 2423 int this_cpu = smp_processor_id();
2424 2424
2425 if (cpu == this_cpu) { 2425 if (cpu == this_cpu) {
2426 schedstat_inc(rq, ttwu_local); 2426 schedstat_inc(rq, ttwu_local);
2427 schedstat_inc(p, se.statistics.nr_wakeups_local); 2427 schedstat_inc(p, se.statistics.nr_wakeups_local);
2428 } else { 2428 } else {
2429 struct sched_domain *sd; 2429 struct sched_domain *sd;
2430 2430
2431 schedstat_inc(p, se.statistics.nr_wakeups_remote); 2431 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2432 for_each_domain(this_cpu, sd) { 2432 for_each_domain(this_cpu, sd) {
2433 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { 2433 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2434 schedstat_inc(sd, ttwu_wake_remote); 2434 schedstat_inc(sd, ttwu_wake_remote);
2435 break; 2435 break;
2436 } 2436 }
2437 } 2437 }
2438 } 2438 }
2439 #endif /* CONFIG_SMP */ 2439 #endif /* CONFIG_SMP */
2440 2440
2441 schedstat_inc(rq, ttwu_count); 2441 schedstat_inc(rq, ttwu_count);
2442 schedstat_inc(p, se.statistics.nr_wakeups); 2442 schedstat_inc(p, se.statistics.nr_wakeups);
2443 2443
2444 if (wake_flags & WF_SYNC) 2444 if (wake_flags & WF_SYNC)
2445 schedstat_inc(p, se.statistics.nr_wakeups_sync); 2445 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2446 2446
2447 if (cpu != task_cpu(p)) 2447 if (cpu != task_cpu(p))
2448 schedstat_inc(p, se.statistics.nr_wakeups_migrate); 2448 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2449 2449
2450 #endif /* CONFIG_SCHEDSTATS */ 2450 #endif /* CONFIG_SCHEDSTATS */
2451 } 2451 }
2452 2452
2453 static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) 2453 static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
2454 { 2454 {
2455 activate_task(rq, p, en_flags); 2455 activate_task(rq, p, en_flags);
2456 p->on_rq = 1; 2456 p->on_rq = 1;
2457 2457
2458 /* if a worker is waking up, notify workqueue */ 2458 /* if a worker is waking up, notify workqueue */
2459 if (p->flags & PF_WQ_WORKER) 2459 if (p->flags & PF_WQ_WORKER)
2460 wq_worker_waking_up(p, cpu_of(rq)); 2460 wq_worker_waking_up(p, cpu_of(rq));
2461 } 2461 }
2462 2462
2463 /* 2463 /*
2464 * Mark the task runnable and perform wakeup-preemption. 2464 * Mark the task runnable and perform wakeup-preemption.
2465 */ 2465 */
2466 static void 2466 static void
2467 ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) 2467 ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
2468 { 2468 {
2469 trace_sched_wakeup(p, true); 2469 trace_sched_wakeup(p, true);
2470 check_preempt_curr(rq, p, wake_flags); 2470 check_preempt_curr(rq, p, wake_flags);
2471 2471
2472 p->state = TASK_RUNNING; 2472 p->state = TASK_RUNNING;
2473 #ifdef CONFIG_SMP 2473 #ifdef CONFIG_SMP
2474 if (p->sched_class->task_woken) 2474 if (p->sched_class->task_woken)
2475 p->sched_class->task_woken(rq, p); 2475 p->sched_class->task_woken(rq, p);
2476 2476
2477 if (unlikely(rq->idle_stamp)) { 2477 if (unlikely(rq->idle_stamp)) {
2478 u64 delta = rq->clock - rq->idle_stamp; 2478 u64 delta = rq->clock - rq->idle_stamp;
2479 u64 max = 2*sysctl_sched_migration_cost; 2479 u64 max = 2*sysctl_sched_migration_cost;
2480 2480
2481 if (delta > max) 2481 if (delta > max)
2482 rq->avg_idle = max; 2482 rq->avg_idle = max;
2483 else 2483 else
2484 update_avg(&rq->avg_idle, delta); 2484 update_avg(&rq->avg_idle, delta);
2485 rq->idle_stamp = 0; 2485 rq->idle_stamp = 0;
2486 } 2486 }
2487 #endif 2487 #endif
2488 } 2488 }
2489 2489
2490 static void 2490 static void
2491 ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) 2491 ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
2492 { 2492 {
2493 #ifdef CONFIG_SMP 2493 #ifdef CONFIG_SMP
2494 if (p->sched_contributes_to_load) 2494 if (p->sched_contributes_to_load)
2495 rq->nr_uninterruptible--; 2495 rq->nr_uninterruptible--;
2496 #endif 2496 #endif
2497 2497
2498 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); 2498 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
2499 ttwu_do_wakeup(rq, p, wake_flags); 2499 ttwu_do_wakeup(rq, p, wake_flags);
2500 } 2500 }
2501 2501
2502 /* 2502 /*
2503 * Called in case the task @p isn't fully descheduled from its runqueue, 2503 * Called in case the task @p isn't fully descheduled from its runqueue,
2504 * in this case we must do a remote wakeup. Its a 'light' wakeup though, 2504 * in this case we must do a remote wakeup. Its a 'light' wakeup though,
2505 * since all we need to do is flip p->state to TASK_RUNNING, since 2505 * since all we need to do is flip p->state to TASK_RUNNING, since
2506 * the task is still ->on_rq. 2506 * the task is still ->on_rq.
2507 */ 2507 */
2508 static int ttwu_remote(struct task_struct *p, int wake_flags) 2508 static int ttwu_remote(struct task_struct *p, int wake_flags)
2509 { 2509 {
2510 struct rq *rq; 2510 struct rq *rq;
2511 int ret = 0; 2511 int ret = 0;
2512 2512
2513 rq = __task_rq_lock(p); 2513 rq = __task_rq_lock(p);
2514 if (p->on_rq) { 2514 if (p->on_rq) {
2515 ttwu_do_wakeup(rq, p, wake_flags); 2515 ttwu_do_wakeup(rq, p, wake_flags);
2516 ret = 1; 2516 ret = 1;
2517 } 2517 }
2518 __task_rq_unlock(rq); 2518 __task_rq_unlock(rq);
2519 2519
2520 return ret; 2520 return ret;
2521 } 2521 }
2522 2522
2523 #ifdef CONFIG_SMP 2523 #ifdef CONFIG_SMP
2524 static void sched_ttwu_pending(void) 2524 static void sched_ttwu_pending(void)
2525 { 2525 {
2526 struct rq *rq = this_rq(); 2526 struct rq *rq = this_rq();
2527 struct task_struct *list = xchg(&rq->wake_list, NULL); 2527 struct task_struct *list = xchg(&rq->wake_list, NULL);
2528 2528
2529 if (!list) 2529 if (!list)
2530 return; 2530 return;
2531 2531
2532 raw_spin_lock(&rq->lock); 2532 raw_spin_lock(&rq->lock);
2533 2533
2534 while (list) { 2534 while (list) {
2535 struct task_struct *p = list; 2535 struct task_struct *p = list;
2536 list = list->wake_entry; 2536 list = list->wake_entry;
2537 ttwu_do_activate(rq, p, 0); 2537 ttwu_do_activate(rq, p, 0);
2538 } 2538 }
2539 2539
2540 raw_spin_unlock(&rq->lock); 2540 raw_spin_unlock(&rq->lock);
2541 } 2541 }
2542 2542
2543 void scheduler_ipi(void) 2543 void scheduler_ipi(void)
2544 { 2544 {
2545 sched_ttwu_pending(); 2545 sched_ttwu_pending();
2546 } 2546 }
2547 2547
2548 static void ttwu_queue_remote(struct task_struct *p, int cpu) 2548 static void ttwu_queue_remote(struct task_struct *p, int cpu)
2549 { 2549 {
2550 struct rq *rq = cpu_rq(cpu); 2550 struct rq *rq = cpu_rq(cpu);
2551 struct task_struct *next = rq->wake_list; 2551 struct task_struct *next = rq->wake_list;
2552 2552
2553 for (;;) { 2553 for (;;) {
2554 struct task_struct *old = next; 2554 struct task_struct *old = next;
2555 2555
2556 p->wake_entry = next; 2556 p->wake_entry = next;
2557 next = cmpxchg(&rq->wake_list, old, p); 2557 next = cmpxchg(&rq->wake_list, old, p);
2558 if (next == old) 2558 if (next == old)
2559 break; 2559 break;
2560 } 2560 }
2561 2561
2562 if (!next) 2562 if (!next)
2563 smp_send_reschedule(cpu); 2563 smp_send_reschedule(cpu);
2564 } 2564 }
2565 #endif 2565 #endif
2566 2566
2567 static void ttwu_queue(struct task_struct *p, int cpu) 2567 static void ttwu_queue(struct task_struct *p, int cpu)
2568 { 2568 {
2569 struct rq *rq = cpu_rq(cpu); 2569 struct rq *rq = cpu_rq(cpu);
2570 2570
2571 #if defined(CONFIG_SMP) && defined(CONFIG_SCHED_TTWU_QUEUE) 2571 #if defined(CONFIG_SMP)
2572 if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) { 2572 if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
2573 ttwu_queue_remote(p, cpu); 2573 ttwu_queue_remote(p, cpu);
2574 return; 2574 return;
2575 } 2575 }
2576 #endif 2576 #endif
2577 2577
2578 raw_spin_lock(&rq->lock); 2578 raw_spin_lock(&rq->lock);
2579 ttwu_do_activate(rq, p, 0); 2579 ttwu_do_activate(rq, p, 0);
2580 raw_spin_unlock(&rq->lock); 2580 raw_spin_unlock(&rq->lock);
2581 } 2581 }
2582 2582
2583 /** 2583 /**
2584 * try_to_wake_up - wake up a thread 2584 * try_to_wake_up - wake up a thread
2585 * @p: the thread to be awakened 2585 * @p: the thread to be awakened
2586 * @state: the mask of task states that can be woken 2586 * @state: the mask of task states that can be woken
2587 * @wake_flags: wake modifier flags (WF_*) 2587 * @wake_flags: wake modifier flags (WF_*)
2588 * 2588 *
2589 * Put it on the run-queue if it's not already there. The "current" 2589 * Put it on the run-queue if it's not already there. The "current"
2590 * thread is always on the run-queue (except when the actual 2590 * thread is always on the run-queue (except when the actual
2591 * re-schedule is in progress), and as such you're allowed to do 2591 * re-schedule is in progress), and as such you're allowed to do
2592 * the simpler "current->state = TASK_RUNNING" to mark yourself 2592 * the simpler "current->state = TASK_RUNNING" to mark yourself
2593 * runnable without the overhead of this. 2593 * runnable without the overhead of this.
2594 * 2594 *
2595 * Returns %true if @p was woken up, %false if it was already running 2595 * Returns %true if @p was woken up, %false if it was already running
2596 * or @state didn't match @p's state. 2596 * or @state didn't match @p's state.
2597 */ 2597 */
2598 static int 2598 static int
2599 try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) 2599 try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2600 { 2600 {
2601 unsigned long flags; 2601 unsigned long flags;
2602 int cpu, success = 0; 2602 int cpu, success = 0;
2603 2603
2604 smp_wmb(); 2604 smp_wmb();
2605 raw_spin_lock_irqsave(&p->pi_lock, flags); 2605 raw_spin_lock_irqsave(&p->pi_lock, flags);
2606 if (!(p->state & state)) 2606 if (!(p->state & state))
2607 goto out; 2607 goto out;
2608 2608
2609 success = 1; /* we're going to change ->state */ 2609 success = 1; /* we're going to change ->state */
2610 cpu = task_cpu(p); 2610 cpu = task_cpu(p);
2611 2611
2612 if (p->on_rq && ttwu_remote(p, wake_flags)) 2612 if (p->on_rq && ttwu_remote(p, wake_flags))
2613 goto stat; 2613 goto stat;
2614 2614
2615 #ifdef CONFIG_SMP 2615 #ifdef CONFIG_SMP
2616 /* 2616 /*
2617 * If the owning (remote) cpu is still in the middle of schedule() with 2617 * If the owning (remote) cpu is still in the middle of schedule() with
2618 * this task as prev, wait until its done referencing the task. 2618 * this task as prev, wait until its done referencing the task.
2619 */ 2619 */
2620 while (p->on_cpu) { 2620 while (p->on_cpu) {
2621 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 2621 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2622 /* 2622 /*
2623 * If called from interrupt context we could have landed in the 2623 * If called from interrupt context we could have landed in the
2624 * middle of schedule(), in this case we should take care not 2624 * middle of schedule(), in this case we should take care not
2625 * to spin on ->on_cpu if p is current, since that would 2625 * to spin on ->on_cpu if p is current, since that would
2626 * deadlock. 2626 * deadlock.
2627 */ 2627 */
2628 if (p == current) { 2628 if (p == current) {
2629 ttwu_queue(p, cpu); 2629 ttwu_queue(p, cpu);
2630 goto stat; 2630 goto stat;
2631 } 2631 }
2632 #endif 2632 #endif
2633 cpu_relax(); 2633 cpu_relax();
2634 } 2634 }
2635 /* 2635 /*
2636 * Pairs with the smp_wmb() in finish_lock_switch(). 2636 * Pairs with the smp_wmb() in finish_lock_switch().
2637 */ 2637 */
2638 smp_rmb(); 2638 smp_rmb();
2639 2639
2640 p->sched_contributes_to_load = !!task_contributes_to_load(p); 2640 p->sched_contributes_to_load = !!task_contributes_to_load(p);
2641 p->state = TASK_WAKING; 2641 p->state = TASK_WAKING;
2642 2642
2643 if (p->sched_class->task_waking) 2643 if (p->sched_class->task_waking)
2644 p->sched_class->task_waking(p); 2644 p->sched_class->task_waking(p);
2645 2645
2646 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); 2646 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2647 if (task_cpu(p) != cpu) 2647 if (task_cpu(p) != cpu)
2648 set_task_cpu(p, cpu); 2648 set_task_cpu(p, cpu);
2649 #endif /* CONFIG_SMP */ 2649 #endif /* CONFIG_SMP */
2650 2650
2651 ttwu_queue(p, cpu); 2651 ttwu_queue(p, cpu);
2652 stat: 2652 stat:
2653 ttwu_stat(p, cpu, wake_flags); 2653 ttwu_stat(p, cpu, wake_flags);
2654 out: 2654 out:
2655 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2655 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2656 2656
2657 return success; 2657 return success;
2658 } 2658 }
2659 2659
2660 /** 2660 /**
2661 * try_to_wake_up_local - try to wake up a local task with rq lock held 2661 * try_to_wake_up_local - try to wake up a local task with rq lock held
2662 * @p: the thread to be awakened 2662 * @p: the thread to be awakened
2663 * 2663 *
2664 * Put @p on the run-queue if it's not already there. The caller must 2664 * Put @p on the run-queue if it's not already there. The caller must
2665 * ensure that this_rq() is locked, @p is bound to this_rq() and not 2665 * ensure that this_rq() is locked, @p is bound to this_rq() and not
2666 * the current task. 2666 * the current task.
2667 */ 2667 */
2668 static void try_to_wake_up_local(struct task_struct *p) 2668 static void try_to_wake_up_local(struct task_struct *p)
2669 { 2669 {
2670 struct rq *rq = task_rq(p); 2670 struct rq *rq = task_rq(p);
2671 2671
2672 BUG_ON(rq != this_rq()); 2672 BUG_ON(rq != this_rq());
2673 BUG_ON(p == current); 2673 BUG_ON(p == current);
2674 lockdep_assert_held(&rq->lock); 2674 lockdep_assert_held(&rq->lock);
2675 2675
2676 if (!raw_spin_trylock(&p->pi_lock)) { 2676 if (!raw_spin_trylock(&p->pi_lock)) {
2677 raw_spin_unlock(&rq->lock); 2677 raw_spin_unlock(&rq->lock);
2678 raw_spin_lock(&p->pi_lock); 2678 raw_spin_lock(&p->pi_lock);
2679 raw_spin_lock(&rq->lock); 2679 raw_spin_lock(&rq->lock);
2680 } 2680 }
2681 2681
2682 if (!(p->state & TASK_NORMAL)) 2682 if (!(p->state & TASK_NORMAL))
2683 goto out; 2683 goto out;
2684 2684
2685 if (!p->on_rq) 2685 if (!p->on_rq)
2686 ttwu_activate(rq, p, ENQUEUE_WAKEUP); 2686 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
2687 2687
2688 ttwu_do_wakeup(rq, p, 0); 2688 ttwu_do_wakeup(rq, p, 0);
2689 ttwu_stat(p, smp_processor_id(), 0); 2689 ttwu_stat(p, smp_processor_id(), 0);
2690 out: 2690 out:
2691 raw_spin_unlock(&p->pi_lock); 2691 raw_spin_unlock(&p->pi_lock);
2692 } 2692 }
2693 2693
2694 /** 2694 /**
2695 * wake_up_process - Wake up a specific process 2695 * wake_up_process - Wake up a specific process
2696 * @p: The process to be woken up. 2696 * @p: The process to be woken up.
2697 * 2697 *
2698 * Attempt to wake up the nominated process and move it to the set of runnable 2698 * Attempt to wake up the nominated process and move it to the set of runnable
2699 * processes. Returns 1 if the process was woken up, 0 if it was already 2699 * processes. Returns 1 if the process was woken up, 0 if it was already
2700 * running. 2700 * running.
2701 * 2701 *
2702 * It may be assumed that this function implies a write memory barrier before 2702 * It may be assumed that this function implies a write memory barrier before
2703 * changing the task state if and only if any tasks are woken up. 2703 * changing the task state if and only if any tasks are woken up.
2704 */ 2704 */
2705 int wake_up_process(struct task_struct *p) 2705 int wake_up_process(struct task_struct *p)
2706 { 2706 {
2707 return try_to_wake_up(p, TASK_ALL, 0); 2707 return try_to_wake_up(p, TASK_ALL, 0);
2708 } 2708 }
2709 EXPORT_SYMBOL(wake_up_process); 2709 EXPORT_SYMBOL(wake_up_process);
2710 2710
2711 int wake_up_state(struct task_struct *p, unsigned int state) 2711 int wake_up_state(struct task_struct *p, unsigned int state)
2712 { 2712 {
2713 return try_to_wake_up(p, state, 0); 2713 return try_to_wake_up(p, state, 0);
2714 } 2714 }
2715 2715
2716 /* 2716 /*
2717 * Perform scheduler related setup for a newly forked process p. 2717 * Perform scheduler related setup for a newly forked process p.
2718 * p is forked by current. 2718 * p is forked by current.
2719 * 2719 *
2720 * __sched_fork() is basic setup used by init_idle() too: 2720 * __sched_fork() is basic setup used by init_idle() too:
2721 */ 2721 */
2722 static void __sched_fork(struct task_struct *p) 2722 static void __sched_fork(struct task_struct *p)
2723 { 2723 {
2724 p->on_rq = 0; 2724 p->on_rq = 0;
2725 2725
2726 p->se.on_rq = 0; 2726 p->se.on_rq = 0;
2727 p->se.exec_start = 0; 2727 p->se.exec_start = 0;
2728 p->se.sum_exec_runtime = 0; 2728 p->se.sum_exec_runtime = 0;
2729 p->se.prev_sum_exec_runtime = 0; 2729 p->se.prev_sum_exec_runtime = 0;
2730 p->se.nr_migrations = 0; 2730 p->se.nr_migrations = 0;
2731 p->se.vruntime = 0; 2731 p->se.vruntime = 0;
2732 INIT_LIST_HEAD(&p->se.group_node); 2732 INIT_LIST_HEAD(&p->se.group_node);
2733 2733
2734 #ifdef CONFIG_SCHEDSTATS 2734 #ifdef CONFIG_SCHEDSTATS
2735 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 2735 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
2736 #endif 2736 #endif
2737 2737
2738 INIT_LIST_HEAD(&p->rt.run_list); 2738 INIT_LIST_HEAD(&p->rt.run_list);
2739 2739
2740 #ifdef CONFIG_PREEMPT_NOTIFIERS 2740 #ifdef CONFIG_PREEMPT_NOTIFIERS
2741 INIT_HLIST_HEAD(&p->preempt_notifiers); 2741 INIT_HLIST_HEAD(&p->preempt_notifiers);
2742 #endif 2742 #endif
2743 } 2743 }
2744 2744
2745 /* 2745 /*
2746 * fork()/clone()-time setup: 2746 * fork()/clone()-time setup:
2747 */ 2747 */
2748 void sched_fork(struct task_struct *p, int clone_flags) 2748 void sched_fork(struct task_struct *p, int clone_flags)
2749 { 2749 {
2750 unsigned long flags; 2750 unsigned long flags;
2751 int cpu = get_cpu(); 2751 int cpu = get_cpu();
2752 2752
2753 __sched_fork(p); 2753 __sched_fork(p);
2754 /* 2754 /*
2755 * We mark the process as running here. This guarantees that 2755 * We mark the process as running here. This guarantees that
2756 * nobody will actually run it, and a signal or other external 2756 * nobody will actually run it, and a signal or other external
2757 * event cannot wake it up and insert it on the runqueue either. 2757 * event cannot wake it up and insert it on the runqueue either.
2758 */ 2758 */
2759 p->state = TASK_RUNNING; 2759 p->state = TASK_RUNNING;
2760 2760
2761 /* 2761 /*
2762 * Revert to default priority/policy on fork if requested. 2762 * Revert to default priority/policy on fork if requested.
2763 */ 2763 */
2764 if (unlikely(p->sched_reset_on_fork)) { 2764 if (unlikely(p->sched_reset_on_fork)) {
2765 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { 2765 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
2766 p->policy = SCHED_NORMAL; 2766 p->policy = SCHED_NORMAL;
2767 p->normal_prio = p->static_prio; 2767 p->normal_prio = p->static_prio;
2768 } 2768 }
2769 2769
2770 if (PRIO_TO_NICE(p->static_prio) < 0) { 2770 if (PRIO_TO_NICE(p->static_prio) < 0) {
2771 p->static_prio = NICE_TO_PRIO(0); 2771 p->static_prio = NICE_TO_PRIO(0);
2772 p->normal_prio = p->static_prio; 2772 p->normal_prio = p->static_prio;
2773 set_load_weight(p); 2773 set_load_weight(p);
2774 } 2774 }
2775 2775
2776 /* 2776 /*
2777 * We don't need the reset flag anymore after the fork. It has 2777 * We don't need the reset flag anymore after the fork. It has
2778 * fulfilled its duty: 2778 * fulfilled its duty:
2779 */ 2779 */
2780 p->sched_reset_on_fork = 0; 2780 p->sched_reset_on_fork = 0;
2781 } 2781 }
2782 2782
2783 /* 2783 /*
2784 * Make sure we do not leak PI boosting priority to the child. 2784 * Make sure we do not leak PI boosting priority to the child.
2785 */ 2785 */
2786 p->prio = current->normal_prio; 2786 p->prio = current->normal_prio;
2787 2787
2788 if (!rt_prio(p->prio)) 2788 if (!rt_prio(p->prio))
2789 p->sched_class = &fair_sched_class; 2789 p->sched_class = &fair_sched_class;
2790 2790
2791 if (p->sched_class->task_fork) 2791 if (p->sched_class->task_fork)
2792 p->sched_class->task_fork(p); 2792 p->sched_class->task_fork(p);
2793 2793
2794 /* 2794 /*
2795 * The child is not yet in the pid-hash so no cgroup attach races, 2795 * The child is not yet in the pid-hash so no cgroup attach races,
2796 * and the cgroup is pinned to this child due to cgroup_fork() 2796 * and the cgroup is pinned to this child due to cgroup_fork()
2797 * is ran before sched_fork(). 2797 * is ran before sched_fork().
2798 * 2798 *
2799 * Silence PROVE_RCU. 2799 * Silence PROVE_RCU.
2800 */ 2800 */
2801 raw_spin_lock_irqsave(&p->pi_lock, flags); 2801 raw_spin_lock_irqsave(&p->pi_lock, flags);
2802 set_task_cpu(p, cpu); 2802 set_task_cpu(p, cpu);
2803 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2803 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2804 2804
2805 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2805 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2806 if (likely(sched_info_on())) 2806 if (likely(sched_info_on()))
2807 memset(&p->sched_info, 0, sizeof(p->sched_info)); 2807 memset(&p->sched_info, 0, sizeof(p->sched_info));
2808 #endif 2808 #endif
2809 #if defined(CONFIG_SMP) 2809 #if defined(CONFIG_SMP)
2810 p->on_cpu = 0; 2810 p->on_cpu = 0;
2811 #endif 2811 #endif
2812 #ifdef CONFIG_PREEMPT 2812 #ifdef CONFIG_PREEMPT
2813 /* Want to start with kernel preemption disabled. */ 2813 /* Want to start with kernel preemption disabled. */
2814 task_thread_info(p)->preempt_count = 1; 2814 task_thread_info(p)->preempt_count = 1;
2815 #endif 2815 #endif
2816 #ifdef CONFIG_SMP 2816 #ifdef CONFIG_SMP
2817 plist_node_init(&p->pushable_tasks, MAX_PRIO); 2817 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2818 #endif 2818 #endif
2819 2819
2820 put_cpu(); 2820 put_cpu();
2821 } 2821 }
2822 2822
2823 /* 2823 /*
2824 * wake_up_new_task - wake up a newly created task for the first time. 2824 * wake_up_new_task - wake up a newly created task for the first time.
2825 * 2825 *
2826 * This function will do some initial scheduler statistics housekeeping 2826 * This function will do some initial scheduler statistics housekeeping
2827 * that must be done for every newly created context, then puts the task 2827 * that must be done for every newly created context, then puts the task
2828 * on the runqueue and wakes it. 2828 * on the runqueue and wakes it.
2829 */ 2829 */
2830 void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) 2830 void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2831 { 2831 {
2832 unsigned long flags; 2832 unsigned long flags;
2833 struct rq *rq; 2833 struct rq *rq;
2834 2834
2835 raw_spin_lock_irqsave(&p->pi_lock, flags); 2835 raw_spin_lock_irqsave(&p->pi_lock, flags);
2836 #ifdef CONFIG_SMP 2836 #ifdef CONFIG_SMP
2837 /* 2837 /*
2838 * Fork balancing, do it here and not earlier because: 2838 * Fork balancing, do it here and not earlier because:
2839 * - cpus_allowed can change in the fork path 2839 * - cpus_allowed can change in the fork path
2840 * - any previously selected cpu might disappear through hotplug 2840 * - any previously selected cpu might disappear through hotplug
2841 */ 2841 */
2842 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); 2842 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
2843 #endif 2843 #endif
2844 2844
2845 rq = __task_rq_lock(p); 2845 rq = __task_rq_lock(p);
2846 activate_task(rq, p, 0); 2846 activate_task(rq, p, 0);
2847 p->on_rq = 1; 2847 p->on_rq = 1;
2848 trace_sched_wakeup_new(p, true); 2848 trace_sched_wakeup_new(p, true);
2849 check_preempt_curr(rq, p, WF_FORK); 2849 check_preempt_curr(rq, p, WF_FORK);
2850 #ifdef CONFIG_SMP 2850 #ifdef CONFIG_SMP
2851 if (p->sched_class->task_woken) 2851 if (p->sched_class->task_woken)
2852 p->sched_class->task_woken(rq, p); 2852 p->sched_class->task_woken(rq, p);
2853 #endif 2853 #endif
2854 task_rq_unlock(rq, p, &flags); 2854 task_rq_unlock(rq, p, &flags);
2855 } 2855 }
2856 2856
2857 #ifdef CONFIG_PREEMPT_NOTIFIERS 2857 #ifdef CONFIG_PREEMPT_NOTIFIERS
2858 2858
2859 /** 2859 /**
2860 * preempt_notifier_register - tell me when current is being preempted & rescheduled 2860 * preempt_notifier_register - tell me when current is being preempted & rescheduled
2861 * @notifier: notifier struct to register 2861 * @notifier: notifier struct to register
2862 */ 2862 */
2863 void preempt_notifier_register(struct preempt_notifier *notifier) 2863 void preempt_notifier_register(struct preempt_notifier *notifier)
2864 { 2864 {
2865 hlist_add_head(&notifier->link, &current->preempt_notifiers); 2865 hlist_add_head(&notifier->link, &current->preempt_notifiers);
2866 } 2866 }
2867 EXPORT_SYMBOL_GPL(preempt_notifier_register); 2867 EXPORT_SYMBOL_GPL(preempt_notifier_register);
2868 2868
2869 /** 2869 /**
2870 * preempt_notifier_unregister - no longer interested in preemption notifications 2870 * preempt_notifier_unregister - no longer interested in preemption notifications
2871 * @notifier: notifier struct to unregister 2871 * @notifier: notifier struct to unregister
2872 * 2872 *
2873 * This is safe to call from within a preemption notifier. 2873 * This is safe to call from within a preemption notifier.
2874 */ 2874 */
2875 void preempt_notifier_unregister(struct preempt_notifier *notifier) 2875 void preempt_notifier_unregister(struct preempt_notifier *notifier)
2876 { 2876 {
2877 hlist_del(&notifier->link); 2877 hlist_del(&notifier->link);
2878 } 2878 }
2879 EXPORT_SYMBOL_GPL(preempt_notifier_unregister); 2879 EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
2880 2880
2881 static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 2881 static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2882 { 2882 {
2883 struct preempt_notifier *notifier; 2883 struct preempt_notifier *notifier;
2884 struct hlist_node *node; 2884 struct hlist_node *node;
2885 2885
2886 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) 2886 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2887 notifier->ops->sched_in(notifier, raw_smp_processor_id()); 2887 notifier->ops->sched_in(notifier, raw_smp_processor_id());
2888 } 2888 }
2889 2889
2890 static void 2890 static void
2891 fire_sched_out_preempt_notifiers(struct task_struct *curr, 2891 fire_sched_out_preempt_notifiers(struct task_struct *curr,
2892 struct task_struct *next) 2892 struct task_struct *next)
2893 { 2893 {
2894 struct preempt_notifier *notifier; 2894 struct preempt_notifier *notifier;
2895 struct hlist_node *node; 2895 struct hlist_node *node;
2896 2896
2897 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) 2897 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2898 notifier->ops->sched_out(notifier, next); 2898 notifier->ops->sched_out(notifier, next);
2899 } 2899 }
2900 2900
2901 #else /* !CONFIG_PREEMPT_NOTIFIERS */ 2901 #else /* !CONFIG_PREEMPT_NOTIFIERS */
2902 2902
2903 static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 2903 static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2904 { 2904 {
2905 } 2905 }
2906 2906
2907 static void 2907 static void
2908 fire_sched_out_preempt_notifiers(struct task_struct *curr, 2908 fire_sched_out_preempt_notifiers(struct task_struct *curr,
2909 struct task_struct *next) 2909 struct task_struct *next)
2910 { 2910 {
2911 } 2911 }
2912 2912
2913 #endif /* CONFIG_PREEMPT_NOTIFIERS */ 2913 #endif /* CONFIG_PREEMPT_NOTIFIERS */
2914 2914
2915 /** 2915 /**
2916 * prepare_task_switch - prepare to switch tasks 2916 * prepare_task_switch - prepare to switch tasks
2917 * @rq: the runqueue preparing to switch 2917 * @rq: the runqueue preparing to switch
2918 * @prev: the current task that is being switched out 2918 * @prev: the current task that is being switched out
2919 * @next: the task we are going to switch to. 2919 * @next: the task we are going to switch to.
2920 * 2920 *
2921 * This is called with the rq lock held and interrupts off. It must 2921 * This is called with the rq lock held and interrupts off. It must
2922 * be paired with a subsequent finish_task_switch after the context 2922 * be paired with a subsequent finish_task_switch after the context
2923 * switch. 2923 * switch.
2924 * 2924 *
2925 * prepare_task_switch sets up locking and calls architecture specific 2925 * prepare_task_switch sets up locking and calls architecture specific
2926 * hooks. 2926 * hooks.
2927 */ 2927 */
2928 static inline void 2928 static inline void
2929 prepare_task_switch(struct rq *rq, struct task_struct *prev, 2929 prepare_task_switch(struct rq *rq, struct task_struct *prev,
2930 struct task_struct *next) 2930 struct task_struct *next)
2931 { 2931 {
2932 sched_info_switch(prev, next); 2932 sched_info_switch(prev, next);
2933 perf_event_task_sched_out(prev, next); 2933 perf_event_task_sched_out(prev, next);
2934 fire_sched_out_preempt_notifiers(prev, next); 2934 fire_sched_out_preempt_notifiers(prev, next);
2935 prepare_lock_switch(rq, next); 2935 prepare_lock_switch(rq, next);
2936 prepare_arch_switch(next); 2936 prepare_arch_switch(next);
2937 trace_sched_switch(prev, next); 2937 trace_sched_switch(prev, next);
2938 } 2938 }
2939 2939
2940 /** 2940 /**
2941 * finish_task_switch - clean up after a task-switch 2941 * finish_task_switch - clean up after a task-switch
2942 * @rq: runqueue associated with task-switch 2942 * @rq: runqueue associated with task-switch
2943 * @prev: the thread we just switched away from. 2943 * @prev: the thread we just switched away from.
2944 * 2944 *
2945 * finish_task_switch must be called after the context switch, paired 2945 * finish_task_switch must be called after the context switch, paired
2946 * with a prepare_task_switch call before the context switch. 2946 * with a prepare_task_switch call before the context switch.
2947 * finish_task_switch will reconcile locking set up by prepare_task_switch, 2947 * finish_task_switch will reconcile locking set up by prepare_task_switch,
2948 * and do any other architecture-specific cleanup actions. 2948 * and do any other architecture-specific cleanup actions.
2949 * 2949 *
2950 * Note that we may have delayed dropping an mm in context_switch(). If 2950 * Note that we may have delayed dropping an mm in context_switch(). If
2951 * so, we finish that here outside of the runqueue lock. (Doing it 2951 * so, we finish that here outside of the runqueue lock. (Doing it
2952 * with the lock held can cause deadlocks; see schedule() for 2952 * with the lock held can cause deadlocks; see schedule() for
2953 * details.) 2953 * details.)
2954 */ 2954 */
2955 static void finish_task_switch(struct rq *rq, struct task_struct *prev) 2955 static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2956 __releases(rq->lock) 2956 __releases(rq->lock)
2957 { 2957 {
2958 struct mm_struct *mm = rq->prev_mm; 2958 struct mm_struct *mm = rq->prev_mm;
2959 long prev_state; 2959 long prev_state;
2960 2960
2961 rq->prev_mm = NULL; 2961 rq->prev_mm = NULL;
2962 2962
2963 /* 2963 /*
2964 * A task struct has one reference for the use as "current". 2964 * A task struct has one reference for the use as "current".
2965 * If a task dies, then it sets TASK_DEAD in tsk->state and calls 2965 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
2966 * schedule one last time. The schedule call will never return, and 2966 * schedule one last time. The schedule call will never return, and
2967 * the scheduled task must drop that reference. 2967 * the scheduled task must drop that reference.
2968 * The test for TASK_DEAD must occur while the runqueue locks are 2968 * The test for TASK_DEAD must occur while the runqueue locks are
2969 * still held, otherwise prev could be scheduled on another cpu, die 2969 * still held, otherwise prev could be scheduled on another cpu, die
2970 * there before we look at prev->state, and then the reference would 2970 * there before we look at prev->state, and then the reference would
2971 * be dropped twice. 2971 * be dropped twice.
2972 * Manfred Spraul <manfred@colorfullife.com> 2972 * Manfred Spraul <manfred@colorfullife.com>
2973 */ 2973 */
2974 prev_state = prev->state; 2974 prev_state = prev->state;
2975 finish_arch_switch(prev); 2975 finish_arch_switch(prev);
2976 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 2976 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2977 local_irq_disable(); 2977 local_irq_disable();
2978 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ 2978 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
2979 perf_event_task_sched_in(current); 2979 perf_event_task_sched_in(current);
2980 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 2980 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2981 local_irq_enable(); 2981 local_irq_enable();
2982 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ 2982 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
2983 finish_lock_switch(rq, prev); 2983 finish_lock_switch(rq, prev);
2984 2984
2985 fire_sched_in_preempt_notifiers(current); 2985 fire_sched_in_preempt_notifiers(current);
2986 if (mm) 2986 if (mm)
2987 mmdrop(mm); 2987 mmdrop(mm);
2988 if (unlikely(prev_state == TASK_DEAD)) { 2988 if (unlikely(prev_state == TASK_DEAD)) {
2989 /* 2989 /*
2990 * Remove function-return probe instances associated with this 2990 * Remove function-return probe instances associated with this
2991 * task and put them back on the free list. 2991 * task and put them back on the free list.
2992 */ 2992 */
2993 kprobe_flush_task(prev); 2993 kprobe_flush_task(prev);
2994 put_task_struct(prev); 2994 put_task_struct(prev);
2995 } 2995 }
2996 } 2996 }
2997 2997
2998 #ifdef CONFIG_SMP 2998 #ifdef CONFIG_SMP
2999 2999
3000 /* assumes rq->lock is held */ 3000 /* assumes rq->lock is held */
3001 static inline void pre_schedule(struct rq *rq, struct task_struct *prev) 3001 static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
3002 { 3002 {
3003 if (prev->sched_class->pre_schedule) 3003 if (prev->sched_class->pre_schedule)
3004 prev->sched_class->pre_schedule(rq, prev); 3004 prev->sched_class->pre_schedule(rq, prev);
3005 } 3005 }
3006 3006
3007 /* rq->lock is NOT held, but preemption is disabled */ 3007 /* rq->lock is NOT held, but preemption is disabled */
3008 static inline void post_schedule(struct rq *rq) 3008 static inline void post_schedule(struct rq *rq)
3009 { 3009 {
3010 if (rq->post_schedule) { 3010 if (rq->post_schedule) {
3011 unsigned long flags; 3011 unsigned long flags;
3012 3012
3013 raw_spin_lock_irqsave(&rq->lock, flags); 3013 raw_spin_lock_irqsave(&rq->lock, flags);
3014 if (rq->curr->sched_class->post_schedule) 3014 if (rq->curr->sched_class->post_schedule)
3015 rq->curr->sched_class->post_schedule(rq); 3015 rq->curr->sched_class->post_schedule(rq);
3016 raw_spin_unlock_irqrestore(&rq->lock, flags); 3016 raw_spin_unlock_irqrestore(&rq->lock, flags);
3017 3017
3018 rq->post_schedule = 0; 3018 rq->post_schedule = 0;
3019 } 3019 }
3020 } 3020 }
3021 3021
3022 #else 3022 #else
3023 3023
3024 static inline void pre_schedule(struct rq *rq, struct task_struct *p) 3024 static inline void pre_schedule(struct rq *rq, struct task_struct *p)
3025 { 3025 {
3026 } 3026 }
3027 3027
3028 static inline void post_schedule(struct rq *rq) 3028 static inline void post_schedule(struct rq *rq)
3029 { 3029 {
3030 } 3030 }
3031 3031
3032 #endif 3032 #endif
3033 3033
3034 /** 3034 /**
3035 * schedule_tail - first thing a freshly forked thread must call. 3035 * schedule_tail - first thing a freshly forked thread must call.
3036 * @prev: the thread we just switched away from. 3036 * @prev: the thread we just switched away from.
3037 */ 3037 */
3038 asmlinkage void schedule_tail(struct task_struct *prev) 3038 asmlinkage void schedule_tail(struct task_struct *prev)
3039 __releases(rq->lock) 3039 __releases(rq->lock)
3040 { 3040 {
3041 struct rq *rq = this_rq(); 3041 struct rq *rq = this_rq();
3042 3042
3043 finish_task_switch(rq, prev); 3043 finish_task_switch(rq, prev);
3044 3044
3045 /* 3045 /*
3046 * FIXME: do we need to worry about rq being invalidated by the 3046 * FIXME: do we need to worry about rq being invalidated by the
3047 * task_switch? 3047 * task_switch?
3048 */ 3048 */
3049 post_schedule(rq); 3049 post_schedule(rq);
3050 3050
3051 #ifdef __ARCH_WANT_UNLOCKED_CTXSW 3051 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
3052 /* In this case, finish_task_switch does not reenable preemption */ 3052 /* In this case, finish_task_switch does not reenable preemption */
3053 preempt_enable(); 3053 preempt_enable();
3054 #endif 3054 #endif
3055 if (current->set_child_tid) 3055 if (current->set_child_tid)
3056 put_user(task_pid_vnr(current), current->set_child_tid); 3056 put_user(task_pid_vnr(current), current->set_child_tid);
3057 } 3057 }
3058 3058
3059 /* 3059 /*
3060 * context_switch - switch to the new MM and the new 3060 * context_switch - switch to the new MM and the new
3061 * thread's register state. 3061 * thread's register state.
3062 */ 3062 */
3063 static inline void 3063 static inline void
3064 context_switch(struct rq *rq, struct task_struct *prev, 3064 context_switch(struct rq *rq, struct task_struct *prev,
3065 struct task_struct *next) 3065 struct task_struct *next)
3066 { 3066 {
3067 struct mm_struct *mm, *oldmm; 3067 struct mm_struct *mm, *oldmm;
3068 3068
3069 prepare_task_switch(rq, prev, next); 3069 prepare_task_switch(rq, prev, next);
3070 3070
3071 mm = next->mm; 3071 mm = next->mm;
3072 oldmm = prev->active_mm; 3072 oldmm = prev->active_mm;
3073 /* 3073 /*
3074 * For paravirt, this is coupled with an exit in switch_to to 3074 * For paravirt, this is coupled with an exit in switch_to to
3075 * combine the page table reload and the switch backend into 3075 * combine the page table reload and the switch backend into
3076 * one hypercall. 3076 * one hypercall.
3077 */ 3077 */
3078 arch_start_context_switch(prev); 3078 arch_start_context_switch(prev);
3079 3079
3080 if (!mm) { 3080 if (!mm) {
3081 next->active_mm = oldmm; 3081 next->active_mm = oldmm;
3082 atomic_inc(&oldmm->mm_count); 3082 atomic_inc(&oldmm->mm_count);
3083 enter_lazy_tlb(oldmm, next); 3083 enter_lazy_tlb(oldmm, next);
3084 } else 3084 } else
3085 switch_mm(oldmm, mm, next); 3085 switch_mm(oldmm, mm, next);
3086 3086
3087 if (!prev->mm) { 3087 if (!prev->mm) {
3088 prev->active_mm = NULL; 3088 prev->active_mm = NULL;
3089 rq->prev_mm = oldmm; 3089 rq->prev_mm = oldmm;
3090 } 3090 }
3091 /* 3091 /*
3092 * Since the runqueue lock will be released by the next 3092 * Since the runqueue lock will be released by the next
3093 * task (which is an invalid locking op but in the case 3093 * task (which is an invalid locking op but in the case
3094 * of the scheduler it's an obvious special-case), so we 3094 * of the scheduler it's an obvious special-case), so we
3095 * do an early lockdep release here: 3095 * do an early lockdep release here:
3096 */ 3096 */
3097 #ifndef __ARCH_WANT_UNLOCKED_CTXSW 3097 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
3098 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 3098 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
3099 #endif 3099 #endif
3100 3100
3101 /* Here we just switch the register state and the stack. */ 3101 /* Here we just switch the register state and the stack. */
3102 switch_to(prev, next, prev); 3102 switch_to(prev, next, prev);
3103 3103
3104 barrier(); 3104 barrier();
3105 /* 3105 /*
3106 * this_rq must be evaluated again because prev may have moved 3106 * this_rq must be evaluated again because prev may have moved
3107 * CPUs since it called schedule(), thus the 'rq' on its stack 3107 * CPUs since it called schedule(), thus the 'rq' on its stack
3108 * frame will be invalid. 3108 * frame will be invalid.
3109 */ 3109 */
3110 finish_task_switch(this_rq(), prev); 3110 finish_task_switch(this_rq(), prev);
3111 } 3111 }
3112 3112
3113 /* 3113 /*
3114 * nr_running, nr_uninterruptible and nr_context_switches: 3114 * nr_running, nr_uninterruptible and nr_context_switches:
3115 * 3115 *
3116 * externally visible scheduler statistics: current number of runnable 3116 * externally visible scheduler statistics: current number of runnable
3117 * threads, current number of uninterruptible-sleeping threads, total 3117 * threads, current number of uninterruptible-sleeping threads, total
3118 * number of context switches performed since bootup. 3118 * number of context switches performed since bootup.
3119 */ 3119 */
3120 unsigned long nr_running(void) 3120 unsigned long nr_running(void)
3121 { 3121 {
3122 unsigned long i, sum = 0; 3122 unsigned long i, sum = 0;
3123 3123
3124 for_each_online_cpu(i) 3124 for_each_online_cpu(i)
3125 sum += cpu_rq(i)->nr_running; 3125 sum += cpu_rq(i)->nr_running;
3126 3126
3127 return sum; 3127 return sum;
3128 } 3128 }
3129 3129
3130 unsigned long nr_uninterruptible(void) 3130 unsigned long nr_uninterruptible(void)
3131 { 3131 {
3132 unsigned long i, sum = 0; 3132 unsigned long i, sum = 0;
3133 3133
3134 for_each_possible_cpu(i) 3134 for_each_possible_cpu(i)
3135 sum += cpu_rq(i)->nr_uninterruptible; 3135 sum += cpu_rq(i)->nr_uninterruptible;
3136 3136
3137 /* 3137 /*
3138 * Since we read the counters lockless, it might be slightly 3138 * Since we read the counters lockless, it might be slightly
3139 * inaccurate. Do not allow it to go below zero though: 3139 * inaccurate. Do not allow it to go below zero though:
3140 */ 3140 */
3141 if (unlikely((long)sum < 0)) 3141 if (unlikely((long)sum < 0))
3142 sum = 0; 3142 sum = 0;
3143 3143
3144 return sum; 3144 return sum;
3145 } 3145 }
3146 3146
3147 unsigned long long nr_context_switches(void) 3147 unsigned long long nr_context_switches(void)
3148 { 3148 {
3149 int i; 3149 int i;
3150 unsigned long long sum = 0; 3150 unsigned long long sum = 0;
3151 3151
3152 for_each_possible_cpu(i) 3152 for_each_possible_cpu(i)
3153 sum += cpu_rq(i)->nr_switches; 3153 sum += cpu_rq(i)->nr_switches;
3154 3154
3155 return sum; 3155 return sum;
3156 } 3156 }
3157 3157
3158 unsigned long nr_iowait(void) 3158 unsigned long nr_iowait(void)
3159 { 3159 {
3160 unsigned long i, sum = 0; 3160 unsigned long i, sum = 0;
3161 3161
3162 for_each_possible_cpu(i) 3162 for_each_possible_cpu(i)
3163 sum += atomic_read(&cpu_rq(i)->nr_iowait); 3163 sum += atomic_read(&cpu_rq(i)->nr_iowait);
3164 3164
3165 return sum; 3165 return sum;
3166 } 3166 }
3167 3167
3168 unsigned long nr_iowait_cpu(int cpu) 3168 unsigned long nr_iowait_cpu(int cpu)
3169 { 3169 {
3170 struct rq *this = cpu_rq(cpu); 3170 struct rq *this = cpu_rq(cpu);
3171 return atomic_read(&this->nr_iowait); 3171 return atomic_read(&this->nr_iowait);
3172 } 3172 }
3173 3173
3174 unsigned long this_cpu_load(void) 3174 unsigned long this_cpu_load(void)
3175 { 3175 {
3176 struct rq *this = this_rq(); 3176 struct rq *this = this_rq();
3177 return this->cpu_load[0]; 3177 return this->cpu_load[0];
3178 } 3178 }
3179 3179
3180 3180
3181 /* Variables and functions for calc_load */ 3181 /* Variables and functions for calc_load */
3182 static atomic_long_t calc_load_tasks; 3182 static atomic_long_t calc_load_tasks;
3183 static unsigned long calc_load_update; 3183 static unsigned long calc_load_update;
3184 unsigned long avenrun[3]; 3184 unsigned long avenrun[3];
3185 EXPORT_SYMBOL(avenrun); 3185 EXPORT_SYMBOL(avenrun);
3186 3186
3187 static long calc_load_fold_active(struct rq *this_rq) 3187 static long calc_load_fold_active(struct rq *this_rq)
3188 { 3188 {
3189 long nr_active, delta = 0; 3189 long nr_active, delta = 0;
3190 3190
3191 nr_active = this_rq->nr_running; 3191 nr_active = this_rq->nr_running;
3192 nr_active += (long) this_rq->nr_uninterruptible; 3192 nr_active += (long) this_rq->nr_uninterruptible;
3193 3193
3194 if (nr_active != this_rq->calc_load_active) { 3194 if (nr_active != this_rq->calc_load_active) {
3195 delta = nr_active - this_rq->calc_load_active; 3195 delta = nr_active - this_rq->calc_load_active;
3196 this_rq->calc_load_active = nr_active; 3196 this_rq->calc_load_active = nr_active;
3197 } 3197 }
3198 3198
3199 return delta; 3199 return delta;
3200 } 3200 }
3201 3201
3202 static unsigned long 3202 static unsigned long
3203 calc_load(unsigned long load, unsigned long exp, unsigned long active) 3203 calc_load(unsigned long load, unsigned long exp, unsigned long active)
3204 { 3204 {
3205 load *= exp; 3205 load *= exp;
3206 load += active * (FIXED_1 - exp); 3206 load += active * (FIXED_1 - exp);
3207 load += 1UL << (FSHIFT - 1); 3207 load += 1UL << (FSHIFT - 1);
3208 return load >> FSHIFT; 3208 return load >> FSHIFT;
3209 } 3209 }
3210 3210
3211 #ifdef CONFIG_NO_HZ 3211 #ifdef CONFIG_NO_HZ
3212 /* 3212 /*
3213 * For NO_HZ we delay the active fold to the next LOAD_FREQ update. 3213 * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
3214 * 3214 *
3215 * When making the ILB scale, we should try to pull this in as well. 3215 * When making the ILB scale, we should try to pull this in as well.
3216 */ 3216 */
3217 static atomic_long_t calc_load_tasks_idle; 3217 static atomic_long_t calc_load_tasks_idle;
3218 3218
3219 static void calc_load_account_idle(struct rq *this_rq) 3219 static void calc_load_account_idle(struct rq *this_rq)
3220 { 3220 {
3221 long delta; 3221 long delta;
3222 3222
3223 delta = calc_load_fold_active(this_rq); 3223 delta = calc_load_fold_active(this_rq);
3224 if (delta) 3224 if (delta)
3225 atomic_long_add(delta, &calc_load_tasks_idle); 3225 atomic_long_add(delta, &calc_load_tasks_idle);
3226 } 3226 }
3227 3227
3228 static long calc_load_fold_idle(void) 3228 static long calc_load_fold_idle(void)
3229 { 3229 {
3230 long delta = 0; 3230 long delta = 0;
3231 3231
3232 /* 3232 /*
3233 * Its got a race, we don't care... 3233 * Its got a race, we don't care...
3234 */ 3234 */
3235 if (atomic_long_read(&calc_load_tasks_idle)) 3235 if (atomic_long_read(&calc_load_tasks_idle))
3236 delta = atomic_long_xchg(&calc_load_tasks_idle, 0); 3236 delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
3237 3237
3238 return delta; 3238 return delta;
3239 } 3239 }
3240 3240
3241 /** 3241 /**
3242 * fixed_power_int - compute: x^n, in O(log n) time 3242 * fixed_power_int - compute: x^n, in O(log n) time
3243 * 3243 *
3244 * @x: base of the power 3244 * @x: base of the power
3245 * @frac_bits: fractional bits of @x 3245 * @frac_bits: fractional bits of @x
3246 * @n: power to raise @x to. 3246 * @n: power to raise @x to.
3247 * 3247 *
3248 * By exploiting the relation between the definition of the natural power 3248 * By exploiting the relation between the definition of the natural power
3249 * function: x^n := x*x*...*x (x multiplied by itself for n times), and 3249 * function: x^n := x*x*...*x (x multiplied by itself for n times), and
3250 * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, 3250 * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
3251 * (where: n_i \elem {0, 1}, the binary vector representing n), 3251 * (where: n_i \elem {0, 1}, the binary vector representing n),
3252 * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is 3252 * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
3253 * of course trivially computable in O(log_2 n), the length of our binary 3253 * of course trivially computable in O(log_2 n), the length of our binary
3254 * vector. 3254 * vector.
3255 */ 3255 */
3256 static unsigned long 3256 static unsigned long
3257 fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) 3257 fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
3258 { 3258 {
3259 unsigned long result = 1UL << frac_bits; 3259 unsigned long result = 1UL << frac_bits;
3260 3260
3261 if (n) for (;;) { 3261 if (n) for (;;) {
3262 if (n & 1) { 3262 if (n & 1) {
3263 result *= x; 3263 result *= x;
3264 result += 1UL << (frac_bits - 1); 3264 result += 1UL << (frac_bits - 1);
3265 result >>= frac_bits; 3265 result >>= frac_bits;
3266 } 3266 }
3267 n >>= 1; 3267 n >>= 1;
3268 if (!n) 3268 if (!n)
3269 break; 3269 break;
3270 x *= x; 3270 x *= x;
3271 x += 1UL << (frac_bits - 1); 3271 x += 1UL << (frac_bits - 1);
3272 x >>= frac_bits; 3272 x >>= frac_bits;
3273 } 3273 }
3274 3274
3275 return result; 3275 return result;
3276 } 3276 }
3277 3277
3278 /* 3278 /*
3279 * a1 = a0 * e + a * (1 - e) 3279 * a1 = a0 * e + a * (1 - e)
3280 * 3280 *
3281 * a2 = a1 * e + a * (1 - e) 3281 * a2 = a1 * e + a * (1 - e)
3282 * = (a0 * e + a * (1 - e)) * e + a * (1 - e) 3282 * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
3283 * = a0 * e^2 + a * (1 - e) * (1 + e) 3283 * = a0 * e^2 + a * (1 - e) * (1 + e)
3284 * 3284 *
3285 * a3 = a2 * e + a * (1 - e) 3285 * a3 = a2 * e + a * (1 - e)
3286 * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) 3286 * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
3287 * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) 3287 * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
3288 * 3288 *
3289 * ... 3289 * ...
3290 * 3290 *
3291 * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] 3291 * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
3292 * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) 3292 * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
3293 * = a0 * e^n + a * (1 - e^n) 3293 * = a0 * e^n + a * (1 - e^n)
3294 * 3294 *
3295 * [1] application of the geometric series: 3295 * [1] application of the geometric series:
3296 * 3296 *
3297 * n 1 - x^(n+1) 3297 * n 1 - x^(n+1)
3298 * S_n := \Sum x^i = ------------- 3298 * S_n := \Sum x^i = -------------
3299 * i=0 1 - x 3299 * i=0 1 - x
3300 */ 3300 */
3301 static unsigned long 3301 static unsigned long
3302 calc_load_n(unsigned long load, unsigned long exp, 3302 calc_load_n(unsigned long load, unsigned long exp,
3303 unsigned long active, unsigned int n) 3303 unsigned long active, unsigned int n)
3304 { 3304 {
3305 3305
3306 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); 3306 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
3307 } 3307 }
3308 3308
3309 /* 3309 /*
3310 * NO_HZ can leave us missing all per-cpu ticks calling 3310 * NO_HZ can leave us missing all per-cpu ticks calling
3311 * calc_load_account_active(), but since an idle CPU folds its delta into 3311 * calc_load_account_active(), but since an idle CPU folds its delta into
3312 * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold 3312 * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
3313 * in the pending idle delta if our idle period crossed a load cycle boundary. 3313 * in the pending idle delta if our idle period crossed a load cycle boundary.
3314 * 3314 *
3315 * Once we've updated the global active value, we need to apply the exponential 3315 * Once we've updated the global active value, we need to apply the exponential
3316 * weights adjusted to the number of cycles missed. 3316 * weights adjusted to the number of cycles missed.
3317 */ 3317 */
3318 static void calc_global_nohz(unsigned long ticks) 3318 static void calc_global_nohz(unsigned long ticks)
3319 { 3319 {
3320 long delta, active, n; 3320 long delta, active, n;
3321 3321
3322 if (time_before(jiffies, calc_load_update)) 3322 if (time_before(jiffies, calc_load_update))
3323 return; 3323 return;
3324 3324
3325 /* 3325 /*
3326 * If we crossed a calc_load_update boundary, make sure to fold 3326 * If we crossed a calc_load_update boundary, make sure to fold
3327 * any pending idle changes, the respective CPUs might have 3327 * any pending idle changes, the respective CPUs might have
3328 * missed the tick driven calc_load_account_active() update 3328 * missed the tick driven calc_load_account_active() update
3329 * due to NO_HZ. 3329 * due to NO_HZ.
3330 */ 3330 */
3331 delta = calc_load_fold_idle(); 3331 delta = calc_load_fold_idle();
3332 if (delta) 3332 if (delta)
3333 atomic_long_add(delta, &calc_load_tasks); 3333 atomic_long_add(delta, &calc_load_tasks);
3334 3334
3335 /* 3335 /*
3336 * If we were idle for multiple load cycles, apply them. 3336 * If we were idle for multiple load cycles, apply them.
3337 */ 3337 */
3338 if (ticks >= LOAD_FREQ) { 3338 if (ticks >= LOAD_FREQ) {
3339 n = ticks / LOAD_FREQ; 3339 n = ticks / LOAD_FREQ;
3340 3340
3341 active = atomic_long_read(&calc_load_tasks); 3341 active = atomic_long_read(&calc_load_tasks);
3342 active = active > 0 ? active * FIXED_1 : 0; 3342 active = active > 0 ? active * FIXED_1 : 0;
3343 3343
3344 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); 3344 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
3345 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); 3345 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
3346 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); 3346 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
3347 3347
3348 calc_load_update += n * LOAD_FREQ; 3348 calc_load_update += n * LOAD_FREQ;
3349 } 3349 }
3350 3350
3351 /* 3351 /*
3352 * Its possible the remainder of the above division also crosses 3352 * Its possible the remainder of the above division also crosses
3353 * a LOAD_FREQ period, the regular check in calc_global_load() 3353 * a LOAD_FREQ period, the regular check in calc_global_load()
3354 * which comes after this will take care of that. 3354 * which comes after this will take care of that.
3355 * 3355 *
3356 * Consider us being 11 ticks before a cycle completion, and us 3356 * Consider us being 11 ticks before a cycle completion, and us
3357 * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will 3357 * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
3358 * age us 4 cycles, and the test in calc_global_load() will 3358 * age us 4 cycles, and the test in calc_global_load() will
3359 * pick up the final one. 3359 * pick up the final one.
3360 */ 3360 */
3361 } 3361 }
3362 #else 3362 #else
3363 static void calc_load_account_idle(struct rq *this_rq) 3363 static void calc_load_account_idle(struct rq *this_rq)
3364 { 3364 {
3365 } 3365 }
3366 3366
3367 static inline long calc_load_fold_idle(void) 3367 static inline long calc_load_fold_idle(void)
3368 { 3368 {
3369 return 0; 3369 return 0;
3370 } 3370 }
3371 3371
3372 static void calc_global_nohz(unsigned long ticks) 3372 static void calc_global_nohz(unsigned long ticks)
3373 { 3373 {
3374 } 3374 }
3375 #endif 3375 #endif
3376 3376
3377 /** 3377 /**
3378 * get_avenrun - get the load average array 3378 * get_avenrun - get the load average array
3379 * @loads: pointer to dest load array 3379 * @loads: pointer to dest load array
3380 * @offset: offset to add 3380 * @offset: offset to add
3381 * @shift: shift count to shift the result left 3381 * @shift: shift count to shift the result left
3382 * 3382 *
3383 * These values are estimates at best, so no need for locking. 3383 * These values are estimates at best, so no need for locking.
3384 */ 3384 */
3385 void get_avenrun(unsigned long *loads, unsigned long offset, int shift) 3385 void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
3386 { 3386 {
3387 loads[0] = (avenrun[0] + offset) << shift; 3387 loads[0] = (avenrun[0] + offset) << shift;
3388 loads[1] = (avenrun[1] + offset) << shift; 3388 loads[1] = (avenrun[1] + offset) << shift;
3389 loads[2] = (avenrun[2] + offset) << shift; 3389 loads[2] = (avenrun[2] + offset) << shift;
3390 } 3390 }
3391 3391
3392 /* 3392 /*
3393 * calc_load - update the avenrun load estimates 10 ticks after the 3393 * calc_load - update the avenrun load estimates 10 ticks after the
3394 * CPUs have updated calc_load_tasks. 3394 * CPUs have updated calc_load_tasks.
3395 */ 3395 */
3396 void calc_global_load(unsigned long ticks) 3396 void calc_global_load(unsigned long ticks)
3397 { 3397 {
3398 long active; 3398 long active;
3399 3399
3400 calc_global_nohz(ticks); 3400 calc_global_nohz(ticks);
3401 3401
3402 if (time_before(jiffies, calc_load_update + 10)) 3402 if (time_before(jiffies, calc_load_update + 10))
3403 return; 3403 return;
3404 3404
3405 active = atomic_long_read(&calc_load_tasks); 3405 active = atomic_long_read(&calc_load_tasks);
3406 active = active > 0 ? active * FIXED_1 : 0; 3406 active = active > 0 ? active * FIXED_1 : 0;
3407 3407
3408 avenrun[0] = calc_load(avenrun[0], EXP_1, active); 3408 avenrun[0] = calc_load(avenrun[0], EXP_1, active);
3409 avenrun[1] = calc_load(avenrun[1], EXP_5, active); 3409 avenrun[1] = calc_load(avenrun[1], EXP_5, active);
3410 avenrun[2] = calc_load(avenrun[2], EXP_15, active); 3410 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
3411 3411
3412 calc_load_update += LOAD_FREQ; 3412 calc_load_update += LOAD_FREQ;
3413 } 3413 }
3414 3414
3415 /* 3415 /*
3416 * Called from update_cpu_load() to periodically update this CPU's 3416 * Called from update_cpu_load() to periodically update this CPU's
3417 * active count. 3417 * active count.
3418 */ 3418 */
3419 static void calc_load_account_active(struct rq *this_rq) 3419 static void calc_load_account_active(struct rq *this_rq)
3420 { 3420 {
3421 long delta; 3421 long delta;
3422 3422
3423 if (time_before(jiffies, this_rq->calc_load_update)) 3423 if (time_before(jiffies, this_rq->calc_load_update))
3424 return; 3424 return;
3425 3425
3426 delta = calc_load_fold_active(this_rq); 3426 delta = calc_load_fold_active(this_rq);
3427 delta += calc_load_fold_idle(); 3427 delta += calc_load_fold_idle();
3428 if (delta) 3428 if (delta)
3429 atomic_long_add(delta, &calc_load_tasks); 3429 atomic_long_add(delta, &calc_load_tasks);
3430 3430
3431 this_rq->calc_load_update += LOAD_FREQ; 3431 this_rq->calc_load_update += LOAD_FREQ;
3432 } 3432 }
3433 3433
3434 /* 3434 /*
3435 * The exact cpuload at various idx values, calculated at every tick would be 3435 * The exact cpuload at various idx values, calculated at every tick would be
3436 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load 3436 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
3437 * 3437 *
3438 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called 3438 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
3439 * on nth tick when cpu may be busy, then we have: 3439 * on nth tick when cpu may be busy, then we have:
3440 * load = ((2^idx - 1) / 2^idx)^(n-1) * load 3440 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
3441 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load 3441 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
3442 * 3442 *
3443 * decay_load_missed() below does efficient calculation of 3443 * decay_load_missed() below does efficient calculation of
3444 * load = ((2^idx - 1) / 2^idx)^(n-1) * load 3444 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
3445 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load 3445 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
3446 * 3446 *
3447 * The calculation is approximated on a 128 point scale. 3447 * The calculation is approximated on a 128 point scale.
3448 * degrade_zero_ticks is the number of ticks after which load at any 3448 * degrade_zero_ticks is the number of ticks after which load at any
3449 * particular idx is approximated to be zero. 3449 * particular idx is approximated to be zero.
3450 * degrade_factor is a precomputed table, a row for each load idx. 3450 * degrade_factor is a precomputed table, a row for each load idx.
3451 * Each column corresponds to degradation factor for a power of two ticks, 3451 * Each column corresponds to degradation factor for a power of two ticks,
3452 * based on 128 point scale. 3452 * based on 128 point scale.
3453 * Example: 3453 * Example:
3454 * row 2, col 3 (=12) says that the degradation at load idx 2 after 3454 * row 2, col 3 (=12) says that the degradation at load idx 2 after
3455 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). 3455 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
3456 * 3456 *
3457 * With this power of 2 load factors, we can degrade the load n times 3457 * With this power of 2 load factors, we can degrade the load n times
3458 * by looking at 1 bits in n and doing as many mult/shift instead of 3458 * by looking at 1 bits in n and doing as many mult/shift instead of
3459 * n mult/shifts needed by the exact degradation. 3459 * n mult/shifts needed by the exact degradation.
3460 */ 3460 */
3461 #define DEGRADE_SHIFT 7 3461 #define DEGRADE_SHIFT 7
3462 static const unsigned char 3462 static const unsigned char
3463 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; 3463 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
3464 static const unsigned char 3464 static const unsigned char
3465 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { 3465 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
3466 {0, 0, 0, 0, 0, 0, 0, 0}, 3466 {0, 0, 0, 0, 0, 0, 0, 0},
3467 {64, 32, 8, 0, 0, 0, 0, 0}, 3467 {64, 32, 8, 0, 0, 0, 0, 0},
3468 {96, 72, 40, 12, 1, 0, 0}, 3468 {96, 72, 40, 12, 1, 0, 0},
3469 {112, 98, 75, 43, 15, 1, 0}, 3469 {112, 98, 75, 43, 15, 1, 0},
3470 {120, 112, 98, 76, 45, 16, 2} }; 3470 {120, 112, 98, 76, 45, 16, 2} };
3471 3471
3472 /* 3472 /*
3473 * Update cpu_load for any missed ticks, due to tickless idle. The backlog 3473 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
3474 * would be when CPU is idle and so we just decay the old load without 3474 * would be when CPU is idle and so we just decay the old load without
3475 * adding any new load. 3475 * adding any new load.
3476 */ 3476 */
3477 static unsigned long 3477 static unsigned long
3478 decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) 3478 decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
3479 { 3479 {
3480 int j = 0; 3480 int j = 0;
3481 3481
3482 if (!missed_updates) 3482 if (!missed_updates)
3483 return load; 3483 return load;
3484 3484
3485 if (missed_updates >= degrade_zero_ticks[idx]) 3485 if (missed_updates >= degrade_zero_ticks[idx])
3486 return 0; 3486 return 0;
3487 3487
3488 if (idx == 1) 3488 if (idx == 1)
3489 return load >> missed_updates; 3489 return load >> missed_updates;
3490 3490
3491 while (missed_updates) { 3491 while (missed_updates) {
3492 if (missed_updates % 2) 3492 if (missed_updates % 2)
3493 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; 3493 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
3494 3494
3495 missed_updates >>= 1; 3495 missed_updates >>= 1;
3496 j++; 3496 j++;
3497 } 3497 }
3498 return load; 3498 return load;
3499 } 3499 }
3500 3500
3501 /* 3501 /*
3502 * Update rq->cpu_load[] statistics. This function is usually called every 3502 * Update rq->cpu_load[] statistics. This function is usually called every
3503 * scheduler tick (TICK_NSEC). With tickless idle this will not be called 3503 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
3504 * every tick. We fix it up based on jiffies. 3504 * every tick. We fix it up based on jiffies.
3505 */ 3505 */
3506 static void update_cpu_load(struct rq *this_rq) 3506 static void update_cpu_load(struct rq *this_rq)
3507 { 3507 {
3508 unsigned long this_load = this_rq->load.weight; 3508 unsigned long this_load = this_rq->load.weight;
3509 unsigned long curr_jiffies = jiffies; 3509 unsigned long curr_jiffies = jiffies;
3510 unsigned long pending_updates; 3510 unsigned long pending_updates;
3511 int i, scale; 3511 int i, scale;
3512 3512
3513 this_rq->nr_load_updates++; 3513 this_rq->nr_load_updates++;
3514 3514
3515 /* Avoid repeated calls on same jiffy, when moving in and out of idle */ 3515 /* Avoid repeated calls on same jiffy, when moving in and out of idle */
3516 if (curr_jiffies == this_rq->last_load_update_tick) 3516 if (curr_jiffies == this_rq->last_load_update_tick)
3517 return; 3517 return;
3518 3518
3519 pending_updates = curr_jiffies - this_rq->last_load_update_tick; 3519 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
3520 this_rq->last_load_update_tick = curr_jiffies; 3520 this_rq->last_load_update_tick = curr_jiffies;
3521 3521
3522 /* Update our load: */ 3522 /* Update our load: */
3523 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ 3523 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
3524 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { 3524 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
3525 unsigned long old_load, new_load; 3525 unsigned long old_load, new_load;
3526 3526
3527 /* scale is effectively 1 << i now, and >> i divides by scale */ 3527 /* scale is effectively 1 << i now, and >> i divides by scale */
3528 3528
3529 old_load = this_rq->cpu_load[i]; 3529 old_load = this_rq->cpu_load[i];
3530 old_load = decay_load_missed(old_load, pending_updates - 1, i); 3530 old_load = decay_load_missed(old_load, pending_updates - 1, i);
3531 new_load = this_load; 3531 new_load = this_load;
3532 /* 3532 /*
3533 * Round up the averaging division if load is increasing. This 3533 * Round up the averaging division if load is increasing. This
3534 * prevents us from getting stuck on 9 if the load is 10, for 3534 * prevents us from getting stuck on 9 if the load is 10, for
3535 * example. 3535 * example.
3536 */ 3536 */
3537 if (new_load > old_load) 3537 if (new_load > old_load)
3538 new_load += scale - 1; 3538 new_load += scale - 1;
3539 3539
3540 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; 3540 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
3541 } 3541 }
3542 3542
3543 sched_avg_update(this_rq); 3543 sched_avg_update(this_rq);
3544 } 3544 }
3545 3545
3546 static void update_cpu_load_active(struct rq *this_rq) 3546 static void update_cpu_load_active(struct rq *this_rq)
3547 { 3547 {
3548 update_cpu_load(this_rq); 3548 update_cpu_load(this_rq);
3549 3549
3550 calc_load_account_active(this_rq); 3550 calc_load_account_active(this_rq);
3551 } 3551 }
3552 3552
3553 #ifdef CONFIG_SMP 3553 #ifdef CONFIG_SMP
3554 3554
3555 /* 3555 /*
3556 * sched_exec - execve() is a valuable balancing opportunity, because at 3556 * sched_exec - execve() is a valuable balancing opportunity, because at
3557 * this point the task has the smallest effective memory and cache footprint. 3557 * this point the task has the smallest effective memory and cache footprint.
3558 */ 3558 */
3559 void sched_exec(void) 3559 void sched_exec(void)
3560 { 3560 {
3561 struct task_struct *p = current; 3561 struct task_struct *p = current;
3562 unsigned long flags; 3562 unsigned long flags;
3563 int dest_cpu; 3563 int dest_cpu;
3564 3564
3565 raw_spin_lock_irqsave(&p->pi_lock, flags); 3565 raw_spin_lock_irqsave(&p->pi_lock, flags);
3566 dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); 3566 dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
3567 if (dest_cpu == smp_processor_id()) 3567 if (dest_cpu == smp_processor_id())
3568 goto unlock; 3568 goto unlock;
3569 3569
3570 if (likely(cpu_active(dest_cpu))) { 3570 if (likely(cpu_active(dest_cpu))) {
3571 struct migration_arg arg = { p, dest_cpu }; 3571 struct migration_arg arg = { p, dest_cpu };
3572 3572
3573 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 3573 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3574 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); 3574 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
3575 return; 3575 return;
3576 } 3576 }
3577 unlock: 3577 unlock:
3578 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 3578 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3579 } 3579 }
3580 3580
3581 #endif 3581 #endif
3582 3582
3583 DEFINE_PER_CPU(struct kernel_stat, kstat); 3583 DEFINE_PER_CPU(struct kernel_stat, kstat);
3584 3584
3585 EXPORT_PER_CPU_SYMBOL(kstat); 3585 EXPORT_PER_CPU_SYMBOL(kstat);
3586 3586
3587 /* 3587 /*
3588 * Return any ns on the sched_clock that have not yet been accounted in 3588 * Return any ns on the sched_clock that have not yet been accounted in
3589 * @p in case that task is currently running. 3589 * @p in case that task is currently running.
3590 * 3590 *
3591 * Called with task_rq_lock() held on @rq. 3591 * Called with task_rq_lock() held on @rq.
3592 */ 3592 */
3593 static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) 3593 static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
3594 { 3594 {
3595 u64 ns = 0; 3595 u64 ns = 0;
3596 3596
3597 if (task_current(rq, p)) { 3597 if (task_current(rq, p)) {
3598 update_rq_clock(rq); 3598 update_rq_clock(rq);
3599 ns = rq->clock_task - p->se.exec_start; 3599 ns = rq->clock_task - p->se.exec_start;
3600 if ((s64)ns < 0) 3600 if ((s64)ns < 0)
3601 ns = 0; 3601 ns = 0;
3602 } 3602 }
3603 3603
3604 return ns; 3604 return ns;
3605 } 3605 }
3606 3606
3607 unsigned long long task_delta_exec(struct task_struct *p) 3607 unsigned long long task_delta_exec(struct task_struct *p)
3608 { 3608 {
3609 unsigned long flags; 3609 unsigned long flags;
3610 struct rq *rq; 3610 struct rq *rq;
3611 u64 ns = 0; 3611 u64 ns = 0;
3612 3612
3613 rq = task_rq_lock(p, &flags); 3613 rq = task_rq_lock(p, &flags);
3614 ns = do_task_delta_exec(p, rq); 3614 ns = do_task_delta_exec(p, rq);
3615 task_rq_unlock(rq, p, &flags); 3615 task_rq_unlock(rq, p, &flags);
3616 3616
3617 return ns; 3617 return ns;
3618 } 3618 }
3619 3619
3620 /* 3620 /*
3621 * Return accounted runtime for the task. 3621 * Return accounted runtime for the task.
3622 * In case the task is currently running, return the runtime plus current's 3622 * In case the task is currently running, return the runtime plus current's
3623 * pending runtime that have not been accounted yet. 3623 * pending runtime that have not been accounted yet.
3624 */ 3624 */
3625 unsigned long long task_sched_runtime(struct task_struct *p) 3625 unsigned long long task_sched_runtime(struct task_struct *p)
3626 { 3626 {
3627 unsigned long flags; 3627 unsigned long flags;
3628 struct rq *rq; 3628 struct rq *rq;
3629 u64 ns = 0; 3629 u64 ns = 0;
3630 3630
3631 rq = task_rq_lock(p, &flags); 3631 rq = task_rq_lock(p, &flags);
3632 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); 3632 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
3633 task_rq_unlock(rq, p, &flags); 3633 task_rq_unlock(rq, p, &flags);
3634 3634
3635 return ns; 3635 return ns;
3636 } 3636 }
3637 3637
3638 /* 3638 /*
3639 * Return sum_exec_runtime for the thread group. 3639 * Return sum_exec_runtime for the thread group.
3640 * In case the task is currently running, return the sum plus current's 3640 * In case the task is currently running, return the sum plus current's
3641 * pending runtime that have not been accounted yet. 3641 * pending runtime that have not been accounted yet.
3642 * 3642 *
3643 * Note that the thread group might have other running tasks as well, 3643 * Note that the thread group might have other running tasks as well,
3644 * so the return value not includes other pending runtime that other 3644 * so the return value not includes other pending runtime that other
3645 * running tasks might have. 3645 * running tasks might have.
3646 */ 3646 */
3647 unsigned long long thread_group_sched_runtime(struct task_struct *p) 3647 unsigned long long thread_group_sched_runtime(struct task_struct *p)
3648 { 3648 {
3649 struct task_cputime totals; 3649 struct task_cputime totals;
3650 unsigned long flags; 3650 unsigned long flags;
3651 struct rq *rq; 3651 struct rq *rq;
3652 u64 ns; 3652 u64 ns;
3653 3653
3654 rq = task_rq_lock(p, &flags); 3654 rq = task_rq_lock(p, &flags);
3655 thread_group_cputime(p, &totals); 3655 thread_group_cputime(p, &totals);
3656 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); 3656 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
3657 task_rq_unlock(rq, p, &flags); 3657 task_rq_unlock(rq, p, &flags);
3658 3658
3659 return ns; 3659 return ns;
3660 } 3660 }
3661 3661
3662 /* 3662 /*
3663 * Account user cpu time to a process. 3663 * Account user cpu time to a process.
3664 * @p: the process that the cpu time gets accounted to 3664 * @p: the process that the cpu time gets accounted to
3665 * @cputime: the cpu time spent in user space since the last update 3665 * @cputime: the cpu time spent in user space since the last update
3666 * @cputime_scaled: cputime scaled by cpu frequency 3666 * @cputime_scaled: cputime scaled by cpu frequency
3667 */ 3667 */
3668 void account_user_time(struct task_struct *p, cputime_t cputime, 3668 void account_user_time(struct task_struct *p, cputime_t cputime,
3669 cputime_t cputime_scaled) 3669 cputime_t cputime_scaled)
3670 { 3670 {
3671 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3671 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3672 cputime64_t tmp; 3672 cputime64_t tmp;
3673 3673
3674 /* Add user time to process. */ 3674 /* Add user time to process. */
3675 p->utime = cputime_add(p->utime, cputime); 3675 p->utime = cputime_add(p->utime, cputime);
3676 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); 3676 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
3677 account_group_user_time(p, cputime); 3677 account_group_user_time(p, cputime);
3678 3678
3679 /* Add user time to cpustat. */ 3679 /* Add user time to cpustat. */
3680 tmp = cputime_to_cputime64(cputime); 3680 tmp = cputime_to_cputime64(cputime);
3681 if (TASK_NICE(p) > 0) 3681 if (TASK_NICE(p) > 0)
3682 cpustat->nice = cputime64_add(cpustat->nice, tmp); 3682 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3683 else 3683 else
3684 cpustat->user = cputime64_add(cpustat->user, tmp); 3684 cpustat->user = cputime64_add(cpustat->user, tmp);
3685 3685
3686 cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime); 3686 cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
3687 /* Account for user time used */ 3687 /* Account for user time used */
3688 acct_update_integrals(p); 3688 acct_update_integrals(p);
3689 } 3689 }
3690 3690
3691 /* 3691 /*
3692 * Account guest cpu time to a process. 3692 * Account guest cpu time to a process.
3693 * @p: the process that the cpu time gets accounted to 3693 * @p: the process that the cpu time gets accounted to
3694 * @cputime: the cpu time spent in virtual machine since the last update 3694 * @cputime: the cpu time spent in virtual machine since the last update
3695 * @cputime_scaled: cputime scaled by cpu frequency 3695 * @cputime_scaled: cputime scaled by cpu frequency
3696 */ 3696 */
3697 static void account_guest_time(struct task_struct *p, cputime_t cputime, 3697 static void account_guest_time(struct task_struct *p, cputime_t cputime,
3698 cputime_t cputime_scaled) 3698 cputime_t cputime_scaled)
3699 { 3699 {
3700 cputime64_t tmp; 3700 cputime64_t tmp;
3701 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3701 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3702 3702
3703 tmp = cputime_to_cputime64(cputime); 3703 tmp = cputime_to_cputime64(cputime);
3704 3704
3705 /* Add guest time to process. */ 3705 /* Add guest time to process. */
3706 p->utime = cputime_add(p->utime, cputime); 3706 p->utime = cputime_add(p->utime, cputime);
3707 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); 3707 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
3708 account_group_user_time(p, cputime); 3708 account_group_user_time(p, cputime);
3709 p->gtime = cputime_add(p->gtime, cputime); 3709 p->gtime = cputime_add(p->gtime, cputime);
3710 3710
3711 /* Add guest time to cpustat. */ 3711 /* Add guest time to cpustat. */
3712 if (TASK_NICE(p) > 0) { 3712 if (TASK_NICE(p) > 0) {
3713 cpustat->nice = cputime64_add(cpustat->nice, tmp); 3713 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3714 cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp); 3714 cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);
3715 } else { 3715 } else {
3716 cpustat->user = cputime64_add(cpustat->user, tmp); 3716 cpustat->user = cputime64_add(cpustat->user, tmp);
3717 cpustat->guest = cputime64_add(cpustat->guest, tmp); 3717 cpustat->guest = cputime64_add(cpustat->guest, tmp);
3718 } 3718 }
3719 } 3719 }
3720 3720
3721 /* 3721 /*
3722 * Account system cpu time to a process and desired cpustat field 3722 * Account system cpu time to a process and desired cpustat field
3723 * @p: the process that the cpu time gets accounted to 3723 * @p: the process that the cpu time gets accounted to
3724 * @cputime: the cpu time spent in kernel space since the last update 3724 * @cputime: the cpu time spent in kernel space since the last update
3725 * @cputime_scaled: cputime scaled by cpu frequency 3725 * @cputime_scaled: cputime scaled by cpu frequency
3726 * @target_cputime64: pointer to cpustat field that has to be updated 3726 * @target_cputime64: pointer to cpustat field that has to be updated
3727 */ 3727 */
3728 static inline 3728 static inline
3729 void __account_system_time(struct task_struct *p, cputime_t cputime, 3729 void __account_system_time(struct task_struct *p, cputime_t cputime,
3730 cputime_t cputime_scaled, cputime64_t *target_cputime64) 3730 cputime_t cputime_scaled, cputime64_t *target_cputime64)
3731 { 3731 {
3732 cputime64_t tmp = cputime_to_cputime64(cputime); 3732 cputime64_t tmp = cputime_to_cputime64(cputime);
3733 3733
3734 /* Add system time to process. */ 3734 /* Add system time to process. */
3735 p->stime = cputime_add(p->stime, cputime); 3735 p->stime = cputime_add(p->stime, cputime);
3736 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); 3736 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
3737 account_group_system_time(p, cputime); 3737 account_group_system_time(p, cputime);
3738 3738
3739 /* Add system time to cpustat. */ 3739 /* Add system time to cpustat. */
3740 *target_cputime64 = cputime64_add(*target_cputime64, tmp); 3740 *target_cputime64 = cputime64_add(*target_cputime64, tmp);
3741 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); 3741 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
3742 3742
3743 /* Account for system time used */ 3743 /* Account for system time used */
3744 acct_update_integrals(p); 3744 acct_update_integrals(p);
3745 } 3745 }
3746 3746
3747 /* 3747 /*
3748 * Account system cpu time to a process. 3748 * Account system cpu time to a process.
3749 * @p: the process that the cpu time gets accounted to 3749 * @p: the process that the cpu time gets accounted to
3750 * @hardirq_offset: the offset to subtract from hardirq_count() 3750 * @hardirq_offset: the offset to subtract from hardirq_count()
3751 * @cputime: the cpu time spent in kernel space since the last update 3751 * @cputime: the cpu time spent in kernel space since the last update
3752 * @cputime_scaled: cputime scaled by cpu frequency 3752 * @cputime_scaled: cputime scaled by cpu frequency
3753 */ 3753 */
3754 void account_system_time(struct task_struct *p, int hardirq_offset, 3754 void account_system_time(struct task_struct *p, int hardirq_offset,
3755 cputime_t cputime, cputime_t cputime_scaled) 3755 cputime_t cputime, cputime_t cputime_scaled)
3756 { 3756 {
3757 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3757 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3758 cputime64_t *target_cputime64; 3758 cputime64_t *target_cputime64;
3759 3759
3760 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { 3760 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
3761 account_guest_time(p, cputime, cputime_scaled); 3761 account_guest_time(p, cputime, cputime_scaled);
3762 return; 3762 return;
3763 } 3763 }
3764 3764
3765 if (hardirq_count() - hardirq_offset) 3765 if (hardirq_count() - hardirq_offset)
3766 target_cputime64 = &cpustat->irq; 3766 target_cputime64 = &cpustat->irq;
3767 else if (in_serving_softirq()) 3767 else if (in_serving_softirq())
3768 target_cputime64 = &cpustat->softirq; 3768 target_cputime64 = &cpustat->softirq;
3769 else 3769 else
3770 target_cputime64 = &cpustat->system; 3770 target_cputime64 = &cpustat->system;
3771 3771
3772 __account_system_time(p, cputime, cputime_scaled, target_cputime64); 3772 __account_system_time(p, cputime, cputime_scaled, target_cputime64);
3773 } 3773 }
3774 3774
3775 /* 3775 /*
3776 * Account for involuntary wait time. 3776 * Account for involuntary wait time.
3777 * @cputime: the cpu time spent in involuntary wait 3777 * @cputime: the cpu time spent in involuntary wait
3778 */ 3778 */
3779 void account_steal_time(cputime_t cputime) 3779 void account_steal_time(cputime_t cputime)
3780 { 3780 {
3781 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3781 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3782 cputime64_t cputime64 = cputime_to_cputime64(cputime); 3782 cputime64_t cputime64 = cputime_to_cputime64(cputime);
3783 3783
3784 cpustat->steal = cputime64_add(cpustat->steal, cputime64); 3784 cpustat->steal = cputime64_add(cpustat->steal, cputime64);
3785 } 3785 }
3786 3786
3787 /* 3787 /*
3788 * Account for idle time. 3788 * Account for idle time.
3789 * @cputime: the cpu time spent in idle wait 3789 * @cputime: the cpu time spent in idle wait
3790 */ 3790 */
3791 void account_idle_time(cputime_t cputime) 3791 void account_idle_time(cputime_t cputime)
3792 { 3792 {
3793 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3793 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3794 cputime64_t cputime64 = cputime_to_cputime64(cputime); 3794 cputime64_t cputime64 = cputime_to_cputime64(cputime);
3795 struct rq *rq = this_rq(); 3795 struct rq *rq = this_rq();
3796 3796
3797 if (atomic_read(&rq->nr_iowait) > 0) 3797 if (atomic_read(&rq->nr_iowait) > 0)
3798 cpustat->iowait = cputime64_add(cpustat->iowait, cputime64); 3798 cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
3799 else 3799 else
3800 cpustat->idle = cputime64_add(cpustat->idle, cputime64); 3800 cpustat->idle = cputime64_add(cpustat->idle, cputime64);
3801 } 3801 }
3802 3802
3803 #ifndef CONFIG_VIRT_CPU_ACCOUNTING 3803 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
3804 3804
3805 #ifdef CONFIG_IRQ_TIME_ACCOUNTING 3805 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
3806 /* 3806 /*
3807 * Account a tick to a process and cpustat 3807 * Account a tick to a process and cpustat
3808 * @p: the process that the cpu time gets accounted to 3808 * @p: the process that the cpu time gets accounted to
3809 * @user_tick: is the tick from userspace 3809 * @user_tick: is the tick from userspace
3810 * @rq: the pointer to rq 3810 * @rq: the pointer to rq
3811 * 3811 *
3812 * Tick demultiplexing follows the order 3812 * Tick demultiplexing follows the order
3813 * - pending hardirq update 3813 * - pending hardirq update
3814 * - pending softirq update 3814 * - pending softirq update
3815 * - user_time 3815 * - user_time
3816 * - idle_time 3816 * - idle_time
3817 * - system time 3817 * - system time
3818 * - check for guest_time 3818 * - check for guest_time
3819 * - else account as system_time 3819 * - else account as system_time
3820 * 3820 *
3821 * Check for hardirq is done both for system and user time as there is 3821 * Check for hardirq is done both for system and user time as there is
3822 * no timer going off while we are on hardirq and hence we may never get an 3822 * no timer going off while we are on hardirq and hence we may never get an
3823 * opportunity to update it solely in system time. 3823 * opportunity to update it solely in system time.
3824 * p->stime and friends are only updated on system time and not on irq 3824 * p->stime and friends are only updated on system time and not on irq
3825 * softirq as those do not count in task exec_runtime any more. 3825 * softirq as those do not count in task exec_runtime any more.
3826 */ 3826 */
3827 static void irqtime_account_process_tick(struct task_struct *p, int user_tick, 3827 static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3828 struct rq *rq) 3828 struct rq *rq)
3829 { 3829 {
3830 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); 3830 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3831 cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); 3831 cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
3832 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3832 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3833 3833
3834 if (irqtime_account_hi_update()) { 3834 if (irqtime_account_hi_update()) {
3835 cpustat->irq = cputime64_add(cpustat->irq, tmp); 3835 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3836 } else if (irqtime_account_si_update()) { 3836 } else if (irqtime_account_si_update()) {
3837 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 3837 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3838 } else if (this_cpu_ksoftirqd() == p) { 3838 } else if (this_cpu_ksoftirqd() == p) {
3839 /* 3839 /*
3840 * ksoftirqd time do not get accounted in cpu_softirq_time. 3840 * ksoftirqd time do not get accounted in cpu_softirq_time.
3841 * So, we have to handle it separately here. 3841 * So, we have to handle it separately here.
3842 * Also, p->stime needs to be updated for ksoftirqd. 3842 * Also, p->stime needs to be updated for ksoftirqd.
3843 */ 3843 */
3844 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, 3844 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3845 &cpustat->softirq); 3845 &cpustat->softirq);
3846 } else if (user_tick) { 3846 } else if (user_tick) {
3847 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); 3847 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3848 } else if (p == rq->idle) { 3848 } else if (p == rq->idle) {
3849 account_idle_time(cputime_one_jiffy); 3849 account_idle_time(cputime_one_jiffy);
3850 } else if (p->flags & PF_VCPU) { /* System time or guest time */ 3850 } else if (p->flags & PF_VCPU) { /* System time or guest time */
3851 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); 3851 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
3852 } else { 3852 } else {
3853 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, 3853 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3854 &cpustat->system); 3854 &cpustat->system);
3855 } 3855 }
3856 } 3856 }
3857 3857
3858 static void irqtime_account_idle_ticks(int ticks) 3858 static void irqtime_account_idle_ticks(int ticks)
3859 { 3859 {
3860 int i; 3860 int i;
3861 struct rq *rq = this_rq(); 3861 struct rq *rq = this_rq();
3862 3862
3863 for (i = 0; i < ticks; i++) 3863 for (i = 0; i < ticks; i++)
3864 irqtime_account_process_tick(current, 0, rq); 3864 irqtime_account_process_tick(current, 0, rq);
3865 } 3865 }
3866 #else /* CONFIG_IRQ_TIME_ACCOUNTING */ 3866 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
3867 static void irqtime_account_idle_ticks(int ticks) {} 3867 static void irqtime_account_idle_ticks(int ticks) {}
3868 static void irqtime_account_process_tick(struct task_struct *p, int user_tick, 3868 static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3869 struct rq *rq) {} 3869 struct rq *rq) {}
3870 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ 3870 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
3871 3871
3872 /* 3872 /*
3873 * Account a single tick of cpu time. 3873 * Account a single tick of cpu time.
3874 * @p: the process that the cpu time gets accounted to 3874 * @p: the process that the cpu time gets accounted to
3875 * @user_tick: indicates if the tick is a user or a system tick 3875 * @user_tick: indicates if the tick is a user or a system tick
3876 */ 3876 */
3877 void account_process_tick(struct task_struct *p, int user_tick) 3877 void account_process_tick(struct task_struct *p, int user_tick)
3878 { 3878 {
3879 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); 3879 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3880 struct rq *rq = this_rq(); 3880 struct rq *rq = this_rq();
3881 3881
3882 if (sched_clock_irqtime) { 3882 if (sched_clock_irqtime) {
3883 irqtime_account_process_tick(p, user_tick, rq); 3883 irqtime_account_process_tick(p, user_tick, rq);
3884 return; 3884 return;
3885 } 3885 }
3886 3886
3887 if (user_tick) 3887 if (user_tick)
3888 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); 3888 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3889 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) 3889 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
3890 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, 3890 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
3891 one_jiffy_scaled); 3891 one_jiffy_scaled);
3892 else 3892 else
3893 account_idle_time(cputime_one_jiffy); 3893 account_idle_time(cputime_one_jiffy);
3894 } 3894 }
3895 3895
3896 /* 3896 /*
3897 * Account multiple ticks of steal time. 3897 * Account multiple ticks of steal time.
3898 * @p: the process from which the cpu time has been stolen 3898 * @p: the process from which the cpu time has been stolen
3899 * @ticks: number of stolen ticks 3899 * @ticks: number of stolen ticks
3900 */ 3900 */
3901 void account_steal_ticks(unsigned long ticks) 3901 void account_steal_ticks(unsigned long ticks)
3902 { 3902 {
3903 account_steal_time(jiffies_to_cputime(ticks)); 3903 account_steal_time(jiffies_to_cputime(ticks));
3904 } 3904 }
3905 3905
3906 /* 3906 /*
3907 * Account multiple ticks of idle time. 3907 * Account multiple ticks of idle time.
3908 * @ticks: number of stolen ticks 3908 * @ticks: number of stolen ticks
3909 */ 3909 */
3910 void account_idle_ticks(unsigned long ticks) 3910 void account_idle_ticks(unsigned long ticks)
3911 { 3911 {
3912 3912
3913 if (sched_clock_irqtime) { 3913 if (sched_clock_irqtime) {
3914 irqtime_account_idle_ticks(ticks); 3914 irqtime_account_idle_ticks(ticks);
3915 return; 3915 return;
3916 } 3916 }
3917 3917
3918 account_idle_time(jiffies_to_cputime(ticks)); 3918 account_idle_time(jiffies_to_cputime(ticks));
3919 } 3919 }
3920 3920
3921 #endif 3921 #endif
3922 3922
3923 /* 3923 /*
3924 * Use precise platform statistics if available: 3924 * Use precise platform statistics if available:
3925 */ 3925 */
3926 #ifdef CONFIG_VIRT_CPU_ACCOUNTING 3926 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
3927 void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) 3927 void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3928 { 3928 {
3929 *ut = p->utime; 3929 *ut = p->utime;
3930 *st = p->stime; 3930 *st = p->stime;
3931 } 3931 }
3932 3932
3933 void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) 3933 void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3934 { 3934 {
3935 struct task_cputime cputime; 3935 struct task_cputime cputime;
3936 3936
3937 thread_group_cputime(p, &cputime); 3937 thread_group_cputime(p, &cputime);
3938 3938
3939 *ut = cputime.utime; 3939 *ut = cputime.utime;
3940 *st = cputime.stime; 3940 *st = cputime.stime;
3941 } 3941 }
3942 #else 3942 #else
3943 3943
3944 #ifndef nsecs_to_cputime 3944 #ifndef nsecs_to_cputime
3945 # define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) 3945 # define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
3946 #endif 3946 #endif
3947 3947
3948 void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) 3948 void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3949 { 3949 {
3950 cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime); 3950 cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);
3951 3951
3952 /* 3952 /*
3953 * Use CFS's precise accounting: 3953 * Use CFS's precise accounting:
3954 */ 3954 */
3955 rtime = nsecs_to_cputime(p->se.sum_exec_runtime); 3955 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
3956 3956
3957 if (total) { 3957 if (total) {
3958 u64 temp = rtime; 3958 u64 temp = rtime;
3959 3959
3960 temp *= utime; 3960 temp *= utime;
3961 do_div(temp, total); 3961 do_div(temp, total);
3962 utime = (cputime_t)temp; 3962 utime = (cputime_t)temp;
3963 } else 3963 } else
3964 utime = rtime; 3964 utime = rtime;
3965 3965
3966 /* 3966 /*
3967 * Compare with previous values, to keep monotonicity: 3967 * Compare with previous values, to keep monotonicity:
3968 */ 3968 */
3969 p->prev_utime = max(p->prev_utime, utime); 3969 p->prev_utime = max(p->prev_utime, utime);
3970 p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime)); 3970 p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));
3971 3971
3972 *ut = p->prev_utime; 3972 *ut = p->prev_utime;
3973 *st = p->prev_stime; 3973 *st = p->prev_stime;
3974 } 3974 }
3975 3975
3976 /* 3976 /*
3977 * Must be called with siglock held. 3977 * Must be called with siglock held.
3978 */ 3978 */
3979 void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) 3979 void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3980 { 3980 {
3981 struct signal_struct *sig = p->signal; 3981 struct signal_struct *sig = p->signal;
3982 struct task_cputime cputime; 3982 struct task_cputime cputime;
3983 cputime_t rtime, utime, total; 3983 cputime_t rtime, utime, total;
3984 3984
3985 thread_group_cputime(p, &cputime); 3985 thread_group_cputime(p, &cputime);
3986 3986
3987 total = cputime_add(cputime.utime, cputime.stime); 3987 total = cputime_add(cputime.utime, cputime.stime);
3988 rtime = nsecs_to_cputime(cputime.sum_exec_runtime); 3988 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
3989 3989
3990 if (total) { 3990 if (total) {
3991 u64 temp = rtime; 3991 u64 temp = rtime;
3992 3992
3993 temp *= cputime.utime; 3993 temp *= cputime.utime;
3994 do_div(temp, total); 3994 do_div(temp, total);
3995 utime = (cputime_t)temp; 3995 utime = (cputime_t)temp;
3996 } else 3996 } else
3997 utime = rtime; 3997 utime = rtime;
3998 3998
3999 sig->prev_utime = max(sig->prev_utime, utime); 3999 sig->prev_utime = max(sig->prev_utime, utime);
4000 sig->prev_stime = max(sig->prev_stime, 4000 sig->prev_stime = max(sig->prev_stime,
4001 cputime_sub(rtime, sig->prev_utime)); 4001 cputime_sub(rtime, sig->prev_utime));
4002 4002
4003 *ut = sig->prev_utime; 4003 *ut = sig->prev_utime;
4004 *st = sig->prev_stime; 4004 *st = sig->prev_stime;
4005 } 4005 }
4006 #endif 4006 #endif
4007 4007
4008 /* 4008 /*
4009 * This function gets called by the timer code, with HZ frequency. 4009 * This function gets called by the timer code, with HZ frequency.
4010 * We call it with interrupts disabled. 4010 * We call it with interrupts disabled.
4011 * 4011 *
4012 * It also gets called by the fork code, when changing the parent's 4012 * It also gets called by the fork code, when changing the parent's
4013 * timeslices. 4013 * timeslices.
4014 */ 4014 */
4015 void scheduler_tick(void) 4015 void scheduler_tick(void)
4016 { 4016 {
4017 int cpu = smp_processor_id(); 4017 int cpu = smp_processor_id();
4018 struct rq *rq = cpu_rq(cpu); 4018 struct rq *rq = cpu_rq(cpu);
4019 struct task_struct *curr = rq->curr; 4019 struct task_struct *curr = rq->curr;
4020 4020
4021 sched_clock_tick(); 4021 sched_clock_tick();
4022 4022
4023 raw_spin_lock(&rq->lock); 4023 raw_spin_lock(&rq->lock);
4024 update_rq_clock(rq); 4024 update_rq_clock(rq);
4025 update_cpu_load_active(rq); 4025 update_cpu_load_active(rq);
4026 curr->sched_class->task_tick(rq, curr, 0); 4026 curr->sched_class->task_tick(rq, curr, 0);
4027 raw_spin_unlock(&rq->lock); 4027 raw_spin_unlock(&rq->lock);
4028 4028
4029 perf_event_task_tick(); 4029 perf_event_task_tick();
4030 4030
4031 #ifdef CONFIG_SMP 4031 #ifdef CONFIG_SMP
4032 rq->idle_at_tick = idle_cpu(cpu); 4032 rq->idle_at_tick = idle_cpu(cpu);
4033 trigger_load_balance(rq, cpu); 4033 trigger_load_balance(rq, cpu);
4034 #endif 4034 #endif
4035 } 4035 }
4036 4036
4037 notrace unsigned long get_parent_ip(unsigned long addr) 4037 notrace unsigned long get_parent_ip(unsigned long addr)
4038 { 4038 {
4039 if (in_lock_functions(addr)) { 4039 if (in_lock_functions(addr)) {
4040 addr = CALLER_ADDR2; 4040 addr = CALLER_ADDR2;
4041 if (in_lock_functions(addr)) 4041 if (in_lock_functions(addr))
4042 addr = CALLER_ADDR3; 4042 addr = CALLER_ADDR3;
4043 } 4043 }
4044 return addr; 4044 return addr;
4045 } 4045 }
4046 4046
4047 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ 4047 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
4048 defined(CONFIG_PREEMPT_TRACER)) 4048 defined(CONFIG_PREEMPT_TRACER))
4049 4049
4050 void __kprobes add_preempt_count(int val) 4050 void __kprobes add_preempt_count(int val)
4051 { 4051 {
4052 #ifdef CONFIG_DEBUG_PREEMPT 4052 #ifdef CONFIG_DEBUG_PREEMPT
4053 /* 4053 /*
4054 * Underflow? 4054 * Underflow?
4055 */ 4055 */
4056 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) 4056 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
4057 return; 4057 return;
4058 #endif 4058 #endif
4059 preempt_count() += val; 4059 preempt_count() += val;
4060 #ifdef CONFIG_DEBUG_PREEMPT 4060 #ifdef CONFIG_DEBUG_PREEMPT
4061 /* 4061 /*
4062 * Spinlock count overflowing soon? 4062 * Spinlock count overflowing soon?
4063 */ 4063 */
4064 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= 4064 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
4065 PREEMPT_MASK - 10); 4065 PREEMPT_MASK - 10);
4066 #endif 4066 #endif
4067 if (preempt_count() == val) 4067 if (preempt_count() == val)
4068 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 4068 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4069 } 4069 }
4070 EXPORT_SYMBOL(add_preempt_count); 4070 EXPORT_SYMBOL(add_preempt_count);
4071 4071
4072 void __kprobes sub_preempt_count(int val) 4072 void __kprobes sub_preempt_count(int val)
4073 { 4073 {
4074 #ifdef CONFIG_DEBUG_PREEMPT 4074 #ifdef CONFIG_DEBUG_PREEMPT
4075 /* 4075 /*
4076 * Underflow? 4076 * Underflow?
4077 */ 4077 */
4078 if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) 4078 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
4079 return; 4079 return;
4080 /* 4080 /*
4081 * Is the spinlock portion underflowing? 4081 * Is the spinlock portion underflowing?
4082 */ 4082 */
4083 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && 4083 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
4084 !(preempt_count() & PREEMPT_MASK))) 4084 !(preempt_count() & PREEMPT_MASK)))
4085 return; 4085 return;
4086 #endif 4086 #endif
4087 4087
4088 if (preempt_count() == val) 4088 if (preempt_count() == val)
4089 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 4089 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4090 preempt_count() -= val; 4090 preempt_count() -= val;
4091 } 4091 }
4092 EXPORT_SYMBOL(sub_preempt_count); 4092 EXPORT_SYMBOL(sub_preempt_count);
4093 4093
4094 #endif 4094 #endif
4095 4095
4096 /* 4096 /*
4097 * Print scheduling while atomic bug: 4097 * Print scheduling while atomic bug:
4098 */ 4098 */
4099 static noinline void __schedule_bug(struct task_struct *prev) 4099 static noinline void __schedule_bug(struct task_struct *prev)
4100 { 4100 {
4101 struct pt_regs *regs = get_irq_regs(); 4101 struct pt_regs *regs = get_irq_regs();
4102 4102
4103 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", 4103 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
4104 prev->comm, prev->pid, preempt_count()); 4104 prev->comm, prev->pid, preempt_count());
4105 4105
4106 debug_show_held_locks(prev); 4106 debug_show_held_locks(prev);
4107 print_modules(); 4107 print_modules();
4108 if (irqs_disabled()) 4108 if (irqs_disabled())
4109 print_irqtrace_events(prev); 4109 print_irqtrace_events(prev);
4110 4110
4111 if (regs) 4111 if (regs)
4112 show_regs(regs); 4112 show_regs(regs);
4113 else 4113 else
4114 dump_stack(); 4114 dump_stack();
4115 } 4115 }
4116 4116
4117 /* 4117 /*
4118 * Various schedule()-time debugging checks and statistics: 4118 * Various schedule()-time debugging checks and statistics:
4119 */ 4119 */
4120 static inline void schedule_debug(struct task_struct *prev) 4120 static inline void schedule_debug(struct task_struct *prev)
4121 { 4121 {
4122 /* 4122 /*
4123 * Test if we are atomic. Since do_exit() needs to call into 4123 * Test if we are atomic. Since do_exit() needs to call into
4124 * schedule() atomically, we ignore that path for now. 4124 * schedule() atomically, we ignore that path for now.
4125 * Otherwise, whine if we are scheduling when we should not be. 4125 * Otherwise, whine if we are scheduling when we should not be.
4126 */ 4126 */
4127 if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) 4127 if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
4128 __schedule_bug(prev); 4128 __schedule_bug(prev);
4129 4129
4130 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 4130 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
4131 4131
4132 schedstat_inc(this_rq(), sched_count); 4132 schedstat_inc(this_rq(), sched_count);
4133 #ifdef CONFIG_SCHEDSTATS 4133 #ifdef CONFIG_SCHEDSTATS
4134 if (unlikely(prev->lock_depth >= 0)) { 4134 if (unlikely(prev->lock_depth >= 0)) {
4135 schedstat_inc(this_rq(), rq_sched_info.bkl_count); 4135 schedstat_inc(this_rq(), rq_sched_info.bkl_count);
4136 schedstat_inc(prev, sched_info.bkl_count); 4136 schedstat_inc(prev, sched_info.bkl_count);
4137 } 4137 }
4138 #endif 4138 #endif
4139 } 4139 }
4140 4140
4141 static void put_prev_task(struct rq *rq, struct task_struct *prev) 4141 static void put_prev_task(struct rq *rq, struct task_struct *prev)
4142 { 4142 {
4143 if (prev->on_rq) 4143 if (prev->on_rq)
4144 update_rq_clock(rq); 4144 update_rq_clock(rq);
4145 prev->sched_class->put_prev_task(rq, prev); 4145 prev->sched_class->put_prev_task(rq, prev);
4146 } 4146 }
4147 4147
4148 /* 4148 /*
4149 * Pick up the highest-prio task: 4149 * Pick up the highest-prio task:
4150 */ 4150 */
4151 static inline struct task_struct * 4151 static inline struct task_struct *
4152 pick_next_task(struct rq *rq) 4152 pick_next_task(struct rq *rq)
4153 { 4153 {
4154 const struct sched_class *class; 4154 const struct sched_class *class;
4155 struct task_struct *p; 4155 struct task_struct *p;
4156 4156
4157 /* 4157 /*
4158 * Optimization: we know that if all tasks are in 4158 * Optimization: we know that if all tasks are in
4159 * the fair class we can call that function directly: 4159 * the fair class we can call that function directly:
4160 */ 4160 */
4161 if (likely(rq->nr_running == rq->cfs.nr_running)) { 4161 if (likely(rq->nr_running == rq->cfs.nr_running)) {
4162 p = fair_sched_class.pick_next_task(rq); 4162 p = fair_sched_class.pick_next_task(rq);
4163 if (likely(p)) 4163 if (likely(p))
4164 return p; 4164 return p;
4165 } 4165 }
4166 4166
4167 for_each_class(class) { 4167 for_each_class(class) {
4168 p = class->pick_next_task(rq); 4168 p = class->pick_next_task(rq);
4169 if (p) 4169 if (p)
4170 return p; 4170 return p;
4171 } 4171 }
4172 4172
4173 BUG(); /* the idle class will always have a runnable task */ 4173 BUG(); /* the idle class will always have a runnable task */
4174 } 4174 }
4175 4175
4176 /* 4176 /*
4177 * schedule() is the main scheduler function. 4177 * schedule() is the main scheduler function.
4178 */ 4178 */
4179 asmlinkage void __sched schedule(void) 4179 asmlinkage void __sched schedule(void)
4180 { 4180 {
4181 struct task_struct *prev, *next; 4181 struct task_struct *prev, *next;
4182 unsigned long *switch_count; 4182 unsigned long *switch_count;
4183 struct rq *rq; 4183 struct rq *rq;
4184 int cpu; 4184 int cpu;
4185 4185
4186 need_resched: 4186 need_resched:
4187 preempt_disable(); 4187 preempt_disable();
4188 cpu = smp_processor_id(); 4188 cpu = smp_processor_id();
4189 rq = cpu_rq(cpu); 4189 rq = cpu_rq(cpu);
4190 rcu_note_context_switch(cpu); 4190 rcu_note_context_switch(cpu);
4191 prev = rq->curr; 4191 prev = rq->curr;
4192 4192
4193 schedule_debug(prev); 4193 schedule_debug(prev);
4194 4194
4195 if (sched_feat(HRTICK)) 4195 if (sched_feat(HRTICK))
4196 hrtick_clear(rq); 4196 hrtick_clear(rq);
4197 4197
4198 raw_spin_lock_irq(&rq->lock); 4198 raw_spin_lock_irq(&rq->lock);
4199 4199
4200 switch_count = &prev->nivcsw; 4200 switch_count = &prev->nivcsw;
4201 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 4201 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
4202 if (unlikely(signal_pending_state(prev->state, prev))) { 4202 if (unlikely(signal_pending_state(prev->state, prev))) {
4203 prev->state = TASK_RUNNING; 4203 prev->state = TASK_RUNNING;
4204 } else { 4204 } else {
4205 deactivate_task(rq, prev, DEQUEUE_SLEEP); 4205 deactivate_task(rq, prev, DEQUEUE_SLEEP);
4206 prev->on_rq = 0; 4206 prev->on_rq = 0;
4207 4207
4208 /* 4208 /*
4209 * If a worker went to sleep, notify and ask workqueue 4209 * If a worker went to sleep, notify and ask workqueue
4210 * whether it wants to wake up a task to maintain 4210 * whether it wants to wake up a task to maintain
4211 * concurrency. 4211 * concurrency.
4212 */ 4212 */
4213 if (prev->flags & PF_WQ_WORKER) { 4213 if (prev->flags & PF_WQ_WORKER) {
4214 struct task_struct *to_wakeup; 4214 struct task_struct *to_wakeup;
4215 4215
4216 to_wakeup = wq_worker_sleeping(prev, cpu); 4216 to_wakeup = wq_worker_sleeping(prev, cpu);
4217 if (to_wakeup) 4217 if (to_wakeup)
4218 try_to_wake_up_local(to_wakeup); 4218 try_to_wake_up_local(to_wakeup);
4219 } 4219 }
4220 4220
4221 /* 4221 /*
4222 * If we are going to sleep and we have plugged IO 4222 * If we are going to sleep and we have plugged IO
4223 * queued, make sure to submit it to avoid deadlocks. 4223 * queued, make sure to submit it to avoid deadlocks.
4224 */ 4224 */
4225 if (blk_needs_flush_plug(prev)) { 4225 if (blk_needs_flush_plug(prev)) {
4226 raw_spin_unlock(&rq->lock); 4226 raw_spin_unlock(&rq->lock);
4227 blk_schedule_flush_plug(prev); 4227 blk_schedule_flush_plug(prev);
4228 raw_spin_lock(&rq->lock); 4228 raw_spin_lock(&rq->lock);
4229 } 4229 }
4230 } 4230 }
4231 switch_count = &prev->nvcsw; 4231 switch_count = &prev->nvcsw;
4232 } 4232 }
4233 4233
4234 pre_schedule(rq, prev); 4234 pre_schedule(rq, prev);
4235 4235
4236 if (unlikely(!rq->nr_running)) 4236 if (unlikely(!rq->nr_running))
4237 idle_balance(cpu, rq); 4237 idle_balance(cpu, rq);
4238 4238
4239 put_prev_task(rq, prev); 4239 put_prev_task(rq, prev);
4240 next = pick_next_task(rq); 4240 next = pick_next_task(rq);
4241 clear_tsk_need_resched(prev); 4241 clear_tsk_need_resched(prev);
4242 rq->skip_clock_update = 0; 4242 rq->skip_clock_update = 0;
4243 4243
4244 if (likely(prev != next)) { 4244 if (likely(prev != next)) {
4245 rq->nr_switches++; 4245 rq->nr_switches++;
4246 rq->curr = next; 4246 rq->curr = next;
4247 ++*switch_count; 4247 ++*switch_count;
4248 4248
4249 context_switch(rq, prev, next); /* unlocks the rq */ 4249 context_switch(rq, prev, next); /* unlocks the rq */
4250 /* 4250 /*
4251 * The context switch have flipped the stack from under us 4251 * The context switch have flipped the stack from under us
4252 * and restored the local variables which were saved when 4252 * and restored the local variables which were saved when
4253 * this task called schedule() in the past. prev == current 4253 * this task called schedule() in the past. prev == current
4254 * is still correct, but it can be moved to another cpu/rq. 4254 * is still correct, but it can be moved to another cpu/rq.
4255 */ 4255 */
4256 cpu = smp_processor_id(); 4256 cpu = smp_processor_id();
4257 rq = cpu_rq(cpu); 4257 rq = cpu_rq(cpu);
4258 } else 4258 } else
4259 raw_spin_unlock_irq(&rq->lock); 4259 raw_spin_unlock_irq(&rq->lock);
4260 4260
4261 post_schedule(rq); 4261 post_schedule(rq);
4262 4262
4263 preempt_enable_no_resched(); 4263 preempt_enable_no_resched();
4264 if (need_resched()) 4264 if (need_resched())
4265 goto need_resched; 4265 goto need_resched;
4266 } 4266 }
4267 EXPORT_SYMBOL(schedule); 4267 EXPORT_SYMBOL(schedule);
4268 4268
4269 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER 4269 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
4270 4270
4271 static inline bool owner_running(struct mutex *lock, struct task_struct *owner) 4271 static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
4272 { 4272 {
4273 bool ret = false; 4273 bool ret = false;
4274 4274
4275 rcu_read_lock(); 4275 rcu_read_lock();
4276 if (lock->owner != owner) 4276 if (lock->owner != owner)
4277 goto fail; 4277 goto fail;
4278 4278
4279 /* 4279 /*
4280 * Ensure we emit the owner->on_cpu, dereference _after_ checking 4280 * Ensure we emit the owner->on_cpu, dereference _after_ checking
4281 * lock->owner still matches owner, if that fails, owner might 4281 * lock->owner still matches owner, if that fails, owner might
4282 * point to free()d memory, if it still matches, the rcu_read_lock() 4282 * point to free()d memory, if it still matches, the rcu_read_lock()
4283 * ensures the memory stays valid. 4283 * ensures the memory stays valid.
4284 */ 4284 */
4285 barrier(); 4285 barrier();
4286 4286
4287 ret = owner->on_cpu; 4287 ret = owner->on_cpu;
4288 fail: 4288 fail:
4289 rcu_read_unlock(); 4289 rcu_read_unlock();
4290 4290
4291 return ret; 4291 return ret;
4292 } 4292 }
4293 4293
4294 /* 4294 /*
4295 * Look out! "owner" is an entirely speculative pointer 4295 * Look out! "owner" is an entirely speculative pointer
4296 * access and not reliable. 4296 * access and not reliable.
4297 */ 4297 */
4298 int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) 4298 int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
4299 { 4299 {
4300 if (!sched_feat(OWNER_SPIN)) 4300 if (!sched_feat(OWNER_SPIN))
4301 return 0; 4301 return 0;
4302 4302
4303 while (owner_running(lock, owner)) { 4303 while (owner_running(lock, owner)) {
4304 if (need_resched()) 4304 if (need_resched())
4305 return 0; 4305 return 0;
4306 4306
4307 arch_mutex_cpu_relax(); 4307 arch_mutex_cpu_relax();
4308 } 4308 }
4309 4309
4310 /* 4310 /*
4311 * If the owner changed to another task there is likely 4311 * If the owner changed to another task there is likely
4312 * heavy contention, stop spinning. 4312 * heavy contention, stop spinning.
4313 */ 4313 */
4314 if (lock->owner) 4314 if (lock->owner)
4315 return 0; 4315 return 0;
4316 4316
4317 return 1; 4317 return 1;
4318 } 4318 }
4319 #endif 4319 #endif
4320 4320
4321 #ifdef CONFIG_PREEMPT 4321 #ifdef CONFIG_PREEMPT
4322 /* 4322 /*
4323 * this is the entry point to schedule() from in-kernel preemption 4323 * this is the entry point to schedule() from in-kernel preemption
4324 * off of preempt_enable. Kernel preemptions off return from interrupt 4324 * off of preempt_enable. Kernel preemptions off return from interrupt
4325 * occur there and call schedule directly. 4325 * occur there and call schedule directly.
4326 */ 4326 */
4327 asmlinkage void __sched notrace preempt_schedule(void) 4327 asmlinkage void __sched notrace preempt_schedule(void)
4328 { 4328 {
4329 struct thread_info *ti = current_thread_info(); 4329 struct thread_info *ti = current_thread_info();
4330 4330
4331 /* 4331 /*
4332 * If there is a non-zero preempt_count or interrupts are disabled, 4332 * If there is a non-zero preempt_count or interrupts are disabled,
4333 * we do not want to preempt the current task. Just return.. 4333 * we do not want to preempt the current task. Just return..
4334 */ 4334 */
4335 if (likely(ti->preempt_count || irqs_disabled())) 4335 if (likely(ti->preempt_count || irqs_disabled()))
4336 return; 4336 return;
4337 4337
4338 do { 4338 do {
4339 add_preempt_count_notrace(PREEMPT_ACTIVE); 4339 add_preempt_count_notrace(PREEMPT_ACTIVE);
4340 schedule(); 4340 schedule();
4341 sub_preempt_count_notrace(PREEMPT_ACTIVE); 4341 sub_preempt_count_notrace(PREEMPT_ACTIVE);
4342 4342
4343 /* 4343 /*
4344 * Check again in case we missed a preemption opportunity 4344 * Check again in case we missed a preemption opportunity
4345 * between schedule and now. 4345 * between schedule and now.
4346 */ 4346 */
4347 barrier(); 4347 barrier();
4348 } while (need_resched()); 4348 } while (need_resched());
4349 } 4349 }
4350 EXPORT_SYMBOL(preempt_schedule); 4350 EXPORT_SYMBOL(preempt_schedule);
4351 4351
4352 /* 4352 /*
4353 * this is the entry point to schedule() from kernel preemption 4353 * this is the entry point to schedule() from kernel preemption
4354 * off of irq context. 4354 * off of irq context.
4355 * Note, that this is called and return with irqs disabled. This will 4355 * Note, that this is called and return with irqs disabled. This will
4356 * protect us against recursive calling from irq. 4356 * protect us against recursive calling from irq.
4357 */ 4357 */
4358 asmlinkage void __sched preempt_schedule_irq(void) 4358 asmlinkage void __sched preempt_schedule_irq(void)
4359 { 4359 {
4360 struct thread_info *ti = current_thread_info(); 4360 struct thread_info *ti = current_thread_info();
4361 4361
4362 /* Catch callers which need to be fixed */ 4362 /* Catch callers which need to be fixed */
4363 BUG_ON(ti->preempt_count || !irqs_disabled()); 4363 BUG_ON(ti->preempt_count || !irqs_disabled());
4364 4364
4365 do { 4365 do {
4366 add_preempt_count(PREEMPT_ACTIVE); 4366 add_preempt_count(PREEMPT_ACTIVE);
4367 local_irq_enable(); 4367 local_irq_enable();
4368 schedule(); 4368 schedule();
4369 local_irq_disable(); 4369 local_irq_disable();
4370 sub_preempt_count(PREEMPT_ACTIVE); 4370 sub_preempt_count(PREEMPT_ACTIVE);
4371 4371
4372 /* 4372 /*
4373 * Check again in case we missed a preemption opportunity 4373 * Check again in case we missed a preemption opportunity
4374 * between schedule and now. 4374 * between schedule and now.
4375 */ 4375 */
4376 barrier(); 4376 barrier();
4377 } while (need_resched()); 4377 } while (need_resched());
4378 } 4378 }
4379 4379
4380 #endif /* CONFIG_PREEMPT */ 4380 #endif /* CONFIG_PREEMPT */
4381 4381
4382 int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, 4382 int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
4383 void *key) 4383 void *key)
4384 { 4384 {
4385 return try_to_wake_up(curr->private, mode, wake_flags); 4385 return try_to_wake_up(curr->private, mode, wake_flags);
4386 } 4386 }
4387 EXPORT_SYMBOL(default_wake_function); 4387 EXPORT_SYMBOL(default_wake_function);
4388 4388
4389 /* 4389 /*
4390 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just 4390 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
4391 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve 4391 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
4392 * number) then we wake all the non-exclusive tasks and one exclusive task. 4392 * number) then we wake all the non-exclusive tasks and one exclusive task.
4393 * 4393 *
4394 * There are circumstances in which we can try to wake a task which has already 4394 * There are circumstances in which we can try to wake a task which has already
4395 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns 4395 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
4396 * zero in this (rare) case, and we handle it by continuing to scan the queue. 4396 * zero in this (rare) case, and we handle it by continuing to scan the queue.
4397 */ 4397 */
4398 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 4398 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
4399 int nr_exclusive, int wake_flags, void *key) 4399 int nr_exclusive, int wake_flags, void *key)
4400 { 4400 {
4401 wait_queue_t *curr, *next; 4401 wait_queue_t *curr, *next;
4402 4402
4403 list_for_each_entry_safe(curr, next, &q->task_list, task_list) { 4403 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
4404 unsigned flags = curr->flags; 4404 unsigned flags = curr->flags;
4405 4405
4406 if (curr->func(curr, mode, wake_flags, key) && 4406 if (curr->func(curr, mode, wake_flags, key) &&
4407 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) 4407 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
4408 break; 4408 break;
4409 } 4409 }
4410 } 4410 }
4411 4411
4412 /** 4412 /**
4413 * __wake_up - wake up threads blocked on a waitqueue. 4413 * __wake_up - wake up threads blocked on a waitqueue.
4414 * @q: the waitqueue 4414 * @q: the waitqueue
4415 * @mode: which threads 4415 * @mode: which threads
4416 * @nr_exclusive: how many wake-one or wake-many threads to wake up 4416 * @nr_exclusive: how many wake-one or wake-many threads to wake up
4417 * @key: is directly passed to the wakeup function 4417 * @key: is directly passed to the wakeup function
4418 * 4418 *
4419 * It may be assumed that this function implies a write memory barrier before 4419 * It may be assumed that this function implies a write memory barrier before
4420 * changing the task state if and only if any tasks are woken up. 4420 * changing the task state if and only if any tasks are woken up.
4421 */ 4421 */
4422 void __wake_up(wait_queue_head_t *q, unsigned int mode, 4422 void __wake_up(wait_queue_head_t *q, unsigned int mode,
4423 int nr_exclusive, void *key) 4423 int nr_exclusive, void *key)
4424 { 4424 {
4425 unsigned long flags; 4425 unsigned long flags;
4426 4426
4427 spin_lock_irqsave(&q->lock, flags); 4427 spin_lock_irqsave(&q->lock, flags);
4428 __wake_up_common(q, mode, nr_exclusive, 0, key); 4428 __wake_up_common(q, mode, nr_exclusive, 0, key);
4429 spin_unlock_irqrestore(&q->lock, flags); 4429 spin_unlock_irqrestore(&q->lock, flags);
4430 } 4430 }
4431 EXPORT_SYMBOL(__wake_up); 4431 EXPORT_SYMBOL(__wake_up);
4432 4432
4433 /* 4433 /*
4434 * Same as __wake_up but called with the spinlock in wait_queue_head_t held. 4434 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
4435 */ 4435 */
4436 void __wake_up_locked(wait_queue_head_t *q, unsigned int mode) 4436 void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
4437 { 4437 {
4438 __wake_up_common(q, mode, 1, 0, NULL); 4438 __wake_up_common(q, mode, 1, 0, NULL);
4439 } 4439 }
4440 EXPORT_SYMBOL_GPL(__wake_up_locked); 4440 EXPORT_SYMBOL_GPL(__wake_up_locked);
4441 4441
4442 void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) 4442 void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
4443 { 4443 {
4444 __wake_up_common(q, mode, 1, 0, key); 4444 __wake_up_common(q, mode, 1, 0, key);
4445 } 4445 }
4446 EXPORT_SYMBOL_GPL(__wake_up_locked_key); 4446 EXPORT_SYMBOL_GPL(__wake_up_locked_key);
4447 4447
4448 /** 4448 /**
4449 * __wake_up_sync_key - wake up threads blocked on a waitqueue. 4449 * __wake_up_sync_key - wake up threads blocked on a waitqueue.
4450 * @q: the waitqueue 4450 * @q: the waitqueue
4451 * @mode: which threads 4451 * @mode: which threads
4452 * @nr_exclusive: how many wake-one or wake-many threads to wake up 4452 * @nr_exclusive: how many wake-one or wake-many threads to wake up
4453 * @key: opaque value to be passed to wakeup targets 4453 * @key: opaque value to be passed to wakeup targets
4454 * 4454 *
4455 * The sync wakeup differs that the waker knows that it will schedule 4455 * The sync wakeup differs that the waker knows that it will schedule
4456 * away soon, so while the target thread will be woken up, it will not 4456 * away soon, so while the target thread will be woken up, it will not
4457 * be migrated to another CPU - ie. the two threads are 'synchronized' 4457 * be migrated to another CPU - ie. the two threads are 'synchronized'
4458 * with each other. This can prevent needless bouncing between CPUs. 4458 * with each other. This can prevent needless bouncing between CPUs.
4459 * 4459 *
4460 * On UP it can prevent extra preemption. 4460 * On UP it can prevent extra preemption.
4461 * 4461 *
4462 * It may be assumed that this function implies a write memory barrier before 4462 * It may be assumed that this function implies a write memory barrier before
4463 * changing the task state if and only if any tasks are woken up. 4463 * changing the task state if and only if any tasks are woken up.
4464 */ 4464 */
4465 void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, 4465 void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
4466 int nr_exclusive, void *key) 4466 int nr_exclusive, void *key)
4467 { 4467 {
4468 unsigned long flags; 4468 unsigned long flags;
4469 int wake_flags = WF_SYNC; 4469 int wake_flags = WF_SYNC;
4470 4470
4471 if (unlikely(!q)) 4471 if (unlikely(!q))
4472 return; 4472 return;
4473 4473
4474 if (unlikely(!nr_exclusive)) 4474 if (unlikely(!nr_exclusive))
4475 wake_flags = 0; 4475 wake_flags = 0;
4476 4476
4477 spin_lock_irqsave(&q->lock, flags); 4477 spin_lock_irqsave(&q->lock, flags);
4478 __wake_up_common(q, mode, nr_exclusive, wake_flags, key); 4478 __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
4479 spin_unlock_irqrestore(&q->lock, flags); 4479 spin_unlock_irqrestore(&q->lock, flags);
4480 } 4480 }
4481 EXPORT_SYMBOL_GPL(__wake_up_sync_key); 4481 EXPORT_SYMBOL_GPL(__wake_up_sync_key);
4482 4482
4483 /* 4483 /*
4484 * __wake_up_sync - see __wake_up_sync_key() 4484 * __wake_up_sync - see __wake_up_sync_key()
4485 */ 4485 */
4486 void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) 4486 void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
4487 { 4487 {
4488 __wake_up_sync_key(q, mode, nr_exclusive, NULL); 4488 __wake_up_sync_key(q, mode, nr_exclusive, NULL);
4489 } 4489 }
4490 EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ 4490 EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
4491 4491
4492 /** 4492 /**
4493 * complete: - signals a single thread waiting on this completion 4493 * complete: - signals a single thread waiting on this completion
4494 * @x: holds the state of this particular completion 4494 * @x: holds the state of this particular completion
4495 * 4495 *
4496 * This will wake up a single thread waiting on this completion. Threads will be 4496 * This will wake up a single thread waiting on this completion. Threads will be
4497 * awakened in the same order in which they were queued. 4497 * awakened in the same order in which they were queued.
4498 * 4498 *
4499 * See also complete_all(), wait_for_completion() and related routines. 4499 * See also complete_all(), wait_for_completion() and related routines.
4500 * 4500 *
4501 * It may be assumed that this function implies a write memory barrier before 4501 * It may be assumed that this function implies a write memory barrier before
4502 * changing the task state if and only if any tasks are woken up. 4502 * changing the task state if and only if any tasks are woken up.
4503 */ 4503 */
4504 void complete(struct completion *x) 4504 void complete(struct completion *x)
4505 { 4505 {
4506 unsigned long flags; 4506 unsigned long flags;
4507 4507
4508 spin_lock_irqsave(&x->wait.lock, flags); 4508 spin_lock_irqsave(&x->wait.lock, flags);
4509 x->done++; 4509 x->done++;
4510 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); 4510 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
4511 spin_unlock_irqrestore(&x->wait.lock, flags); 4511 spin_unlock_irqrestore(&x->wait.lock, flags);
4512 } 4512 }
4513 EXPORT_SYMBOL(complete); 4513 EXPORT_SYMBOL(complete);
4514 4514
4515 /** 4515 /**
4516 * complete_all: - signals all threads waiting on this completion 4516 * complete_all: - signals all threads waiting on this completion
4517 * @x: holds the state of this particular completion 4517 * @x: holds the state of this particular completion
4518 * 4518 *
4519 * This will wake up all threads waiting on this particular completion event. 4519 * This will wake up all threads waiting on this particular completion event.
4520 * 4520 *
4521 * It may be assumed that this function implies a write memory barrier before 4521 * It may be assumed that this function implies a write memory barrier before
4522 * changing the task state if and only if any tasks are woken up. 4522 * changing the task state if and only if any tasks are woken up.
4523 */ 4523 */
4524 void complete_all(struct completion *x) 4524 void complete_all(struct completion *x)
4525 { 4525 {
4526 unsigned long flags; 4526 unsigned long flags;
4527 4527
4528 spin_lock_irqsave(&x->wait.lock, flags); 4528 spin_lock_irqsave(&x->wait.lock, flags);
4529 x->done += UINT_MAX/2; 4529 x->done += UINT_MAX/2;
4530 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); 4530 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
4531 spin_unlock_irqrestore(&x->wait.lock, flags); 4531 spin_unlock_irqrestore(&x->wait.lock, flags);
4532 } 4532 }
4533 EXPORT_SYMBOL(complete_all); 4533 EXPORT_SYMBOL(complete_all);
4534 4534
4535 static inline long __sched 4535 static inline long __sched
4536 do_wait_for_common(struct completion *x, long timeout, int state) 4536 do_wait_for_common(struct completion *x, long timeout, int state)
4537 { 4537 {
4538 if (!x->done) { 4538 if (!x->done) {
4539 DECLARE_WAITQUEUE(wait, current); 4539 DECLARE_WAITQUEUE(wait, current);
4540 4540
4541 __add_wait_queue_tail_exclusive(&x->wait, &wait); 4541 __add_wait_queue_tail_exclusive(&x->wait, &wait);
4542 do { 4542 do {
4543 if (signal_pending_state(state, current)) { 4543 if (signal_pending_state(state, current)) {
4544 timeout = -ERESTARTSYS; 4544 timeout = -ERESTARTSYS;
4545 break; 4545 break;
4546 } 4546 }
4547 __set_current_state(state); 4547 __set_current_state(state);
4548 spin_unlock_irq(&x->wait.lock); 4548 spin_unlock_irq(&x->wait.lock);
4549 timeout = schedule_timeout(timeout); 4549 timeout = schedule_timeout(timeout);
4550 spin_lock_irq(&x->wait.lock); 4550 spin_lock_irq(&x->wait.lock);
4551 } while (!x->done && timeout); 4551 } while (!x->done && timeout);
4552 __remove_wait_queue(&x->wait, &wait); 4552 __remove_wait_queue(&x->wait, &wait);
4553 if (!x->done) 4553 if (!x->done)
4554 return timeout; 4554 return timeout;
4555 } 4555 }
4556 x->done--; 4556 x->done--;
4557 return timeout ?: 1; 4557 return timeout ?: 1;
4558 } 4558 }
4559 4559
4560 static long __sched 4560 static long __sched
4561 wait_for_common(struct completion *x, long timeout, int state) 4561 wait_for_common(struct completion *x, long timeout, int state)
4562 { 4562 {
4563 might_sleep(); 4563 might_sleep();
4564 4564
4565 spin_lock_irq(&x->wait.lock); 4565 spin_lock_irq(&x->wait.lock);
4566 timeout = do_wait_for_common(x, timeout, state); 4566 timeout = do_wait_for_common(x, timeout, state);
4567 spin_unlock_irq(&x->wait.lock); 4567 spin_unlock_irq(&x->wait.lock);
4568 return timeout; 4568 return timeout;
4569 } 4569 }
4570 4570
4571 /** 4571 /**
4572 * wait_for_completion: - waits for completion of a task 4572 * wait_for_completion: - waits for completion of a task
4573 * @x: holds the state of this particular completion 4573 * @x: holds the state of this particular completion
4574 * 4574 *
4575 * This waits to be signaled for completion of a specific task. It is NOT 4575 * This waits to be signaled for completion of a specific task. It is NOT
4576 * interruptible and there is no timeout. 4576 * interruptible and there is no timeout.
4577 * 4577 *
4578 * See also similar routines (i.e. wait_for_completion_timeout()) with timeout 4578 * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
4579 * and interrupt capability. Also see complete(). 4579 * and interrupt capability. Also see complete().
4580 */ 4580 */
4581 void __sched wait_for_completion(struct completion *x) 4581 void __sched wait_for_completion(struct completion *x)
4582 { 4582 {
4583 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); 4583 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
4584 } 4584 }
4585 EXPORT_SYMBOL(wait_for_completion); 4585 EXPORT_SYMBOL(wait_for_completion);
4586 4586
4587 /** 4587 /**
4588 * wait_for_completion_timeout: - waits for completion of a task (w/timeout) 4588 * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
4589 * @x: holds the state of this particular completion 4589 * @x: holds the state of this particular completion
4590 * @timeout: timeout value in jiffies 4590 * @timeout: timeout value in jiffies
4591 * 4591 *
4592 * This waits for either a completion of a specific task to be signaled or for a 4592 * This waits for either a completion of a specific task to be signaled or for a
4593 * specified timeout to expire. The timeout is in jiffies. It is not 4593 * specified timeout to expire. The timeout is in jiffies. It is not
4594 * interruptible. 4594 * interruptible.
4595 */ 4595 */
4596 unsigned long __sched 4596 unsigned long __sched
4597 wait_for_completion_timeout(struct completion *x, unsigned long timeout) 4597 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
4598 { 4598 {
4599 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); 4599 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
4600 } 4600 }
4601 EXPORT_SYMBOL(wait_for_completion_timeout); 4601 EXPORT_SYMBOL(wait_for_completion_timeout);
4602 4602
4603 /** 4603 /**
4604 * wait_for_completion_interruptible: - waits for completion of a task (w/intr) 4604 * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
4605 * @x: holds the state of this particular completion 4605 * @x: holds the state of this particular completion
4606 * 4606 *
4607 * This waits for completion of a specific task to be signaled. It is 4607 * This waits for completion of a specific task to be signaled. It is
4608 * interruptible. 4608 * interruptible.
4609 */ 4609 */
4610 int __sched wait_for_completion_interruptible(struct completion *x) 4610 int __sched wait_for_completion_interruptible(struct completion *x)
4611 { 4611 {
4612 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); 4612 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
4613 if (t == -ERESTARTSYS) 4613 if (t == -ERESTARTSYS)
4614 return t; 4614 return t;
4615 return 0; 4615 return 0;
4616 } 4616 }
4617 EXPORT_SYMBOL(wait_for_completion_interruptible); 4617 EXPORT_SYMBOL(wait_for_completion_interruptible);
4618 4618
4619 /** 4619 /**
4620 * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) 4620 * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
4621 * @x: holds the state of this particular completion 4621 * @x: holds the state of this particular completion
4622 * @timeout: timeout value in jiffies 4622 * @timeout: timeout value in jiffies
4623 * 4623 *
4624 * This waits for either a completion of a specific task to be signaled or for a 4624 * This waits for either a completion of a specific task to be signaled or for a
4625 * specified timeout to expire. It is interruptible. The timeout is in jiffies. 4625 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
4626 */ 4626 */
4627 long __sched 4627 long __sched
4628 wait_for_completion_interruptible_timeout(struct completion *x, 4628 wait_for_completion_interruptible_timeout(struct completion *x,
4629 unsigned long timeout) 4629 unsigned long timeout)
4630 { 4630 {
4631 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); 4631 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
4632 } 4632 }
4633 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); 4633 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
4634 4634
4635 /** 4635 /**
4636 * wait_for_completion_killable: - waits for completion of a task (killable) 4636 * wait_for_completion_killable: - waits for completion of a task (killable)
4637 * @x: holds the state of this particular completion 4637 * @x: holds the state of this particular completion
4638 * 4638 *
4639 * This waits to be signaled for completion of a specific task. It can be 4639 * This waits to be signaled for completion of a specific task. It can be
4640 * interrupted by a kill signal. 4640 * interrupted by a kill signal.
4641 */ 4641 */
4642 int __sched wait_for_completion_killable(struct completion *x) 4642 int __sched wait_for_completion_killable(struct completion *x)
4643 { 4643 {
4644 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); 4644 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
4645 if (t == -ERESTARTSYS) 4645 if (t == -ERESTARTSYS)
4646 return t; 4646 return t;
4647 return 0; 4647 return 0;
4648 } 4648 }
4649 EXPORT_SYMBOL(wait_for_completion_killable); 4649 EXPORT_SYMBOL(wait_for_completion_killable);
4650 4650
4651 /** 4651 /**
4652 * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) 4652 * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
4653 * @x: holds the state of this particular completion 4653 * @x: holds the state of this particular completion
4654 * @timeout: timeout value in jiffies 4654 * @timeout: timeout value in jiffies
4655 * 4655 *
4656 * This waits for either a completion of a specific task to be 4656 * This waits for either a completion of a specific task to be
4657 * signaled or for a specified timeout to expire. It can be 4657 * signaled or for a specified timeout to expire. It can be
4658 * interrupted by a kill signal. The timeout is in jiffies. 4658 * interrupted by a kill signal. The timeout is in jiffies.
4659 */ 4659 */
4660 long __sched 4660 long __sched
4661 wait_for_completion_killable_timeout(struct completion *x, 4661 wait_for_completion_killable_timeout(struct completion *x,
4662 unsigned long timeout) 4662 unsigned long timeout)
4663 { 4663 {
4664 return wait_for_common(x, timeout, TASK_KILLABLE); 4664 return wait_for_common(x, timeout, TASK_KILLABLE);
4665 } 4665 }
4666 EXPORT_SYMBOL(wait_for_completion_killable_timeout); 4666 EXPORT_SYMBOL(wait_for_completion_killable_timeout);
4667 4667
4668 /** 4668 /**
4669 * try_wait_for_completion - try to decrement a completion without blocking 4669 * try_wait_for_completion - try to decrement a completion without blocking
4670 * @x: completion structure 4670 * @x: completion structure
4671 * 4671 *
4672 * Returns: 0 if a decrement cannot be done without blocking 4672 * Returns: 0 if a decrement cannot be done without blocking
4673 * 1 if a decrement succeeded. 4673 * 1 if a decrement succeeded.
4674 * 4674 *
4675 * If a completion is being used as a counting completion, 4675 * If a completion is being used as a counting completion,
4676 * attempt to decrement the counter without blocking. This 4676 * attempt to decrement the counter without blocking. This
4677 * enables us to avoid waiting if the resource the completion 4677 * enables us to avoid waiting if the resource the completion
4678 * is protecting is not available. 4678 * is protecting is not available.
4679 */ 4679 */
4680 bool try_wait_for_completion(struct completion *x) 4680 bool try_wait_for_completion(struct completion *x)
4681 { 4681 {
4682 unsigned long flags; 4682 unsigned long flags;
4683 int ret = 1; 4683 int ret = 1;
4684 4684
4685 spin_lock_irqsave(&x->wait.lock, flags); 4685 spin_lock_irqsave(&x->wait.lock, flags);
4686 if (!x->done) 4686 if (!x->done)
4687 ret = 0; 4687 ret = 0;
4688 else 4688 else
4689 x->done--; 4689 x->done--;
4690 spin_unlock_irqrestore(&x->wait.lock, flags); 4690 spin_unlock_irqrestore(&x->wait.lock, flags);
4691 return ret; 4691 return ret;
4692 } 4692 }
4693 EXPORT_SYMBOL(try_wait_for_completion); 4693 EXPORT_SYMBOL(try_wait_for_completion);
4694 4694
4695 /** 4695 /**
4696 * completion_done - Test to see if a completion has any waiters 4696 * completion_done - Test to see if a completion has any waiters
4697 * @x: completion structure 4697 * @x: completion structure
4698 * 4698 *
4699 * Returns: 0 if there are waiters (wait_for_completion() in progress) 4699 * Returns: 0 if there are waiters (wait_for_completion() in progress)
4700 * 1 if there are no waiters. 4700 * 1 if there are no waiters.
4701 * 4701 *
4702 */ 4702 */
4703 bool completion_done(struct completion *x) 4703 bool completion_done(struct completion *x)
4704 { 4704 {
4705 unsigned long flags; 4705 unsigned long flags;
4706 int ret = 1; 4706 int ret = 1;
4707 4707
4708 spin_lock_irqsave(&x->wait.lock, flags); 4708 spin_lock_irqsave(&x->wait.lock, flags);
4709 if (!x->done) 4709 if (!x->done)
4710 ret = 0; 4710 ret = 0;
4711 spin_unlock_irqrestore(&x->wait.lock, flags); 4711 spin_unlock_irqrestore(&x->wait.lock, flags);
4712 return ret; 4712 return ret;
4713 } 4713 }
4714 EXPORT_SYMBOL(completion_done); 4714 EXPORT_SYMBOL(completion_done);
4715 4715
4716 static long __sched 4716 static long __sched
4717 sleep_on_common(wait_queue_head_t *q, int state, long timeout) 4717 sleep_on_common(wait_queue_head_t *q, int state, long timeout)
4718 { 4718 {
4719 unsigned long flags; 4719 unsigned long flags;
4720 wait_queue_t wait; 4720 wait_queue_t wait;
4721 4721
4722 init_waitqueue_entry(&wait, current); 4722 init_waitqueue_entry(&wait, current);
4723 4723
4724 __set_current_state(state); 4724 __set_current_state(state);
4725 4725
4726 spin_lock_irqsave(&q->lock, flags); 4726 spin_lock_irqsave(&q->lock, flags);
4727 __add_wait_queue(q, &wait); 4727 __add_wait_queue(q, &wait);
4728 spin_unlock(&q->lock); 4728 spin_unlock(&q->lock);
4729 timeout = schedule_timeout(timeout); 4729 timeout = schedule_timeout(timeout);
4730 spin_lock_irq(&q->lock); 4730 spin_lock_irq(&q->lock);
4731 __remove_wait_queue(q, &wait); 4731 __remove_wait_queue(q, &wait);
4732 spin_unlock_irqrestore(&q->lock, flags); 4732 spin_unlock_irqrestore(&q->lock, flags);
4733 4733
4734 return timeout; 4734 return timeout;
4735 } 4735 }
4736 4736
4737 void __sched interruptible_sleep_on(wait_queue_head_t *q) 4737 void __sched interruptible_sleep_on(wait_queue_head_t *q)
4738 { 4738 {
4739 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); 4739 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
4740 } 4740 }
4741 EXPORT_SYMBOL(interruptible_sleep_on); 4741 EXPORT_SYMBOL(interruptible_sleep_on);
4742 4742
4743 long __sched 4743 long __sched
4744 interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) 4744 interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
4745 { 4745 {
4746 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout); 4746 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
4747 } 4747 }
4748 EXPORT_SYMBOL(interruptible_sleep_on_timeout); 4748 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
4749 4749
4750 void __sched sleep_on(wait_queue_head_t *q) 4750 void __sched sleep_on(wait_queue_head_t *q)
4751 { 4751 {
4752 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); 4752 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
4753 } 4753 }
4754 EXPORT_SYMBOL(sleep_on); 4754 EXPORT_SYMBOL(sleep_on);
4755 4755
4756 long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) 4756 long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
4757 { 4757 {
4758 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout); 4758 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
4759 } 4759 }
4760 EXPORT_SYMBOL(sleep_on_timeout); 4760 EXPORT_SYMBOL(sleep_on_timeout);
4761 4761
4762 #ifdef CONFIG_RT_MUTEXES 4762 #ifdef CONFIG_RT_MUTEXES
4763 4763
4764 /* 4764 /*
4765 * rt_mutex_setprio - set the current priority of a task 4765 * rt_mutex_setprio - set the current priority of a task
4766 * @p: task 4766 * @p: task
4767 * @prio: prio value (kernel-internal form) 4767 * @prio: prio value (kernel-internal form)
4768 * 4768 *
4769 * This function changes the 'effective' priority of a task. It does 4769 * This function changes the 'effective' priority of a task. It does
4770 * not touch ->normal_prio like __setscheduler(). 4770 * not touch ->normal_prio like __setscheduler().
4771 * 4771 *
4772 * Used by the rt_mutex code to implement priority inheritance logic. 4772 * Used by the rt_mutex code to implement priority inheritance logic.
4773 */ 4773 */
4774 void rt_mutex_setprio(struct task_struct *p, int prio) 4774 void rt_mutex_setprio(struct task_struct *p, int prio)
4775 { 4775 {
4776 int oldprio, on_rq, running; 4776 int oldprio, on_rq, running;
4777 struct rq *rq; 4777 struct rq *rq;
4778 const struct sched_class *prev_class; 4778 const struct sched_class *prev_class;
4779 4779
4780 BUG_ON(prio < 0 || prio > MAX_PRIO); 4780 BUG_ON(prio < 0 || prio > MAX_PRIO);
4781 4781
4782 rq = __task_rq_lock(p); 4782 rq = __task_rq_lock(p);
4783 4783
4784 trace_sched_pi_setprio(p, prio); 4784 trace_sched_pi_setprio(p, prio);
4785 oldprio = p->prio; 4785 oldprio = p->prio;
4786 prev_class = p->sched_class; 4786 prev_class = p->sched_class;
4787 on_rq = p->on_rq; 4787 on_rq = p->on_rq;
4788 running = task_current(rq, p); 4788 running = task_current(rq, p);
4789 if (on_rq) 4789 if (on_rq)
4790 dequeue_task(rq, p, 0); 4790 dequeue_task(rq, p, 0);
4791 if (running) 4791 if (running)
4792 p->sched_class->put_prev_task(rq, p); 4792 p->sched_class->put_prev_task(rq, p);
4793 4793
4794 if (rt_prio(prio)) 4794 if (rt_prio(prio))
4795 p->sched_class = &rt_sched_class; 4795 p->sched_class = &rt_sched_class;
4796 else 4796 else
4797 p->sched_class = &fair_sched_class; 4797 p->sched_class = &fair_sched_class;
4798 4798
4799 p->prio = prio; 4799 p->prio = prio;
4800 4800
4801 if (running) 4801 if (running)
4802 p->sched_class->set_curr_task(rq); 4802 p->sched_class->set_curr_task(rq);
4803 if (on_rq) 4803 if (on_rq)
4804 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); 4804 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
4805 4805
4806 check_class_changed(rq, p, prev_class, oldprio); 4806 check_class_changed(rq, p, prev_class, oldprio);
4807 __task_rq_unlock(rq); 4807 __task_rq_unlock(rq);
4808 } 4808 }
4809 4809
4810 #endif 4810 #endif
4811 4811
4812 void set_user_nice(struct task_struct *p, long nice) 4812 void set_user_nice(struct task_struct *p, long nice)
4813 { 4813 {
4814 int old_prio, delta, on_rq; 4814 int old_prio, delta, on_rq;
4815 unsigned long flags; 4815 unsigned long flags;
4816 struct rq *rq; 4816 struct rq *rq;
4817 4817
4818 if (TASK_NICE(p) == nice || nice < -20 || nice > 19) 4818 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
4819 return; 4819 return;
4820 /* 4820 /*
4821 * We have to be careful, if called from sys_setpriority(), 4821 * We have to be careful, if called from sys_setpriority(),
4822 * the task might be in the middle of scheduling on another CPU. 4822 * the task might be in the middle of scheduling on another CPU.
4823 */ 4823 */
4824 rq = task_rq_lock(p, &flags); 4824 rq = task_rq_lock(p, &flags);
4825 /* 4825 /*
4826 * The RT priorities are set via sched_setscheduler(), but we still 4826 * The RT priorities are set via sched_setscheduler(), but we still
4827 * allow the 'normal' nice value to be set - but as expected 4827 * allow the 'normal' nice value to be set - but as expected
4828 * it wont have any effect on scheduling until the task is 4828 * it wont have any effect on scheduling until the task is
4829 * SCHED_FIFO/SCHED_RR: 4829 * SCHED_FIFO/SCHED_RR:
4830 */ 4830 */
4831 if (task_has_rt_policy(p)) { 4831 if (task_has_rt_policy(p)) {
4832 p->static_prio = NICE_TO_PRIO(nice); 4832 p->static_prio = NICE_TO_PRIO(nice);
4833 goto out_unlock; 4833 goto out_unlock;
4834 } 4834 }
4835 on_rq = p->on_rq; 4835 on_rq = p->on_rq;
4836 if (on_rq) 4836 if (on_rq)
4837 dequeue_task(rq, p, 0); 4837 dequeue_task(rq, p, 0);
4838 4838
4839 p->static_prio = NICE_TO_PRIO(nice); 4839 p->static_prio = NICE_TO_PRIO(nice);
4840 set_load_weight(p); 4840 set_load_weight(p);
4841 old_prio = p->prio; 4841 old_prio = p->prio;
4842 p->prio = effective_prio(p); 4842 p->prio = effective_prio(p);
4843 delta = p->prio - old_prio; 4843 delta = p->prio - old_prio;
4844 4844
4845 if (on_rq) { 4845 if (on_rq) {
4846 enqueue_task(rq, p, 0); 4846 enqueue_task(rq, p, 0);
4847 /* 4847 /*
4848 * If the task increased its priority or is running and 4848 * If the task increased its priority or is running and
4849 * lowered its priority, then reschedule its CPU: 4849 * lowered its priority, then reschedule its CPU:
4850 */ 4850 */
4851 if (delta < 0 || (delta > 0 && task_running(rq, p))) 4851 if (delta < 0 || (delta > 0 && task_running(rq, p)))
4852 resched_task(rq->curr); 4852 resched_task(rq->curr);
4853 } 4853 }
4854 out_unlock: 4854 out_unlock:
4855 task_rq_unlock(rq, p, &flags); 4855 task_rq_unlock(rq, p, &flags);
4856 } 4856 }
4857 EXPORT_SYMBOL(set_user_nice); 4857 EXPORT_SYMBOL(set_user_nice);
4858 4858
4859 /* 4859 /*
4860 * can_nice - check if a task can reduce its nice value 4860 * can_nice - check if a task can reduce its nice value
4861 * @p: task 4861 * @p: task
4862 * @nice: nice value 4862 * @nice: nice value
4863 */ 4863 */
4864 int can_nice(const struct task_struct *p, const int nice) 4864 int can_nice(const struct task_struct *p, const int nice)
4865 { 4865 {
4866 /* convert nice value [19,-20] to rlimit style value [1,40] */ 4866 /* convert nice value [19,-20] to rlimit style value [1,40] */
4867 int nice_rlim = 20 - nice; 4867 int nice_rlim = 20 - nice;
4868 4868
4869 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || 4869 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
4870 capable(CAP_SYS_NICE)); 4870 capable(CAP_SYS_NICE));
4871 } 4871 }
4872 4872
4873 #ifdef __ARCH_WANT_SYS_NICE 4873 #ifdef __ARCH_WANT_SYS_NICE
4874 4874
4875 /* 4875 /*
4876 * sys_nice - change the priority of the current process. 4876 * sys_nice - change the priority of the current process.
4877 * @increment: priority increment 4877 * @increment: priority increment
4878 * 4878 *
4879 * sys_setpriority is a more generic, but much slower function that 4879 * sys_setpriority is a more generic, but much slower function that
4880 * does similar things. 4880 * does similar things.
4881 */ 4881 */
4882 SYSCALL_DEFINE1(nice, int, increment) 4882 SYSCALL_DEFINE1(nice, int, increment)
4883 { 4883 {
4884 long nice, retval; 4884 long nice, retval;
4885 4885
4886 /* 4886 /*
4887 * Setpriority might change our priority at the same moment. 4887 * Setpriority might change our priority at the same moment.
4888 * We don't have to worry. Conceptually one call occurs first 4888 * We don't have to worry. Conceptually one call occurs first
4889 * and we have a single winner. 4889 * and we have a single winner.
4890 */ 4890 */
4891 if (increment < -40) 4891 if (increment < -40)
4892 increment = -40; 4892 increment = -40;
4893 if (increment > 40) 4893 if (increment > 40)
4894 increment = 40; 4894 increment = 40;
4895 4895
4896 nice = TASK_NICE(current) + increment; 4896 nice = TASK_NICE(current) + increment;
4897 if (nice < -20) 4897 if (nice < -20)
4898 nice = -20; 4898 nice = -20;
4899 if (nice > 19) 4899 if (nice > 19)
4900 nice = 19; 4900 nice = 19;
4901 4901
4902 if (increment < 0 && !can_nice(current, nice)) 4902 if (increment < 0 && !can_nice(current, nice))
4903 return -EPERM; 4903 return -EPERM;
4904 4904
4905 retval = security_task_setnice(current, nice); 4905 retval = security_task_setnice(current, nice);
4906 if (retval) 4906 if (retval)
4907 return retval; 4907 return retval;
4908 4908
4909 set_user_nice(current, nice); 4909 set_user_nice(current, nice);
4910 return 0; 4910 return 0;
4911 } 4911 }
4912 4912
4913 #endif 4913 #endif
4914 4914
4915 /** 4915 /**
4916 * task_prio - return the priority value of a given task. 4916 * task_prio - return the priority value of a given task.
4917 * @p: the task in question. 4917 * @p: the task in question.
4918 * 4918 *
4919 * This is the priority value as seen by users in /proc. 4919 * This is the priority value as seen by users in /proc.
4920 * RT tasks are offset by -200. Normal tasks are centered 4920 * RT tasks are offset by -200. Normal tasks are centered
4921 * around 0, value goes from -16 to +15. 4921 * around 0, value goes from -16 to +15.
4922 */ 4922 */
4923 int task_prio(const struct task_struct *p) 4923 int task_prio(const struct task_struct *p)
4924 { 4924 {
4925 return p->prio - MAX_RT_PRIO; 4925 return p->prio - MAX_RT_PRIO;
4926 } 4926 }
4927 4927
4928 /** 4928 /**
4929 * task_nice - return the nice value of a given task. 4929 * task_nice - return the nice value of a given task.
4930 * @p: the task in question. 4930 * @p: the task in question.
4931 */ 4931 */
4932 int task_nice(const struct task_struct *p) 4932 int task_nice(const struct task_struct *p)
4933 { 4933 {
4934 return TASK_NICE(p); 4934 return TASK_NICE(p);
4935 } 4935 }
4936 EXPORT_SYMBOL(task_nice); 4936 EXPORT_SYMBOL(task_nice);
4937 4937
4938 /** 4938 /**
4939 * idle_cpu - is a given cpu idle currently? 4939 * idle_cpu - is a given cpu idle currently?
4940 * @cpu: the processor in question. 4940 * @cpu: the processor in question.
4941 */ 4941 */
4942 int idle_cpu(int cpu) 4942 int idle_cpu(int cpu)
4943 { 4943 {
4944 return cpu_curr(cpu) == cpu_rq(cpu)->idle; 4944 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
4945 } 4945 }
4946 4946
4947 /** 4947 /**
4948 * idle_task - return the idle task for a given cpu. 4948 * idle_task - return the idle task for a given cpu.
4949 * @cpu: the processor in question. 4949 * @cpu: the processor in question.
4950 */ 4950 */
4951 struct task_struct *idle_task(int cpu) 4951 struct task_struct *idle_task(int cpu)
4952 { 4952 {
4953 return cpu_rq(cpu)->idle; 4953 return cpu_rq(cpu)->idle;
4954 } 4954 }
4955 4955
4956 /** 4956 /**
4957 * find_process_by_pid - find a process with a matching PID value. 4957 * find_process_by_pid - find a process with a matching PID value.
4958 * @pid: the pid in question. 4958 * @pid: the pid in question.
4959 */ 4959 */
4960 static struct task_struct *find_process_by_pid(pid_t pid) 4960 static struct task_struct *find_process_by_pid(pid_t pid)
4961 { 4961 {
4962 return pid ? find_task_by_vpid(pid) : current; 4962 return pid ? find_task_by_vpid(pid) : current;
4963 } 4963 }
4964 4964
4965 /* Actually do priority change: must hold rq lock. */ 4965 /* Actually do priority change: must hold rq lock. */
4966 static void 4966 static void
4967 __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) 4967 __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4968 { 4968 {
4969 p->policy = policy; 4969 p->policy = policy;
4970 p->rt_priority = prio; 4970 p->rt_priority = prio;
4971 p->normal_prio = normal_prio(p); 4971 p->normal_prio = normal_prio(p);
4972 /* we are holding p->pi_lock already */ 4972 /* we are holding p->pi_lock already */
4973 p->prio = rt_mutex_getprio(p); 4973 p->prio = rt_mutex_getprio(p);
4974 if (rt_prio(p->prio)) 4974 if (rt_prio(p->prio))
4975 p->sched_class = &rt_sched_class; 4975 p->sched_class = &rt_sched_class;
4976 else 4976 else
4977 p->sched_class = &fair_sched_class; 4977 p->sched_class = &fair_sched_class;
4978 set_load_weight(p); 4978 set_load_weight(p);
4979 } 4979 }
4980 4980
4981 /* 4981 /*
4982 * check the target process has a UID that matches the current process's 4982 * check the target process has a UID that matches the current process's
4983 */ 4983 */
4984 static bool check_same_owner(struct task_struct *p) 4984 static bool check_same_owner(struct task_struct *p)
4985 { 4985 {
4986 const struct cred *cred = current_cred(), *pcred; 4986 const struct cred *cred = current_cred(), *pcred;
4987 bool match; 4987 bool match;
4988 4988
4989 rcu_read_lock(); 4989 rcu_read_lock();
4990 pcred = __task_cred(p); 4990 pcred = __task_cred(p);
4991 if (cred->user->user_ns == pcred->user->user_ns) 4991 if (cred->user->user_ns == pcred->user->user_ns)
4992 match = (cred->euid == pcred->euid || 4992 match = (cred->euid == pcred->euid ||
4993 cred->euid == pcred->uid); 4993 cred->euid == pcred->uid);
4994 else 4994 else
4995 match = false; 4995 match = false;
4996 rcu_read_unlock(); 4996 rcu_read_unlock();
4997 return match; 4997 return match;
4998 } 4998 }
4999 4999
5000 static int __sched_setscheduler(struct task_struct *p, int policy, 5000 static int __sched_setscheduler(struct task_struct *p, int policy,
5001 const struct sched_param *param, bool user) 5001 const struct sched_param *param, bool user)
5002 { 5002 {
5003 int retval, oldprio, oldpolicy = -1, on_rq, running; 5003 int retval, oldprio, oldpolicy = -1, on_rq, running;
5004 unsigned long flags; 5004 unsigned long flags;
5005 const struct sched_class *prev_class; 5005 const struct sched_class *prev_class;
5006 struct rq *rq; 5006 struct rq *rq;
5007 int reset_on_fork; 5007 int reset_on_fork;
5008 5008
5009 /* may grab non-irq protected spin_locks */ 5009 /* may grab non-irq protected spin_locks */
5010 BUG_ON(in_interrupt()); 5010 BUG_ON(in_interrupt());
5011 recheck: 5011 recheck:
5012 /* double check policy once rq lock held */ 5012 /* double check policy once rq lock held */
5013 if (policy < 0) { 5013 if (policy < 0) {
5014 reset_on_fork = p->sched_reset_on_fork; 5014 reset_on_fork = p->sched_reset_on_fork;
5015 policy = oldpolicy = p->policy; 5015 policy = oldpolicy = p->policy;
5016 } else { 5016 } else {
5017 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); 5017 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
5018 policy &= ~SCHED_RESET_ON_FORK; 5018 policy &= ~SCHED_RESET_ON_FORK;
5019 5019
5020 if (policy != SCHED_FIFO && policy != SCHED_RR && 5020 if (policy != SCHED_FIFO && policy != SCHED_RR &&
5021 policy != SCHED_NORMAL && policy != SCHED_BATCH && 5021 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
5022 policy != SCHED_IDLE) 5022 policy != SCHED_IDLE)
5023 return -EINVAL; 5023 return -EINVAL;
5024 } 5024 }
5025 5025
5026 /* 5026 /*
5027 * Valid priorities for SCHED_FIFO and SCHED_RR are 5027 * Valid priorities for SCHED_FIFO and SCHED_RR are
5028 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, 5028 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
5029 * SCHED_BATCH and SCHED_IDLE is 0. 5029 * SCHED_BATCH and SCHED_IDLE is 0.
5030 */ 5030 */
5031 if (param->sched_priority < 0 || 5031 if (param->sched_priority < 0 ||
5032 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || 5032 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
5033 (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) 5033 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
5034 return -EINVAL; 5034 return -EINVAL;
5035 if (rt_policy(policy) != (param->sched_priority != 0)) 5035 if (rt_policy(policy) != (param->sched_priority != 0))
5036 return -EINVAL; 5036 return -EINVAL;
5037 5037
5038 /* 5038 /*
5039 * Allow unprivileged RT tasks to decrease priority: 5039 * Allow unprivileged RT tasks to decrease priority:
5040 */ 5040 */
5041 if (user && !capable(CAP_SYS_NICE)) { 5041 if (user && !capable(CAP_SYS_NICE)) {
5042 if (rt_policy(policy)) { 5042 if (rt_policy(policy)) {
5043 unsigned long rlim_rtprio = 5043 unsigned long rlim_rtprio =
5044 task_rlimit(p, RLIMIT_RTPRIO); 5044 task_rlimit(p, RLIMIT_RTPRIO);
5045 5045
5046 /* can't set/change the rt policy */ 5046 /* can't set/change the rt policy */
5047 if (policy != p->policy && !rlim_rtprio) 5047 if (policy != p->policy && !rlim_rtprio)
5048 return -EPERM; 5048 return -EPERM;
5049 5049
5050 /* can't increase priority */ 5050 /* can't increase priority */
5051 if (param->sched_priority > p->rt_priority && 5051 if (param->sched_priority > p->rt_priority &&
5052 param->sched_priority > rlim_rtprio) 5052 param->sched_priority > rlim_rtprio)
5053 return -EPERM; 5053 return -EPERM;
5054 } 5054 }
5055 5055
5056 /* 5056 /*
5057 * Treat SCHED_IDLE as nice 20. Only allow a switch to 5057 * Treat SCHED_IDLE as nice 20. Only allow a switch to
5058 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. 5058 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
5059 */ 5059 */
5060 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { 5060 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
5061 if (!can_nice(p, TASK_NICE(p))) 5061 if (!can_nice(p, TASK_NICE(p)))
5062 return -EPERM; 5062 return -EPERM;
5063 } 5063 }
5064 5064
5065 /* can't change other user's priorities */ 5065 /* can't change other user's priorities */
5066 if (!check_same_owner(p)) 5066 if (!check_same_owner(p))
5067 return -EPERM; 5067 return -EPERM;
5068 5068
5069 /* Normal users shall not reset the sched_reset_on_fork flag */ 5069 /* Normal users shall not reset the sched_reset_on_fork flag */
5070 if (p->sched_reset_on_fork && !reset_on_fork) 5070 if (p->sched_reset_on_fork && !reset_on_fork)
5071 return -EPERM; 5071 return -EPERM;
5072 } 5072 }
5073 5073
5074 if (user) { 5074 if (user) {
5075 retval = security_task_setscheduler(p); 5075 retval = security_task_setscheduler(p);
5076 if (retval) 5076 if (retval)
5077 return retval; 5077 return retval;
5078 } 5078 }
5079 5079
5080 /* 5080 /*
5081 * make sure no PI-waiters arrive (or leave) while we are 5081 * make sure no PI-waiters arrive (or leave) while we are
5082 * changing the priority of the task: 5082 * changing the priority of the task:
5083 * 5083 *
5084 * To be able to change p->policy safely, the appropriate 5084 * To be able to change p->policy safely, the appropriate
5085 * runqueue lock must be held. 5085 * runqueue lock must be held.
5086 */ 5086 */
5087 rq = task_rq_lock(p, &flags); 5087 rq = task_rq_lock(p, &flags);
5088 5088
5089 /* 5089 /*
5090 * Changing the policy of the stop threads its a very bad idea 5090 * Changing the policy of the stop threads its a very bad idea
5091 */ 5091 */
5092 if (p == rq->stop) { 5092 if (p == rq->stop) {
5093 task_rq_unlock(rq, p, &flags); 5093 task_rq_unlock(rq, p, &flags);
5094 return -EINVAL; 5094 return -EINVAL;
5095 } 5095 }
5096 5096
5097 /* 5097 /*
5098 * If not changing anything there's no need to proceed further: 5098 * If not changing anything there's no need to proceed further:
5099 */ 5099 */
5100 if (unlikely(policy == p->policy && (!rt_policy(policy) || 5100 if (unlikely(policy == p->policy && (!rt_policy(policy) ||
5101 param->sched_priority == p->rt_priority))) { 5101 param->sched_priority == p->rt_priority))) {
5102 5102
5103 __task_rq_unlock(rq); 5103 __task_rq_unlock(rq);
5104 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 5104 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5105 return 0; 5105 return 0;
5106 } 5106 }
5107 5107
5108 #ifdef CONFIG_RT_GROUP_SCHED 5108 #ifdef CONFIG_RT_GROUP_SCHED
5109 if (user) { 5109 if (user) {
5110 /* 5110 /*
5111 * Do not allow realtime tasks into groups that have no runtime 5111 * Do not allow realtime tasks into groups that have no runtime
5112 * assigned. 5112 * assigned.
5113 */ 5113 */
5114 if (rt_bandwidth_enabled() && rt_policy(policy) && 5114 if (rt_bandwidth_enabled() && rt_policy(policy) &&
5115 task_group(p)->rt_bandwidth.rt_runtime == 0 && 5115 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
5116 !task_group_is_autogroup(task_group(p))) { 5116 !task_group_is_autogroup(task_group(p))) {
5117 task_rq_unlock(rq, p, &flags); 5117 task_rq_unlock(rq, p, &flags);
5118 return -EPERM; 5118 return -EPERM;
5119 } 5119 }
5120 } 5120 }
5121 #endif 5121 #endif
5122 5122
5123 /* recheck policy now with rq lock held */ 5123 /* recheck policy now with rq lock held */
5124 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 5124 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
5125 policy = oldpolicy = -1; 5125 policy = oldpolicy = -1;
5126 task_rq_unlock(rq, p, &flags); 5126 task_rq_unlock(rq, p, &flags);
5127 goto recheck; 5127 goto recheck;
5128 } 5128 }
5129 on_rq = p->on_rq; 5129 on_rq = p->on_rq;
5130 running = task_current(rq, p); 5130 running = task_current(rq, p);
5131 if (on_rq) 5131 if (on_rq)
5132 deactivate_task(rq, p, 0); 5132 deactivate_task(rq, p, 0);
5133 if (running) 5133 if (running)
5134 p->sched_class->put_prev_task(rq, p); 5134 p->sched_class->put_prev_task(rq, p);
5135 5135
5136 p->sched_reset_on_fork = reset_on_fork; 5136 p->sched_reset_on_fork = reset_on_fork;
5137 5137
5138 oldprio = p->prio; 5138 oldprio = p->prio;
5139 prev_class = p->sched_class; 5139 prev_class = p->sched_class;
5140 __setscheduler(rq, p, policy, param->sched_priority); 5140 __setscheduler(rq, p, policy, param->sched_priority);
5141 5141
5142 if (running) 5142 if (running)
5143 p->sched_class->set_curr_task(rq); 5143 p->sched_class->set_curr_task(rq);
5144 if (on_rq) 5144 if (on_rq)
5145 activate_task(rq, p, 0); 5145 activate_task(rq, p, 0);
5146 5146
5147 check_class_changed(rq, p, prev_class, oldprio); 5147 check_class_changed(rq, p, prev_class, oldprio);
5148 task_rq_unlock(rq, p, &flags); 5148 task_rq_unlock(rq, p, &flags);
5149 5149
5150 rt_mutex_adjust_pi(p); 5150 rt_mutex_adjust_pi(p);
5151 5151
5152 return 0; 5152 return 0;
5153 } 5153 }
5154 5154
5155 /** 5155 /**
5156 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. 5156 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
5157 * @p: the task in question. 5157 * @p: the task in question.
5158 * @policy: new policy. 5158 * @policy: new policy.
5159 * @param: structure containing the new RT priority. 5159 * @param: structure containing the new RT priority.
5160 * 5160 *
5161 * NOTE that the task may be already dead. 5161 * NOTE that the task may be already dead.
5162 */ 5162 */
5163 int sched_setscheduler(struct task_struct *p, int policy, 5163 int sched_setscheduler(struct task_struct *p, int policy,
5164 const struct sched_param *param) 5164 const struct sched_param *param)
5165 { 5165 {
5166 return __sched_setscheduler(p, policy, param, true); 5166 return __sched_setscheduler(p, policy, param, true);
5167 } 5167 }
5168 EXPORT_SYMBOL_GPL(sched_setscheduler); 5168 EXPORT_SYMBOL_GPL(sched_setscheduler);
5169 5169
5170 /** 5170 /**
5171 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. 5171 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
5172 * @p: the task in question. 5172 * @p: the task in question.
5173 * @policy: new policy. 5173 * @policy: new policy.
5174 * @param: structure containing the new RT priority. 5174 * @param: structure containing the new RT priority.
5175 * 5175 *
5176 * Just like sched_setscheduler, only don't bother checking if the 5176 * Just like sched_setscheduler, only don't bother checking if the
5177 * current context has permission. For example, this is needed in 5177 * current context has permission. For example, this is needed in
5178 * stop_machine(): we create temporary high priority worker threads, 5178 * stop_machine(): we create temporary high priority worker threads,
5179 * but our caller might not have that capability. 5179 * but our caller might not have that capability.
5180 */ 5180 */
5181 int sched_setscheduler_nocheck(struct task_struct *p, int policy, 5181 int sched_setscheduler_nocheck(struct task_struct *p, int policy,
5182 const struct sched_param *param) 5182 const struct sched_param *param)
5183 { 5183 {
5184 return __sched_setscheduler(p, policy, param, false); 5184 return __sched_setscheduler(p, policy, param, false);
5185 } 5185 }
5186 5186
5187 static int 5187 static int
5188 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) 5188 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
5189 { 5189 {
5190 struct sched_param lparam; 5190 struct sched_param lparam;
5191 struct task_struct *p; 5191 struct task_struct *p;
5192 int retval; 5192 int retval;
5193 5193
5194 if (!param || pid < 0) 5194 if (!param || pid < 0)
5195 return -EINVAL; 5195 return -EINVAL;
5196 if (copy_from_user(&lparam, param, sizeof(struct sched_param))) 5196 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
5197 return -EFAULT; 5197 return -EFAULT;
5198 5198
5199 rcu_read_lock(); 5199 rcu_read_lock();
5200 retval = -ESRCH; 5200 retval = -ESRCH;
5201 p = find_process_by_pid(pid); 5201 p = find_process_by_pid(pid);
5202 if (p != NULL) 5202 if (p != NULL)
5203 retval = sched_setscheduler(p, policy, &lparam); 5203 retval = sched_setscheduler(p, policy, &lparam);
5204 rcu_read_unlock(); 5204 rcu_read_unlock();
5205 5205
5206 return retval; 5206 return retval;
5207 } 5207 }
5208 5208
5209 /** 5209 /**
5210 * sys_sched_setscheduler - set/change the scheduler policy and RT priority 5210 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
5211 * @pid: the pid in question. 5211 * @pid: the pid in question.
5212 * @policy: new policy. 5212 * @policy: new policy.
5213 * @param: structure containing the new RT priority. 5213 * @param: structure containing the new RT priority.
5214 */ 5214 */
5215 SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, 5215 SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
5216 struct sched_param __user *, param) 5216 struct sched_param __user *, param)
5217 { 5217 {
5218 /* negative values for policy are not valid */ 5218 /* negative values for policy are not valid */
5219 if (policy < 0) 5219 if (policy < 0)
5220 return -EINVAL; 5220 return -EINVAL;
5221 5221
5222 return do_sched_setscheduler(pid, policy, param); 5222 return do_sched_setscheduler(pid, policy, param);
5223 } 5223 }
5224 5224
5225 /** 5225 /**
5226 * sys_sched_setparam - set/change the RT priority of a thread 5226 * sys_sched_setparam - set/change the RT priority of a thread
5227 * @pid: the pid in question. 5227 * @pid: the pid in question.
5228 * @param: structure containing the new RT priority. 5228 * @param: structure containing the new RT priority.
5229 */ 5229 */
5230 SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) 5230 SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
5231 { 5231 {
5232 return do_sched_setscheduler(pid, -1, param); 5232 return do_sched_setscheduler(pid, -1, param);
5233 } 5233 }
5234 5234
5235 /** 5235 /**
5236 * sys_sched_getscheduler - get the policy (scheduling class) of a thread 5236 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
5237 * @pid: the pid in question. 5237 * @pid: the pid in question.
5238 */ 5238 */
5239 SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) 5239 SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
5240 { 5240 {
5241 struct task_struct *p; 5241 struct task_struct *p;
5242 int retval; 5242 int retval;
5243 5243
5244 if (pid < 0) 5244 if (pid < 0)
5245 return -EINVAL; 5245 return -EINVAL;
5246 5246
5247 retval = -ESRCH; 5247 retval = -ESRCH;
5248 rcu_read_lock(); 5248 rcu_read_lock();
5249 p = find_process_by_pid(pid); 5249 p = find_process_by_pid(pid);
5250 if (p) { 5250 if (p) {
5251 retval = security_task_getscheduler(p); 5251 retval = security_task_getscheduler(p);
5252 if (!retval) 5252 if (!retval)
5253 retval = p->policy 5253 retval = p->policy
5254 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); 5254 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
5255 } 5255 }
5256 rcu_read_unlock(); 5256 rcu_read_unlock();
5257 return retval; 5257 return retval;
5258 } 5258 }
5259 5259
5260 /** 5260 /**
5261 * sys_sched_getparam - get the RT priority of a thread 5261 * sys_sched_getparam - get the RT priority of a thread
5262 * @pid: the pid in question. 5262 * @pid: the pid in question.
5263 * @param: structure containing the RT priority. 5263 * @param: structure containing the RT priority.
5264 */ 5264 */
5265 SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) 5265 SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
5266 { 5266 {
5267 struct sched_param lp; 5267 struct sched_param lp;
5268 struct task_struct *p; 5268 struct task_struct *p;
5269 int retval; 5269 int retval;
5270 5270
5271 if (!param || pid < 0) 5271 if (!param || pid < 0)
5272 return -EINVAL; 5272 return -EINVAL;
5273 5273
5274 rcu_read_lock(); 5274 rcu_read_lock();
5275 p = find_process_by_pid(pid); 5275 p = find_process_by_pid(pid);
5276 retval = -ESRCH; 5276 retval = -ESRCH;
5277 if (!p) 5277 if (!p)
5278 goto out_unlock; 5278 goto out_unlock;
5279 5279
5280 retval = security_task_getscheduler(p); 5280 retval = security_task_getscheduler(p);
5281 if (retval) 5281 if (retval)
5282 goto out_unlock; 5282 goto out_unlock;
5283 5283
5284 lp.sched_priority = p->rt_priority; 5284 lp.sched_priority = p->rt_priority;
5285 rcu_read_unlock(); 5285 rcu_read_unlock();
5286 5286
5287 /* 5287 /*
5288 * This one might sleep, we cannot do it with a spinlock held ... 5288 * This one might sleep, we cannot do it with a spinlock held ...
5289 */ 5289 */
5290 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; 5290 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
5291 5291
5292 return retval; 5292 return retval;
5293 5293
5294 out_unlock: 5294 out_unlock:
5295 rcu_read_unlock(); 5295 rcu_read_unlock();
5296 return retval; 5296 return retval;
5297 } 5297 }
5298 5298
5299 long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) 5299 long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
5300 { 5300 {
5301 cpumask_var_t cpus_allowed, new_mask; 5301 cpumask_var_t cpus_allowed, new_mask;
5302 struct task_struct *p; 5302 struct task_struct *p;
5303 int retval; 5303 int retval;
5304 5304
5305 get_online_cpus(); 5305 get_online_cpus();
5306 rcu_read_lock(); 5306 rcu_read_lock();
5307 5307
5308 p = find_process_by_pid(pid); 5308 p = find_process_by_pid(pid);
5309 if (!p) { 5309 if (!p) {
5310 rcu_read_unlock(); 5310 rcu_read_unlock();
5311 put_online_cpus(); 5311 put_online_cpus();
5312 return -ESRCH; 5312 return -ESRCH;
5313 } 5313 }
5314 5314
5315 /* Prevent p going away */ 5315 /* Prevent p going away */
5316 get_task_struct(p); 5316 get_task_struct(p);
5317 rcu_read_unlock(); 5317 rcu_read_unlock();
5318 5318
5319 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { 5319 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
5320 retval = -ENOMEM; 5320 retval = -ENOMEM;
5321 goto out_put_task; 5321 goto out_put_task;
5322 } 5322 }
5323 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { 5323 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
5324 retval = -ENOMEM; 5324 retval = -ENOMEM;
5325 goto out_free_cpus_allowed; 5325 goto out_free_cpus_allowed;
5326 } 5326 }
5327 retval = -EPERM; 5327 retval = -EPERM;
5328 if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE)) 5328 if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE))
5329 goto out_unlock; 5329 goto out_unlock;
5330 5330
5331 retval = security_task_setscheduler(p); 5331 retval = security_task_setscheduler(p);
5332 if (retval) 5332 if (retval)
5333 goto out_unlock; 5333 goto out_unlock;
5334 5334
5335 cpuset_cpus_allowed(p, cpus_allowed); 5335 cpuset_cpus_allowed(p, cpus_allowed);
5336 cpumask_and(new_mask, in_mask, cpus_allowed); 5336 cpumask_and(new_mask, in_mask, cpus_allowed);
5337 again: 5337 again:
5338 retval = set_cpus_allowed_ptr(p, new_mask); 5338 retval = set_cpus_allowed_ptr(p, new_mask);
5339 5339
5340 if (!retval) { 5340 if (!retval) {
5341 cpuset_cpus_allowed(p, cpus_allowed); 5341 cpuset_cpus_allowed(p, cpus_allowed);
5342 if (!cpumask_subset(new_mask, cpus_allowed)) { 5342 if (!cpumask_subset(new_mask, cpus_allowed)) {
5343 /* 5343 /*
5344 * We must have raced with a concurrent cpuset 5344 * We must have raced with a concurrent cpuset
5345 * update. Just reset the cpus_allowed to the 5345 * update. Just reset the cpus_allowed to the
5346 * cpuset's cpus_allowed 5346 * cpuset's cpus_allowed
5347 */ 5347 */
5348 cpumask_copy(new_mask, cpus_allowed); 5348 cpumask_copy(new_mask, cpus_allowed);
5349 goto again; 5349 goto again;
5350 } 5350 }
5351 } 5351 }
5352 out_unlock: 5352 out_unlock:
5353 free_cpumask_var(new_mask); 5353 free_cpumask_var(new_mask);
5354 out_free_cpus_allowed: 5354 out_free_cpus_allowed:
5355 free_cpumask_var(cpus_allowed); 5355 free_cpumask_var(cpus_allowed);
5356 out_put_task: 5356 out_put_task:
5357 put_task_struct(p); 5357 put_task_struct(p);
5358 put_online_cpus(); 5358 put_online_cpus();
5359 return retval; 5359 return retval;
5360 } 5360 }
5361 5361
5362 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, 5362 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
5363 struct cpumask *new_mask) 5363 struct cpumask *new_mask)
5364 { 5364 {
5365 if (len < cpumask_size()) 5365 if (len < cpumask_size())
5366 cpumask_clear(new_mask); 5366 cpumask_clear(new_mask);
5367 else if (len > cpumask_size()) 5367 else if (len > cpumask_size())
5368 len = cpumask_size(); 5368 len = cpumask_size();
5369 5369
5370 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; 5370 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
5371 } 5371 }
5372 5372
5373 /** 5373 /**
5374 * sys_sched_setaffinity - set the cpu affinity of a process 5374 * sys_sched_setaffinity - set the cpu affinity of a process
5375 * @pid: pid of the process 5375 * @pid: pid of the process
5376 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 5376 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
5377 * @user_mask_ptr: user-space pointer to the new cpu mask 5377 * @user_mask_ptr: user-space pointer to the new cpu mask
5378 */ 5378 */
5379 SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, 5379 SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
5380 unsigned long __user *, user_mask_ptr) 5380 unsigned long __user *, user_mask_ptr)
5381 { 5381 {
5382 cpumask_var_t new_mask; 5382 cpumask_var_t new_mask;
5383 int retval; 5383 int retval;
5384 5384
5385 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) 5385 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
5386 return -ENOMEM; 5386 return -ENOMEM;
5387 5387
5388 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); 5388 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
5389 if (retval == 0) 5389 if (retval == 0)
5390 retval = sched_setaffinity(pid, new_mask); 5390 retval = sched_setaffinity(pid, new_mask);
5391 free_cpumask_var(new_mask); 5391 free_cpumask_var(new_mask);
5392 return retval; 5392 return retval;
5393 } 5393 }
5394 5394
5395 long sched_getaffinity(pid_t pid, struct cpumask *mask) 5395 long sched_getaffinity(pid_t pid, struct cpumask *mask)
5396 { 5396 {
5397 struct task_struct *p; 5397 struct task_struct *p;
5398 unsigned long flags; 5398 unsigned long flags;
5399 int retval; 5399 int retval;
5400 5400
5401 get_online_cpus(); 5401 get_online_cpus();
5402 rcu_read_lock(); 5402 rcu_read_lock();
5403 5403
5404 retval = -ESRCH; 5404 retval = -ESRCH;
5405 p = find_process_by_pid(pid); 5405 p = find_process_by_pid(pid);
5406 if (!p) 5406 if (!p)
5407 goto out_unlock; 5407 goto out_unlock;
5408 5408
5409 retval = security_task_getscheduler(p); 5409 retval = security_task_getscheduler(p);
5410 if (retval) 5410 if (retval)
5411 goto out_unlock; 5411 goto out_unlock;
5412 5412
5413 raw_spin_lock_irqsave(&p->pi_lock, flags); 5413 raw_spin_lock_irqsave(&p->pi_lock, flags);
5414 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); 5414 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
5415 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 5415 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5416 5416
5417 out_unlock: 5417 out_unlock:
5418 rcu_read_unlock(); 5418 rcu_read_unlock();
5419 put_online_cpus(); 5419 put_online_cpus();
5420 5420
5421 return retval; 5421 return retval;
5422 } 5422 }
5423 5423
5424 /** 5424 /**
5425 * sys_sched_getaffinity - get the cpu affinity of a process 5425 * sys_sched_getaffinity - get the cpu affinity of a process
5426 * @pid: pid of the process 5426 * @pid: pid of the process
5427 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 5427 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
5428 * @user_mask_ptr: user-space pointer to hold the current cpu mask 5428 * @user_mask_ptr: user-space pointer to hold the current cpu mask
5429 */ 5429 */
5430 SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, 5430 SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
5431 unsigned long __user *, user_mask_ptr) 5431 unsigned long __user *, user_mask_ptr)
5432 { 5432 {
5433 int ret; 5433 int ret;
5434 cpumask_var_t mask; 5434 cpumask_var_t mask;
5435 5435
5436 if ((len * BITS_PER_BYTE) < nr_cpu_ids) 5436 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
5437 return -EINVAL; 5437 return -EINVAL;
5438 if (len & (sizeof(unsigned long)-1)) 5438 if (len & (sizeof(unsigned long)-1))
5439 return -EINVAL; 5439 return -EINVAL;
5440 5440
5441 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 5441 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
5442 return -ENOMEM; 5442 return -ENOMEM;
5443 5443
5444 ret = sched_getaffinity(pid, mask); 5444 ret = sched_getaffinity(pid, mask);
5445 if (ret == 0) { 5445 if (ret == 0) {
5446 size_t retlen = min_t(size_t, len, cpumask_size()); 5446 size_t retlen = min_t(size_t, len, cpumask_size());
5447 5447
5448 if (copy_to_user(user_mask_ptr, mask, retlen)) 5448 if (copy_to_user(user_mask_ptr, mask, retlen))
5449 ret = -EFAULT; 5449 ret = -EFAULT;
5450 else 5450 else
5451 ret = retlen; 5451 ret = retlen;
5452 } 5452 }
5453 free_cpumask_var(mask); 5453 free_cpumask_var(mask);
5454 5454
5455 return ret; 5455 return ret;
5456 } 5456 }
5457 5457
5458 /** 5458 /**
5459 * sys_sched_yield - yield the current processor to other threads. 5459 * sys_sched_yield - yield the current processor to other threads.
5460 * 5460 *
5461 * This function yields the current CPU to other tasks. If there are no 5461 * This function yields the current CPU to other tasks. If there are no
5462 * other threads running on this CPU then this function will return. 5462 * other threads running on this CPU then this function will return.
5463 */ 5463 */
5464 SYSCALL_DEFINE0(sched_yield) 5464 SYSCALL_DEFINE0(sched_yield)
5465 { 5465 {
5466 struct rq *rq = this_rq_lock(); 5466 struct rq *rq = this_rq_lock();
5467 5467
5468 schedstat_inc(rq, yld_count); 5468 schedstat_inc(rq, yld_count);
5469 current->sched_class->yield_task(rq); 5469 current->sched_class->yield_task(rq);
5470 5470
5471 /* 5471 /*
5472 * Since we are going to call schedule() anyway, there's 5472 * Since we are going to call schedule() anyway, there's
5473 * no need to preempt or enable interrupts: 5473 * no need to preempt or enable interrupts:
5474 */ 5474 */
5475 __release(rq->lock); 5475 __release(rq->lock);
5476 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 5476 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
5477 do_raw_spin_unlock(&rq->lock); 5477 do_raw_spin_unlock(&rq->lock);
5478 preempt_enable_no_resched(); 5478 preempt_enable_no_resched();
5479 5479
5480 schedule(); 5480 schedule();
5481 5481
5482 return 0; 5482 return 0;
5483 } 5483 }
5484 5484
5485 static inline int should_resched(void) 5485 static inline int should_resched(void)
5486 { 5486 {
5487 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE); 5487 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
5488 } 5488 }
5489 5489
5490 static void __cond_resched(void) 5490 static void __cond_resched(void)
5491 { 5491 {
5492 add_preempt_count(PREEMPT_ACTIVE); 5492 add_preempt_count(PREEMPT_ACTIVE);
5493 schedule(); 5493 schedule();
5494 sub_preempt_count(PREEMPT_ACTIVE); 5494 sub_preempt_count(PREEMPT_ACTIVE);
5495 } 5495 }
5496 5496
5497 int __sched _cond_resched(void) 5497 int __sched _cond_resched(void)
5498 { 5498 {
5499 if (should_resched()) { 5499 if (should_resched()) {
5500 __cond_resched(); 5500 __cond_resched();
5501 return 1; 5501 return 1;
5502 } 5502 }
5503 return 0; 5503 return 0;
5504 } 5504 }
5505 EXPORT_SYMBOL(_cond_resched); 5505 EXPORT_SYMBOL(_cond_resched);
5506 5506
5507 /* 5507 /*
5508 * __cond_resched_lock() - if a reschedule is pending, drop the given lock, 5508 * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
5509 * call schedule, and on return reacquire the lock. 5509 * call schedule, and on return reacquire the lock.
5510 * 5510 *
5511 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level 5511 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
5512 * operations here to prevent schedule() from being called twice (once via 5512 * operations here to prevent schedule() from being called twice (once via
5513 * spin_unlock(), once by hand). 5513 * spin_unlock(), once by hand).
5514 */ 5514 */
5515 int __cond_resched_lock(spinlock_t *lock) 5515 int __cond_resched_lock(spinlock_t *lock)
5516 { 5516 {
5517 int resched = should_resched(); 5517 int resched = should_resched();
5518 int ret = 0; 5518 int ret = 0;
5519 5519
5520 lockdep_assert_held(lock); 5520 lockdep_assert_held(lock);
5521 5521
5522 if (spin_needbreak(lock) || resched) { 5522 if (spin_needbreak(lock) || resched) {
5523 spin_unlock(lock); 5523 spin_unlock(lock);
5524 if (resched) 5524 if (resched)
5525 __cond_resched(); 5525 __cond_resched();
5526 else 5526 else
5527 cpu_relax(); 5527 cpu_relax();
5528 ret = 1; 5528 ret = 1;
5529 spin_lock(lock); 5529 spin_lock(lock);
5530 } 5530 }
5531 return ret; 5531 return ret;
5532 } 5532 }
5533 EXPORT_SYMBOL(__cond_resched_lock); 5533 EXPORT_SYMBOL(__cond_resched_lock);
5534 5534
5535 int __sched __cond_resched_softirq(void) 5535 int __sched __cond_resched_softirq(void)
5536 { 5536 {
5537 BUG_ON(!in_softirq()); 5537 BUG_ON(!in_softirq());
5538 5538
5539 if (should_resched()) { 5539 if (should_resched()) {
5540 local_bh_enable(); 5540 local_bh_enable();
5541 __cond_resched(); 5541 __cond_resched();
5542 local_bh_disable(); 5542 local_bh_disable();
5543 return 1; 5543 return 1;
5544 } 5544 }
5545 return 0; 5545 return 0;
5546 } 5546 }
5547 EXPORT_SYMBOL(__cond_resched_softirq); 5547 EXPORT_SYMBOL(__cond_resched_softirq);
5548 5548
5549 /** 5549 /**
5550 * yield - yield the current processor to other threads. 5550 * yield - yield the current processor to other threads.
5551 * 5551 *
5552 * This is a shortcut for kernel-space yielding - it marks the 5552 * This is a shortcut for kernel-space yielding - it marks the
5553 * thread runnable and calls sys_sched_yield(). 5553 * thread runnable and calls sys_sched_yield().
5554 */ 5554 */
5555 void __sched yield(void) 5555 void __sched yield(void)
5556 { 5556 {
5557 set_current_state(TASK_RUNNING); 5557 set_current_state(TASK_RUNNING);
5558 sys_sched_yield(); 5558 sys_sched_yield();
5559 } 5559 }
5560 EXPORT_SYMBOL(yield); 5560 EXPORT_SYMBOL(yield);
5561 5561
5562 /** 5562 /**
5563 * yield_to - yield the current processor to another thread in 5563 * yield_to - yield the current processor to another thread in
5564 * your thread group, or accelerate that thread toward the 5564 * your thread group, or accelerate that thread toward the
5565 * processor it's on. 5565 * processor it's on.
5566 * @p: target task 5566 * @p: target task
5567 * @preempt: whether task preemption is allowed or not 5567 * @preempt: whether task preemption is allowed or not
5568 * 5568 *
5569 * It's the caller's job to ensure that the target task struct 5569 * It's the caller's job to ensure that the target task struct
5570 * can't go away on us before we can do any checks. 5570 * can't go away on us before we can do any checks.
5571 * 5571 *
5572 * Returns true if we indeed boosted the target task. 5572 * Returns true if we indeed boosted the target task.
5573 */ 5573 */
5574 bool __sched yield_to(struct task_struct *p, bool preempt) 5574 bool __sched yield_to(struct task_struct *p, bool preempt)
5575 { 5575 {
5576 struct task_struct *curr = current; 5576 struct task_struct *curr = current;
5577 struct rq *rq, *p_rq; 5577 struct rq *rq, *p_rq;
5578 unsigned long flags; 5578 unsigned long flags;
5579 bool yielded = 0; 5579 bool yielded = 0;
5580 5580
5581 local_irq_save(flags); 5581 local_irq_save(flags);
5582 rq = this_rq(); 5582 rq = this_rq();
5583 5583
5584 again: 5584 again:
5585 p_rq = task_rq(p); 5585 p_rq = task_rq(p);
5586 double_rq_lock(rq, p_rq); 5586 double_rq_lock(rq, p_rq);
5587 while (task_rq(p) != p_rq) { 5587 while (task_rq(p) != p_rq) {
5588 double_rq_unlock(rq, p_rq); 5588 double_rq_unlock(rq, p_rq);
5589 goto again; 5589 goto again;
5590 } 5590 }
5591 5591
5592 if (!curr->sched_class->yield_to_task) 5592 if (!curr->sched_class->yield_to_task)
5593 goto out; 5593 goto out;
5594 5594
5595 if (curr->sched_class != p->sched_class) 5595 if (curr->sched_class != p->sched_class)
5596 goto out; 5596 goto out;
5597 5597
5598 if (task_running(p_rq, p) || p->state) 5598 if (task_running(p_rq, p) || p->state)
5599 goto out; 5599 goto out;
5600 5600
5601 yielded = curr->sched_class->yield_to_task(rq, p, preempt); 5601 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
5602 if (yielded) { 5602 if (yielded) {
5603 schedstat_inc(rq, yld_count); 5603 schedstat_inc(rq, yld_count);
5604 /* 5604 /*
5605 * Make p's CPU reschedule; pick_next_entity takes care of 5605 * Make p's CPU reschedule; pick_next_entity takes care of
5606 * fairness. 5606 * fairness.
5607 */ 5607 */
5608 if (preempt && rq != p_rq) 5608 if (preempt && rq != p_rq)
5609 resched_task(p_rq->curr); 5609 resched_task(p_rq->curr);
5610 } 5610 }
5611 5611
5612 out: 5612 out:
5613 double_rq_unlock(rq, p_rq); 5613 double_rq_unlock(rq, p_rq);
5614 local_irq_restore(flags); 5614 local_irq_restore(flags);
5615 5615
5616 if (yielded) 5616 if (yielded)
5617 schedule(); 5617 schedule();
5618 5618
5619 return yielded; 5619 return yielded;
5620 } 5620 }
5621 EXPORT_SYMBOL_GPL(yield_to); 5621 EXPORT_SYMBOL_GPL(yield_to);
5622 5622
5623 /* 5623 /*
5624 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 5624 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
5625 * that process accounting knows that this is a task in IO wait state. 5625 * that process accounting knows that this is a task in IO wait state.
5626 */ 5626 */
5627 void __sched io_schedule(void) 5627 void __sched io_schedule(void)
5628 { 5628 {
5629 struct rq *rq = raw_rq(); 5629 struct rq *rq = raw_rq();
5630 5630
5631 delayacct_blkio_start(); 5631 delayacct_blkio_start();
5632 atomic_inc(&rq->nr_iowait); 5632 atomic_inc(&rq->nr_iowait);
5633 blk_flush_plug(current); 5633 blk_flush_plug(current);
5634 current->in_iowait = 1; 5634 current->in_iowait = 1;
5635 schedule(); 5635 schedule();
5636 current->in_iowait = 0; 5636 current->in_iowait = 0;
5637 atomic_dec(&rq->nr_iowait); 5637 atomic_dec(&rq->nr_iowait);
5638 delayacct_blkio_end(); 5638 delayacct_blkio_end();
5639 } 5639 }
5640 EXPORT_SYMBOL(io_schedule); 5640 EXPORT_SYMBOL(io_schedule);
5641 5641
5642 long __sched io_schedule_timeout(long timeout) 5642 long __sched io_schedule_timeout(long timeout)
5643 { 5643 {
5644 struct rq *rq = raw_rq(); 5644 struct rq *rq = raw_rq();
5645 long ret; 5645 long ret;
5646 5646
5647 delayacct_blkio_start(); 5647 delayacct_blkio_start();
5648 atomic_inc(&rq->nr_iowait); 5648 atomic_inc(&rq->nr_iowait);
5649 blk_flush_plug(current); 5649 blk_flush_plug(current);
5650 current->in_iowait = 1; 5650 current->in_iowait = 1;
5651 ret = schedule_timeout(timeout); 5651 ret = schedule_timeout(timeout);
5652 current->in_iowait = 0; 5652 current->in_iowait = 0;
5653 atomic_dec(&rq->nr_iowait); 5653 atomic_dec(&rq->nr_iowait);
5654 delayacct_blkio_end(); 5654 delayacct_blkio_end();
5655 return ret; 5655 return ret;
5656 } 5656 }
5657 5657
5658 /** 5658 /**
5659 * sys_sched_get_priority_max - return maximum RT priority. 5659 * sys_sched_get_priority_max - return maximum RT priority.
5660 * @policy: scheduling class. 5660 * @policy: scheduling class.
5661 * 5661 *
5662 * this syscall returns the maximum rt_priority that can be used 5662 * this syscall returns the maximum rt_priority that can be used
5663 * by a given scheduling class. 5663 * by a given scheduling class.
5664 */ 5664 */
5665 SYSCALL_DEFINE1(sched_get_priority_max, int, policy) 5665 SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
5666 { 5666 {
5667 int ret = -EINVAL; 5667 int ret = -EINVAL;
5668 5668
5669 switch (policy) { 5669 switch (policy) {
5670 case SCHED_FIFO: 5670 case SCHED_FIFO:
5671 case SCHED_RR: 5671 case SCHED_RR:
5672 ret = MAX_USER_RT_PRIO-1; 5672 ret = MAX_USER_RT_PRIO-1;
5673 break; 5673 break;
5674 case SCHED_NORMAL: 5674 case SCHED_NORMAL:
5675 case SCHED_BATCH: 5675 case SCHED_BATCH:
5676 case SCHED_IDLE: 5676 case SCHED_IDLE:
5677 ret = 0; 5677 ret = 0;
5678 break; 5678 break;
5679 } 5679 }
5680 return ret; 5680 return ret;
5681 } 5681 }
5682 5682
5683 /** 5683 /**
5684 * sys_sched_get_priority_min - return minimum RT priority. 5684 * sys_sched_get_priority_min - return minimum RT priority.
5685 * @policy: scheduling class. 5685 * @policy: scheduling class.
5686 * 5686 *
5687 * this syscall returns the minimum rt_priority that can be used 5687 * this syscall returns the minimum rt_priority that can be used
5688 * by a given scheduling class. 5688 * by a given scheduling class.
5689 */ 5689 */
5690 SYSCALL_DEFINE1(sched_get_priority_min, int, policy) 5690 SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
5691 { 5691 {
5692 int ret = -EINVAL; 5692 int ret = -EINVAL;
5693 5693
5694 switch (policy) { 5694 switch (policy) {
5695 case SCHED_FIFO: 5695 case SCHED_FIFO:
5696 case SCHED_RR: 5696 case SCHED_RR:
5697 ret = 1; 5697 ret = 1;
5698 break; 5698 break;
5699 case SCHED_NORMAL: 5699 case SCHED_NORMAL:
5700 case SCHED_BATCH: 5700 case SCHED_BATCH:
5701 case SCHED_IDLE: 5701 case SCHED_IDLE:
5702 ret = 0; 5702 ret = 0;
5703 } 5703 }
5704 return ret; 5704 return ret;
5705 } 5705 }
5706 5706
5707 /** 5707 /**
5708 * sys_sched_rr_get_interval - return the default timeslice of a process. 5708 * sys_sched_rr_get_interval - return the default timeslice of a process.
5709 * @pid: pid of the process. 5709 * @pid: pid of the process.
5710 * @interval: userspace pointer to the timeslice value. 5710 * @interval: userspace pointer to the timeslice value.
5711 * 5711 *
5712 * this syscall writes the default timeslice value of a given process 5712 * this syscall writes the default timeslice value of a given process
5713 * into the user-space timespec buffer. A value of '0' means infinity. 5713 * into the user-space timespec buffer. A value of '0' means infinity.
5714 */ 5714 */
5715 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, 5715 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
5716 struct timespec __user *, interval) 5716 struct timespec __user *, interval)
5717 { 5717 {
5718 struct task_struct *p; 5718 struct task_struct *p;
5719 unsigned int time_slice; 5719 unsigned int time_slice;
5720 unsigned long flags; 5720 unsigned long flags;
5721 struct rq *rq; 5721 struct rq *rq;
5722 int retval; 5722 int retval;
5723 struct timespec t; 5723 struct timespec t;
5724 5724
5725 if (pid < 0) 5725 if (pid < 0)
5726 return -EINVAL; 5726 return -EINVAL;
5727 5727
5728 retval = -ESRCH; 5728 retval = -ESRCH;
5729 rcu_read_lock(); 5729 rcu_read_lock();
5730 p = find_process_by_pid(pid); 5730 p = find_process_by_pid(pid);
5731 if (!p) 5731 if (!p)
5732 goto out_unlock; 5732 goto out_unlock;
5733 5733
5734 retval = security_task_getscheduler(p); 5734 retval = security_task_getscheduler(p);
5735 if (retval) 5735 if (retval)
5736 goto out_unlock; 5736 goto out_unlock;
5737 5737
5738 rq = task_rq_lock(p, &flags); 5738 rq = task_rq_lock(p, &flags);
5739 time_slice = p->sched_class->get_rr_interval(rq, p); 5739 time_slice = p->sched_class->get_rr_interval(rq, p);
5740 task_rq_unlock(rq, p, &flags); 5740 task_rq_unlock(rq, p, &flags);
5741 5741
5742 rcu_read_unlock(); 5742 rcu_read_unlock();
5743 jiffies_to_timespec(time_slice, &t); 5743 jiffies_to_timespec(time_slice, &t);
5744 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 5744 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
5745 return retval; 5745 return retval;
5746 5746
5747 out_unlock: 5747 out_unlock:
5748 rcu_read_unlock(); 5748 rcu_read_unlock();
5749 return retval; 5749 return retval;
5750 } 5750 }
5751 5751
5752 static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; 5752 static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
5753 5753
5754 void sched_show_task(struct task_struct *p) 5754 void sched_show_task(struct task_struct *p)
5755 { 5755 {
5756 unsigned long free = 0; 5756 unsigned long free = 0;
5757 unsigned state; 5757 unsigned state;
5758 5758
5759 state = p->state ? __ffs(p->state) + 1 : 0; 5759 state = p->state ? __ffs(p->state) + 1 : 0;
5760 printk(KERN_INFO "%-15.15s %c", p->comm, 5760 printk(KERN_INFO "%-15.15s %c", p->comm,
5761 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 5761 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
5762 #if BITS_PER_LONG == 32 5762 #if BITS_PER_LONG == 32
5763 if (state == TASK_RUNNING) 5763 if (state == TASK_RUNNING)
5764 printk(KERN_CONT " running "); 5764 printk(KERN_CONT " running ");
5765 else 5765 else
5766 printk(KERN_CONT " %08lx ", thread_saved_pc(p)); 5766 printk(KERN_CONT " %08lx ", thread_saved_pc(p));
5767 #else 5767 #else
5768 if (state == TASK_RUNNING) 5768 if (state == TASK_RUNNING)
5769 printk(KERN_CONT " running task "); 5769 printk(KERN_CONT " running task ");
5770 else 5770 else
5771 printk(KERN_CONT " %016lx ", thread_saved_pc(p)); 5771 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
5772 #endif 5772 #endif
5773 #ifdef CONFIG_DEBUG_STACK_USAGE 5773 #ifdef CONFIG_DEBUG_STACK_USAGE
5774 free = stack_not_used(p); 5774 free = stack_not_used(p);
5775 #endif 5775 #endif
5776 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, 5776 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
5777 task_pid_nr(p), task_pid_nr(p->real_parent), 5777 task_pid_nr(p), task_pid_nr(p->real_parent),
5778 (unsigned long)task_thread_info(p)->flags); 5778 (unsigned long)task_thread_info(p)->flags);
5779 5779
5780 show_stack(p, NULL); 5780 show_stack(p, NULL);
5781 } 5781 }
5782 5782
5783 void show_state_filter(unsigned long state_filter) 5783 void show_state_filter(unsigned long state_filter)
5784 { 5784 {
5785 struct task_struct *g, *p; 5785 struct task_struct *g, *p;
5786 5786
5787 #if BITS_PER_LONG == 32 5787 #if BITS_PER_LONG == 32
5788 printk(KERN_INFO 5788 printk(KERN_INFO
5789 " task PC stack pid father\n"); 5789 " task PC stack pid father\n");
5790 #else 5790 #else
5791 printk(KERN_INFO 5791 printk(KERN_INFO
5792 " task PC stack pid father\n"); 5792 " task PC stack pid father\n");
5793 #endif 5793 #endif
5794 read_lock(&tasklist_lock); 5794 read_lock(&tasklist_lock);
5795 do_each_thread(g, p) { 5795 do_each_thread(g, p) {
5796 /* 5796 /*
5797 * reset the NMI-timeout, listing all files on a slow 5797 * reset the NMI-timeout, listing all files on a slow
5798 * console might take a lot of time: 5798 * console might take a lot of time:
5799 */ 5799 */
5800 touch_nmi_watchdog(); 5800 touch_nmi_watchdog();
5801 if (!state_filter || (p->state & state_filter)) 5801 if (!state_filter || (p->state & state_filter))
5802 sched_show_task(p); 5802 sched_show_task(p);
5803 } while_each_thread(g, p); 5803 } while_each_thread(g, p);
5804 5804
5805 touch_all_softlockup_watchdogs(); 5805 touch_all_softlockup_watchdogs();
5806 5806
5807 #ifdef CONFIG_SCHED_DEBUG 5807 #ifdef CONFIG_SCHED_DEBUG
5808 sysrq_sched_debug_show(); 5808 sysrq_sched_debug_show();
5809 #endif 5809 #endif
5810 read_unlock(&tasklist_lock); 5810 read_unlock(&tasklist_lock);
5811 /* 5811 /*
5812 * Only show locks if all tasks are dumped: 5812 * Only show locks if all tasks are dumped:
5813 */ 5813 */
5814 if (!state_filter) 5814 if (!state_filter)
5815 debug_show_all_locks(); 5815 debug_show_all_locks();
5816 } 5816 }
5817 5817
5818 void __cpuinit init_idle_bootup_task(struct task_struct *idle) 5818 void __cpuinit init_idle_bootup_task(struct task_struct *idle)
5819 { 5819 {
5820 idle->sched_class = &idle_sched_class; 5820 idle->sched_class = &idle_sched_class;
5821 } 5821 }
5822 5822
5823 /** 5823 /**
5824 * init_idle - set up an idle thread for a given CPU 5824 * init_idle - set up an idle thread for a given CPU
5825 * @idle: task in question 5825 * @idle: task in question
5826 * @cpu: cpu the idle task belongs to 5826 * @cpu: cpu the idle task belongs to
5827 * 5827 *
5828 * NOTE: this function does not set the idle thread's NEED_RESCHED 5828 * NOTE: this function does not set the idle thread's NEED_RESCHED
5829 * flag, to make booting more robust. 5829 * flag, to make booting more robust.
5830 */ 5830 */
5831 void __cpuinit init_idle(struct task_struct *idle, int cpu) 5831 void __cpuinit init_idle(struct task_struct *idle, int cpu)
5832 { 5832 {
5833 struct rq *rq = cpu_rq(cpu); 5833 struct rq *rq = cpu_rq(cpu);
5834 unsigned long flags; 5834 unsigned long flags;
5835 5835
5836 raw_spin_lock_irqsave(&rq->lock, flags); 5836 raw_spin_lock_irqsave(&rq->lock, flags);
5837 5837
5838 __sched_fork(idle); 5838 __sched_fork(idle);
5839 idle->state = TASK_RUNNING; 5839 idle->state = TASK_RUNNING;
5840 idle->se.exec_start = sched_clock(); 5840 idle->se.exec_start = sched_clock();
5841 5841
5842 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); 5842 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
5843 /* 5843 /*
5844 * We're having a chicken and egg problem, even though we are 5844 * We're having a chicken and egg problem, even though we are
5845 * holding rq->lock, the cpu isn't yet set to this cpu so the 5845 * holding rq->lock, the cpu isn't yet set to this cpu so the
5846 * lockdep check in task_group() will fail. 5846 * lockdep check in task_group() will fail.
5847 * 5847 *
5848 * Similar case to sched_fork(). / Alternatively we could 5848 * Similar case to sched_fork(). / Alternatively we could
5849 * use task_rq_lock() here and obtain the other rq->lock. 5849 * use task_rq_lock() here and obtain the other rq->lock.
5850 * 5850 *
5851 * Silence PROVE_RCU 5851 * Silence PROVE_RCU
5852 */ 5852 */
5853 rcu_read_lock(); 5853 rcu_read_lock();
5854 __set_task_cpu(idle, cpu); 5854 __set_task_cpu(idle, cpu);
5855 rcu_read_unlock(); 5855 rcu_read_unlock();
5856 5856
5857 rq->curr = rq->idle = idle; 5857 rq->curr = rq->idle = idle;
5858 #if defined(CONFIG_SMP) 5858 #if defined(CONFIG_SMP)
5859 idle->on_cpu = 1; 5859 idle->on_cpu = 1;
5860 #endif 5860 #endif
5861 raw_spin_unlock_irqrestore(&rq->lock, flags); 5861 raw_spin_unlock_irqrestore(&rq->lock, flags);
5862 5862
5863 /* Set the preempt count _outside_ the spinlocks! */ 5863 /* Set the preempt count _outside_ the spinlocks! */
5864 #if defined(CONFIG_PREEMPT) 5864 #if defined(CONFIG_PREEMPT)
5865 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); 5865 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
5866 #else 5866 #else
5867 task_thread_info(idle)->preempt_count = 0; 5867 task_thread_info(idle)->preempt_count = 0;
5868 #endif 5868 #endif
5869 /* 5869 /*
5870 * The idle tasks have their own, simple scheduling class: 5870 * The idle tasks have their own, simple scheduling class:
5871 */ 5871 */
5872 idle->sched_class = &idle_sched_class; 5872 idle->sched_class = &idle_sched_class;
5873 ftrace_graph_init_idle_task(idle, cpu); 5873 ftrace_graph_init_idle_task(idle, cpu);
5874 } 5874 }
5875 5875
5876 /* 5876 /*
5877 * In a system that switches off the HZ timer nohz_cpu_mask 5877 * In a system that switches off the HZ timer nohz_cpu_mask
5878 * indicates which cpus entered this state. This is used 5878 * indicates which cpus entered this state. This is used
5879 * in the rcu update to wait only for active cpus. For system 5879 * in the rcu update to wait only for active cpus. For system
5880 * which do not switch off the HZ timer nohz_cpu_mask should 5880 * which do not switch off the HZ timer nohz_cpu_mask should
5881 * always be CPU_BITS_NONE. 5881 * always be CPU_BITS_NONE.
5882 */ 5882 */
5883 cpumask_var_t nohz_cpu_mask; 5883 cpumask_var_t nohz_cpu_mask;
5884 5884
5885 /* 5885 /*
5886 * Increase the granularity value when there are more CPUs, 5886 * Increase the granularity value when there are more CPUs,
5887 * because with more CPUs the 'effective latency' as visible 5887 * because with more CPUs the 'effective latency' as visible
5888 * to users decreases. But the relationship is not linear, 5888 * to users decreases. But the relationship is not linear,
5889 * so pick a second-best guess by going with the log2 of the 5889 * so pick a second-best guess by going with the log2 of the
5890 * number of CPUs. 5890 * number of CPUs.
5891 * 5891 *
5892 * This idea comes from the SD scheduler of Con Kolivas: 5892 * This idea comes from the SD scheduler of Con Kolivas:
5893 */ 5893 */
5894 static int get_update_sysctl_factor(void) 5894 static int get_update_sysctl_factor(void)
5895 { 5895 {
5896 unsigned int cpus = min_t(int, num_online_cpus(), 8); 5896 unsigned int cpus = min_t(int, num_online_cpus(), 8);
5897 unsigned int factor; 5897 unsigned int factor;
5898 5898
5899 switch (sysctl_sched_tunable_scaling) { 5899 switch (sysctl_sched_tunable_scaling) {
5900 case SCHED_TUNABLESCALING_NONE: 5900 case SCHED_TUNABLESCALING_NONE:
5901 factor = 1; 5901 factor = 1;
5902 break; 5902 break;
5903 case SCHED_TUNABLESCALING_LINEAR: 5903 case SCHED_TUNABLESCALING_LINEAR:
5904 factor = cpus; 5904 factor = cpus;
5905 break; 5905 break;
5906 case SCHED_TUNABLESCALING_LOG: 5906 case SCHED_TUNABLESCALING_LOG:
5907 default: 5907 default:
5908 factor = 1 + ilog2(cpus); 5908 factor = 1 + ilog2(cpus);
5909 break; 5909 break;
5910 } 5910 }
5911 5911
5912 return factor; 5912 return factor;
5913 } 5913 }
5914 5914
5915 static void update_sysctl(void) 5915 static void update_sysctl(void)
5916 { 5916 {
5917 unsigned int factor = get_update_sysctl_factor(); 5917 unsigned int factor = get_update_sysctl_factor();
5918 5918
5919 #define SET_SYSCTL(name) \ 5919 #define SET_SYSCTL(name) \
5920 (sysctl_##name = (factor) * normalized_sysctl_##name) 5920 (sysctl_##name = (factor) * normalized_sysctl_##name)
5921 SET_SYSCTL(sched_min_granularity); 5921 SET_SYSCTL(sched_min_granularity);
5922 SET_SYSCTL(sched_latency); 5922 SET_SYSCTL(sched_latency);
5923 SET_SYSCTL(sched_wakeup_granularity); 5923 SET_SYSCTL(sched_wakeup_granularity);
5924 #undef SET_SYSCTL 5924 #undef SET_SYSCTL
5925 } 5925 }
5926 5926
5927 static inline void sched_init_granularity(void) 5927 static inline void sched_init_granularity(void)
5928 { 5928 {
5929 update_sysctl(); 5929 update_sysctl();
5930 } 5930 }
5931 5931
5932 #ifdef CONFIG_SMP 5932 #ifdef CONFIG_SMP
5933 /* 5933 /*
5934 * This is how migration works: 5934 * This is how migration works:
5935 * 5935 *
5936 * 1) we invoke migration_cpu_stop() on the target CPU using 5936 * 1) we invoke migration_cpu_stop() on the target CPU using
5937 * stop_one_cpu(). 5937 * stop_one_cpu().
5938 * 2) stopper starts to run (implicitly forcing the migrated thread 5938 * 2) stopper starts to run (implicitly forcing the migrated thread
5939 * off the CPU) 5939 * off the CPU)
5940 * 3) it checks whether the migrated task is still in the wrong runqueue. 5940 * 3) it checks whether the migrated task is still in the wrong runqueue.
5941 * 4) if it's in the wrong runqueue then the migration thread removes 5941 * 4) if it's in the wrong runqueue then the migration thread removes
5942 * it and puts it into the right queue. 5942 * it and puts it into the right queue.
5943 * 5) stopper completes and stop_one_cpu() returns and the migration 5943 * 5) stopper completes and stop_one_cpu() returns and the migration
5944 * is done. 5944 * is done.
5945 */ 5945 */
5946 5946
5947 /* 5947 /*
5948 * Change a given task's CPU affinity. Migrate the thread to a 5948 * Change a given task's CPU affinity. Migrate the thread to a
5949 * proper CPU and schedule it away if the CPU it's executing on 5949 * proper CPU and schedule it away if the CPU it's executing on
5950 * is removed from the allowed bitmask. 5950 * is removed from the allowed bitmask.
5951 * 5951 *
5952 * NOTE: the caller must have a valid reference to the task, the 5952 * NOTE: the caller must have a valid reference to the task, the
5953 * task must not exit() & deallocate itself prematurely. The 5953 * task must not exit() & deallocate itself prematurely. The
5954 * call is not atomic; no spinlocks may be held. 5954 * call is not atomic; no spinlocks may be held.
5955 */ 5955 */
5956 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) 5956 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5957 { 5957 {
5958 unsigned long flags; 5958 unsigned long flags;
5959 struct rq *rq; 5959 struct rq *rq;
5960 unsigned int dest_cpu; 5960 unsigned int dest_cpu;
5961 int ret = 0; 5961 int ret = 0;
5962 5962
5963 rq = task_rq_lock(p, &flags); 5963 rq = task_rq_lock(p, &flags);
5964 5964
5965 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 5965 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
5966 ret = -EINVAL; 5966 ret = -EINVAL;
5967 goto out; 5967 goto out;
5968 } 5968 }
5969 5969
5970 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && 5970 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
5971 !cpumask_equal(&p->cpus_allowed, new_mask))) { 5971 !cpumask_equal(&p->cpus_allowed, new_mask))) {
5972 ret = -EINVAL; 5972 ret = -EINVAL;
5973 goto out; 5973 goto out;
5974 } 5974 }
5975 5975
5976 if (p->sched_class->set_cpus_allowed) 5976 if (p->sched_class->set_cpus_allowed)
5977 p->sched_class->set_cpus_allowed(p, new_mask); 5977 p->sched_class->set_cpus_allowed(p, new_mask);
5978 else { 5978 else {
5979 cpumask_copy(&p->cpus_allowed, new_mask); 5979 cpumask_copy(&p->cpus_allowed, new_mask);
5980 p->rt.nr_cpus_allowed = cpumask_weight(new_mask); 5980 p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
5981 } 5981 }
5982 5982
5983 /* Can the task run on the task's current CPU? If so, we're done */ 5983 /* Can the task run on the task's current CPU? If so, we're done */
5984 if (cpumask_test_cpu(task_cpu(p), new_mask)) 5984 if (cpumask_test_cpu(task_cpu(p), new_mask))
5985 goto out; 5985 goto out;
5986 5986
5987 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 5987 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5988 if (need_migrate_task(p)) { 5988 if (need_migrate_task(p)) {
5989 struct migration_arg arg = { p, dest_cpu }; 5989 struct migration_arg arg = { p, dest_cpu };
5990 /* Need help from migration thread: drop lock and wait. */ 5990 /* Need help from migration thread: drop lock and wait. */
5991 task_rq_unlock(rq, p, &flags); 5991 task_rq_unlock(rq, p, &flags);
5992 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 5992 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
5993 tlb_migrate_finish(p->mm); 5993 tlb_migrate_finish(p->mm);
5994 return 0; 5994 return 0;
5995 } 5995 }
5996 out: 5996 out:
5997 task_rq_unlock(rq, p, &flags); 5997 task_rq_unlock(rq, p, &flags);
5998 5998
5999 return ret; 5999 return ret;
6000 } 6000 }
6001 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); 6001 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
6002 6002
6003 /* 6003 /*
6004 * Move (not current) task off this cpu, onto dest cpu. We're doing 6004 * Move (not current) task off this cpu, onto dest cpu. We're doing
6005 * this because either it can't run here any more (set_cpus_allowed() 6005 * this because either it can't run here any more (set_cpus_allowed()
6006 * away from this CPU, or CPU going down), or because we're 6006 * away from this CPU, or CPU going down), or because we're
6007 * attempting to rebalance this task on exec (sched_exec). 6007 * attempting to rebalance this task on exec (sched_exec).
6008 * 6008 *
6009 * So we race with normal scheduler movements, but that's OK, as long 6009 * So we race with normal scheduler movements, but that's OK, as long
6010 * as the task is no longer on this CPU. 6010 * as the task is no longer on this CPU.
6011 * 6011 *
6012 * Returns non-zero if task was successfully migrated. 6012 * Returns non-zero if task was successfully migrated.
6013 */ 6013 */
6014 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) 6014 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
6015 { 6015 {
6016 struct rq *rq_dest, *rq_src; 6016 struct rq *rq_dest, *rq_src;
6017 int ret = 0; 6017 int ret = 0;
6018 6018
6019 if (unlikely(!cpu_active(dest_cpu))) 6019 if (unlikely(!cpu_active(dest_cpu)))
6020 return ret; 6020 return ret;
6021 6021
6022 rq_src = cpu_rq(src_cpu); 6022 rq_src = cpu_rq(src_cpu);
6023 rq_dest = cpu_rq(dest_cpu); 6023 rq_dest = cpu_rq(dest_cpu);
6024 6024
6025 raw_spin_lock(&p->pi_lock); 6025 raw_spin_lock(&p->pi_lock);
6026 double_rq_lock(rq_src, rq_dest); 6026 double_rq_lock(rq_src, rq_dest);
6027 /* Already moved. */ 6027 /* Already moved. */
6028 if (task_cpu(p) != src_cpu) 6028 if (task_cpu(p) != src_cpu)
6029 goto done; 6029 goto done;
6030 /* Affinity changed (again). */ 6030 /* Affinity changed (again). */
6031 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) 6031 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
6032 goto fail; 6032 goto fail;
6033 6033
6034 /* 6034 /*
6035 * If we're not on a rq, the next wake-up will ensure we're 6035 * If we're not on a rq, the next wake-up will ensure we're
6036 * placed properly. 6036 * placed properly.
6037 */ 6037 */
6038 if (p->on_rq) { 6038 if (p->on_rq) {
6039 deactivate_task(rq_src, p, 0); 6039 deactivate_task(rq_src, p, 0);
6040 set_task_cpu(p, dest_cpu); 6040 set_task_cpu(p, dest_cpu);
6041 activate_task(rq_dest, p, 0); 6041 activate_task(rq_dest, p, 0);
6042 check_preempt_curr(rq_dest, p, 0); 6042 check_preempt_curr(rq_dest, p, 0);
6043 } 6043 }
6044 done: 6044 done:
6045 ret = 1; 6045 ret = 1;
6046 fail: 6046 fail:
6047 double_rq_unlock(rq_src, rq_dest); 6047 double_rq_unlock(rq_src, rq_dest);
6048 raw_spin_unlock(&p->pi_lock); 6048 raw_spin_unlock(&p->pi_lock);
6049 return ret; 6049 return ret;
6050 } 6050 }
6051 6051
6052 /* 6052 /*
6053 * migration_cpu_stop - this will be executed by a highprio stopper thread 6053 * migration_cpu_stop - this will be executed by a highprio stopper thread
6054 * and performs thread migration by bumping thread off CPU then 6054 * and performs thread migration by bumping thread off CPU then
6055 * 'pushing' onto another runqueue. 6055 * 'pushing' onto another runqueue.
6056 */ 6056 */
6057 static int migration_cpu_stop(void *data) 6057 static int migration_cpu_stop(void *data)
6058 { 6058 {
6059 struct migration_arg *arg = data; 6059 struct migration_arg *arg = data;
6060 6060
6061 /* 6061 /*
6062 * The original target cpu might have gone down and we might 6062 * The original target cpu might have gone down and we might
6063 * be on another cpu but it doesn't matter. 6063 * be on another cpu but it doesn't matter.
6064 */ 6064 */
6065 local_irq_disable(); 6065 local_irq_disable();
6066 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); 6066 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
6067 local_irq_enable(); 6067 local_irq_enable();
6068 return 0; 6068 return 0;
6069 } 6069 }
6070 6070
6071 #ifdef CONFIG_HOTPLUG_CPU 6071 #ifdef CONFIG_HOTPLUG_CPU
6072 6072
6073 /* 6073 /*
6074 * Ensures that the idle task is using init_mm right before its cpu goes 6074 * Ensures that the idle task is using init_mm right before its cpu goes
6075 * offline. 6075 * offline.
6076 */ 6076 */
6077 void idle_task_exit(void) 6077 void idle_task_exit(void)
6078 { 6078 {
6079 struct mm_struct *mm = current->active_mm; 6079 struct mm_struct *mm = current->active_mm;
6080 6080
6081 BUG_ON(cpu_online(smp_processor_id())); 6081 BUG_ON(cpu_online(smp_processor_id()));
6082 6082
6083 if (mm != &init_mm) 6083 if (mm != &init_mm)
6084 switch_mm(mm, &init_mm, current); 6084 switch_mm(mm, &init_mm, current);
6085 mmdrop(mm); 6085 mmdrop(mm);
6086 } 6086 }
6087 6087
6088 /* 6088 /*
6089 * While a dead CPU has no uninterruptible tasks queued at this point, 6089 * While a dead CPU has no uninterruptible tasks queued at this point,
6090 * it might still have a nonzero ->nr_uninterruptible counter, because 6090 * it might still have a nonzero ->nr_uninterruptible counter, because
6091 * for performance reasons the counter is not stricly tracking tasks to 6091 * for performance reasons the counter is not stricly tracking tasks to
6092 * their home CPUs. So we just add the counter to another CPU's counter, 6092 * their home CPUs. So we just add the counter to another CPU's counter,
6093 * to keep the global sum constant after CPU-down: 6093 * to keep the global sum constant after CPU-down:
6094 */ 6094 */
6095 static void migrate_nr_uninterruptible(struct rq *rq_src) 6095 static void migrate_nr_uninterruptible(struct rq *rq_src)
6096 { 6096 {
6097 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); 6097 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
6098 6098
6099 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; 6099 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
6100 rq_src->nr_uninterruptible = 0; 6100 rq_src->nr_uninterruptible = 0;
6101 } 6101 }
6102 6102
6103 /* 6103 /*
6104 * remove the tasks which were accounted by rq from calc_load_tasks. 6104 * remove the tasks which were accounted by rq from calc_load_tasks.
6105 */ 6105 */
6106 static void calc_global_load_remove(struct rq *rq) 6106 static void calc_global_load_remove(struct rq *rq)
6107 { 6107 {
6108 atomic_long_sub(rq->calc_load_active, &calc_load_tasks); 6108 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
6109 rq->calc_load_active = 0; 6109 rq->calc_load_active = 0;
6110 } 6110 }
6111 6111
6112 /* 6112 /*
6113 * Migrate all tasks from the rq, sleeping tasks will be migrated by 6113 * Migrate all tasks from the rq, sleeping tasks will be migrated by
6114 * try_to_wake_up()->select_task_rq(). 6114 * try_to_wake_up()->select_task_rq().
6115 * 6115 *
6116 * Called with rq->lock held even though we'er in stop_machine() and 6116 * Called with rq->lock held even though we'er in stop_machine() and
6117 * there's no concurrency possible, we hold the required locks anyway 6117 * there's no concurrency possible, we hold the required locks anyway
6118 * because of lock validation efforts. 6118 * because of lock validation efforts.
6119 */ 6119 */
6120 static void migrate_tasks(unsigned int dead_cpu) 6120 static void migrate_tasks(unsigned int dead_cpu)
6121 { 6121 {
6122 struct rq *rq = cpu_rq(dead_cpu); 6122 struct rq *rq = cpu_rq(dead_cpu);
6123 struct task_struct *next, *stop = rq->stop; 6123 struct task_struct *next, *stop = rq->stop;
6124 int dest_cpu; 6124 int dest_cpu;
6125 6125
6126 /* 6126 /*
6127 * Fudge the rq selection such that the below task selection loop 6127 * Fudge the rq selection such that the below task selection loop
6128 * doesn't get stuck on the currently eligible stop task. 6128 * doesn't get stuck on the currently eligible stop task.
6129 * 6129 *
6130 * We're currently inside stop_machine() and the rq is either stuck 6130 * We're currently inside stop_machine() and the rq is either stuck
6131 * in the stop_machine_cpu_stop() loop, or we're executing this code, 6131 * in the stop_machine_cpu_stop() loop, or we're executing this code,
6132 * either way we should never end up calling schedule() until we're 6132 * either way we should never end up calling schedule() until we're
6133 * done here. 6133 * done here.
6134 */ 6134 */
6135 rq->stop = NULL; 6135 rq->stop = NULL;
6136 6136
6137 for ( ; ; ) { 6137 for ( ; ; ) {
6138 /* 6138 /*
6139 * There's this thread running, bail when that's the only 6139 * There's this thread running, bail when that's the only
6140 * remaining thread. 6140 * remaining thread.
6141 */ 6141 */
6142 if (rq->nr_running == 1) 6142 if (rq->nr_running == 1)
6143 break; 6143 break;
6144 6144
6145 next = pick_next_task(rq); 6145 next = pick_next_task(rq);
6146 BUG_ON(!next); 6146 BUG_ON(!next);
6147 next->sched_class->put_prev_task(rq, next); 6147 next->sched_class->put_prev_task(rq, next);
6148 6148
6149 /* Find suitable destination for @next, with force if needed. */ 6149 /* Find suitable destination for @next, with force if needed. */
6150 dest_cpu = select_fallback_rq(dead_cpu, next); 6150 dest_cpu = select_fallback_rq(dead_cpu, next);
6151 raw_spin_unlock(&rq->lock); 6151 raw_spin_unlock(&rq->lock);
6152 6152
6153 __migrate_task(next, dead_cpu, dest_cpu); 6153 __migrate_task(next, dead_cpu, dest_cpu);
6154 6154
6155 raw_spin_lock(&rq->lock); 6155 raw_spin_lock(&rq->lock);
6156 } 6156 }
6157 6157
6158 rq->stop = stop; 6158 rq->stop = stop;
6159 } 6159 }
6160 6160
6161 #endif /* CONFIG_HOTPLUG_CPU */ 6161 #endif /* CONFIG_HOTPLUG_CPU */
6162 6162
6163 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 6163 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
6164 6164
6165 static struct ctl_table sd_ctl_dir[] = { 6165 static struct ctl_table sd_ctl_dir[] = {
6166 { 6166 {
6167 .procname = "sched_domain", 6167 .procname = "sched_domain",
6168 .mode = 0555, 6168 .mode = 0555,
6169 }, 6169 },
6170 {} 6170 {}
6171 }; 6171 };
6172 6172
6173 static struct ctl_table sd_ctl_root[] = { 6173 static struct ctl_table sd_ctl_root[] = {
6174 { 6174 {
6175 .procname = "kernel", 6175 .procname = "kernel",
6176 .mode = 0555, 6176 .mode = 0555,
6177 .child = sd_ctl_dir, 6177 .child = sd_ctl_dir,
6178 }, 6178 },
6179 {} 6179 {}
6180 }; 6180 };
6181 6181
6182 static struct ctl_table *sd_alloc_ctl_entry(int n) 6182 static struct ctl_table *sd_alloc_ctl_entry(int n)
6183 { 6183 {
6184 struct ctl_table *entry = 6184 struct ctl_table *entry =
6185 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); 6185 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
6186 6186
6187 return entry; 6187 return entry;
6188 } 6188 }
6189 6189
6190 static void sd_free_ctl_entry(struct ctl_table **tablep) 6190 static void sd_free_ctl_entry(struct ctl_table **tablep)
6191 { 6191 {
6192 struct ctl_table *entry; 6192 struct ctl_table *entry;
6193 6193
6194 /* 6194 /*
6195 * In the intermediate directories, both the child directory and 6195 * In the intermediate directories, both the child directory and
6196 * procname are dynamically allocated and could fail but the mode 6196 * procname are dynamically allocated and could fail but the mode
6197 * will always be set. In the lowest directory the names are 6197 * will always be set. In the lowest directory the names are
6198 * static strings and all have proc handlers. 6198 * static strings and all have proc handlers.
6199 */ 6199 */
6200 for (entry = *tablep; entry->mode; entry++) { 6200 for (entry = *tablep; entry->mode; entry++) {
6201 if (entry->child) 6201 if (entry->child)
6202 sd_free_ctl_entry(&entry->child); 6202 sd_free_ctl_entry(&entry->child);
6203 if (entry->proc_handler == NULL) 6203 if (entry->proc_handler == NULL)
6204 kfree(entry->procname); 6204 kfree(entry->procname);
6205 } 6205 }
6206 6206
6207 kfree(*tablep); 6207 kfree(*tablep);
6208 *tablep = NULL; 6208 *tablep = NULL;
6209 } 6209 }
6210 6210
6211 static void 6211 static void
6212 set_table_entry(struct ctl_table *entry, 6212 set_table_entry(struct ctl_table *entry,
6213 const char *procname, void *data, int maxlen, 6213 const char *procname, void *data, int maxlen,
6214 mode_t mode, proc_handler *proc_handler) 6214 mode_t mode, proc_handler *proc_handler)
6215 { 6215 {
6216 entry->procname = procname; 6216 entry->procname = procname;
6217 entry->data = data; 6217 entry->data = data;
6218 entry->maxlen = maxlen; 6218 entry->maxlen = maxlen;
6219 entry->mode = mode; 6219 entry->mode = mode;
6220 entry->proc_handler = proc_handler; 6220 entry->proc_handler = proc_handler;
6221 } 6221 }
6222 6222
6223 static struct ctl_table * 6223 static struct ctl_table *
6224 sd_alloc_ctl_domain_table(struct sched_domain *sd) 6224 sd_alloc_ctl_domain_table(struct sched_domain *sd)
6225 { 6225 {
6226 struct ctl_table *table = sd_alloc_ctl_entry(13); 6226 struct ctl_table *table = sd_alloc_ctl_entry(13);
6227 6227
6228 if (table == NULL) 6228 if (table == NULL)
6229 return NULL; 6229 return NULL;
6230 6230
6231 set_table_entry(&table[0], "min_interval", &sd->min_interval, 6231 set_table_entry(&table[0], "min_interval", &sd->min_interval,
6232 sizeof(long), 0644, proc_doulongvec_minmax); 6232 sizeof(long), 0644, proc_doulongvec_minmax);
6233 set_table_entry(&table[1], "max_interval", &sd->max_interval, 6233 set_table_entry(&table[1], "max_interval", &sd->max_interval,
6234 sizeof(long), 0644, proc_doulongvec_minmax); 6234 sizeof(long), 0644, proc_doulongvec_minmax);
6235 set_table_entry(&table[2], "busy_idx", &sd->busy_idx, 6235 set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
6236 sizeof(int), 0644, proc_dointvec_minmax); 6236 sizeof(int), 0644, proc_dointvec_minmax);
6237 set_table_entry(&table[3], "idle_idx", &sd->idle_idx, 6237 set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
6238 sizeof(int), 0644, proc_dointvec_minmax); 6238 sizeof(int), 0644, proc_dointvec_minmax);
6239 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, 6239 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
6240 sizeof(int), 0644, proc_dointvec_minmax); 6240 sizeof(int), 0644, proc_dointvec_minmax);
6241 set_table_entry(&table[5], "wake_idx", &sd->wake_idx, 6241 set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
6242 sizeof(int), 0644, proc_dointvec_minmax); 6242 sizeof(int), 0644, proc_dointvec_minmax);
6243 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, 6243 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
6244 sizeof(int), 0644, proc_dointvec_minmax); 6244 sizeof(int), 0644, proc_dointvec_minmax);
6245 set_table_entry(&table[7], "busy_factor", &sd->busy_factor, 6245 set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
6246 sizeof(int), 0644, proc_dointvec_minmax); 6246 sizeof(int), 0644, proc_dointvec_minmax);
6247 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, 6247 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
6248 sizeof(int), 0644, proc_dointvec_minmax); 6248 sizeof(int), 0644, proc_dointvec_minmax);
6249 set_table_entry(&table[9], "cache_nice_tries", 6249 set_table_entry(&table[9], "cache_nice_tries",
6250 &sd->cache_nice_tries, 6250 &sd->cache_nice_tries,
6251 sizeof(int), 0644, proc_dointvec_minmax); 6251 sizeof(int), 0644, proc_dointvec_minmax);
6252 set_table_entry(&table[10], "flags", &sd->flags, 6252 set_table_entry(&table[10], "flags", &sd->flags,
6253 sizeof(int), 0644, proc_dointvec_minmax); 6253 sizeof(int), 0644, proc_dointvec_minmax);
6254 set_table_entry(&table[11], "name", sd->name, 6254 set_table_entry(&table[11], "name", sd->name,
6255 CORENAME_MAX_SIZE, 0444, proc_dostring); 6255 CORENAME_MAX_SIZE, 0444, proc_dostring);
6256 /* &table[12] is terminator */ 6256 /* &table[12] is terminator */
6257 6257
6258 return table; 6258 return table;
6259 } 6259 }
6260 6260
6261 static ctl_table *sd_alloc_ctl_cpu_table(int cpu) 6261 static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
6262 { 6262 {
6263 struct ctl_table *entry, *table; 6263 struct ctl_table *entry, *table;
6264 struct sched_domain *sd; 6264 struct sched_domain *sd;
6265 int domain_num = 0, i; 6265 int domain_num = 0, i;
6266 char buf[32]; 6266 char buf[32];
6267 6267
6268 for_each_domain(cpu, sd) 6268 for_each_domain(cpu, sd)
6269 domain_num++; 6269 domain_num++;
6270 entry = table = sd_alloc_ctl_entry(domain_num + 1); 6270 entry = table = sd_alloc_ctl_entry(domain_num + 1);
6271 if (table == NULL) 6271 if (table == NULL)
6272 return NULL; 6272 return NULL;
6273 6273
6274 i = 0; 6274 i = 0;
6275 for_each_domain(cpu, sd) { 6275 for_each_domain(cpu, sd) {
6276 snprintf(buf, 32, "domain%d", i); 6276 snprintf(buf, 32, "domain%d", i);
6277 entry->procname = kstrdup(buf, GFP_KERNEL); 6277 entry->procname = kstrdup(buf, GFP_KERNEL);
6278 entry->mode = 0555; 6278 entry->mode = 0555;
6279 entry->child = sd_alloc_ctl_domain_table(sd); 6279 entry->child = sd_alloc_ctl_domain_table(sd);
6280 entry++; 6280 entry++;
6281 i++; 6281 i++;
6282 } 6282 }
6283 return table; 6283 return table;
6284 } 6284 }
6285 6285
6286 static struct ctl_table_header *sd_sysctl_header; 6286 static struct ctl_table_header *sd_sysctl_header;
6287 static void register_sched_domain_sysctl(void) 6287 static void register_sched_domain_sysctl(void)
6288 { 6288 {
6289 int i, cpu_num = num_possible_cpus(); 6289 int i, cpu_num = num_possible_cpus();
6290 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); 6290 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
6291 char buf[32]; 6291 char buf[32];
6292 6292
6293 WARN_ON(sd_ctl_dir[0].child); 6293 WARN_ON(sd_ctl_dir[0].child);
6294 sd_ctl_dir[0].child = entry; 6294 sd_ctl_dir[0].child = entry;
6295 6295
6296 if (entry == NULL) 6296 if (entry == NULL)
6297 return; 6297 return;
6298 6298
6299 for_each_possible_cpu(i) { 6299 for_each_possible_cpu(i) {
6300 snprintf(buf, 32, "cpu%d", i); 6300 snprintf(buf, 32, "cpu%d", i);
6301 entry->procname = kstrdup(buf, GFP_KERNEL); 6301 entry->procname = kstrdup(buf, GFP_KERNEL);
6302 entry->mode = 0555; 6302 entry->mode = 0555;
6303 entry->child = sd_alloc_ctl_cpu_table(i); 6303 entry->child = sd_alloc_ctl_cpu_table(i);
6304 entry++; 6304 entry++;
6305 } 6305 }
6306 6306
6307 WARN_ON(sd_sysctl_header); 6307 WARN_ON(sd_sysctl_header);
6308 sd_sysctl_header = register_sysctl_table(sd_ctl_root); 6308 sd_sysctl_header = register_sysctl_table(sd_ctl_root);
6309 } 6309 }
6310 6310
6311 /* may be called multiple times per register */ 6311 /* may be called multiple times per register */
6312 static void unregister_sched_domain_sysctl(void) 6312 static void unregister_sched_domain_sysctl(void)
6313 { 6313 {
6314 if (sd_sysctl_header) 6314 if (sd_sysctl_header)
6315 unregister_sysctl_table(sd_sysctl_header); 6315 unregister_sysctl_table(sd_sysctl_header);
6316 sd_sysctl_header = NULL; 6316 sd_sysctl_header = NULL;
6317 if (sd_ctl_dir[0].child) 6317 if (sd_ctl_dir[0].child)
6318 sd_free_ctl_entry(&sd_ctl_dir[0].child); 6318 sd_free_ctl_entry(&sd_ctl_dir[0].child);
6319 } 6319 }
6320 #else 6320 #else
6321 static void register_sched_domain_sysctl(void) 6321 static void register_sched_domain_sysctl(void)
6322 { 6322 {
6323 } 6323 }
6324 static void unregister_sched_domain_sysctl(void) 6324 static void unregister_sched_domain_sysctl(void)
6325 { 6325 {
6326 } 6326 }
6327 #endif 6327 #endif
6328 6328
6329 static void set_rq_online(struct rq *rq) 6329 static void set_rq_online(struct rq *rq)
6330 { 6330 {
6331 if (!rq->online) { 6331 if (!rq->online) {
6332 const struct sched_class *class; 6332 const struct sched_class *class;
6333 6333
6334 cpumask_set_cpu(rq->cpu, rq->rd->online); 6334 cpumask_set_cpu(rq->cpu, rq->rd->online);
6335 rq->online = 1; 6335 rq->online = 1;
6336 6336
6337 for_each_class(class) { 6337 for_each_class(class) {
6338 if (class->rq_online) 6338 if (class->rq_online)
6339 class->rq_online(rq); 6339 class->rq_online(rq);
6340 } 6340 }
6341 } 6341 }
6342 } 6342 }
6343 6343
6344 static void set_rq_offline(struct rq *rq) 6344 static void set_rq_offline(struct rq *rq)
6345 { 6345 {
6346 if (rq->online) { 6346 if (rq->online) {
6347 const struct sched_class *class; 6347 const struct sched_class *class;
6348 6348
6349 for_each_class(class) { 6349 for_each_class(class) {
6350 if (class->rq_offline) 6350 if (class->rq_offline)
6351 class->rq_offline(rq); 6351 class->rq_offline(rq);
6352 } 6352 }
6353 6353
6354 cpumask_clear_cpu(rq->cpu, rq->rd->online); 6354 cpumask_clear_cpu(rq->cpu, rq->rd->online);
6355 rq->online = 0; 6355 rq->online = 0;
6356 } 6356 }
6357 } 6357 }
6358 6358
6359 /* 6359 /*
6360 * migration_call - callback that gets triggered when a CPU is added. 6360 * migration_call - callback that gets triggered when a CPU is added.
6361 * Here we can start up the necessary migration thread for the new CPU. 6361 * Here we can start up the necessary migration thread for the new CPU.
6362 */ 6362 */
6363 static int __cpuinit 6363 static int __cpuinit
6364 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) 6364 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6365 { 6365 {
6366 int cpu = (long)hcpu; 6366 int cpu = (long)hcpu;
6367 unsigned long flags; 6367 unsigned long flags;
6368 struct rq *rq = cpu_rq(cpu); 6368 struct rq *rq = cpu_rq(cpu);
6369 6369
6370 switch (action & ~CPU_TASKS_FROZEN) { 6370 switch (action & ~CPU_TASKS_FROZEN) {
6371 6371
6372 case CPU_UP_PREPARE: 6372 case CPU_UP_PREPARE:
6373 rq->calc_load_update = calc_load_update; 6373 rq->calc_load_update = calc_load_update;
6374 break; 6374 break;
6375 6375
6376 case CPU_ONLINE: 6376 case CPU_ONLINE:
6377 /* Update our root-domain */ 6377 /* Update our root-domain */
6378 raw_spin_lock_irqsave(&rq->lock, flags); 6378 raw_spin_lock_irqsave(&rq->lock, flags);
6379 if (rq->rd) { 6379 if (rq->rd) {
6380 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 6380 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6381 6381
6382 set_rq_online(rq); 6382 set_rq_online(rq);
6383 } 6383 }
6384 raw_spin_unlock_irqrestore(&rq->lock, flags); 6384 raw_spin_unlock_irqrestore(&rq->lock, flags);
6385 break; 6385 break;
6386 6386
6387 #ifdef CONFIG_HOTPLUG_CPU 6387 #ifdef CONFIG_HOTPLUG_CPU
6388 case CPU_DYING: 6388 case CPU_DYING:
6389 sched_ttwu_pending(); 6389 sched_ttwu_pending();
6390 /* Update our root-domain */ 6390 /* Update our root-domain */
6391 raw_spin_lock_irqsave(&rq->lock, flags); 6391 raw_spin_lock_irqsave(&rq->lock, flags);
6392 if (rq->rd) { 6392 if (rq->rd) {
6393 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 6393 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6394 set_rq_offline(rq); 6394 set_rq_offline(rq);
6395 } 6395 }
6396 migrate_tasks(cpu); 6396 migrate_tasks(cpu);
6397 BUG_ON(rq->nr_running != 1); /* the migration thread */ 6397 BUG_ON(rq->nr_running != 1); /* the migration thread */
6398 raw_spin_unlock_irqrestore(&rq->lock, flags); 6398 raw_spin_unlock_irqrestore(&rq->lock, flags);
6399 6399
6400 migrate_nr_uninterruptible(rq); 6400 migrate_nr_uninterruptible(rq);
6401 calc_global_load_remove(rq); 6401 calc_global_load_remove(rq);
6402 break; 6402 break;
6403 #endif 6403 #endif
6404 } 6404 }
6405 6405
6406 update_max_interval(); 6406 update_max_interval();
6407 6407
6408 return NOTIFY_OK; 6408 return NOTIFY_OK;
6409 } 6409 }
6410 6410
6411 /* 6411 /*
6412 * Register at high priority so that task migration (migrate_all_tasks) 6412 * Register at high priority so that task migration (migrate_all_tasks)
6413 * happens before everything else. This has to be lower priority than 6413 * happens before everything else. This has to be lower priority than
6414 * the notifier in the perf_event subsystem, though. 6414 * the notifier in the perf_event subsystem, though.
6415 */ 6415 */
6416 static struct notifier_block __cpuinitdata migration_notifier = { 6416 static struct notifier_block __cpuinitdata migration_notifier = {
6417 .notifier_call = migration_call, 6417 .notifier_call = migration_call,
6418 .priority = CPU_PRI_MIGRATION, 6418 .priority = CPU_PRI_MIGRATION,
6419 }; 6419 };
6420 6420
6421 static int __cpuinit sched_cpu_active(struct notifier_block *nfb, 6421 static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
6422 unsigned long action, void *hcpu) 6422 unsigned long action, void *hcpu)
6423 { 6423 {
6424 switch (action & ~CPU_TASKS_FROZEN) { 6424 switch (action & ~CPU_TASKS_FROZEN) {
6425 case CPU_ONLINE: 6425 case CPU_ONLINE:
6426 case CPU_DOWN_FAILED: 6426 case CPU_DOWN_FAILED:
6427 set_cpu_active((long)hcpu, true); 6427 set_cpu_active((long)hcpu, true);
6428 return NOTIFY_OK; 6428 return NOTIFY_OK;
6429 default: 6429 default:
6430 return NOTIFY_DONE; 6430 return NOTIFY_DONE;
6431 } 6431 }
6432 } 6432 }
6433 6433
6434 static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb, 6434 static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
6435 unsigned long action, void *hcpu) 6435 unsigned long action, void *hcpu)
6436 { 6436 {
6437 switch (action & ~CPU_TASKS_FROZEN) { 6437 switch (action & ~CPU_TASKS_FROZEN) {
6438 case CPU_DOWN_PREPARE: 6438 case CPU_DOWN_PREPARE:
6439 set_cpu_active((long)hcpu, false); 6439 set_cpu_active((long)hcpu, false);
6440 return NOTIFY_OK; 6440 return NOTIFY_OK;
6441 default: 6441 default:
6442 return NOTIFY_DONE; 6442 return NOTIFY_DONE;
6443 } 6443 }
6444 } 6444 }
6445 6445
6446 static int __init migration_init(void) 6446 static int __init migration_init(void)
6447 { 6447 {
6448 void *cpu = (void *)(long)smp_processor_id(); 6448 void *cpu = (void *)(long)smp_processor_id();
6449 int err; 6449 int err;
6450 6450
6451 /* Initialize migration for the boot CPU */ 6451 /* Initialize migration for the boot CPU */
6452 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); 6452 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
6453 BUG_ON(err == NOTIFY_BAD); 6453 BUG_ON(err == NOTIFY_BAD);
6454 migration_call(&migration_notifier, CPU_ONLINE, cpu); 6454 migration_call(&migration_notifier, CPU_ONLINE, cpu);
6455 register_cpu_notifier(&migration_notifier); 6455 register_cpu_notifier(&migration_notifier);
6456 6456
6457 /* Register cpu active notifiers */ 6457 /* Register cpu active notifiers */
6458 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); 6458 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
6459 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); 6459 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
6460 6460
6461 return 0; 6461 return 0;
6462 } 6462 }
6463 early_initcall(migration_init); 6463 early_initcall(migration_init);
6464 #endif 6464 #endif
6465 6465
6466 #ifdef CONFIG_SMP 6466 #ifdef CONFIG_SMP
6467 6467
6468 #ifdef CONFIG_SCHED_DEBUG 6468 #ifdef CONFIG_SCHED_DEBUG
6469 6469
6470 static __read_mostly int sched_domain_debug_enabled; 6470 static __read_mostly int sched_domain_debug_enabled;
6471 6471
6472 static int __init sched_domain_debug_setup(char *str) 6472 static int __init sched_domain_debug_setup(char *str)
6473 { 6473 {
6474 sched_domain_debug_enabled = 1; 6474 sched_domain_debug_enabled = 1;
6475 6475
6476 return 0; 6476 return 0;
6477 } 6477 }
6478 early_param("sched_debug", sched_domain_debug_setup); 6478 early_param("sched_debug", sched_domain_debug_setup);
6479 6479
6480 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 6480 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6481 struct cpumask *groupmask) 6481 struct cpumask *groupmask)
6482 { 6482 {
6483 struct sched_group *group = sd->groups; 6483 struct sched_group *group = sd->groups;
6484 char str[256]; 6484 char str[256];
6485 6485
6486 cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd)); 6486 cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
6487 cpumask_clear(groupmask); 6487 cpumask_clear(groupmask);
6488 6488
6489 printk(KERN_DEBUG "%*s domain %d: ", level, "", level); 6489 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
6490 6490
6491 if (!(sd->flags & SD_LOAD_BALANCE)) { 6491 if (!(sd->flags & SD_LOAD_BALANCE)) {
6492 printk("does not load-balance\n"); 6492 printk("does not load-balance\n");
6493 if (sd->parent) 6493 if (sd->parent)
6494 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" 6494 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
6495 " has parent"); 6495 " has parent");
6496 return -1; 6496 return -1;
6497 } 6497 }
6498 6498
6499 printk(KERN_CONT "span %s level %s\n", str, sd->name); 6499 printk(KERN_CONT "span %s level %s\n", str, sd->name);
6500 6500
6501 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { 6501 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
6502 printk(KERN_ERR "ERROR: domain->span does not contain " 6502 printk(KERN_ERR "ERROR: domain->span does not contain "
6503 "CPU%d\n", cpu); 6503 "CPU%d\n", cpu);
6504 } 6504 }
6505 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { 6505 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
6506 printk(KERN_ERR "ERROR: domain->groups does not contain" 6506 printk(KERN_ERR "ERROR: domain->groups does not contain"
6507 " CPU%d\n", cpu); 6507 " CPU%d\n", cpu);
6508 } 6508 }
6509 6509
6510 printk(KERN_DEBUG "%*s groups:", level + 1, ""); 6510 printk(KERN_DEBUG "%*s groups:", level + 1, "");
6511 do { 6511 do {
6512 if (!group) { 6512 if (!group) {
6513 printk("\n"); 6513 printk("\n");
6514 printk(KERN_ERR "ERROR: group is NULL\n"); 6514 printk(KERN_ERR "ERROR: group is NULL\n");
6515 break; 6515 break;
6516 } 6516 }
6517 6517
6518 if (!group->cpu_power) { 6518 if (!group->cpu_power) {
6519 printk(KERN_CONT "\n"); 6519 printk(KERN_CONT "\n");
6520 printk(KERN_ERR "ERROR: domain->cpu_power not " 6520 printk(KERN_ERR "ERROR: domain->cpu_power not "
6521 "set\n"); 6521 "set\n");
6522 break; 6522 break;
6523 } 6523 }
6524 6524
6525 if (!cpumask_weight(sched_group_cpus(group))) { 6525 if (!cpumask_weight(sched_group_cpus(group))) {
6526 printk(KERN_CONT "\n"); 6526 printk(KERN_CONT "\n");
6527 printk(KERN_ERR "ERROR: empty group\n"); 6527 printk(KERN_ERR "ERROR: empty group\n");
6528 break; 6528 break;
6529 } 6529 }
6530 6530
6531 if (cpumask_intersects(groupmask, sched_group_cpus(group))) { 6531 if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
6532 printk(KERN_CONT "\n"); 6532 printk(KERN_CONT "\n");
6533 printk(KERN_ERR "ERROR: repeated CPUs\n"); 6533 printk(KERN_ERR "ERROR: repeated CPUs\n");
6534 break; 6534 break;
6535 } 6535 }
6536 6536
6537 cpumask_or(groupmask, groupmask, sched_group_cpus(group)); 6537 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
6538 6538
6539 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); 6539 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
6540 6540
6541 printk(KERN_CONT " %s", str); 6541 printk(KERN_CONT " %s", str);
6542 if (group->cpu_power != SCHED_LOAD_SCALE) { 6542 if (group->cpu_power != SCHED_LOAD_SCALE) {
6543 printk(KERN_CONT " (cpu_power = %d)", 6543 printk(KERN_CONT " (cpu_power = %d)",
6544 group->cpu_power); 6544 group->cpu_power);
6545 } 6545 }
6546 6546
6547 group = group->next; 6547 group = group->next;
6548 } while (group != sd->groups); 6548 } while (group != sd->groups);
6549 printk(KERN_CONT "\n"); 6549 printk(KERN_CONT "\n");
6550 6550
6551 if (!cpumask_equal(sched_domain_span(sd), groupmask)) 6551 if (!cpumask_equal(sched_domain_span(sd), groupmask))
6552 printk(KERN_ERR "ERROR: groups don't span domain->span\n"); 6552 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
6553 6553
6554 if (sd->parent && 6554 if (sd->parent &&
6555 !cpumask_subset(groupmask, sched_domain_span(sd->parent))) 6555 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
6556 printk(KERN_ERR "ERROR: parent span is not a superset " 6556 printk(KERN_ERR "ERROR: parent span is not a superset "
6557 "of domain->span\n"); 6557 "of domain->span\n");
6558 return 0; 6558 return 0;
6559 } 6559 }
6560 6560
6561 static void sched_domain_debug(struct sched_domain *sd, int cpu) 6561 static void sched_domain_debug(struct sched_domain *sd, int cpu)
6562 { 6562 {
6563 cpumask_var_t groupmask; 6563 cpumask_var_t groupmask;
6564 int level = 0; 6564 int level = 0;
6565 6565
6566 if (!sched_domain_debug_enabled) 6566 if (!sched_domain_debug_enabled)
6567 return; 6567 return;
6568 6568
6569 if (!sd) { 6569 if (!sd) {
6570 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); 6570 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
6571 return; 6571 return;
6572 } 6572 }
6573 6573
6574 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 6574 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
6575 6575
6576 if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) { 6576 if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
6577 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); 6577 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
6578 return; 6578 return;
6579 } 6579 }
6580 6580
6581 for (;;) { 6581 for (;;) {
6582 if (sched_domain_debug_one(sd, cpu, level, groupmask)) 6582 if (sched_domain_debug_one(sd, cpu, level, groupmask))
6583 break; 6583 break;
6584 level++; 6584 level++;
6585 sd = sd->parent; 6585 sd = sd->parent;
6586 if (!sd) 6586 if (!sd)
6587 break; 6587 break;
6588 } 6588 }
6589 free_cpumask_var(groupmask); 6589 free_cpumask_var(groupmask);
6590 } 6590 }
6591 #else /* !CONFIG_SCHED_DEBUG */ 6591 #else /* !CONFIG_SCHED_DEBUG */
6592 # define sched_domain_debug(sd, cpu) do { } while (0) 6592 # define sched_domain_debug(sd, cpu) do { } while (0)
6593 #endif /* CONFIG_SCHED_DEBUG */ 6593 #endif /* CONFIG_SCHED_DEBUG */
6594 6594
6595 static int sd_degenerate(struct sched_domain *sd) 6595 static int sd_degenerate(struct sched_domain *sd)
6596 { 6596 {
6597 if (cpumask_weight(sched_domain_span(sd)) == 1) 6597 if (cpumask_weight(sched_domain_span(sd)) == 1)
6598 return 1; 6598 return 1;
6599 6599
6600 /* Following flags need at least 2 groups */ 6600 /* Following flags need at least 2 groups */
6601 if (sd->flags & (SD_LOAD_BALANCE | 6601 if (sd->flags & (SD_LOAD_BALANCE |
6602 SD_BALANCE_NEWIDLE | 6602 SD_BALANCE_NEWIDLE |
6603 SD_BALANCE_FORK | 6603 SD_BALANCE_FORK |
6604 SD_BALANCE_EXEC | 6604 SD_BALANCE_EXEC |
6605 SD_SHARE_CPUPOWER | 6605 SD_SHARE_CPUPOWER |
6606 SD_SHARE_PKG_RESOURCES)) { 6606 SD_SHARE_PKG_RESOURCES)) {
6607 if (sd->groups != sd->groups->next) 6607 if (sd->groups != sd->groups->next)
6608 return 0; 6608 return 0;
6609 } 6609 }
6610 6610
6611 /* Following flags don't use groups */ 6611 /* Following flags don't use groups */
6612 if (sd->flags & (SD_WAKE_AFFINE)) 6612 if (sd->flags & (SD_WAKE_AFFINE))
6613 return 0; 6613 return 0;
6614 6614
6615 return 1; 6615 return 1;
6616 } 6616 }
6617 6617
6618 static int 6618 static int
6619 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) 6619 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6620 { 6620 {
6621 unsigned long cflags = sd->flags, pflags = parent->flags; 6621 unsigned long cflags = sd->flags, pflags = parent->flags;
6622 6622
6623 if (sd_degenerate(parent)) 6623 if (sd_degenerate(parent))
6624 return 1; 6624 return 1;
6625 6625
6626 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) 6626 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
6627 return 0; 6627 return 0;
6628 6628
6629 /* Flags needing groups don't count if only 1 group in parent */ 6629 /* Flags needing groups don't count if only 1 group in parent */
6630 if (parent->groups == parent->groups->next) { 6630 if (parent->groups == parent->groups->next) {
6631 pflags &= ~(SD_LOAD_BALANCE | 6631 pflags &= ~(SD_LOAD_BALANCE |
6632 SD_BALANCE_NEWIDLE | 6632 SD_BALANCE_NEWIDLE |
6633 SD_BALANCE_FORK | 6633 SD_BALANCE_FORK |
6634 SD_BALANCE_EXEC | 6634 SD_BALANCE_EXEC |
6635 SD_SHARE_CPUPOWER | 6635 SD_SHARE_CPUPOWER |
6636 SD_SHARE_PKG_RESOURCES); 6636 SD_SHARE_PKG_RESOURCES);
6637 if (nr_node_ids == 1) 6637 if (nr_node_ids == 1)
6638 pflags &= ~SD_SERIALIZE; 6638 pflags &= ~SD_SERIALIZE;
6639 } 6639 }
6640 if (~cflags & pflags) 6640 if (~cflags & pflags)
6641 return 0; 6641 return 0;
6642 6642
6643 return 1; 6643 return 1;
6644 } 6644 }
6645 6645
6646 static void free_rootdomain(struct root_domain *rd) 6646 static void free_rootdomain(struct root_domain *rd)
6647 { 6647 {
6648 synchronize_sched(); 6648 synchronize_sched();
6649 6649
6650 cpupri_cleanup(&rd->cpupri); 6650 cpupri_cleanup(&rd->cpupri);
6651 6651
6652 free_cpumask_var(rd->rto_mask); 6652 free_cpumask_var(rd->rto_mask);
6653 free_cpumask_var(rd->online); 6653 free_cpumask_var(rd->online);
6654 free_cpumask_var(rd->span); 6654 free_cpumask_var(rd->span);
6655 kfree(rd); 6655 kfree(rd);
6656 } 6656 }
6657 6657
6658 static void rq_attach_root(struct rq *rq, struct root_domain *rd) 6658 static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6659 { 6659 {
6660 struct root_domain *old_rd = NULL; 6660 struct root_domain *old_rd = NULL;
6661 unsigned long flags; 6661 unsigned long flags;
6662 6662
6663 raw_spin_lock_irqsave(&rq->lock, flags); 6663 raw_spin_lock_irqsave(&rq->lock, flags);
6664 6664
6665 if (rq->rd) { 6665 if (rq->rd) {
6666 old_rd = rq->rd; 6666 old_rd = rq->rd;
6667 6667
6668 if (cpumask_test_cpu(rq->cpu, old_rd->online)) 6668 if (cpumask_test_cpu(rq->cpu, old_rd->online))
6669 set_rq_offline(rq); 6669 set_rq_offline(rq);
6670 6670
6671 cpumask_clear_cpu(rq->cpu, old_rd->span); 6671 cpumask_clear_cpu(rq->cpu, old_rd->span);
6672 6672
6673 /* 6673 /*
6674 * If we dont want to free the old_rt yet then 6674 * If we dont want to free the old_rt yet then
6675 * set old_rd to NULL to skip the freeing later 6675 * set old_rd to NULL to skip the freeing later
6676 * in this function: 6676 * in this function:
6677 */ 6677 */
6678 if (!atomic_dec_and_test(&old_rd->refcount)) 6678 if (!atomic_dec_and_test(&old_rd->refcount))
6679 old_rd = NULL; 6679 old_rd = NULL;
6680 } 6680 }
6681 6681
6682 atomic_inc(&rd->refcount); 6682 atomic_inc(&rd->refcount);
6683 rq->rd = rd; 6683 rq->rd = rd;
6684 6684
6685 cpumask_set_cpu(rq->cpu, rd->span); 6685 cpumask_set_cpu(rq->cpu, rd->span);
6686 if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) 6686 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
6687 set_rq_online(rq); 6687 set_rq_online(rq);
6688 6688
6689 raw_spin_unlock_irqrestore(&rq->lock, flags); 6689 raw_spin_unlock_irqrestore(&rq->lock, flags);
6690 6690
6691 if (old_rd) 6691 if (old_rd)
6692 free_rootdomain(old_rd); 6692 free_rootdomain(old_rd);
6693 } 6693 }
6694 6694
6695 static int init_rootdomain(struct root_domain *rd) 6695 static int init_rootdomain(struct root_domain *rd)
6696 { 6696 {
6697 memset(rd, 0, sizeof(*rd)); 6697 memset(rd, 0, sizeof(*rd));
6698 6698
6699 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) 6699 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
6700 goto out; 6700 goto out;
6701 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) 6701 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
6702 goto free_span; 6702 goto free_span;
6703 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) 6703 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
6704 goto free_online; 6704 goto free_online;
6705 6705
6706 if (cpupri_init(&rd->cpupri) != 0) 6706 if (cpupri_init(&rd->cpupri) != 0)
6707 goto free_rto_mask; 6707 goto free_rto_mask;
6708 return 0; 6708 return 0;
6709 6709
6710 free_rto_mask: 6710 free_rto_mask:
6711 free_cpumask_var(rd->rto_mask); 6711 free_cpumask_var(rd->rto_mask);
6712 free_online: 6712 free_online:
6713 free_cpumask_var(rd->online); 6713 free_cpumask_var(rd->online);
6714 free_span: 6714 free_span:
6715 free_cpumask_var(rd->span); 6715 free_cpumask_var(rd->span);
6716 out: 6716 out:
6717 return -ENOMEM; 6717 return -ENOMEM;
6718 } 6718 }
6719 6719
6720 static void init_defrootdomain(void) 6720 static void init_defrootdomain(void)
6721 { 6721 {
6722 init_rootdomain(&def_root_domain); 6722 init_rootdomain(&def_root_domain);
6723 6723
6724 atomic_set(&def_root_domain.refcount, 1); 6724 atomic_set(&def_root_domain.refcount, 1);
6725 } 6725 }
6726 6726
6727 static struct root_domain *alloc_rootdomain(void) 6727 static struct root_domain *alloc_rootdomain(void)
6728 { 6728 {
6729 struct root_domain *rd; 6729 struct root_domain *rd;
6730 6730
6731 rd = kmalloc(sizeof(*rd), GFP_KERNEL); 6731 rd = kmalloc(sizeof(*rd), GFP_KERNEL);
6732 if (!rd) 6732 if (!rd)
6733 return NULL; 6733 return NULL;
6734 6734
6735 if (init_rootdomain(rd) != 0) { 6735 if (init_rootdomain(rd) != 0) {
6736 kfree(rd); 6736 kfree(rd);
6737 return NULL; 6737 return NULL;
6738 } 6738 }
6739 6739
6740 return rd; 6740 return rd;
6741 } 6741 }
6742 6742
6743 /* 6743 /*
6744 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 6744 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
6745 * hold the hotplug lock. 6745 * hold the hotplug lock.
6746 */ 6746 */
6747 static void 6747 static void
6748 cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) 6748 cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6749 { 6749 {
6750 struct rq *rq = cpu_rq(cpu); 6750 struct rq *rq = cpu_rq(cpu);
6751 struct sched_domain *tmp; 6751 struct sched_domain *tmp;
6752 6752
6753 for (tmp = sd; tmp; tmp = tmp->parent) 6753 for (tmp = sd; tmp; tmp = tmp->parent)
6754 tmp->span_weight = cpumask_weight(sched_domain_span(tmp)); 6754 tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
6755 6755
6756 /* Remove the sched domains which do not contribute to scheduling. */ 6756 /* Remove the sched domains which do not contribute to scheduling. */
6757 for (tmp = sd; tmp; ) { 6757 for (tmp = sd; tmp; ) {
6758 struct sched_domain *parent = tmp->parent; 6758 struct sched_domain *parent = tmp->parent;
6759 if (!parent) 6759 if (!parent)
6760 break; 6760 break;
6761 6761
6762 if (sd_parent_degenerate(tmp, parent)) { 6762 if (sd_parent_degenerate(tmp, parent)) {
6763 tmp->parent = parent->parent; 6763 tmp->parent = parent->parent;
6764 if (parent->parent) 6764 if (parent->parent)
6765 parent->parent->child = tmp; 6765 parent->parent->child = tmp;
6766 } else 6766 } else
6767 tmp = tmp->parent; 6767 tmp = tmp->parent;
6768 } 6768 }
6769 6769
6770 if (sd && sd_degenerate(sd)) { 6770 if (sd && sd_degenerate(sd)) {
6771 sd = sd->parent; 6771 sd = sd->parent;
6772 if (sd) 6772 if (sd)
6773 sd->child = NULL; 6773 sd->child = NULL;
6774 } 6774 }
6775 6775
6776 sched_domain_debug(sd, cpu); 6776 sched_domain_debug(sd, cpu);
6777 6777
6778 rq_attach_root(rq, rd); 6778 rq_attach_root(rq, rd);
6779 rcu_assign_pointer(rq->sd, sd); 6779 rcu_assign_pointer(rq->sd, sd);
6780 } 6780 }
6781 6781
6782 /* cpus with isolated domains */ 6782 /* cpus with isolated domains */
6783 static cpumask_var_t cpu_isolated_map; 6783 static cpumask_var_t cpu_isolated_map;
6784 6784
6785 /* Setup the mask of cpus configured for isolated domains */ 6785 /* Setup the mask of cpus configured for isolated domains */
6786 static int __init isolated_cpu_setup(char *str) 6786 static int __init isolated_cpu_setup(char *str)
6787 { 6787 {
6788 alloc_bootmem_cpumask_var(&cpu_isolated_map); 6788 alloc_bootmem_cpumask_var(&cpu_isolated_map);
6789 cpulist_parse(str, cpu_isolated_map); 6789 cpulist_parse(str, cpu_isolated_map);
6790 return 1; 6790 return 1;
6791 } 6791 }
6792 6792
6793 __setup("isolcpus=", isolated_cpu_setup); 6793 __setup("isolcpus=", isolated_cpu_setup);
6794 6794
6795 /* 6795 /*
6796 * init_sched_build_groups takes the cpumask we wish to span, and a pointer 6796 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
6797 * to a function which identifies what group(along with sched group) a CPU 6797 * to a function which identifies what group(along with sched group) a CPU
6798 * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids 6798 * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
6799 * (due to the fact that we keep track of groups covered with a struct cpumask). 6799 * (due to the fact that we keep track of groups covered with a struct cpumask).
6800 * 6800 *
6801 * init_sched_build_groups will build a circular linked list of the groups 6801 * init_sched_build_groups will build a circular linked list of the groups
6802 * covered by the given span, and will set each group's ->cpumask correctly, 6802 * covered by the given span, and will set each group's ->cpumask correctly,
6803 * and ->cpu_power to 0. 6803 * and ->cpu_power to 0.
6804 */ 6804 */
6805 static void 6805 static void
6806 init_sched_build_groups(const struct cpumask *span, 6806 init_sched_build_groups(const struct cpumask *span,
6807 const struct cpumask *cpu_map, 6807 const struct cpumask *cpu_map,
6808 int (*group_fn)(int cpu, const struct cpumask *cpu_map, 6808 int (*group_fn)(int cpu, const struct cpumask *cpu_map,
6809 struct sched_group **sg, 6809 struct sched_group **sg,
6810 struct cpumask *tmpmask), 6810 struct cpumask *tmpmask),
6811 struct cpumask *covered, struct cpumask *tmpmask) 6811 struct cpumask *covered, struct cpumask *tmpmask)
6812 { 6812 {
6813 struct sched_group *first = NULL, *last = NULL; 6813 struct sched_group *first = NULL, *last = NULL;
6814 int i; 6814 int i;
6815 6815
6816 cpumask_clear(covered); 6816 cpumask_clear(covered);
6817 6817
6818 for_each_cpu(i, span) { 6818 for_each_cpu(i, span) {
6819 struct sched_group *sg; 6819 struct sched_group *sg;
6820 int group = group_fn(i, cpu_map, &sg, tmpmask); 6820 int group = group_fn(i, cpu_map, &sg, tmpmask);
6821 int j; 6821 int j;
6822 6822
6823 if (cpumask_test_cpu(i, covered)) 6823 if (cpumask_test_cpu(i, covered))
6824 continue; 6824 continue;
6825 6825
6826 cpumask_clear(sched_group_cpus(sg)); 6826 cpumask_clear(sched_group_cpus(sg));
6827 sg->cpu_power = 0; 6827 sg->cpu_power = 0;
6828 6828
6829 for_each_cpu(j, span) { 6829 for_each_cpu(j, span) {
6830 if (group_fn(j, cpu_map, NULL, tmpmask) != group) 6830 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
6831 continue; 6831 continue;
6832 6832
6833 cpumask_set_cpu(j, covered); 6833 cpumask_set_cpu(j, covered);
6834 cpumask_set_cpu(j, sched_group_cpus(sg)); 6834 cpumask_set_cpu(j, sched_group_cpus(sg));
6835 } 6835 }
6836 if (!first) 6836 if (!first)
6837 first = sg; 6837 first = sg;
6838 if (last) 6838 if (last)
6839 last->next = sg; 6839 last->next = sg;
6840 last = sg; 6840 last = sg;
6841 } 6841 }
6842 last->next = first; 6842 last->next = first;
6843 } 6843 }
6844 6844
6845 #define SD_NODES_PER_DOMAIN 16 6845 #define SD_NODES_PER_DOMAIN 16
6846 6846
6847 #ifdef CONFIG_NUMA 6847 #ifdef CONFIG_NUMA
6848 6848
6849 /** 6849 /**
6850 * find_next_best_node - find the next node to include in a sched_domain 6850 * find_next_best_node - find the next node to include in a sched_domain
6851 * @node: node whose sched_domain we're building 6851 * @node: node whose sched_domain we're building
6852 * @used_nodes: nodes already in the sched_domain 6852 * @used_nodes: nodes already in the sched_domain
6853 * 6853 *
6854 * Find the next node to include in a given scheduling domain. Simply 6854 * Find the next node to include in a given scheduling domain. Simply
6855 * finds the closest node not already in the @used_nodes map. 6855 * finds the closest node not already in the @used_nodes map.
6856 * 6856 *
6857 * Should use nodemask_t. 6857 * Should use nodemask_t.
6858 */ 6858 */
6859 static int find_next_best_node(int node, nodemask_t *used_nodes) 6859 static int find_next_best_node(int node, nodemask_t *used_nodes)
6860 { 6860 {
6861 int i, n, val, min_val, best_node = 0; 6861 int i, n, val, min_val, best_node = 0;
6862 6862
6863 min_val = INT_MAX; 6863 min_val = INT_MAX;
6864 6864
6865 for (i = 0; i < nr_node_ids; i++) { 6865 for (i = 0; i < nr_node_ids; i++) {
6866 /* Start at @node */ 6866 /* Start at @node */
6867 n = (node + i) % nr_node_ids; 6867 n = (node + i) % nr_node_ids;
6868 6868
6869 if (!nr_cpus_node(n)) 6869 if (!nr_cpus_node(n))
6870 continue; 6870 continue;
6871 6871
6872 /* Skip already used nodes */ 6872 /* Skip already used nodes */
6873 if (node_isset(n, *used_nodes)) 6873 if (node_isset(n, *used_nodes))
6874 continue; 6874 continue;
6875 6875
6876 /* Simple min distance search */ 6876 /* Simple min distance search */
6877 val = node_distance(node, n); 6877 val = node_distance(node, n);
6878 6878
6879 if (val < min_val) { 6879 if (val < min_val) {
6880 min_val = val; 6880 min_val = val;
6881 best_node = n; 6881 best_node = n;
6882 } 6882 }
6883 } 6883 }
6884 6884
6885 node_set(best_node, *used_nodes); 6885 node_set(best_node, *used_nodes);
6886 return best_node; 6886 return best_node;
6887 } 6887 }
6888 6888
6889 /** 6889 /**
6890 * sched_domain_node_span - get a cpumask for a node's sched_domain 6890 * sched_domain_node_span - get a cpumask for a node's sched_domain
6891 * @node: node whose cpumask we're constructing 6891 * @node: node whose cpumask we're constructing
6892 * @span: resulting cpumask 6892 * @span: resulting cpumask
6893 * 6893 *
6894 * Given a node, construct a good cpumask for its sched_domain to span. It 6894 * Given a node, construct a good cpumask for its sched_domain to span. It
6895 * should be one that prevents unnecessary balancing, but also spreads tasks 6895 * should be one that prevents unnecessary balancing, but also spreads tasks
6896 * out optimally. 6896 * out optimally.
6897 */ 6897 */
6898 static void sched_domain_node_span(int node, struct cpumask *span) 6898 static void sched_domain_node_span(int node, struct cpumask *span)
6899 { 6899 {
6900 nodemask_t used_nodes; 6900 nodemask_t used_nodes;
6901 int i; 6901 int i;
6902 6902
6903 cpumask_clear(span); 6903 cpumask_clear(span);
6904 nodes_clear(used_nodes); 6904 nodes_clear(used_nodes);
6905 6905
6906 cpumask_or(span, span, cpumask_of_node(node)); 6906 cpumask_or(span, span, cpumask_of_node(node));
6907 node_set(node, used_nodes); 6907 node_set(node, used_nodes);
6908 6908
6909 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { 6909 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
6910 int next_node = find_next_best_node(node, &used_nodes); 6910 int next_node = find_next_best_node(node, &used_nodes);
6911 6911
6912 cpumask_or(span, span, cpumask_of_node(next_node)); 6912 cpumask_or(span, span, cpumask_of_node(next_node));
6913 } 6913 }
6914 } 6914 }
6915 #endif /* CONFIG_NUMA */ 6915 #endif /* CONFIG_NUMA */
6916 6916
6917 int sched_smt_power_savings = 0, sched_mc_power_savings = 0; 6917 int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6918 6918
6919 /* 6919 /*
6920 * The cpus mask in sched_group and sched_domain hangs off the end. 6920 * The cpus mask in sched_group and sched_domain hangs off the end.
6921 * 6921 *
6922 * ( See the the comments in include/linux/sched.h:struct sched_group 6922 * ( See the the comments in include/linux/sched.h:struct sched_group
6923 * and struct sched_domain. ) 6923 * and struct sched_domain. )
6924 */ 6924 */
6925 struct static_sched_group { 6925 struct static_sched_group {
6926 struct sched_group sg; 6926 struct sched_group sg;
6927 DECLARE_BITMAP(cpus, CONFIG_NR_CPUS); 6927 DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
6928 }; 6928 };
6929 6929
6930 struct static_sched_domain { 6930 struct static_sched_domain {
6931 struct sched_domain sd; 6931 struct sched_domain sd;
6932 DECLARE_BITMAP(span, CONFIG_NR_CPUS); 6932 DECLARE_BITMAP(span, CONFIG_NR_CPUS);
6933 }; 6933 };
6934 6934
6935 struct s_data { 6935 struct s_data {
6936 #ifdef CONFIG_NUMA 6936 #ifdef CONFIG_NUMA
6937 int sd_allnodes; 6937 int sd_allnodes;
6938 cpumask_var_t domainspan; 6938 cpumask_var_t domainspan;
6939 cpumask_var_t covered; 6939 cpumask_var_t covered;
6940 cpumask_var_t notcovered; 6940 cpumask_var_t notcovered;
6941 #endif 6941 #endif
6942 cpumask_var_t nodemask; 6942 cpumask_var_t nodemask;
6943 cpumask_var_t this_sibling_map; 6943 cpumask_var_t this_sibling_map;
6944 cpumask_var_t this_core_map; 6944 cpumask_var_t this_core_map;
6945 cpumask_var_t this_book_map; 6945 cpumask_var_t this_book_map;
6946 cpumask_var_t send_covered; 6946 cpumask_var_t send_covered;
6947 cpumask_var_t tmpmask; 6947 cpumask_var_t tmpmask;
6948 struct sched_group **sched_group_nodes; 6948 struct sched_group **sched_group_nodes;
6949 struct root_domain *rd; 6949 struct root_domain *rd;
6950 }; 6950 };
6951 6951
6952 enum s_alloc { 6952 enum s_alloc {
6953 sa_sched_groups = 0, 6953 sa_sched_groups = 0,
6954 sa_rootdomain, 6954 sa_rootdomain,
6955 sa_tmpmask, 6955 sa_tmpmask,
6956 sa_send_covered, 6956 sa_send_covered,
6957 sa_this_book_map, 6957 sa_this_book_map,
6958 sa_this_core_map, 6958 sa_this_core_map,
6959 sa_this_sibling_map, 6959 sa_this_sibling_map,
6960 sa_nodemask, 6960 sa_nodemask,
6961 sa_sched_group_nodes, 6961 sa_sched_group_nodes,
6962 #ifdef CONFIG_NUMA 6962 #ifdef CONFIG_NUMA
6963 sa_notcovered, 6963 sa_notcovered,
6964 sa_covered, 6964 sa_covered,
6965 sa_domainspan, 6965 sa_domainspan,
6966 #endif 6966 #endif
6967 sa_none, 6967 sa_none,
6968 }; 6968 };
6969 6969
6970 /* 6970 /*
6971 * SMT sched-domains: 6971 * SMT sched-domains:
6972 */ 6972 */
6973 #ifdef CONFIG_SCHED_SMT 6973 #ifdef CONFIG_SCHED_SMT
6974 static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains); 6974 static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
6975 static DEFINE_PER_CPU(struct static_sched_group, sched_groups); 6975 static DEFINE_PER_CPU(struct static_sched_group, sched_groups);
6976 6976
6977 static int 6977 static int
6978 cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, 6978 cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
6979 struct sched_group **sg, struct cpumask *unused) 6979 struct sched_group **sg, struct cpumask *unused)
6980 { 6980 {
6981 if (sg) 6981 if (sg)
6982 *sg = &per_cpu(sched_groups, cpu).sg; 6982 *sg = &per_cpu(sched_groups, cpu).sg;
6983 return cpu; 6983 return cpu;
6984 } 6984 }
6985 #endif /* CONFIG_SCHED_SMT */ 6985 #endif /* CONFIG_SCHED_SMT */
6986 6986
6987 /* 6987 /*
6988 * multi-core sched-domains: 6988 * multi-core sched-domains:
6989 */ 6989 */
6990 #ifdef CONFIG_SCHED_MC 6990 #ifdef CONFIG_SCHED_MC
6991 static DEFINE_PER_CPU(struct static_sched_domain, core_domains); 6991 static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
6992 static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); 6992 static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
6993 6993
6994 static int 6994 static int
6995 cpu_to_core_group(int cpu, const struct cpumask *cpu_map, 6995 cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
6996 struct sched_group **sg, struct cpumask *mask) 6996 struct sched_group **sg, struct cpumask *mask)
6997 { 6997 {
6998 int group; 6998 int group;
6999 #ifdef CONFIG_SCHED_SMT 6999 #ifdef CONFIG_SCHED_SMT
7000 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); 7000 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
7001 group = cpumask_first(mask); 7001 group = cpumask_first(mask);
7002 #else 7002 #else
7003 group = cpu; 7003 group = cpu;
7004 #endif 7004 #endif
7005 if (sg) 7005 if (sg)
7006 *sg = &per_cpu(sched_group_core, group).sg; 7006 *sg = &per_cpu(sched_group_core, group).sg;
7007 return group; 7007 return group;
7008 } 7008 }
7009 #endif /* CONFIG_SCHED_MC */ 7009 #endif /* CONFIG_SCHED_MC */
7010 7010
7011 /* 7011 /*
7012 * book sched-domains: 7012 * book sched-domains:
7013 */ 7013 */
7014 #ifdef CONFIG_SCHED_BOOK 7014 #ifdef CONFIG_SCHED_BOOK
7015 static DEFINE_PER_CPU(struct static_sched_domain, book_domains); 7015 static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
7016 static DEFINE_PER_CPU(struct static_sched_group, sched_group_book); 7016 static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
7017 7017
7018 static int 7018 static int
7019 cpu_to_book_group(int cpu, const struct cpumask *cpu_map, 7019 cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
7020 struct sched_group **sg, struct cpumask *mask) 7020 struct sched_group **sg, struct cpumask *mask)
7021 { 7021 {
7022 int group = cpu; 7022 int group = cpu;
7023 #ifdef CONFIG_SCHED_MC 7023 #ifdef CONFIG_SCHED_MC
7024 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); 7024 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
7025 group = cpumask_first(mask); 7025 group = cpumask_first(mask);
7026 #elif defined(CONFIG_SCHED_SMT) 7026 #elif defined(CONFIG_SCHED_SMT)
7027 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); 7027 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
7028 group = cpumask_first(mask); 7028 group = cpumask_first(mask);
7029 #endif 7029 #endif
7030 if (sg) 7030 if (sg)
7031 *sg = &per_cpu(sched_group_book, group).sg; 7031 *sg = &per_cpu(sched_group_book, group).sg;
7032 return group; 7032 return group;
7033 } 7033 }
7034 #endif /* CONFIG_SCHED_BOOK */ 7034 #endif /* CONFIG_SCHED_BOOK */
7035 7035
7036 static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); 7036 static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
7037 static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); 7037 static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
7038 7038
7039 static int 7039 static int
7040 cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, 7040 cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
7041 struct sched_group **sg, struct cpumask *mask) 7041 struct sched_group **sg, struct cpumask *mask)
7042 { 7042 {
7043 int group; 7043 int group;
7044 #ifdef CONFIG_SCHED_BOOK 7044 #ifdef CONFIG_SCHED_BOOK
7045 cpumask_and(mask, cpu_book_mask(cpu), cpu_map); 7045 cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
7046 group = cpumask_first(mask); 7046 group = cpumask_first(mask);
7047 #elif defined(CONFIG_SCHED_MC) 7047 #elif defined(CONFIG_SCHED_MC)
7048 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); 7048 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
7049 group = cpumask_first(mask); 7049 group = cpumask_first(mask);
7050 #elif defined(CONFIG_SCHED_SMT) 7050 #elif defined(CONFIG_SCHED_SMT)
7051 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); 7051 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
7052 group = cpumask_first(mask); 7052 group = cpumask_first(mask);
7053 #else 7053 #else
7054 group = cpu; 7054 group = cpu;
7055 #endif 7055 #endif
7056 if (sg) 7056 if (sg)
7057 *sg = &per_cpu(sched_group_phys, group).sg; 7057 *sg = &per_cpu(sched_group_phys, group).sg;
7058 return group; 7058 return group;
7059 } 7059 }
7060 7060
7061 #ifdef CONFIG_NUMA 7061 #ifdef CONFIG_NUMA
7062 /* 7062 /*
7063 * The init_sched_build_groups can't handle what we want to do with node 7063 * The init_sched_build_groups can't handle what we want to do with node
7064 * groups, so roll our own. Now each node has its own list of groups which 7064 * groups, so roll our own. Now each node has its own list of groups which
7065 * gets dynamically allocated. 7065 * gets dynamically allocated.
7066 */ 7066 */
7067 static DEFINE_PER_CPU(struct static_sched_domain, node_domains); 7067 static DEFINE_PER_CPU(struct static_sched_domain, node_domains);
7068 static struct sched_group ***sched_group_nodes_bycpu; 7068 static struct sched_group ***sched_group_nodes_bycpu;
7069 7069
7070 static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains); 7070 static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
7071 static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes); 7071 static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
7072 7072
7073 static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, 7073 static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
7074 struct sched_group **sg, 7074 struct sched_group **sg,
7075 struct cpumask *nodemask) 7075 struct cpumask *nodemask)
7076 { 7076 {
7077 int group; 7077 int group;
7078 7078
7079 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map); 7079 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
7080 group = cpumask_first(nodemask); 7080 group = cpumask_first(nodemask);
7081 7081
7082 if (sg) 7082 if (sg)
7083 *sg = &per_cpu(sched_group_allnodes, group).sg; 7083 *sg = &per_cpu(sched_group_allnodes, group).sg;
7084 return group; 7084 return group;
7085 } 7085 }
7086 7086
7087 static void init_numa_sched_groups_power(struct sched_group *group_head) 7087 static void init_numa_sched_groups_power(struct sched_group *group_head)
7088 { 7088 {
7089 struct sched_group *sg = group_head; 7089 struct sched_group *sg = group_head;
7090 int j; 7090 int j;
7091 7091
7092 if (!sg) 7092 if (!sg)
7093 return; 7093 return;
7094 do { 7094 do {
7095 for_each_cpu(j, sched_group_cpus(sg)) { 7095 for_each_cpu(j, sched_group_cpus(sg)) {
7096 struct sched_domain *sd; 7096 struct sched_domain *sd;
7097 7097
7098 sd = &per_cpu(phys_domains, j).sd; 7098 sd = &per_cpu(phys_domains, j).sd;
7099 if (j != group_first_cpu(sd->groups)) { 7099 if (j != group_first_cpu(sd->groups)) {
7100 /* 7100 /*
7101 * Only add "power" once for each 7101 * Only add "power" once for each
7102 * physical package. 7102 * physical package.
7103 */ 7103 */
7104 continue; 7104 continue;
7105 } 7105 }
7106 7106
7107 sg->cpu_power += sd->groups->cpu_power; 7107 sg->cpu_power += sd->groups->cpu_power;
7108 } 7108 }
7109 sg = sg->next; 7109 sg = sg->next;
7110 } while (sg != group_head); 7110 } while (sg != group_head);
7111 } 7111 }
7112 7112
7113 static int build_numa_sched_groups(struct s_data *d, 7113 static int build_numa_sched_groups(struct s_data *d,
7114 const struct cpumask *cpu_map, int num) 7114 const struct cpumask *cpu_map, int num)
7115 { 7115 {
7116 struct sched_domain *sd; 7116 struct sched_domain *sd;
7117 struct sched_group *sg, *prev; 7117 struct sched_group *sg, *prev;
7118 int n, j; 7118 int n, j;
7119 7119
7120 cpumask_clear(d->covered); 7120 cpumask_clear(d->covered);
7121 cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map); 7121 cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
7122 if (cpumask_empty(d->nodemask)) { 7122 if (cpumask_empty(d->nodemask)) {
7123 d->sched_group_nodes[num] = NULL; 7123 d->sched_group_nodes[num] = NULL;
7124 goto out; 7124 goto out;
7125 } 7125 }
7126 7126
7127 sched_domain_node_span(num, d->domainspan); 7127 sched_domain_node_span(num, d->domainspan);
7128 cpumask_and(d->domainspan, d->domainspan, cpu_map); 7128 cpumask_and(d->domainspan, d->domainspan, cpu_map);
7129 7129
7130 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), 7130 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
7131 GFP_KERNEL, num); 7131 GFP_KERNEL, num);
7132 if (!sg) { 7132 if (!sg) {
7133 printk(KERN_WARNING "Can not alloc domain group for node %d\n", 7133 printk(KERN_WARNING "Can not alloc domain group for node %d\n",
7134 num); 7134 num);
7135 return -ENOMEM; 7135 return -ENOMEM;
7136 } 7136 }
7137 d->sched_group_nodes[num] = sg; 7137 d->sched_group_nodes[num] = sg;
7138 7138
7139 for_each_cpu(j, d->nodemask) { 7139 for_each_cpu(j, d->nodemask) {
7140 sd = &per_cpu(node_domains, j).sd; 7140 sd = &per_cpu(node_domains, j).sd;
7141 sd->groups = sg; 7141 sd->groups = sg;
7142 } 7142 }
7143 7143
7144 sg->cpu_power = 0; 7144 sg->cpu_power = 0;
7145 cpumask_copy(sched_group_cpus(sg), d->nodemask); 7145 cpumask_copy(sched_group_cpus(sg), d->nodemask);
7146 sg->next = sg; 7146 sg->next = sg;
7147 cpumask_or(d->covered, d->covered, d->nodemask); 7147 cpumask_or(d->covered, d->covered, d->nodemask);
7148 7148
7149 prev = sg; 7149 prev = sg;
7150 for (j = 0; j < nr_node_ids; j++) { 7150 for (j = 0; j < nr_node_ids; j++) {
7151 n = (num + j) % nr_node_ids; 7151 n = (num + j) % nr_node_ids;
7152 cpumask_complement(d->notcovered, d->covered); 7152 cpumask_complement(d->notcovered, d->covered);
7153 cpumask_and(d->tmpmask, d->notcovered, cpu_map); 7153 cpumask_and(d->tmpmask, d->notcovered, cpu_map);
7154 cpumask_and(d->tmpmask, d->tmpmask, d->domainspan); 7154 cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
7155 if (cpumask_empty(d->tmpmask)) 7155 if (cpumask_empty(d->tmpmask))
7156 break; 7156 break;
7157 cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n)); 7157 cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
7158 if (cpumask_empty(d->tmpmask)) 7158 if (cpumask_empty(d->tmpmask))
7159 continue; 7159 continue;
7160 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), 7160 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
7161 GFP_KERNEL, num); 7161 GFP_KERNEL, num);
7162 if (!sg) { 7162 if (!sg) {
7163 printk(KERN_WARNING 7163 printk(KERN_WARNING
7164 "Can not alloc domain group for node %d\n", j); 7164 "Can not alloc domain group for node %d\n", j);
7165 return -ENOMEM; 7165 return -ENOMEM;
7166 } 7166 }
7167 sg->cpu_power = 0; 7167 sg->cpu_power = 0;
7168 cpumask_copy(sched_group_cpus(sg), d->tmpmask); 7168 cpumask_copy(sched_group_cpus(sg), d->tmpmask);
7169 sg->next = prev->next; 7169 sg->next = prev->next;
7170 cpumask_or(d->covered, d->covered, d->tmpmask); 7170 cpumask_or(d->covered, d->covered, d->tmpmask);
7171 prev->next = sg; 7171 prev->next = sg;
7172 prev = sg; 7172 prev = sg;
7173 } 7173 }
7174 out: 7174 out:
7175 return 0; 7175 return 0;
7176 } 7176 }
7177 #endif /* CONFIG_NUMA */ 7177 #endif /* CONFIG_NUMA */
7178 7178
7179 #ifdef CONFIG_NUMA 7179 #ifdef CONFIG_NUMA
7180 /* Free memory allocated for various sched_group structures */ 7180 /* Free memory allocated for various sched_group structures */
7181 static void free_sched_groups(const struct cpumask *cpu_map, 7181 static void free_sched_groups(const struct cpumask *cpu_map,
7182 struct cpumask *nodemask) 7182 struct cpumask *nodemask)
7183 { 7183 {
7184 int cpu, i; 7184 int cpu, i;
7185 7185
7186 for_each_cpu(cpu, cpu_map) { 7186 for_each_cpu(cpu, cpu_map) {
7187 struct sched_group **sched_group_nodes 7187 struct sched_group **sched_group_nodes
7188 = sched_group_nodes_bycpu[cpu]; 7188 = sched_group_nodes_bycpu[cpu];
7189 7189
7190 if (!sched_group_nodes) 7190 if (!sched_group_nodes)
7191 continue; 7191 continue;
7192 7192
7193 for (i = 0; i < nr_node_ids; i++) { 7193 for (i = 0; i < nr_node_ids; i++) {
7194 struct sched_group *oldsg, *sg = sched_group_nodes[i]; 7194 struct sched_group *oldsg, *sg = sched_group_nodes[i];
7195 7195
7196 cpumask_and(nodemask, cpumask_of_node(i), cpu_map); 7196 cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
7197 if (cpumask_empty(nodemask)) 7197 if (cpumask_empty(nodemask))
7198 continue; 7198 continue;
7199 7199
7200 if (sg == NULL) 7200 if (sg == NULL)
7201 continue; 7201 continue;
7202 sg = sg->next; 7202 sg = sg->next;
7203 next_sg: 7203 next_sg:
7204 oldsg = sg; 7204 oldsg = sg;
7205 sg = sg->next; 7205 sg = sg->next;
7206 kfree(oldsg); 7206 kfree(oldsg);
7207 if (oldsg != sched_group_nodes[i]) 7207 if (oldsg != sched_group_nodes[i])
7208 goto next_sg; 7208 goto next_sg;
7209 } 7209 }
7210 kfree(sched_group_nodes); 7210 kfree(sched_group_nodes);
7211 sched_group_nodes_bycpu[cpu] = NULL; 7211 sched_group_nodes_bycpu[cpu] = NULL;
7212 } 7212 }
7213 } 7213 }
7214 #else /* !CONFIG_NUMA */ 7214 #else /* !CONFIG_NUMA */
7215 static void free_sched_groups(const struct cpumask *cpu_map, 7215 static void free_sched_groups(const struct cpumask *cpu_map,
7216 struct cpumask *nodemask) 7216 struct cpumask *nodemask)
7217 { 7217 {
7218 } 7218 }
7219 #endif /* CONFIG_NUMA */ 7219 #endif /* CONFIG_NUMA */
7220 7220
7221 /* 7221 /*
7222 * Initialize sched groups cpu_power. 7222 * Initialize sched groups cpu_power.
7223 * 7223 *
7224 * cpu_power indicates the capacity of sched group, which is used while 7224 * cpu_power indicates the capacity of sched group, which is used while
7225 * distributing the load between different sched groups in a sched domain. 7225 * distributing the load between different sched groups in a sched domain.
7226 * Typically cpu_power for all the groups in a sched domain will be same unless 7226 * Typically cpu_power for all the groups in a sched domain will be same unless
7227 * there are asymmetries in the topology. If there are asymmetries, group 7227 * there are asymmetries in the topology. If there are asymmetries, group
7228 * having more cpu_power will pickup more load compared to the group having 7228 * having more cpu_power will pickup more load compared to the group having
7229 * less cpu_power. 7229 * less cpu_power.
7230 */ 7230 */
7231 static void init_sched_groups_power(int cpu, struct sched_domain *sd) 7231 static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7232 { 7232 {
7233 struct sched_domain *child; 7233 struct sched_domain *child;
7234 struct sched_group *group; 7234 struct sched_group *group;
7235 long power; 7235 long power;
7236 int weight; 7236 int weight;
7237 7237
7238 WARN_ON(!sd || !sd->groups); 7238 WARN_ON(!sd || !sd->groups);
7239 7239
7240 if (cpu != group_first_cpu(sd->groups)) 7240 if (cpu != group_first_cpu(sd->groups))
7241 return; 7241 return;
7242 7242
7243 sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); 7243 sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
7244 7244
7245 child = sd->child; 7245 child = sd->child;
7246 7246
7247 sd->groups->cpu_power = 0; 7247 sd->groups->cpu_power = 0;
7248 7248
7249 if (!child) { 7249 if (!child) {
7250 power = SCHED_LOAD_SCALE; 7250 power = SCHED_LOAD_SCALE;
7251 weight = cpumask_weight(sched_domain_span(sd)); 7251 weight = cpumask_weight(sched_domain_span(sd));
7252 /* 7252 /*
7253 * SMT siblings share the power of a single core. 7253 * SMT siblings share the power of a single core.
7254 * Usually multiple threads get a better yield out of 7254 * Usually multiple threads get a better yield out of
7255 * that one core than a single thread would have, 7255 * that one core than a single thread would have,
7256 * reflect that in sd->smt_gain. 7256 * reflect that in sd->smt_gain.
7257 */ 7257 */
7258 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { 7258 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
7259 power *= sd->smt_gain; 7259 power *= sd->smt_gain;
7260 power /= weight; 7260 power /= weight;
7261 power >>= SCHED_LOAD_SHIFT; 7261 power >>= SCHED_LOAD_SHIFT;
7262 } 7262 }
7263 sd->groups->cpu_power += power; 7263 sd->groups->cpu_power += power;
7264 return; 7264 return;
7265 } 7265 }
7266 7266
7267 /* 7267 /*
7268 * Add cpu_power of each child group to this groups cpu_power. 7268 * Add cpu_power of each child group to this groups cpu_power.
7269 */ 7269 */
7270 group = child->groups; 7270 group = child->groups;
7271 do { 7271 do {
7272 sd->groups->cpu_power += group->cpu_power; 7272 sd->groups->cpu_power += group->cpu_power;
7273 group = group->next; 7273 group = group->next;
7274 } while (group != child->groups); 7274 } while (group != child->groups);
7275 } 7275 }
7276 7276
7277 /* 7277 /*
7278 * Initializers for schedule domains 7278 * Initializers for schedule domains
7279 * Non-inlined to reduce accumulated stack pressure in build_sched_domains() 7279 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
7280 */ 7280 */
7281 7281
7282 #ifdef CONFIG_SCHED_DEBUG 7282 #ifdef CONFIG_SCHED_DEBUG
7283 # define SD_INIT_NAME(sd, type) sd->name = #type 7283 # define SD_INIT_NAME(sd, type) sd->name = #type
7284 #else 7284 #else
7285 # define SD_INIT_NAME(sd, type) do { } while (0) 7285 # define SD_INIT_NAME(sd, type) do { } while (0)
7286 #endif 7286 #endif
7287 7287
7288 #define SD_INIT(sd, type) sd_init_##type(sd) 7288 #define SD_INIT(sd, type) sd_init_##type(sd)
7289 7289
7290 #define SD_INIT_FUNC(type) \ 7290 #define SD_INIT_FUNC(type) \
7291 static noinline void sd_init_##type(struct sched_domain *sd) \ 7291 static noinline void sd_init_##type(struct sched_domain *sd) \
7292 { \ 7292 { \
7293 memset(sd, 0, sizeof(*sd)); \ 7293 memset(sd, 0, sizeof(*sd)); \
7294 *sd = SD_##type##_INIT; \ 7294 *sd = SD_##type##_INIT; \
7295 sd->level = SD_LV_##type; \ 7295 sd->level = SD_LV_##type; \
7296 SD_INIT_NAME(sd, type); \ 7296 SD_INIT_NAME(sd, type); \
7297 } 7297 }
7298 7298
7299 SD_INIT_FUNC(CPU) 7299 SD_INIT_FUNC(CPU)
7300 #ifdef CONFIG_NUMA 7300 #ifdef CONFIG_NUMA
7301 SD_INIT_FUNC(ALLNODES) 7301 SD_INIT_FUNC(ALLNODES)
7302 SD_INIT_FUNC(NODE) 7302 SD_INIT_FUNC(NODE)
7303 #endif 7303 #endif
7304 #ifdef CONFIG_SCHED_SMT 7304 #ifdef CONFIG_SCHED_SMT
7305 SD_INIT_FUNC(SIBLING) 7305 SD_INIT_FUNC(SIBLING)
7306 #endif 7306 #endif
7307 #ifdef CONFIG_SCHED_MC 7307 #ifdef CONFIG_SCHED_MC
7308 SD_INIT_FUNC(MC) 7308 SD_INIT_FUNC(MC)
7309 #endif 7309 #endif
7310 #ifdef CONFIG_SCHED_BOOK 7310 #ifdef CONFIG_SCHED_BOOK
7311 SD_INIT_FUNC(BOOK) 7311 SD_INIT_FUNC(BOOK)
7312 #endif 7312 #endif
7313 7313
7314 static int default_relax_domain_level = -1; 7314 static int default_relax_domain_level = -1;
7315 7315
7316 static int __init setup_relax_domain_level(char *str) 7316 static int __init setup_relax_domain_level(char *str)
7317 { 7317 {
7318 unsigned long val; 7318 unsigned long val;
7319 7319
7320 val = simple_strtoul(str, NULL, 0); 7320 val = simple_strtoul(str, NULL, 0);
7321 if (val < SD_LV_MAX) 7321 if (val < SD_LV_MAX)
7322 default_relax_domain_level = val; 7322 default_relax_domain_level = val;
7323 7323
7324 return 1; 7324 return 1;
7325 } 7325 }
7326 __setup("relax_domain_level=", setup_relax_domain_level); 7326 __setup("relax_domain_level=", setup_relax_domain_level);
7327 7327
7328 static void set_domain_attribute(struct sched_domain *sd, 7328 static void set_domain_attribute(struct sched_domain *sd,
7329 struct sched_domain_attr *attr) 7329 struct sched_domain_attr *attr)
7330 { 7330 {
7331 int request; 7331 int request;
7332 7332
7333 if (!attr || attr->relax_domain_level < 0) { 7333 if (!attr || attr->relax_domain_level < 0) {
7334 if (default_relax_domain_level < 0) 7334 if (default_relax_domain_level < 0)
7335 return; 7335 return;
7336 else 7336 else
7337 request = default_relax_domain_level; 7337 request = default_relax_domain_level;
7338 } else 7338 } else
7339 request = attr->relax_domain_level; 7339 request = attr->relax_domain_level;
7340 if (request < sd->level) { 7340 if (request < sd->level) {
7341 /* turn off idle balance on this domain */ 7341 /* turn off idle balance on this domain */
7342 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); 7342 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
7343 } else { 7343 } else {
7344 /* turn on idle balance on this domain */ 7344 /* turn on idle balance on this domain */
7345 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); 7345 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
7346 } 7346 }
7347 } 7347 }
7348 7348
7349 static void __free_domain_allocs(struct s_data *d, enum s_alloc what, 7349 static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
7350 const struct cpumask *cpu_map) 7350 const struct cpumask *cpu_map)
7351 { 7351 {
7352 switch (what) { 7352 switch (what) {
7353 case sa_sched_groups: 7353 case sa_sched_groups:
7354 free_sched_groups(cpu_map, d->tmpmask); /* fall through */ 7354 free_sched_groups(cpu_map, d->tmpmask); /* fall through */
7355 d->sched_group_nodes = NULL; 7355 d->sched_group_nodes = NULL;
7356 case sa_rootdomain: 7356 case sa_rootdomain:
7357 free_rootdomain(d->rd); /* fall through */ 7357 free_rootdomain(d->rd); /* fall through */
7358 case sa_tmpmask: 7358 case sa_tmpmask:
7359 free_cpumask_var(d->tmpmask); /* fall through */ 7359 free_cpumask_var(d->tmpmask); /* fall through */
7360 case sa_send_covered: 7360 case sa_send_covered:
7361 free_cpumask_var(d->send_covered); /* fall through */ 7361 free_cpumask_var(d->send_covered); /* fall through */
7362 case sa_this_book_map: 7362 case sa_this_book_map:
7363 free_cpumask_var(d->this_book_map); /* fall through */ 7363 free_cpumask_var(d->this_book_map); /* fall through */
7364 case sa_this_core_map: 7364 case sa_this_core_map:
7365 free_cpumask_var(d->this_core_map); /* fall through */ 7365 free_cpumask_var(d->this_core_map); /* fall through */
7366 case sa_this_sibling_map: 7366 case sa_this_sibling_map:
7367 free_cpumask_var(d->this_sibling_map); /* fall through */ 7367 free_cpumask_var(d->this_sibling_map); /* fall through */
7368 case sa_nodemask: 7368 case sa_nodemask:
7369 free_cpumask_var(d->nodemask); /* fall through */ 7369 free_cpumask_var(d->nodemask); /* fall through */
7370 case sa_sched_group_nodes: 7370 case sa_sched_group_nodes:
7371 #ifdef CONFIG_NUMA 7371 #ifdef CONFIG_NUMA
7372 kfree(d->sched_group_nodes); /* fall through */ 7372 kfree(d->sched_group_nodes); /* fall through */
7373 case sa_notcovered: 7373 case sa_notcovered:
7374 free_cpumask_var(d->notcovered); /* fall through */ 7374 free_cpumask_var(d->notcovered); /* fall through */
7375 case sa_covered: 7375 case sa_covered:
7376 free_cpumask_var(d->covered); /* fall through */ 7376 free_cpumask_var(d->covered); /* fall through */
7377 case sa_domainspan: 7377 case sa_domainspan:
7378 free_cpumask_var(d->domainspan); /* fall through */ 7378 free_cpumask_var(d->domainspan); /* fall through */
7379 #endif 7379 #endif
7380 case sa_none: 7380 case sa_none:
7381 break; 7381 break;
7382 } 7382 }
7383 } 7383 }
7384 7384
7385 static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, 7385 static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
7386 const struct cpumask *cpu_map) 7386 const struct cpumask *cpu_map)
7387 { 7387 {
7388 #ifdef CONFIG_NUMA 7388 #ifdef CONFIG_NUMA
7389 if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) 7389 if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))
7390 return sa_none; 7390 return sa_none;
7391 if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) 7391 if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))
7392 return sa_domainspan; 7392 return sa_domainspan;
7393 if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) 7393 if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))
7394 return sa_covered; 7394 return sa_covered;
7395 /* Allocate the per-node list of sched groups */ 7395 /* Allocate the per-node list of sched groups */
7396 d->sched_group_nodes = kcalloc(nr_node_ids, 7396 d->sched_group_nodes = kcalloc(nr_node_ids,
7397 sizeof(struct sched_group *), GFP_KERNEL); 7397 sizeof(struct sched_group *), GFP_KERNEL);
7398 if (!d->sched_group_nodes) { 7398 if (!d->sched_group_nodes) {
7399 printk(KERN_WARNING "Can not alloc sched group node list\n"); 7399 printk(KERN_WARNING "Can not alloc sched group node list\n");
7400 return sa_notcovered; 7400 return sa_notcovered;
7401 } 7401 }
7402 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes; 7402 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
7403 #endif 7403 #endif
7404 if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL)) 7404 if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
7405 return sa_sched_group_nodes; 7405 return sa_sched_group_nodes;
7406 if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL)) 7406 if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
7407 return sa_nodemask; 7407 return sa_nodemask;
7408 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) 7408 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
7409 return sa_this_sibling_map; 7409 return sa_this_sibling_map;
7410 if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL)) 7410 if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
7411 return sa_this_core_map; 7411 return sa_this_core_map;
7412 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) 7412 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
7413 return sa_this_book_map; 7413 return sa_this_book_map;
7414 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) 7414 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
7415 return sa_send_covered; 7415 return sa_send_covered;
7416 d->rd = alloc_rootdomain(); 7416 d->rd = alloc_rootdomain();
7417 if (!d->rd) { 7417 if (!d->rd) {
7418 printk(KERN_WARNING "Cannot alloc root domain\n"); 7418 printk(KERN_WARNING "Cannot alloc root domain\n");
7419 return sa_tmpmask; 7419 return sa_tmpmask;
7420 } 7420 }
7421 return sa_rootdomain; 7421 return sa_rootdomain;
7422 } 7422 }
7423 7423
7424 static struct sched_domain *__build_numa_sched_domains(struct s_data *d, 7424 static struct sched_domain *__build_numa_sched_domains(struct s_data *d,
7425 const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) 7425 const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)
7426 { 7426 {
7427 struct sched_domain *sd = NULL; 7427 struct sched_domain *sd = NULL;
7428 #ifdef CONFIG_NUMA 7428 #ifdef CONFIG_NUMA
7429 struct sched_domain *parent; 7429 struct sched_domain *parent;
7430 7430
7431 d->sd_allnodes = 0; 7431 d->sd_allnodes = 0;
7432 if (cpumask_weight(cpu_map) > 7432 if (cpumask_weight(cpu_map) >
7433 SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) { 7433 SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
7434 sd = &per_cpu(allnodes_domains, i).sd; 7434 sd = &per_cpu(allnodes_domains, i).sd;
7435 SD_INIT(sd, ALLNODES); 7435 SD_INIT(sd, ALLNODES);
7436 set_domain_attribute(sd, attr); 7436 set_domain_attribute(sd, attr);
7437 cpumask_copy(sched_domain_span(sd), cpu_map); 7437 cpumask_copy(sched_domain_span(sd), cpu_map);
7438 cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask); 7438 cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
7439 d->sd_allnodes = 1; 7439 d->sd_allnodes = 1;
7440 } 7440 }
7441 parent = sd; 7441 parent = sd;
7442 7442
7443 sd = &per_cpu(node_domains, i).sd; 7443 sd = &per_cpu(node_domains, i).sd;
7444 SD_INIT(sd, NODE); 7444 SD_INIT(sd, NODE);
7445 set_domain_attribute(sd, attr); 7445 set_domain_attribute(sd, attr);
7446 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); 7446 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
7447 sd->parent = parent; 7447 sd->parent = parent;
7448 if (parent) 7448 if (parent)
7449 parent->child = sd; 7449 parent->child = sd;
7450 cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map); 7450 cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
7451 #endif 7451 #endif
7452 return sd; 7452 return sd;
7453 } 7453 }
7454 7454
7455 static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, 7455 static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
7456 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7456 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
7457 struct sched_domain *parent, int i) 7457 struct sched_domain *parent, int i)
7458 { 7458 {
7459 struct sched_domain *sd; 7459 struct sched_domain *sd;
7460 sd = &per_cpu(phys_domains, i).sd; 7460 sd = &per_cpu(phys_domains, i).sd;
7461 SD_INIT(sd, CPU); 7461 SD_INIT(sd, CPU);
7462 set_domain_attribute(sd, attr); 7462 set_domain_attribute(sd, attr);
7463 cpumask_copy(sched_domain_span(sd), d->nodemask); 7463 cpumask_copy(sched_domain_span(sd), d->nodemask);
7464 sd->parent = parent; 7464 sd->parent = parent;
7465 if (parent) 7465 if (parent)
7466 parent->child = sd; 7466 parent->child = sd;
7467 cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask); 7467 cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
7468 return sd; 7468 return sd;
7469 } 7469 }
7470 7470
7471 static struct sched_domain *__build_book_sched_domain(struct s_data *d, 7471 static struct sched_domain *__build_book_sched_domain(struct s_data *d,
7472 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7472 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
7473 struct sched_domain *parent, int i) 7473 struct sched_domain *parent, int i)
7474 { 7474 {
7475 struct sched_domain *sd = parent; 7475 struct sched_domain *sd = parent;
7476 #ifdef CONFIG_SCHED_BOOK 7476 #ifdef CONFIG_SCHED_BOOK
7477 sd = &per_cpu(book_domains, i).sd; 7477 sd = &per_cpu(book_domains, i).sd;
7478 SD_INIT(sd, BOOK); 7478 SD_INIT(sd, BOOK);
7479 set_domain_attribute(sd, attr); 7479 set_domain_attribute(sd, attr);
7480 cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i)); 7480 cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
7481 sd->parent = parent; 7481 sd->parent = parent;
7482 parent->child = sd; 7482 parent->child = sd;
7483 cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask); 7483 cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
7484 #endif 7484 #endif
7485 return sd; 7485 return sd;
7486 } 7486 }
7487 7487
7488 static struct sched_domain *__build_mc_sched_domain(struct s_data *d, 7488 static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
7489 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7489 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
7490 struct sched_domain *parent, int i) 7490 struct sched_domain *parent, int i)
7491 { 7491 {
7492 struct sched_domain *sd = parent; 7492 struct sched_domain *sd = parent;
7493 #ifdef CONFIG_SCHED_MC 7493 #ifdef CONFIG_SCHED_MC
7494 sd = &per_cpu(core_domains, i).sd; 7494 sd = &per_cpu(core_domains, i).sd;
7495 SD_INIT(sd, MC); 7495 SD_INIT(sd, MC);
7496 set_domain_attribute(sd, attr); 7496 set_domain_attribute(sd, attr);
7497 cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i)); 7497 cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
7498 sd->parent = parent; 7498 sd->parent = parent;
7499 parent->child = sd; 7499 parent->child = sd;
7500 cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask); 7500 cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
7501 #endif 7501 #endif
7502 return sd; 7502 return sd;
7503 } 7503 }
7504 7504
7505 static struct sched_domain *__build_smt_sched_domain(struct s_data *d, 7505 static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
7506 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7506 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
7507 struct sched_domain *parent, int i) 7507 struct sched_domain *parent, int i)
7508 { 7508 {
7509 struct sched_domain *sd = parent; 7509 struct sched_domain *sd = parent;
7510 #ifdef CONFIG_SCHED_SMT 7510 #ifdef CONFIG_SCHED_SMT
7511 sd = &per_cpu(cpu_domains, i).sd; 7511 sd = &per_cpu(cpu_domains, i).sd;
7512 SD_INIT(sd, SIBLING); 7512 SD_INIT(sd, SIBLING);
7513 set_domain_attribute(sd, attr); 7513 set_domain_attribute(sd, attr);
7514 cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i)); 7514 cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
7515 sd->parent = parent; 7515 sd->parent = parent;
7516 parent->child = sd; 7516 parent->child = sd;
7517 cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask); 7517 cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
7518 #endif 7518 #endif
7519 return sd; 7519 return sd;
7520 } 7520 }
7521 7521
7522 static void build_sched_groups(struct s_data *d, enum sched_domain_level l, 7522 static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
7523 const struct cpumask *cpu_map, int cpu) 7523 const struct cpumask *cpu_map, int cpu)
7524 { 7524 {
7525 switch (l) { 7525 switch (l) {
7526 #ifdef CONFIG_SCHED_SMT 7526 #ifdef CONFIG_SCHED_SMT
7527 case SD_LV_SIBLING: /* set up CPU (sibling) groups */ 7527 case SD_LV_SIBLING: /* set up CPU (sibling) groups */
7528 cpumask_and(d->this_sibling_map, cpu_map, 7528 cpumask_and(d->this_sibling_map, cpu_map,
7529 topology_thread_cpumask(cpu)); 7529 topology_thread_cpumask(cpu));
7530 if (cpu == cpumask_first(d->this_sibling_map)) 7530 if (cpu == cpumask_first(d->this_sibling_map))
7531 init_sched_build_groups(d->this_sibling_map, cpu_map, 7531 init_sched_build_groups(d->this_sibling_map, cpu_map,
7532 &cpu_to_cpu_group, 7532 &cpu_to_cpu_group,
7533 d->send_covered, d->tmpmask); 7533 d->send_covered, d->tmpmask);
7534 break; 7534 break;
7535 #endif 7535 #endif
7536 #ifdef CONFIG_SCHED_MC 7536 #ifdef CONFIG_SCHED_MC
7537 case SD_LV_MC: /* set up multi-core groups */ 7537 case SD_LV_MC: /* set up multi-core groups */
7538 cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu)); 7538 cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
7539 if (cpu == cpumask_first(d->this_core_map)) 7539 if (cpu == cpumask_first(d->this_core_map))
7540 init_sched_build_groups(d->this_core_map, cpu_map, 7540 init_sched_build_groups(d->this_core_map, cpu_map,
7541 &cpu_to_core_group, 7541 &cpu_to_core_group,
7542 d->send_covered, d->tmpmask); 7542 d->send_covered, d->tmpmask);
7543 break; 7543 break;
7544 #endif 7544 #endif
7545 #ifdef CONFIG_SCHED_BOOK 7545 #ifdef CONFIG_SCHED_BOOK
7546 case SD_LV_BOOK: /* set up book groups */ 7546 case SD_LV_BOOK: /* set up book groups */
7547 cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu)); 7547 cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
7548 if (cpu == cpumask_first(d->this_book_map)) 7548 if (cpu == cpumask_first(d->this_book_map))
7549 init_sched_build_groups(d->this_book_map, cpu_map, 7549 init_sched_build_groups(d->this_book_map, cpu_map,
7550 &cpu_to_book_group, 7550 &cpu_to_book_group,
7551 d->send_covered, d->tmpmask); 7551 d->send_covered, d->tmpmask);
7552 break; 7552 break;
7553 #endif 7553 #endif
7554 case SD_LV_CPU: /* set up physical groups */ 7554 case SD_LV_CPU: /* set up physical groups */
7555 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); 7555 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
7556 if (!cpumask_empty(d->nodemask)) 7556 if (!cpumask_empty(d->nodemask))
7557 init_sched_build_groups(d->nodemask, cpu_map, 7557 init_sched_build_groups(d->nodemask, cpu_map,
7558 &cpu_to_phys_group, 7558 &cpu_to_phys_group,
7559 d->send_covered, d->tmpmask); 7559 d->send_covered, d->tmpmask);
7560 break; 7560 break;
7561 #ifdef CONFIG_NUMA 7561 #ifdef CONFIG_NUMA
7562 case SD_LV_ALLNODES: 7562 case SD_LV_ALLNODES:
7563 init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, 7563 init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,
7564 d->send_covered, d->tmpmask); 7564 d->send_covered, d->tmpmask);
7565 break; 7565 break;
7566 #endif 7566 #endif
7567 default: 7567 default:
7568 break; 7568 break;
7569 } 7569 }
7570 } 7570 }
7571 7571
7572 /* 7572 /*
7573 * Build sched domains for a given set of cpus and attach the sched domains 7573 * Build sched domains for a given set of cpus and attach the sched domains
7574 * to the individual cpus 7574 * to the individual cpus
7575 */ 7575 */
7576 static int __build_sched_domains(const struct cpumask *cpu_map, 7576 static int __build_sched_domains(const struct cpumask *cpu_map,
7577 struct sched_domain_attr *attr) 7577 struct sched_domain_attr *attr)
7578 { 7578 {
7579 enum s_alloc alloc_state = sa_none; 7579 enum s_alloc alloc_state = sa_none;
7580 struct s_data d; 7580 struct s_data d;
7581 struct sched_domain *sd; 7581 struct sched_domain *sd;
7582 int i; 7582 int i;
7583 #ifdef CONFIG_NUMA 7583 #ifdef CONFIG_NUMA
7584 d.sd_allnodes = 0; 7584 d.sd_allnodes = 0;
7585 #endif 7585 #endif
7586 7586
7587 alloc_state = __visit_domain_allocation_hell(&d, cpu_map); 7587 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
7588 if (alloc_state != sa_rootdomain) 7588 if (alloc_state != sa_rootdomain)
7589 goto error; 7589 goto error;
7590 alloc_state = sa_sched_groups; 7590 alloc_state = sa_sched_groups;
7591 7591
7592 /* 7592 /*
7593 * Set up domains for cpus specified by the cpu_map. 7593 * Set up domains for cpus specified by the cpu_map.
7594 */ 7594 */
7595 for_each_cpu(i, cpu_map) { 7595 for_each_cpu(i, cpu_map) {
7596 cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), 7596 cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
7597 cpu_map); 7597 cpu_map);
7598 7598
7599 sd = __build_numa_sched_domains(&d, cpu_map, attr, i); 7599 sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
7600 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); 7600 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
7601 sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i); 7601 sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);
7602 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); 7602 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
7603 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); 7603 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
7604 } 7604 }
7605 7605
7606 for_each_cpu(i, cpu_map) { 7606 for_each_cpu(i, cpu_map) {
7607 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); 7607 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
7608 build_sched_groups(&d, SD_LV_BOOK, cpu_map, i); 7608 build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
7609 build_sched_groups(&d, SD_LV_MC, cpu_map, i); 7609 build_sched_groups(&d, SD_LV_MC, cpu_map, i);
7610 } 7610 }
7611 7611
7612 /* Set up physical groups */ 7612 /* Set up physical groups */
7613 for (i = 0; i < nr_node_ids; i++) 7613 for (i = 0; i < nr_node_ids; i++)
7614 build_sched_groups(&d, SD_LV_CPU, cpu_map, i); 7614 build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
7615 7615
7616 #ifdef CONFIG_NUMA 7616 #ifdef CONFIG_NUMA
7617 /* Set up node groups */ 7617 /* Set up node groups */
7618 if (d.sd_allnodes) 7618 if (d.sd_allnodes)
7619 build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0); 7619 build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
7620 7620
7621 for (i = 0; i < nr_node_ids; i++) 7621 for (i = 0; i < nr_node_ids; i++)
7622 if (build_numa_sched_groups(&d, cpu_map, i)) 7622 if (build_numa_sched_groups(&d, cpu_map, i))
7623 goto error; 7623 goto error;
7624 #endif 7624 #endif
7625 7625
7626 /* Calculate CPU power for physical packages and nodes */ 7626 /* Calculate CPU power for physical packages and nodes */
7627 #ifdef CONFIG_SCHED_SMT 7627 #ifdef CONFIG_SCHED_SMT
7628 for_each_cpu(i, cpu_map) { 7628 for_each_cpu(i, cpu_map) {
7629 sd = &per_cpu(cpu_domains, i).sd; 7629 sd = &per_cpu(cpu_domains, i).sd;
7630 init_sched_groups_power(i, sd); 7630 init_sched_groups_power(i, sd);
7631 } 7631 }
7632 #endif 7632 #endif
7633 #ifdef CONFIG_SCHED_MC 7633 #ifdef CONFIG_SCHED_MC
7634 for_each_cpu(i, cpu_map) { 7634 for_each_cpu(i, cpu_map) {
7635 sd = &per_cpu(core_domains, i).sd; 7635 sd = &per_cpu(core_domains, i).sd;
7636 init_sched_groups_power(i, sd); 7636 init_sched_groups_power(i, sd);
7637 } 7637 }
7638 #endif 7638 #endif
7639 #ifdef CONFIG_SCHED_BOOK 7639 #ifdef CONFIG_SCHED_BOOK
7640 for_each_cpu(i, cpu_map) { 7640 for_each_cpu(i, cpu_map) {
7641 sd = &per_cpu(book_domains, i).sd; 7641 sd = &per_cpu(book_domains, i).sd;
7642 init_sched_groups_power(i, sd); 7642 init_sched_groups_power(i, sd);
7643 } 7643 }
7644 #endif 7644 #endif
7645 7645
7646 for_each_cpu(i, cpu_map) { 7646 for_each_cpu(i, cpu_map) {
7647 sd = &per_cpu(phys_domains, i).sd; 7647 sd = &per_cpu(phys_domains, i).sd;
7648 init_sched_groups_power(i, sd); 7648 init_sched_groups_power(i, sd);
7649 } 7649 }
7650 7650
7651 #ifdef CONFIG_NUMA 7651 #ifdef CONFIG_NUMA
7652 for (i = 0; i < nr_node_ids; i++) 7652 for (i = 0; i < nr_node_ids; i++)
7653 init_numa_sched_groups_power(d.sched_group_nodes[i]); 7653 init_numa_sched_groups_power(d.sched_group_nodes[i]);
7654 7654
7655 if (d.sd_allnodes) { 7655 if (d.sd_allnodes) {
7656 struct sched_group *sg; 7656 struct sched_group *sg;
7657 7657
7658 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, 7658 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
7659 d.tmpmask); 7659 d.tmpmask);
7660 init_numa_sched_groups_power(sg); 7660 init_numa_sched_groups_power(sg);
7661 } 7661 }
7662 #endif 7662 #endif
7663 7663
7664 /* Attach the domains */ 7664 /* Attach the domains */
7665 for_each_cpu(i, cpu_map) { 7665 for_each_cpu(i, cpu_map) {
7666 #ifdef CONFIG_SCHED_SMT 7666 #ifdef CONFIG_SCHED_SMT
7667 sd = &per_cpu(cpu_domains, i).sd; 7667 sd = &per_cpu(cpu_domains, i).sd;
7668 #elif defined(CONFIG_SCHED_MC) 7668 #elif defined(CONFIG_SCHED_MC)
7669 sd = &per_cpu(core_domains, i).sd; 7669 sd = &per_cpu(core_domains, i).sd;
7670 #elif defined(CONFIG_SCHED_BOOK) 7670 #elif defined(CONFIG_SCHED_BOOK)
7671 sd = &per_cpu(book_domains, i).sd; 7671 sd = &per_cpu(book_domains, i).sd;
7672 #else 7672 #else
7673 sd = &per_cpu(phys_domains, i).sd; 7673 sd = &per_cpu(phys_domains, i).sd;
7674 #endif 7674 #endif
7675 cpu_attach_domain(sd, d.rd, i); 7675 cpu_attach_domain(sd, d.rd, i);
7676 } 7676 }
7677 7677
7678 d.sched_group_nodes = NULL; /* don't free this we still need it */ 7678 d.sched_group_nodes = NULL; /* don't free this we still need it */
7679 __free_domain_allocs(&d, sa_tmpmask, cpu_map); 7679 __free_domain_allocs(&d, sa_tmpmask, cpu_map);
7680 return 0; 7680 return 0;
7681 7681
7682 error: 7682 error:
7683 __free_domain_allocs(&d, alloc_state, cpu_map); 7683 __free_domain_allocs(&d, alloc_state, cpu_map);
7684 return -ENOMEM; 7684 return -ENOMEM;
7685 } 7685 }
7686 7686
7687 static int build_sched_domains(const struct cpumask *cpu_map) 7687 static int build_sched_domains(const struct cpumask *cpu_map)
7688 { 7688 {
7689 return __build_sched_domains(cpu_map, NULL); 7689 return __build_sched_domains(cpu_map, NULL);
7690 } 7690 }
7691 7691
7692 static cpumask_var_t *doms_cur; /* current sched domains */ 7692 static cpumask_var_t *doms_cur; /* current sched domains */
7693 static int ndoms_cur; /* number of sched domains in 'doms_cur' */ 7693 static int ndoms_cur; /* number of sched domains in 'doms_cur' */
7694 static struct sched_domain_attr *dattr_cur; 7694 static struct sched_domain_attr *dattr_cur;
7695 /* attribues of custom domains in 'doms_cur' */ 7695 /* attribues of custom domains in 'doms_cur' */
7696 7696
7697 /* 7697 /*
7698 * Special case: If a kmalloc of a doms_cur partition (array of 7698 * Special case: If a kmalloc of a doms_cur partition (array of
7699 * cpumask) fails, then fallback to a single sched domain, 7699 * cpumask) fails, then fallback to a single sched domain,
7700 * as determined by the single cpumask fallback_doms. 7700 * as determined by the single cpumask fallback_doms.
7701 */ 7701 */
7702 static cpumask_var_t fallback_doms; 7702 static cpumask_var_t fallback_doms;
7703 7703
7704 /* 7704 /*
7705 * arch_update_cpu_topology lets virtualized architectures update the 7705 * arch_update_cpu_topology lets virtualized architectures update the
7706 * cpu core maps. It is supposed to return 1 if the topology changed 7706 * cpu core maps. It is supposed to return 1 if the topology changed
7707 * or 0 if it stayed the same. 7707 * or 0 if it stayed the same.
7708 */ 7708 */
7709 int __attribute__((weak)) arch_update_cpu_topology(void) 7709 int __attribute__((weak)) arch_update_cpu_topology(void)
7710 { 7710 {
7711 return 0; 7711 return 0;
7712 } 7712 }
7713 7713
7714 cpumask_var_t *alloc_sched_domains(unsigned int ndoms) 7714 cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
7715 { 7715 {
7716 int i; 7716 int i;
7717 cpumask_var_t *doms; 7717 cpumask_var_t *doms;
7718 7718
7719 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); 7719 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
7720 if (!doms) 7720 if (!doms)
7721 return NULL; 7721 return NULL;
7722 for (i = 0; i < ndoms; i++) { 7722 for (i = 0; i < ndoms; i++) {
7723 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { 7723 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
7724 free_sched_domains(doms, i); 7724 free_sched_domains(doms, i);
7725 return NULL; 7725 return NULL;
7726 } 7726 }
7727 } 7727 }
7728 return doms; 7728 return doms;
7729 } 7729 }
7730 7730
7731 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) 7731 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
7732 { 7732 {
7733 unsigned int i; 7733 unsigned int i;
7734 for (i = 0; i < ndoms; i++) 7734 for (i = 0; i < ndoms; i++)
7735 free_cpumask_var(doms[i]); 7735 free_cpumask_var(doms[i]);
7736 kfree(doms); 7736 kfree(doms);
7737 } 7737 }
7738 7738
7739 /* 7739 /*
7740 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 7740 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
7741 * For now this just excludes isolated cpus, but could be used to 7741 * For now this just excludes isolated cpus, but could be used to
7742 * exclude other special cases in the future. 7742 * exclude other special cases in the future.
7743 */ 7743 */
7744 static int arch_init_sched_domains(const struct cpumask *cpu_map) 7744 static int arch_init_sched_domains(const struct cpumask *cpu_map)
7745 { 7745 {
7746 int err; 7746 int err;
7747 7747
7748 arch_update_cpu_topology(); 7748 arch_update_cpu_topology();
7749 ndoms_cur = 1; 7749 ndoms_cur = 1;
7750 doms_cur = alloc_sched_domains(ndoms_cur); 7750 doms_cur = alloc_sched_domains(ndoms_cur);
7751 if (!doms_cur) 7751 if (!doms_cur)
7752 doms_cur = &fallback_doms; 7752 doms_cur = &fallback_doms;
7753 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); 7753 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
7754 dattr_cur = NULL; 7754 dattr_cur = NULL;
7755 err = build_sched_domains(doms_cur[0]); 7755 err = build_sched_domains(doms_cur[0]);
7756 register_sched_domain_sysctl(); 7756 register_sched_domain_sysctl();
7757 7757
7758 return err; 7758 return err;
7759 } 7759 }
7760 7760
7761 static void arch_destroy_sched_domains(const struct cpumask *cpu_map, 7761 static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
7762 struct cpumask *tmpmask) 7762 struct cpumask *tmpmask)
7763 { 7763 {
7764 free_sched_groups(cpu_map, tmpmask); 7764 free_sched_groups(cpu_map, tmpmask);
7765 } 7765 }
7766 7766
7767 /* 7767 /*
7768 * Detach sched domains from a group of cpus specified in cpu_map 7768 * Detach sched domains from a group of cpus specified in cpu_map
7769 * These cpus will now be attached to the NULL domain 7769 * These cpus will now be attached to the NULL domain
7770 */ 7770 */
7771 static void detach_destroy_domains(const struct cpumask *cpu_map) 7771 static void detach_destroy_domains(const struct cpumask *cpu_map)
7772 { 7772 {
7773 /* Save because hotplug lock held. */ 7773 /* Save because hotplug lock held. */
7774 static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS); 7774 static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
7775 int i; 7775 int i;
7776 7776
7777 for_each_cpu(i, cpu_map) 7777 for_each_cpu(i, cpu_map)
7778 cpu_attach_domain(NULL, &def_root_domain, i); 7778 cpu_attach_domain(NULL, &def_root_domain, i);
7779 synchronize_sched(); 7779 synchronize_sched();
7780 arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask)); 7780 arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
7781 } 7781 }
7782 7782
7783 /* handle null as "default" */ 7783 /* handle null as "default" */
7784 static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, 7784 static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7785 struct sched_domain_attr *new, int idx_new) 7785 struct sched_domain_attr *new, int idx_new)
7786 { 7786 {
7787 struct sched_domain_attr tmp; 7787 struct sched_domain_attr tmp;
7788 7788
7789 /* fast path */ 7789 /* fast path */
7790 if (!new && !cur) 7790 if (!new && !cur)
7791 return 1; 7791 return 1;
7792 7792
7793 tmp = SD_ATTR_INIT; 7793 tmp = SD_ATTR_INIT;
7794 return !memcmp(cur ? (cur + idx_cur) : &tmp, 7794 return !memcmp(cur ? (cur + idx_cur) : &tmp,
7795 new ? (new + idx_new) : &tmp, 7795 new ? (new + idx_new) : &tmp,
7796 sizeof(struct sched_domain_attr)); 7796 sizeof(struct sched_domain_attr));
7797 } 7797 }
7798 7798
7799 /* 7799 /*
7800 * Partition sched domains as specified by the 'ndoms_new' 7800 * Partition sched domains as specified by the 'ndoms_new'
7801 * cpumasks in the array doms_new[] of cpumasks. This compares 7801 * cpumasks in the array doms_new[] of cpumasks. This compares
7802 * doms_new[] to the current sched domain partitioning, doms_cur[]. 7802 * doms_new[] to the current sched domain partitioning, doms_cur[].
7803 * It destroys each deleted domain and builds each new domain. 7803 * It destroys each deleted domain and builds each new domain.
7804 * 7804 *
7805 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. 7805 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
7806 * The masks don't intersect (don't overlap.) We should setup one 7806 * The masks don't intersect (don't overlap.) We should setup one
7807 * sched domain for each mask. CPUs not in any of the cpumasks will 7807 * sched domain for each mask. CPUs not in any of the cpumasks will
7808 * not be load balanced. If the same cpumask appears both in the 7808 * not be load balanced. If the same cpumask appears both in the
7809 * current 'doms_cur' domains and in the new 'doms_new', we can leave 7809 * current 'doms_cur' domains and in the new 'doms_new', we can leave
7810 * it as it is. 7810 * it as it is.
7811 * 7811 *
7812 * The passed in 'doms_new' should be allocated using 7812 * The passed in 'doms_new' should be allocated using
7813 * alloc_sched_domains. This routine takes ownership of it and will 7813 * alloc_sched_domains. This routine takes ownership of it and will
7814 * free_sched_domains it when done with it. If the caller failed the 7814 * free_sched_domains it when done with it. If the caller failed the
7815 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, 7815 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
7816 * and partition_sched_domains() will fallback to the single partition 7816 * and partition_sched_domains() will fallback to the single partition
7817 * 'fallback_doms', it also forces the domains to be rebuilt. 7817 * 'fallback_doms', it also forces the domains to be rebuilt.
7818 * 7818 *
7819 * If doms_new == NULL it will be replaced with cpu_online_mask. 7819 * If doms_new == NULL it will be replaced with cpu_online_mask.
7820 * ndoms_new == 0 is a special case for destroying existing domains, 7820 * ndoms_new == 0 is a special case for destroying existing domains,
7821 * and it will not create the default domain. 7821 * and it will not create the default domain.
7822 * 7822 *
7823 * Call with hotplug lock held 7823 * Call with hotplug lock held
7824 */ 7824 */
7825 void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], 7825 void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
7826 struct sched_domain_attr *dattr_new) 7826 struct sched_domain_attr *dattr_new)
7827 { 7827 {
7828 int i, j, n; 7828 int i, j, n;
7829 int new_topology; 7829 int new_topology;
7830 7830
7831 mutex_lock(&sched_domains_mutex); 7831 mutex_lock(&sched_domains_mutex);
7832 7832
7833 /* always unregister in case we don't destroy any domains */ 7833 /* always unregister in case we don't destroy any domains */
7834 unregister_sched_domain_sysctl(); 7834 unregister_sched_domain_sysctl();
7835 7835
7836 /* Let architecture update cpu core mappings. */ 7836 /* Let architecture update cpu core mappings. */
7837 new_topology = arch_update_cpu_topology(); 7837 new_topology = arch_update_cpu_topology();
7838 7838
7839 n = doms_new ? ndoms_new : 0; 7839 n = doms_new ? ndoms_new : 0;
7840 7840
7841 /* Destroy deleted domains */ 7841 /* Destroy deleted domains */
7842 for (i = 0; i < ndoms_cur; i++) { 7842 for (i = 0; i < ndoms_cur; i++) {
7843 for (j = 0; j < n && !new_topology; j++) { 7843 for (j = 0; j < n && !new_topology; j++) {
7844 if (cpumask_equal(doms_cur[i], doms_new[j]) 7844 if (cpumask_equal(doms_cur[i], doms_new[j])
7845 && dattrs_equal(dattr_cur, i, dattr_new, j)) 7845 && dattrs_equal(dattr_cur, i, dattr_new, j))
7846 goto match1; 7846 goto match1;
7847 } 7847 }
7848 /* no match - a current sched domain not in new doms_new[] */ 7848 /* no match - a current sched domain not in new doms_new[] */
7849 detach_destroy_domains(doms_cur[i]); 7849 detach_destroy_domains(doms_cur[i]);
7850 match1: 7850 match1:
7851 ; 7851 ;
7852 } 7852 }
7853 7853
7854 if (doms_new == NULL) { 7854 if (doms_new == NULL) {
7855 ndoms_cur = 0; 7855 ndoms_cur = 0;
7856 doms_new = &fallback_doms; 7856 doms_new = &fallback_doms;
7857 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); 7857 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
7858 WARN_ON_ONCE(dattr_new); 7858 WARN_ON_ONCE(dattr_new);
7859 } 7859 }
7860 7860
7861 /* Build new domains */ 7861 /* Build new domains */
7862 for (i = 0; i < ndoms_new; i++) { 7862 for (i = 0; i < ndoms_new; i++) {
7863 for (j = 0; j < ndoms_cur && !new_topology; j++) { 7863 for (j = 0; j < ndoms_cur && !new_topology; j++) {
7864 if (cpumask_equal(doms_new[i], doms_cur[j]) 7864 if (cpumask_equal(doms_new[i], doms_cur[j])
7865 && dattrs_equal(dattr_new, i, dattr_cur, j)) 7865 && dattrs_equal(dattr_new, i, dattr_cur, j))
7866 goto match2; 7866 goto match2;
7867 } 7867 }
7868 /* no match - add a new doms_new */ 7868 /* no match - add a new doms_new */
7869 __build_sched_domains(doms_new[i], 7869 __build_sched_domains(doms_new[i],
7870 dattr_new ? dattr_new + i : NULL); 7870 dattr_new ? dattr_new + i : NULL);
7871 match2: 7871 match2:
7872 ; 7872 ;
7873 } 7873 }
7874 7874
7875 /* Remember the new sched domains */ 7875 /* Remember the new sched domains */
7876 if (doms_cur != &fallback_doms) 7876 if (doms_cur != &fallback_doms)
7877 free_sched_domains(doms_cur, ndoms_cur); 7877 free_sched_domains(doms_cur, ndoms_cur);
7878 kfree(dattr_cur); /* kfree(NULL) is safe */ 7878 kfree(dattr_cur); /* kfree(NULL) is safe */
7879 doms_cur = doms_new; 7879 doms_cur = doms_new;
7880 dattr_cur = dattr_new; 7880 dattr_cur = dattr_new;
7881 ndoms_cur = ndoms_new; 7881 ndoms_cur = ndoms_new;
7882 7882
7883 register_sched_domain_sysctl(); 7883 register_sched_domain_sysctl();
7884 7884
7885 mutex_unlock(&sched_domains_mutex); 7885 mutex_unlock(&sched_domains_mutex);
7886 } 7886 }
7887 7887
7888 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 7888 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
7889 static void arch_reinit_sched_domains(void) 7889 static void arch_reinit_sched_domains(void)
7890 { 7890 {
7891 get_online_cpus(); 7891 get_online_cpus();
7892 7892
7893 /* Destroy domains first to force the rebuild */ 7893 /* Destroy domains first to force the rebuild */
7894 partition_sched_domains(0, NULL, NULL); 7894 partition_sched_domains(0, NULL, NULL);
7895 7895
7896 rebuild_sched_domains(); 7896 rebuild_sched_domains();
7897 put_online_cpus(); 7897 put_online_cpus();
7898 } 7898 }
7899 7899
7900 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) 7900 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
7901 { 7901 {
7902 unsigned int level = 0; 7902 unsigned int level = 0;
7903 7903
7904 if (sscanf(buf, "%u", &level) != 1) 7904 if (sscanf(buf, "%u", &level) != 1)
7905 return -EINVAL; 7905 return -EINVAL;
7906 7906
7907 /* 7907 /*
7908 * level is always be positive so don't check for 7908 * level is always be positive so don't check for
7909 * level < POWERSAVINGS_BALANCE_NONE which is 0 7909 * level < POWERSAVINGS_BALANCE_NONE which is 0
7910 * What happens on 0 or 1 byte write, 7910 * What happens on 0 or 1 byte write,
7911 * need to check for count as well? 7911 * need to check for count as well?
7912 */ 7912 */
7913 7913
7914 if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS) 7914 if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
7915 return -EINVAL; 7915 return -EINVAL;
7916 7916
7917 if (smt) 7917 if (smt)
7918 sched_smt_power_savings = level; 7918 sched_smt_power_savings = level;
7919 else 7919 else
7920 sched_mc_power_savings = level; 7920 sched_mc_power_savings = level;
7921 7921
7922 arch_reinit_sched_domains(); 7922 arch_reinit_sched_domains();
7923 7923
7924 return count; 7924 return count;
7925 } 7925 }
7926 7926
7927 #ifdef CONFIG_SCHED_MC 7927 #ifdef CONFIG_SCHED_MC
7928 static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, 7928 static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
7929 struct sysdev_class_attribute *attr, 7929 struct sysdev_class_attribute *attr,
7930 char *page) 7930 char *page)
7931 { 7931 {
7932 return sprintf(page, "%u\n", sched_mc_power_savings); 7932 return sprintf(page, "%u\n", sched_mc_power_savings);
7933 } 7933 }
7934 static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, 7934 static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
7935 struct sysdev_class_attribute *attr, 7935 struct sysdev_class_attribute *attr,
7936 const char *buf, size_t count) 7936 const char *buf, size_t count)
7937 { 7937 {
7938 return sched_power_savings_store(buf, count, 0); 7938 return sched_power_savings_store(buf, count, 0);
7939 } 7939 }
7940 static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644, 7940 static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
7941 sched_mc_power_savings_show, 7941 sched_mc_power_savings_show,
7942 sched_mc_power_savings_store); 7942 sched_mc_power_savings_store);
7943 #endif 7943 #endif
7944 7944
7945 #ifdef CONFIG_SCHED_SMT 7945 #ifdef CONFIG_SCHED_SMT
7946 static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, 7946 static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
7947 struct sysdev_class_attribute *attr, 7947 struct sysdev_class_attribute *attr,
7948 char *page) 7948 char *page)
7949 { 7949 {
7950 return sprintf(page, "%u\n", sched_smt_power_savings); 7950 return sprintf(page, "%u\n", sched_smt_power_savings);
7951 } 7951 }
7952 static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, 7952 static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
7953 struct sysdev_class_attribute *attr, 7953 struct sysdev_class_attribute *attr,
7954 const char *buf, size_t count) 7954 const char *buf, size_t count)
7955 { 7955 {
7956 return sched_power_savings_store(buf, count, 1); 7956 return sched_power_savings_store(buf, count, 1);
7957 } 7957 }
7958 static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644, 7958 static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
7959 sched_smt_power_savings_show, 7959 sched_smt_power_savings_show,
7960 sched_smt_power_savings_store); 7960 sched_smt_power_savings_store);
7961 #endif 7961 #endif
7962 7962
7963 int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) 7963 int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7964 { 7964 {
7965 int err = 0; 7965 int err = 0;
7966 7966
7967 #ifdef CONFIG_SCHED_SMT 7967 #ifdef CONFIG_SCHED_SMT
7968 if (smt_capable()) 7968 if (smt_capable())
7969 err = sysfs_create_file(&cls->kset.kobj, 7969 err = sysfs_create_file(&cls->kset.kobj,
7970 &attr_sched_smt_power_savings.attr); 7970 &attr_sched_smt_power_savings.attr);
7971 #endif 7971 #endif
7972 #ifdef CONFIG_SCHED_MC 7972 #ifdef CONFIG_SCHED_MC
7973 if (!err && mc_capable()) 7973 if (!err && mc_capable())
7974 err = sysfs_create_file(&cls->kset.kobj, 7974 err = sysfs_create_file(&cls->kset.kobj,
7975 &attr_sched_mc_power_savings.attr); 7975 &attr_sched_mc_power_savings.attr);
7976 #endif 7976 #endif
7977 return err; 7977 return err;
7978 } 7978 }
7979 #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ 7979 #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
7980 7980
7981 /* 7981 /*
7982 * Update cpusets according to cpu_active mask. If cpusets are 7982 * Update cpusets according to cpu_active mask. If cpusets are
7983 * disabled, cpuset_update_active_cpus() becomes a simple wrapper 7983 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
7984 * around partition_sched_domains(). 7984 * around partition_sched_domains().
7985 */ 7985 */
7986 static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, 7986 static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
7987 void *hcpu) 7987 void *hcpu)
7988 { 7988 {
7989 switch (action & ~CPU_TASKS_FROZEN) { 7989 switch (action & ~CPU_TASKS_FROZEN) {
7990 case CPU_ONLINE: 7990 case CPU_ONLINE:
7991 case CPU_DOWN_FAILED: 7991 case CPU_DOWN_FAILED:
7992 cpuset_update_active_cpus(); 7992 cpuset_update_active_cpus();
7993 return NOTIFY_OK; 7993 return NOTIFY_OK;
7994 default: 7994 default:
7995 return NOTIFY_DONE; 7995 return NOTIFY_DONE;
7996 } 7996 }
7997 } 7997 }
7998 7998
7999 static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, 7999 static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
8000 void *hcpu) 8000 void *hcpu)
8001 { 8001 {
8002 switch (action & ~CPU_TASKS_FROZEN) { 8002 switch (action & ~CPU_TASKS_FROZEN) {
8003 case CPU_DOWN_PREPARE: 8003 case CPU_DOWN_PREPARE:
8004 cpuset_update_active_cpus(); 8004 cpuset_update_active_cpus();
8005 return NOTIFY_OK; 8005 return NOTIFY_OK;
8006 default: 8006 default:
8007 return NOTIFY_DONE; 8007 return NOTIFY_DONE;
8008 } 8008 }
8009 } 8009 }
8010 8010
8011 static int update_runtime(struct notifier_block *nfb, 8011 static int update_runtime(struct notifier_block *nfb,
8012 unsigned long action, void *hcpu) 8012 unsigned long action, void *hcpu)
8013 { 8013 {
8014 int cpu = (int)(long)hcpu; 8014 int cpu = (int)(long)hcpu;
8015 8015
8016 switch (action) { 8016 switch (action) {
8017 case CPU_DOWN_PREPARE: 8017 case CPU_DOWN_PREPARE:
8018 case CPU_DOWN_PREPARE_FROZEN: 8018 case CPU_DOWN_PREPARE_FROZEN:
8019 disable_runtime(cpu_rq(cpu)); 8019 disable_runtime(cpu_rq(cpu));
8020 return NOTIFY_OK; 8020 return NOTIFY_OK;
8021 8021
8022 case CPU_DOWN_FAILED: 8022 case CPU_DOWN_FAILED:
8023 case CPU_DOWN_FAILED_FROZEN: 8023 case CPU_DOWN_FAILED_FROZEN:
8024 case CPU_ONLINE: 8024 case CPU_ONLINE:
8025 case CPU_ONLINE_FROZEN: 8025 case CPU_ONLINE_FROZEN:
8026 enable_runtime(cpu_rq(cpu)); 8026 enable_runtime(cpu_rq(cpu));
8027 return NOTIFY_OK; 8027 return NOTIFY_OK;
8028 8028
8029 default: 8029 default:
8030 return NOTIFY_DONE; 8030 return NOTIFY_DONE;
8031 } 8031 }
8032 } 8032 }
8033 8033
8034 void __init sched_init_smp(void) 8034 void __init sched_init_smp(void)
8035 { 8035 {
8036 cpumask_var_t non_isolated_cpus; 8036 cpumask_var_t non_isolated_cpus;
8037 8037
8038 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 8038 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
8039 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 8039 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
8040 8040
8041 #if defined(CONFIG_NUMA) 8041 #if defined(CONFIG_NUMA)
8042 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), 8042 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
8043 GFP_KERNEL); 8043 GFP_KERNEL);
8044 BUG_ON(sched_group_nodes_bycpu == NULL); 8044 BUG_ON(sched_group_nodes_bycpu == NULL);
8045 #endif 8045 #endif
8046 get_online_cpus(); 8046 get_online_cpus();
8047 mutex_lock(&sched_domains_mutex); 8047 mutex_lock(&sched_domains_mutex);
8048 arch_init_sched_domains(cpu_active_mask); 8048 arch_init_sched_domains(cpu_active_mask);
8049 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 8049 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
8050 if (cpumask_empty(non_isolated_cpus)) 8050 if (cpumask_empty(non_isolated_cpus))
8051 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 8051 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
8052 mutex_unlock(&sched_domains_mutex); 8052 mutex_unlock(&sched_domains_mutex);
8053 put_online_cpus(); 8053 put_online_cpus();
8054 8054
8055 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); 8055 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
8056 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); 8056 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
8057 8057
8058 /* RT runtime code needs to handle some hotplug events */ 8058 /* RT runtime code needs to handle some hotplug events */
8059 hotcpu_notifier(update_runtime, 0); 8059 hotcpu_notifier(update_runtime, 0);
8060 8060
8061 init_hrtick(); 8061 init_hrtick();
8062 8062
8063 /* Move init over to a non-isolated CPU */ 8063 /* Move init over to a non-isolated CPU */
8064 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) 8064 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
8065 BUG(); 8065 BUG();
8066 sched_init_granularity(); 8066 sched_init_granularity();
8067 free_cpumask_var(non_isolated_cpus); 8067 free_cpumask_var(non_isolated_cpus);
8068 8068
8069 init_sched_rt_class(); 8069 init_sched_rt_class();
8070 } 8070 }
8071 #else 8071 #else
8072 void __init sched_init_smp(void) 8072 void __init sched_init_smp(void)
8073 { 8073 {
8074 sched_init_granularity(); 8074 sched_init_granularity();
8075 } 8075 }
8076 #endif /* CONFIG_SMP */ 8076 #endif /* CONFIG_SMP */
8077 8077
8078 const_debug unsigned int sysctl_timer_migration = 1; 8078 const_debug unsigned int sysctl_timer_migration = 1;
8079 8079
8080 int in_sched_functions(unsigned long addr) 8080 int in_sched_functions(unsigned long addr)
8081 { 8081 {
8082 return in_lock_functions(addr) || 8082 return in_lock_functions(addr) ||
8083 (addr >= (unsigned long)__sched_text_start 8083 (addr >= (unsigned long)__sched_text_start
8084 && addr < (unsigned long)__sched_text_end); 8084 && addr < (unsigned long)__sched_text_end);
8085 } 8085 }
8086 8086
8087 static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) 8087 static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
8088 { 8088 {
8089 cfs_rq->tasks_timeline = RB_ROOT; 8089 cfs_rq->tasks_timeline = RB_ROOT;
8090 INIT_LIST_HEAD(&cfs_rq->tasks); 8090 INIT_LIST_HEAD(&cfs_rq->tasks);
8091 #ifdef CONFIG_FAIR_GROUP_SCHED 8091 #ifdef CONFIG_FAIR_GROUP_SCHED
8092 cfs_rq->rq = rq; 8092 cfs_rq->rq = rq;
8093 /* allow initial update_cfs_load() to truncate */ 8093 /* allow initial update_cfs_load() to truncate */
8094 #ifdef CONFIG_SMP 8094 #ifdef CONFIG_SMP
8095 cfs_rq->load_stamp = 1; 8095 cfs_rq->load_stamp = 1;
8096 #endif 8096 #endif
8097 #endif 8097 #endif
8098 cfs_rq->min_vruntime = (u64)(-(1LL << 20)); 8098 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
8099 } 8099 }
8100 8100
8101 static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) 8101 static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
8102 { 8102 {
8103 struct rt_prio_array *array; 8103 struct rt_prio_array *array;
8104 int i; 8104 int i;
8105 8105
8106 array = &rt_rq->active; 8106 array = &rt_rq->active;
8107 for (i = 0; i < MAX_RT_PRIO; i++) { 8107 for (i = 0; i < MAX_RT_PRIO; i++) {
8108 INIT_LIST_HEAD(array->queue + i); 8108 INIT_LIST_HEAD(array->queue + i);
8109 __clear_bit(i, array->bitmap); 8109 __clear_bit(i, array->bitmap);
8110 } 8110 }
8111 /* delimiter for bitsearch: */ 8111 /* delimiter for bitsearch: */
8112 __set_bit(MAX_RT_PRIO, array->bitmap); 8112 __set_bit(MAX_RT_PRIO, array->bitmap);
8113 8113
8114 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 8114 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
8115 rt_rq->highest_prio.curr = MAX_RT_PRIO; 8115 rt_rq->highest_prio.curr = MAX_RT_PRIO;
8116 #ifdef CONFIG_SMP 8116 #ifdef CONFIG_SMP
8117 rt_rq->highest_prio.next = MAX_RT_PRIO; 8117 rt_rq->highest_prio.next = MAX_RT_PRIO;
8118 #endif 8118 #endif
8119 #endif 8119 #endif
8120 #ifdef CONFIG_SMP 8120 #ifdef CONFIG_SMP
8121 rt_rq->rt_nr_migratory = 0; 8121 rt_rq->rt_nr_migratory = 0;
8122 rt_rq->overloaded = 0; 8122 rt_rq->overloaded = 0;
8123 plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock); 8123 plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock);
8124 #endif 8124 #endif
8125 8125
8126 rt_rq->rt_time = 0; 8126 rt_rq->rt_time = 0;
8127 rt_rq->rt_throttled = 0; 8127 rt_rq->rt_throttled = 0;
8128 rt_rq->rt_runtime = 0; 8128 rt_rq->rt_runtime = 0;
8129 raw_spin_lock_init(&rt_rq->rt_runtime_lock); 8129 raw_spin_lock_init(&rt_rq->rt_runtime_lock);
8130 8130
8131 #ifdef CONFIG_RT_GROUP_SCHED 8131 #ifdef CONFIG_RT_GROUP_SCHED
8132 rt_rq->rt_nr_boosted = 0; 8132 rt_rq->rt_nr_boosted = 0;
8133 rt_rq->rq = rq; 8133 rt_rq->rq = rq;
8134 #endif 8134 #endif
8135 } 8135 }
8136 8136
8137 #ifdef CONFIG_FAIR_GROUP_SCHED 8137 #ifdef CONFIG_FAIR_GROUP_SCHED
8138 static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, 8138 static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8139 struct sched_entity *se, int cpu, 8139 struct sched_entity *se, int cpu,
8140 struct sched_entity *parent) 8140 struct sched_entity *parent)
8141 { 8141 {
8142 struct rq *rq = cpu_rq(cpu); 8142 struct rq *rq = cpu_rq(cpu);
8143 tg->cfs_rq[cpu] = cfs_rq; 8143 tg->cfs_rq[cpu] = cfs_rq;
8144 init_cfs_rq(cfs_rq, rq); 8144 init_cfs_rq(cfs_rq, rq);
8145 cfs_rq->tg = tg; 8145 cfs_rq->tg = tg;
8146 8146
8147 tg->se[cpu] = se; 8147 tg->se[cpu] = se;
8148 /* se could be NULL for root_task_group */ 8148 /* se could be NULL for root_task_group */
8149 if (!se) 8149 if (!se)
8150 return; 8150 return;
8151 8151
8152 if (!parent) 8152 if (!parent)
8153 se->cfs_rq = &rq->cfs; 8153 se->cfs_rq = &rq->cfs;
8154 else 8154 else
8155 se->cfs_rq = parent->my_q; 8155 se->cfs_rq = parent->my_q;
8156 8156
8157 se->my_q = cfs_rq; 8157 se->my_q = cfs_rq;
8158 update_load_set(&se->load, 0); 8158 update_load_set(&se->load, 0);
8159 se->parent = parent; 8159 se->parent = parent;
8160 } 8160 }
8161 #endif 8161 #endif
8162 8162
8163 #ifdef CONFIG_RT_GROUP_SCHED 8163 #ifdef CONFIG_RT_GROUP_SCHED
8164 static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, 8164 static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
8165 struct sched_rt_entity *rt_se, int cpu, 8165 struct sched_rt_entity *rt_se, int cpu,
8166 struct sched_rt_entity *parent) 8166 struct sched_rt_entity *parent)
8167 { 8167 {
8168 struct rq *rq = cpu_rq(cpu); 8168 struct rq *rq = cpu_rq(cpu);
8169 8169
8170 tg->rt_rq[cpu] = rt_rq; 8170 tg->rt_rq[cpu] = rt_rq;
8171 init_rt_rq(rt_rq, rq); 8171 init_rt_rq(rt_rq, rq);
8172 rt_rq->tg = tg; 8172 rt_rq->tg = tg;
8173 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 8173 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
8174 8174
8175 tg->rt_se[cpu] = rt_se; 8175 tg->rt_se[cpu] = rt_se;
8176 if (!rt_se) 8176 if (!rt_se)
8177 return; 8177 return;
8178 8178
8179 if (!parent) 8179 if (!parent)
8180 rt_se->rt_rq = &rq->rt; 8180 rt_se->rt_rq = &rq->rt;
8181 else 8181 else
8182 rt_se->rt_rq = parent->my_q; 8182 rt_se->rt_rq = parent->my_q;
8183 8183
8184 rt_se->my_q = rt_rq; 8184 rt_se->my_q = rt_rq;
8185 rt_se->parent = parent; 8185 rt_se->parent = parent;
8186 INIT_LIST_HEAD(&rt_se->run_list); 8186 INIT_LIST_HEAD(&rt_se->run_list);
8187 } 8187 }
8188 #endif 8188 #endif
8189 8189
8190 void __init sched_init(void) 8190 void __init sched_init(void)
8191 { 8191 {
8192 int i, j; 8192 int i, j;
8193 unsigned long alloc_size = 0, ptr; 8193 unsigned long alloc_size = 0, ptr;
8194 8194
8195 #ifdef CONFIG_FAIR_GROUP_SCHED 8195 #ifdef CONFIG_FAIR_GROUP_SCHED
8196 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 8196 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
8197 #endif 8197 #endif
8198 #ifdef CONFIG_RT_GROUP_SCHED 8198 #ifdef CONFIG_RT_GROUP_SCHED
8199 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 8199 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
8200 #endif 8200 #endif
8201 #ifdef CONFIG_CPUMASK_OFFSTACK 8201 #ifdef CONFIG_CPUMASK_OFFSTACK
8202 alloc_size += num_possible_cpus() * cpumask_size(); 8202 alloc_size += num_possible_cpus() * cpumask_size();
8203 #endif 8203 #endif
8204 if (alloc_size) { 8204 if (alloc_size) {
8205 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 8205 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
8206 8206
8207 #ifdef CONFIG_FAIR_GROUP_SCHED 8207 #ifdef CONFIG_FAIR_GROUP_SCHED
8208 root_task_group.se = (struct sched_entity **)ptr; 8208 root_task_group.se = (struct sched_entity **)ptr;
8209 ptr += nr_cpu_ids * sizeof(void **); 8209 ptr += nr_cpu_ids * sizeof(void **);
8210 8210
8211 root_task_group.cfs_rq = (struct cfs_rq **)ptr; 8211 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
8212 ptr += nr_cpu_ids * sizeof(void **); 8212 ptr += nr_cpu_ids * sizeof(void **);
8213 8213
8214 #endif /* CONFIG_FAIR_GROUP_SCHED */ 8214 #endif /* CONFIG_FAIR_GROUP_SCHED */
8215 #ifdef CONFIG_RT_GROUP_SCHED 8215 #ifdef CONFIG_RT_GROUP_SCHED
8216 root_task_group.rt_se = (struct sched_rt_entity **)ptr; 8216 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
8217 ptr += nr_cpu_ids * sizeof(void **); 8217 ptr += nr_cpu_ids * sizeof(void **);
8218 8218
8219 root_task_group.rt_rq = (struct rt_rq **)ptr; 8219 root_task_group.rt_rq = (struct rt_rq **)ptr;
8220 ptr += nr_cpu_ids * sizeof(void **); 8220 ptr += nr_cpu_ids * sizeof(void **);
8221 8221
8222 #endif /* CONFIG_RT_GROUP_SCHED */ 8222 #endif /* CONFIG_RT_GROUP_SCHED */
8223 #ifdef CONFIG_CPUMASK_OFFSTACK 8223 #ifdef CONFIG_CPUMASK_OFFSTACK
8224 for_each_possible_cpu(i) { 8224 for_each_possible_cpu(i) {
8225 per_cpu(load_balance_tmpmask, i) = (void *)ptr; 8225 per_cpu(load_balance_tmpmask, i) = (void *)ptr;
8226 ptr += cpumask_size(); 8226 ptr += cpumask_size();
8227 } 8227 }
8228 #endif /* CONFIG_CPUMASK_OFFSTACK */ 8228 #endif /* CONFIG_CPUMASK_OFFSTACK */
8229 } 8229 }
8230 8230
8231 #ifdef CONFIG_SMP 8231 #ifdef CONFIG_SMP
8232 init_defrootdomain(); 8232 init_defrootdomain();
8233 #endif 8233 #endif
8234 8234
8235 init_rt_bandwidth(&def_rt_bandwidth, 8235 init_rt_bandwidth(&def_rt_bandwidth,
8236 global_rt_period(), global_rt_runtime()); 8236 global_rt_period(), global_rt_runtime());
8237 8237
8238 #ifdef CONFIG_RT_GROUP_SCHED 8238 #ifdef CONFIG_RT_GROUP_SCHED
8239 init_rt_bandwidth(&root_task_group.rt_bandwidth, 8239 init_rt_bandwidth(&root_task_group.rt_bandwidth,
8240 global_rt_period(), global_rt_runtime()); 8240 global_rt_period(), global_rt_runtime());
8241 #endif /* CONFIG_RT_GROUP_SCHED */ 8241 #endif /* CONFIG_RT_GROUP_SCHED */
8242 8242
8243 #ifdef CONFIG_CGROUP_SCHED 8243 #ifdef CONFIG_CGROUP_SCHED
8244 list_add(&root_task_group.list, &task_groups); 8244 list_add(&root_task_group.list, &task_groups);
8245 INIT_LIST_HEAD(&root_task_group.children); 8245 INIT_LIST_HEAD(&root_task_group.children);
8246 autogroup_init(&init_task); 8246 autogroup_init(&init_task);
8247 #endif /* CONFIG_CGROUP_SCHED */ 8247 #endif /* CONFIG_CGROUP_SCHED */
8248 8248
8249 for_each_possible_cpu(i) { 8249 for_each_possible_cpu(i) {
8250 struct rq *rq; 8250 struct rq *rq;
8251 8251
8252 rq = cpu_rq(i); 8252 rq = cpu_rq(i);
8253 raw_spin_lock_init(&rq->lock); 8253 raw_spin_lock_init(&rq->lock);
8254 rq->nr_running = 0; 8254 rq->nr_running = 0;
8255 rq->calc_load_active = 0; 8255 rq->calc_load_active = 0;
8256 rq->calc_load_update = jiffies + LOAD_FREQ; 8256 rq->calc_load_update = jiffies + LOAD_FREQ;
8257 init_cfs_rq(&rq->cfs, rq); 8257 init_cfs_rq(&rq->cfs, rq);
8258 init_rt_rq(&rq->rt, rq); 8258 init_rt_rq(&rq->rt, rq);
8259 #ifdef CONFIG_FAIR_GROUP_SCHED 8259 #ifdef CONFIG_FAIR_GROUP_SCHED
8260 root_task_group.shares = root_task_group_load; 8260 root_task_group.shares = root_task_group_load;
8261 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 8261 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
8262 /* 8262 /*
8263 * How much cpu bandwidth does root_task_group get? 8263 * How much cpu bandwidth does root_task_group get?
8264 * 8264 *
8265 * In case of task-groups formed thr' the cgroup filesystem, it 8265 * In case of task-groups formed thr' the cgroup filesystem, it
8266 * gets 100% of the cpu resources in the system. This overall 8266 * gets 100% of the cpu resources in the system. This overall
8267 * system cpu resource is divided among the tasks of 8267 * system cpu resource is divided among the tasks of
8268 * root_task_group and its child task-groups in a fair manner, 8268 * root_task_group and its child task-groups in a fair manner,
8269 * based on each entity's (task or task-group's) weight 8269 * based on each entity's (task or task-group's) weight
8270 * (se->load.weight). 8270 * (se->load.weight).
8271 * 8271 *
8272 * In other words, if root_task_group has 10 tasks of weight 8272 * In other words, if root_task_group has 10 tasks of weight
8273 * 1024) and two child groups A0 and A1 (of weight 1024 each), 8273 * 1024) and two child groups A0 and A1 (of weight 1024 each),
8274 * then A0's share of the cpu resource is: 8274 * then A0's share of the cpu resource is:
8275 * 8275 *
8276 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% 8276 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
8277 * 8277 *
8278 * We achieve this by letting root_task_group's tasks sit 8278 * We achieve this by letting root_task_group's tasks sit
8279 * directly in rq->cfs (i.e root_task_group->se[] = NULL). 8279 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
8280 */ 8280 */
8281 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); 8281 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
8282 #endif /* CONFIG_FAIR_GROUP_SCHED */ 8282 #endif /* CONFIG_FAIR_GROUP_SCHED */
8283 8283
8284 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; 8284 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
8285 #ifdef CONFIG_RT_GROUP_SCHED 8285 #ifdef CONFIG_RT_GROUP_SCHED
8286 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 8286 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
8287 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); 8287 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
8288 #endif 8288 #endif
8289 8289
8290 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 8290 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
8291 rq->cpu_load[j] = 0; 8291 rq->cpu_load[j] = 0;
8292 8292
8293 rq->last_load_update_tick = jiffies; 8293 rq->last_load_update_tick = jiffies;
8294 8294
8295 #ifdef CONFIG_SMP 8295 #ifdef CONFIG_SMP
8296 rq->sd = NULL; 8296 rq->sd = NULL;
8297 rq->rd = NULL; 8297 rq->rd = NULL;
8298 rq->cpu_power = SCHED_LOAD_SCALE; 8298 rq->cpu_power = SCHED_LOAD_SCALE;
8299 rq->post_schedule = 0; 8299 rq->post_schedule = 0;
8300 rq->active_balance = 0; 8300 rq->active_balance = 0;
8301 rq->next_balance = jiffies; 8301 rq->next_balance = jiffies;
8302 rq->push_cpu = 0; 8302 rq->push_cpu = 0;
8303 rq->cpu = i; 8303 rq->cpu = i;
8304 rq->online = 0; 8304 rq->online = 0;
8305 rq->idle_stamp = 0; 8305 rq->idle_stamp = 0;
8306 rq->avg_idle = 2*sysctl_sched_migration_cost; 8306 rq->avg_idle = 2*sysctl_sched_migration_cost;
8307 rq_attach_root(rq, &def_root_domain); 8307 rq_attach_root(rq, &def_root_domain);
8308 #ifdef CONFIG_NO_HZ 8308 #ifdef CONFIG_NO_HZ
8309 rq->nohz_balance_kick = 0; 8309 rq->nohz_balance_kick = 0;
8310 init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i)); 8310 init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
8311 #endif 8311 #endif
8312 #endif 8312 #endif
8313 init_rq_hrtick(rq); 8313 init_rq_hrtick(rq);
8314 atomic_set(&rq->nr_iowait, 0); 8314 atomic_set(&rq->nr_iowait, 0);
8315 } 8315 }
8316 8316
8317 set_load_weight(&init_task); 8317 set_load_weight(&init_task);
8318 8318
8319 #ifdef CONFIG_PREEMPT_NOTIFIERS 8319 #ifdef CONFIG_PREEMPT_NOTIFIERS
8320 INIT_HLIST_HEAD(&init_task.preempt_notifiers); 8320 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
8321 #endif 8321 #endif
8322 8322
8323 #ifdef CONFIG_SMP 8323 #ifdef CONFIG_SMP
8324 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); 8324 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
8325 #endif 8325 #endif
8326 8326
8327 #ifdef CONFIG_RT_MUTEXES 8327 #ifdef CONFIG_RT_MUTEXES
8328 plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock); 8328 plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock);
8329 #endif 8329 #endif
8330 8330
8331 /* 8331 /*
8332 * The boot idle thread does lazy MMU switching as well: 8332 * The boot idle thread does lazy MMU switching as well:
8333 */ 8333 */
8334 atomic_inc(&init_mm.mm_count); 8334 atomic_inc(&init_mm.mm_count);
8335 enter_lazy_tlb(&init_mm, current); 8335 enter_lazy_tlb(&init_mm, current);
8336 8336
8337 /* 8337 /*
8338 * Make us the idle thread. Technically, schedule() should not be 8338 * Make us the idle thread. Technically, schedule() should not be
8339 * called from this thread, however somewhere below it might be, 8339 * called from this thread, however somewhere below it might be,
8340 * but because we are the idle thread, we just pick up running again 8340 * but because we are the idle thread, we just pick up running again
8341 * when this runqueue becomes "idle". 8341 * when this runqueue becomes "idle".
8342 */ 8342 */
8343 init_idle(current, smp_processor_id()); 8343 init_idle(current, smp_processor_id());
8344 8344
8345 calc_load_update = jiffies + LOAD_FREQ; 8345 calc_load_update = jiffies + LOAD_FREQ;
8346 8346
8347 /* 8347 /*
8348 * During early bootup we pretend to be a normal task: 8348 * During early bootup we pretend to be a normal task:
8349 */ 8349 */
8350 current->sched_class = &fair_sched_class; 8350 current->sched_class = &fair_sched_class;
8351 8351
8352 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ 8352 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
8353 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); 8353 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
8354 #ifdef CONFIG_SMP 8354 #ifdef CONFIG_SMP
8355 #ifdef CONFIG_NO_HZ 8355 #ifdef CONFIG_NO_HZ
8356 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); 8356 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
8357 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); 8357 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
8358 atomic_set(&nohz.load_balancer, nr_cpu_ids); 8358 atomic_set(&nohz.load_balancer, nr_cpu_ids);
8359 atomic_set(&nohz.first_pick_cpu, nr_cpu_ids); 8359 atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
8360 atomic_set(&nohz.second_pick_cpu, nr_cpu_ids); 8360 atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
8361 #endif 8361 #endif
8362 /* May be allocated at isolcpus cmdline parse time */ 8362 /* May be allocated at isolcpus cmdline parse time */
8363 if (cpu_isolated_map == NULL) 8363 if (cpu_isolated_map == NULL)
8364 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 8364 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
8365 #endif /* SMP */ 8365 #endif /* SMP */
8366 8366
8367 scheduler_running = 1; 8367 scheduler_running = 1;
8368 } 8368 }
8369 8369
8370 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 8370 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
8371 static inline int preempt_count_equals(int preempt_offset) 8371 static inline int preempt_count_equals(int preempt_offset)
8372 { 8372 {
8373 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); 8373 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
8374 8374
8375 return (nested == preempt_offset); 8375 return (nested == preempt_offset);
8376 } 8376 }
8377 8377
8378 void __might_sleep(const char *file, int line, int preempt_offset) 8378 void __might_sleep(const char *file, int line, int preempt_offset)
8379 { 8379 {
8380 #ifdef in_atomic 8380 #ifdef in_atomic
8381 static unsigned long prev_jiffy; /* ratelimiting */ 8381 static unsigned long prev_jiffy; /* ratelimiting */
8382 8382
8383 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || 8383 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
8384 system_state != SYSTEM_RUNNING || oops_in_progress) 8384 system_state != SYSTEM_RUNNING || oops_in_progress)
8385 return; 8385 return;
8386 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 8386 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
8387 return; 8387 return;
8388 prev_jiffy = jiffies; 8388 prev_jiffy = jiffies;
8389 8389
8390 printk(KERN_ERR 8390 printk(KERN_ERR
8391 "BUG: sleeping function called from invalid context at %s:%d\n", 8391 "BUG: sleeping function called from invalid context at %s:%d\n",
8392 file, line); 8392 file, line);
8393 printk(KERN_ERR 8393 printk(KERN_ERR
8394 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", 8394 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
8395 in_atomic(), irqs_disabled(), 8395 in_atomic(), irqs_disabled(),
8396 current->pid, current->comm); 8396 current->pid, current->comm);
8397 8397
8398 debug_show_held_locks(current); 8398 debug_show_held_locks(current);
8399 if (irqs_disabled()) 8399 if (irqs_disabled())
8400 print_irqtrace_events(current); 8400 print_irqtrace_events(current);
8401 dump_stack(); 8401 dump_stack();
8402 #endif 8402 #endif
8403 } 8403 }
8404 EXPORT_SYMBOL(__might_sleep); 8404 EXPORT_SYMBOL(__might_sleep);
8405 #endif 8405 #endif
8406 8406
8407 #ifdef CONFIG_MAGIC_SYSRQ 8407 #ifdef CONFIG_MAGIC_SYSRQ
8408 static void normalize_task(struct rq *rq, struct task_struct *p) 8408 static void normalize_task(struct rq *rq, struct task_struct *p)
8409 { 8409 {
8410 const struct sched_class *prev_class = p->sched_class; 8410 const struct sched_class *prev_class = p->sched_class;
8411 int old_prio = p->prio; 8411 int old_prio = p->prio;
8412 int on_rq; 8412 int on_rq;
8413 8413
8414 on_rq = p->on_rq; 8414 on_rq = p->on_rq;
8415 if (on_rq) 8415 if (on_rq)
8416 deactivate_task(rq, p, 0); 8416 deactivate_task(rq, p, 0);
8417 __setscheduler(rq, p, SCHED_NORMAL, 0); 8417 __setscheduler(rq, p, SCHED_NORMAL, 0);
8418 if (on_rq) { 8418 if (on_rq) {
8419 activate_task(rq, p, 0); 8419 activate_task(rq, p, 0);
8420 resched_task(rq->curr); 8420 resched_task(rq->curr);
8421 } 8421 }
8422 8422
8423 check_class_changed(rq, p, prev_class, old_prio); 8423 check_class_changed(rq, p, prev_class, old_prio);
8424 } 8424 }
8425 8425
8426 void normalize_rt_tasks(void) 8426 void normalize_rt_tasks(void)
8427 { 8427 {
8428 struct task_struct *g, *p; 8428 struct task_struct *g, *p;
8429 unsigned long flags; 8429 unsigned long flags;
8430 struct rq *rq; 8430 struct rq *rq;
8431 8431
8432 read_lock_irqsave(&tasklist_lock, flags); 8432 read_lock_irqsave(&tasklist_lock, flags);
8433 do_each_thread(g, p) { 8433 do_each_thread(g, p) {
8434 /* 8434 /*
8435 * Only normalize user tasks: 8435 * Only normalize user tasks:
8436 */ 8436 */
8437 if (!p->mm) 8437 if (!p->mm)
8438 continue; 8438 continue;
8439 8439
8440 p->se.exec_start = 0; 8440 p->se.exec_start = 0;
8441 #ifdef CONFIG_SCHEDSTATS 8441 #ifdef CONFIG_SCHEDSTATS
8442 p->se.statistics.wait_start = 0; 8442 p->se.statistics.wait_start = 0;
8443 p->se.statistics.sleep_start = 0; 8443 p->se.statistics.sleep_start = 0;
8444 p->se.statistics.block_start = 0; 8444 p->se.statistics.block_start = 0;
8445 #endif 8445 #endif
8446 8446
8447 if (!rt_task(p)) { 8447 if (!rt_task(p)) {
8448 /* 8448 /*
8449 * Renice negative nice level userspace 8449 * Renice negative nice level userspace
8450 * tasks back to 0: 8450 * tasks back to 0:
8451 */ 8451 */
8452 if (TASK_NICE(p) < 0 && p->mm) 8452 if (TASK_NICE(p) < 0 && p->mm)
8453 set_user_nice(p, 0); 8453 set_user_nice(p, 0);
8454 continue; 8454 continue;
8455 } 8455 }
8456 8456
8457 raw_spin_lock(&p->pi_lock); 8457 raw_spin_lock(&p->pi_lock);
8458 rq = __task_rq_lock(p); 8458 rq = __task_rq_lock(p);
8459 8459
8460 normalize_task(rq, p); 8460 normalize_task(rq, p);
8461 8461
8462 __task_rq_unlock(rq); 8462 __task_rq_unlock(rq);
8463 raw_spin_unlock(&p->pi_lock); 8463 raw_spin_unlock(&p->pi_lock);
8464 } while_each_thread(g, p); 8464 } while_each_thread(g, p);
8465 8465
8466 read_unlock_irqrestore(&tasklist_lock, flags); 8466 read_unlock_irqrestore(&tasklist_lock, flags);
8467 } 8467 }
8468 8468
8469 #endif /* CONFIG_MAGIC_SYSRQ */ 8469 #endif /* CONFIG_MAGIC_SYSRQ */
8470 8470
8471 #if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) 8471 #if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
8472 /* 8472 /*
8473 * These functions are only useful for the IA64 MCA handling, or kdb. 8473 * These functions are only useful for the IA64 MCA handling, or kdb.
8474 * 8474 *
8475 * They can only be called when the whole system has been 8475 * They can only be called when the whole system has been
8476 * stopped - every CPU needs to be quiescent, and no scheduling 8476 * stopped - every CPU needs to be quiescent, and no scheduling
8477 * activity can take place. Using them for anything else would 8477 * activity can take place. Using them for anything else would
8478 * be a serious bug, and as a result, they aren't even visible 8478 * be a serious bug, and as a result, they aren't even visible
8479 * under any other configuration. 8479 * under any other configuration.
8480 */ 8480 */
8481 8481
8482 /** 8482 /**
8483 * curr_task - return the current task for a given cpu. 8483 * curr_task - return the current task for a given cpu.
8484 * @cpu: the processor in question. 8484 * @cpu: the processor in question.
8485 * 8485 *
8486 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 8486 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
8487 */ 8487 */
8488 struct task_struct *curr_task(int cpu) 8488 struct task_struct *curr_task(int cpu)
8489 { 8489 {
8490 return cpu_curr(cpu); 8490 return cpu_curr(cpu);
8491 } 8491 }
8492 8492
8493 #endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ 8493 #endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
8494 8494
8495 #ifdef CONFIG_IA64 8495 #ifdef CONFIG_IA64
8496 /** 8496 /**
8497 * set_curr_task - set the current task for a given cpu. 8497 * set_curr_task - set the current task for a given cpu.
8498 * @cpu: the processor in question. 8498 * @cpu: the processor in question.
8499 * @p: the task pointer to set. 8499 * @p: the task pointer to set.
8500 * 8500 *
8501 * Description: This function must only be used when non-maskable interrupts 8501 * Description: This function must only be used when non-maskable interrupts
8502 * are serviced on a separate stack. It allows the architecture to switch the 8502 * are serviced on a separate stack. It allows the architecture to switch the
8503 * notion of the current task on a cpu in a non-blocking manner. This function 8503 * notion of the current task on a cpu in a non-blocking manner. This function
8504 * must be called with all CPU's synchronized, and interrupts disabled, the 8504 * must be called with all CPU's synchronized, and interrupts disabled, the
8505 * and caller must save the original value of the current task (see 8505 * and caller must save the original value of the current task (see
8506 * curr_task() above) and restore that value before reenabling interrupts and 8506 * curr_task() above) and restore that value before reenabling interrupts and
8507 * re-starting the system. 8507 * re-starting the system.
8508 * 8508 *
8509 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 8509 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
8510 */ 8510 */
8511 void set_curr_task(int cpu, struct task_struct *p) 8511 void set_curr_task(int cpu, struct task_struct *p)
8512 { 8512 {
8513 cpu_curr(cpu) = p; 8513 cpu_curr(cpu) = p;
8514 } 8514 }
8515 8515
8516 #endif 8516 #endif
8517 8517
8518 #ifdef CONFIG_FAIR_GROUP_SCHED 8518 #ifdef CONFIG_FAIR_GROUP_SCHED
8519 static void free_fair_sched_group(struct task_group *tg) 8519 static void free_fair_sched_group(struct task_group *tg)
8520 { 8520 {
8521 int i; 8521 int i;
8522 8522
8523 for_each_possible_cpu(i) { 8523 for_each_possible_cpu(i) {
8524 if (tg->cfs_rq) 8524 if (tg->cfs_rq)
8525 kfree(tg->cfs_rq[i]); 8525 kfree(tg->cfs_rq[i]);
8526 if (tg->se) 8526 if (tg->se)
8527 kfree(tg->se[i]); 8527 kfree(tg->se[i]);
8528 } 8528 }
8529 8529
8530 kfree(tg->cfs_rq); 8530 kfree(tg->cfs_rq);
8531 kfree(tg->se); 8531 kfree(tg->se);
8532 } 8532 }
8533 8533
8534 static 8534 static
8535 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) 8535 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8536 { 8536 {
8537 struct cfs_rq *cfs_rq; 8537 struct cfs_rq *cfs_rq;
8538 struct sched_entity *se; 8538 struct sched_entity *se;
8539 int i; 8539 int i;
8540 8540
8541 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); 8541 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
8542 if (!tg->cfs_rq) 8542 if (!tg->cfs_rq)
8543 goto err; 8543 goto err;
8544 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL); 8544 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
8545 if (!tg->se) 8545 if (!tg->se)
8546 goto err; 8546 goto err;
8547 8547
8548 tg->shares = NICE_0_LOAD; 8548 tg->shares = NICE_0_LOAD;
8549 8549
8550 for_each_possible_cpu(i) { 8550 for_each_possible_cpu(i) {
8551 cfs_rq = kzalloc_node(sizeof(struct cfs_rq), 8551 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8552 GFP_KERNEL, cpu_to_node(i)); 8552 GFP_KERNEL, cpu_to_node(i));
8553 if (!cfs_rq) 8553 if (!cfs_rq)
8554 goto err; 8554 goto err;
8555 8555
8556 se = kzalloc_node(sizeof(struct sched_entity), 8556 se = kzalloc_node(sizeof(struct sched_entity),
8557 GFP_KERNEL, cpu_to_node(i)); 8557 GFP_KERNEL, cpu_to_node(i));
8558 if (!se) 8558 if (!se)
8559 goto err_free_rq; 8559 goto err_free_rq;
8560 8560
8561 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); 8561 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8562 } 8562 }
8563 8563
8564 return 1; 8564 return 1;
8565 8565
8566 err_free_rq: 8566 err_free_rq:
8567 kfree(cfs_rq); 8567 kfree(cfs_rq);
8568 err: 8568 err:
8569 return 0; 8569 return 0;
8570 } 8570 }
8571 8571
8572 static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8572 static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8573 { 8573 {
8574 struct rq *rq = cpu_rq(cpu); 8574 struct rq *rq = cpu_rq(cpu);
8575 unsigned long flags; 8575 unsigned long flags;
8576 8576
8577 /* 8577 /*
8578 * Only empty task groups can be destroyed; so we can speculatively 8578 * Only empty task groups can be destroyed; so we can speculatively
8579 * check on_list without danger of it being re-added. 8579 * check on_list without danger of it being re-added.
8580 */ 8580 */
8581 if (!tg->cfs_rq[cpu]->on_list) 8581 if (!tg->cfs_rq[cpu]->on_list)
8582 return; 8582 return;
8583 8583
8584 raw_spin_lock_irqsave(&rq->lock, flags); 8584 raw_spin_lock_irqsave(&rq->lock, flags);
8585 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); 8585 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8586 raw_spin_unlock_irqrestore(&rq->lock, flags); 8586 raw_spin_unlock_irqrestore(&rq->lock, flags);
8587 } 8587 }
8588 #else /* !CONFG_FAIR_GROUP_SCHED */ 8588 #else /* !CONFG_FAIR_GROUP_SCHED */
8589 static inline void free_fair_sched_group(struct task_group *tg) 8589 static inline void free_fair_sched_group(struct task_group *tg)
8590 { 8590 {
8591 } 8591 }
8592 8592
8593 static inline 8593 static inline
8594 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) 8594 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8595 { 8595 {
8596 return 1; 8596 return 1;
8597 } 8597 }
8598 8598
8599 static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8599 static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8600 { 8600 {
8601 } 8601 }
8602 #endif /* CONFIG_FAIR_GROUP_SCHED */ 8602 #endif /* CONFIG_FAIR_GROUP_SCHED */
8603 8603
8604 #ifdef CONFIG_RT_GROUP_SCHED 8604 #ifdef CONFIG_RT_GROUP_SCHED
8605 static void free_rt_sched_group(struct task_group *tg) 8605 static void free_rt_sched_group(struct task_group *tg)
8606 { 8606 {
8607 int i; 8607 int i;
8608 8608
8609 destroy_rt_bandwidth(&tg->rt_bandwidth); 8609 destroy_rt_bandwidth(&tg->rt_bandwidth);
8610 8610
8611 for_each_possible_cpu(i) { 8611 for_each_possible_cpu(i) {
8612 if (tg->rt_rq) 8612 if (tg->rt_rq)
8613 kfree(tg->rt_rq[i]); 8613 kfree(tg->rt_rq[i]);
8614 if (tg->rt_se) 8614 if (tg->rt_se)
8615 kfree(tg->rt_se[i]); 8615 kfree(tg->rt_se[i]);
8616 } 8616 }
8617 8617
8618 kfree(tg->rt_rq); 8618 kfree(tg->rt_rq);
8619 kfree(tg->rt_se); 8619 kfree(tg->rt_se);
8620 } 8620 }
8621 8621
8622 static 8622 static
8623 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 8623 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8624 { 8624 {
8625 struct rt_rq *rt_rq; 8625 struct rt_rq *rt_rq;
8626 struct sched_rt_entity *rt_se; 8626 struct sched_rt_entity *rt_se;
8627 struct rq *rq; 8627 struct rq *rq;
8628 int i; 8628 int i;
8629 8629
8630 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); 8630 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
8631 if (!tg->rt_rq) 8631 if (!tg->rt_rq)
8632 goto err; 8632 goto err;
8633 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); 8633 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
8634 if (!tg->rt_se) 8634 if (!tg->rt_se)
8635 goto err; 8635 goto err;
8636 8636
8637 init_rt_bandwidth(&tg->rt_bandwidth, 8637 init_rt_bandwidth(&tg->rt_bandwidth,
8638 ktime_to_ns(def_rt_bandwidth.rt_period), 0); 8638 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
8639 8639
8640 for_each_possible_cpu(i) { 8640 for_each_possible_cpu(i) {
8641 rq = cpu_rq(i); 8641 rq = cpu_rq(i);
8642 8642
8643 rt_rq = kzalloc_node(sizeof(struct rt_rq), 8643 rt_rq = kzalloc_node(sizeof(struct rt_rq),
8644 GFP_KERNEL, cpu_to_node(i)); 8644 GFP_KERNEL, cpu_to_node(i));
8645 if (!rt_rq) 8645 if (!rt_rq)
8646 goto err; 8646 goto err;
8647 8647
8648 rt_se = kzalloc_node(sizeof(struct sched_rt_entity), 8648 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
8649 GFP_KERNEL, cpu_to_node(i)); 8649 GFP_KERNEL, cpu_to_node(i));
8650 if (!rt_se) 8650 if (!rt_se)
8651 goto err_free_rq; 8651 goto err_free_rq;
8652 8652
8653 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); 8653 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
8654 } 8654 }
8655 8655
8656 return 1; 8656 return 1;
8657 8657
8658 err_free_rq: 8658 err_free_rq:
8659 kfree(rt_rq); 8659 kfree(rt_rq);
8660 err: 8660 err:
8661 return 0; 8661 return 0;
8662 } 8662 }
8663 #else /* !CONFIG_RT_GROUP_SCHED */ 8663 #else /* !CONFIG_RT_GROUP_SCHED */
8664 static inline void free_rt_sched_group(struct task_group *tg) 8664 static inline void free_rt_sched_group(struct task_group *tg)
8665 { 8665 {
8666 } 8666 }
8667 8667
8668 static inline 8668 static inline
8669 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 8669 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8670 { 8670 {
8671 return 1; 8671 return 1;
8672 } 8672 }
8673 #endif /* CONFIG_RT_GROUP_SCHED */ 8673 #endif /* CONFIG_RT_GROUP_SCHED */
8674 8674
8675 #ifdef CONFIG_CGROUP_SCHED 8675 #ifdef CONFIG_CGROUP_SCHED
8676 static void free_sched_group(struct task_group *tg) 8676 static void free_sched_group(struct task_group *tg)
8677 { 8677 {
8678 free_fair_sched_group(tg); 8678 free_fair_sched_group(tg);
8679 free_rt_sched_group(tg); 8679 free_rt_sched_group(tg);
8680 autogroup_free(tg); 8680 autogroup_free(tg);
8681 kfree(tg); 8681 kfree(tg);
8682 } 8682 }
8683 8683
8684 /* allocate runqueue etc for a new task group */ 8684 /* allocate runqueue etc for a new task group */
8685 struct task_group *sched_create_group(struct task_group *parent) 8685 struct task_group *sched_create_group(struct task_group *parent)
8686 { 8686 {
8687 struct task_group *tg; 8687 struct task_group *tg;
8688 unsigned long flags; 8688 unsigned long flags;
8689 8689
8690 tg = kzalloc(sizeof(*tg), GFP_KERNEL); 8690 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
8691 if (!tg) 8691 if (!tg)
8692 return ERR_PTR(-ENOMEM); 8692 return ERR_PTR(-ENOMEM);
8693 8693
8694 if (!alloc_fair_sched_group(tg, parent)) 8694 if (!alloc_fair_sched_group(tg, parent))
8695 goto err; 8695 goto err;
8696 8696
8697 if (!alloc_rt_sched_group(tg, parent)) 8697 if (!alloc_rt_sched_group(tg, parent))
8698 goto err; 8698 goto err;
8699 8699
8700 spin_lock_irqsave(&task_group_lock, flags); 8700 spin_lock_irqsave(&task_group_lock, flags);
8701 list_add_rcu(&tg->list, &task_groups); 8701 list_add_rcu(&tg->list, &task_groups);
8702 8702
8703 WARN_ON(!parent); /* root should already exist */ 8703 WARN_ON(!parent); /* root should already exist */
8704 8704
8705 tg->parent = parent; 8705 tg->parent = parent;
8706 INIT_LIST_HEAD(&tg->children); 8706 INIT_LIST_HEAD(&tg->children);
8707 list_add_rcu(&tg->siblings, &parent->children); 8707 list_add_rcu(&tg->siblings, &parent->children);
8708 spin_unlock_irqrestore(&task_group_lock, flags); 8708 spin_unlock_irqrestore(&task_group_lock, flags);
8709 8709
8710 return tg; 8710 return tg;
8711 8711
8712 err: 8712 err:
8713 free_sched_group(tg); 8713 free_sched_group(tg);
8714 return ERR_PTR(-ENOMEM); 8714 return ERR_PTR(-ENOMEM);
8715 } 8715 }
8716 8716
8717 /* rcu callback to free various structures associated with a task group */ 8717 /* rcu callback to free various structures associated with a task group */
8718 static void free_sched_group_rcu(struct rcu_head *rhp) 8718 static void free_sched_group_rcu(struct rcu_head *rhp)
8719 { 8719 {
8720 /* now it should be safe to free those cfs_rqs */ 8720 /* now it should be safe to free those cfs_rqs */
8721 free_sched_group(container_of(rhp, struct task_group, rcu)); 8721 free_sched_group(container_of(rhp, struct task_group, rcu));
8722 } 8722 }
8723 8723
8724 /* Destroy runqueue etc associated with a task group */ 8724 /* Destroy runqueue etc associated with a task group */
8725 void sched_destroy_group(struct task_group *tg) 8725 void sched_destroy_group(struct task_group *tg)
8726 { 8726 {
8727 unsigned long flags; 8727 unsigned long flags;
8728 int i; 8728 int i;
8729 8729
8730 /* end participation in shares distribution */ 8730 /* end participation in shares distribution */
8731 for_each_possible_cpu(i) 8731 for_each_possible_cpu(i)
8732 unregister_fair_sched_group(tg, i); 8732 unregister_fair_sched_group(tg, i);
8733 8733
8734 spin_lock_irqsave(&task_group_lock, flags); 8734 spin_lock_irqsave(&task_group_lock, flags);
8735 list_del_rcu(&tg->list); 8735 list_del_rcu(&tg->list);
8736 list_del_rcu(&tg->siblings); 8736 list_del_rcu(&tg->siblings);
8737 spin_unlock_irqrestore(&task_group_lock, flags); 8737 spin_unlock_irqrestore(&task_group_lock, flags);
8738 8738
8739 /* wait for possible concurrent references to cfs_rqs complete */ 8739 /* wait for possible concurrent references to cfs_rqs complete */
8740 call_rcu(&tg->rcu, free_sched_group_rcu); 8740 call_rcu(&tg->rcu, free_sched_group_rcu);
8741 } 8741 }
8742 8742
8743 /* change task's runqueue when it moves between groups. 8743 /* change task's runqueue when it moves between groups.
8744 * The caller of this function should have put the task in its new group 8744 * The caller of this function should have put the task in its new group
8745 * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to 8745 * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
8746 * reflect its new group. 8746 * reflect its new group.
8747 */ 8747 */
8748 void sched_move_task(struct task_struct *tsk) 8748 void sched_move_task(struct task_struct *tsk)
8749 { 8749 {
8750 int on_rq, running; 8750 int on_rq, running;
8751 unsigned long flags; 8751 unsigned long flags;
8752 struct rq *rq; 8752 struct rq *rq;
8753 8753
8754 rq = task_rq_lock(tsk, &flags); 8754 rq = task_rq_lock(tsk, &flags);
8755 8755
8756 running = task_current(rq, tsk); 8756 running = task_current(rq, tsk);
8757 on_rq = tsk->on_rq; 8757 on_rq = tsk->on_rq;
8758 8758
8759 if (on_rq) 8759 if (on_rq)
8760 dequeue_task(rq, tsk, 0); 8760 dequeue_task(rq, tsk, 0);
8761 if (unlikely(running)) 8761 if (unlikely(running))
8762 tsk->sched_class->put_prev_task(rq, tsk); 8762 tsk->sched_class->put_prev_task(rq, tsk);
8763 8763
8764 #ifdef CONFIG_FAIR_GROUP_SCHED 8764 #ifdef CONFIG_FAIR_GROUP_SCHED
8765 if (tsk->sched_class->task_move_group) 8765 if (tsk->sched_class->task_move_group)
8766 tsk->sched_class->task_move_group(tsk, on_rq); 8766 tsk->sched_class->task_move_group(tsk, on_rq);
8767 else 8767 else
8768 #endif 8768 #endif
8769 set_task_rq(tsk, task_cpu(tsk)); 8769 set_task_rq(tsk, task_cpu(tsk));
8770 8770
8771 if (unlikely(running)) 8771 if (unlikely(running))
8772 tsk->sched_class->set_curr_task(rq); 8772 tsk->sched_class->set_curr_task(rq);
8773 if (on_rq) 8773 if (on_rq)
8774 enqueue_task(rq, tsk, 0); 8774 enqueue_task(rq, tsk, 0);
8775 8775
8776 task_rq_unlock(rq, tsk, &flags); 8776 task_rq_unlock(rq, tsk, &flags);
8777 } 8777 }
8778 #endif /* CONFIG_CGROUP_SCHED */ 8778 #endif /* CONFIG_CGROUP_SCHED */
8779 8779
8780 #ifdef CONFIG_FAIR_GROUP_SCHED 8780 #ifdef CONFIG_FAIR_GROUP_SCHED
8781 static DEFINE_MUTEX(shares_mutex); 8781 static DEFINE_MUTEX(shares_mutex);
8782 8782
8783 int sched_group_set_shares(struct task_group *tg, unsigned long shares) 8783 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8784 { 8784 {
8785 int i; 8785 int i;
8786 unsigned long flags; 8786 unsigned long flags;
8787 8787
8788 /* 8788 /*
8789 * We can't change the weight of the root cgroup. 8789 * We can't change the weight of the root cgroup.
8790 */ 8790 */
8791 if (!tg->se[0]) 8791 if (!tg->se[0])
8792 return -EINVAL; 8792 return -EINVAL;
8793 8793
8794 if (shares < MIN_SHARES) 8794 if (shares < MIN_SHARES)
8795 shares = MIN_SHARES; 8795 shares = MIN_SHARES;
8796 else if (shares > MAX_SHARES) 8796 else if (shares > MAX_SHARES)
8797 shares = MAX_SHARES; 8797 shares = MAX_SHARES;
8798 8798
8799 mutex_lock(&shares_mutex); 8799 mutex_lock(&shares_mutex);
8800 if (tg->shares == shares) 8800 if (tg->shares == shares)
8801 goto done; 8801 goto done;
8802 8802
8803 tg->shares = shares; 8803 tg->shares = shares;
8804 for_each_possible_cpu(i) { 8804 for_each_possible_cpu(i) {
8805 struct rq *rq = cpu_rq(i); 8805 struct rq *rq = cpu_rq(i);
8806 struct sched_entity *se; 8806 struct sched_entity *se;
8807 8807
8808 se = tg->se[i]; 8808 se = tg->se[i];
8809 /* Propagate contribution to hierarchy */ 8809 /* Propagate contribution to hierarchy */
8810 raw_spin_lock_irqsave(&rq->lock, flags); 8810 raw_spin_lock_irqsave(&rq->lock, flags);
8811 for_each_sched_entity(se) 8811 for_each_sched_entity(se)
8812 update_cfs_shares(group_cfs_rq(se)); 8812 update_cfs_shares(group_cfs_rq(se));
8813 raw_spin_unlock_irqrestore(&rq->lock, flags); 8813 raw_spin_unlock_irqrestore(&rq->lock, flags);
8814 } 8814 }
8815 8815
8816 done: 8816 done:
8817 mutex_unlock(&shares_mutex); 8817 mutex_unlock(&shares_mutex);
8818 return 0; 8818 return 0;
8819 } 8819 }
8820 8820
8821 unsigned long sched_group_shares(struct task_group *tg) 8821 unsigned long sched_group_shares(struct task_group *tg)
8822 { 8822 {
8823 return tg->shares; 8823 return tg->shares;
8824 } 8824 }
8825 #endif 8825 #endif
8826 8826
8827 #ifdef CONFIG_RT_GROUP_SCHED 8827 #ifdef CONFIG_RT_GROUP_SCHED
8828 /* 8828 /*
8829 * Ensure that the real time constraints are schedulable. 8829 * Ensure that the real time constraints are schedulable.
8830 */ 8830 */
8831 static DEFINE_MUTEX(rt_constraints_mutex); 8831 static DEFINE_MUTEX(rt_constraints_mutex);
8832 8832
8833 static unsigned long to_ratio(u64 period, u64 runtime) 8833 static unsigned long to_ratio(u64 period, u64 runtime)
8834 { 8834 {
8835 if (runtime == RUNTIME_INF) 8835 if (runtime == RUNTIME_INF)
8836 return 1ULL << 20; 8836 return 1ULL << 20;
8837 8837
8838 return div64_u64(runtime << 20, period); 8838 return div64_u64(runtime << 20, period);
8839 } 8839 }
8840 8840
8841 /* Must be called with tasklist_lock held */ 8841 /* Must be called with tasklist_lock held */
8842 static inline int tg_has_rt_tasks(struct task_group *tg) 8842 static inline int tg_has_rt_tasks(struct task_group *tg)
8843 { 8843 {
8844 struct task_struct *g, *p; 8844 struct task_struct *g, *p;
8845 8845
8846 do_each_thread(g, p) { 8846 do_each_thread(g, p) {
8847 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) 8847 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
8848 return 1; 8848 return 1;
8849 } while_each_thread(g, p); 8849 } while_each_thread(g, p);
8850 8850
8851 return 0; 8851 return 0;
8852 } 8852 }
8853 8853
8854 struct rt_schedulable_data { 8854 struct rt_schedulable_data {
8855 struct task_group *tg; 8855 struct task_group *tg;
8856 u64 rt_period; 8856 u64 rt_period;
8857 u64 rt_runtime; 8857 u64 rt_runtime;
8858 }; 8858 };
8859 8859
8860 static int tg_schedulable(struct task_group *tg, void *data) 8860 static int tg_schedulable(struct task_group *tg, void *data)
8861 { 8861 {
8862 struct rt_schedulable_data *d = data; 8862 struct rt_schedulable_data *d = data;
8863 struct task_group *child; 8863 struct task_group *child;
8864 unsigned long total, sum = 0; 8864 unsigned long total, sum = 0;
8865 u64 period, runtime; 8865 u64 period, runtime;
8866 8866
8867 period = ktime_to_ns(tg->rt_bandwidth.rt_period); 8867 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8868 runtime = tg->rt_bandwidth.rt_runtime; 8868 runtime = tg->rt_bandwidth.rt_runtime;
8869 8869
8870 if (tg == d->tg) { 8870 if (tg == d->tg) {
8871 period = d->rt_period; 8871 period = d->rt_period;
8872 runtime = d->rt_runtime; 8872 runtime = d->rt_runtime;
8873 } 8873 }
8874 8874
8875 /* 8875 /*
8876 * Cannot have more runtime than the period. 8876 * Cannot have more runtime than the period.
8877 */ 8877 */
8878 if (runtime > period && runtime != RUNTIME_INF) 8878 if (runtime > period && runtime != RUNTIME_INF)
8879 return -EINVAL; 8879 return -EINVAL;
8880 8880
8881 /* 8881 /*
8882 * Ensure we don't starve existing RT tasks. 8882 * Ensure we don't starve existing RT tasks.
8883 */ 8883 */
8884 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) 8884 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
8885 return -EBUSY; 8885 return -EBUSY;
8886 8886
8887 total = to_ratio(period, runtime); 8887 total = to_ratio(period, runtime);
8888 8888
8889 /* 8889 /*
8890 * Nobody can have more than the global setting allows. 8890 * Nobody can have more than the global setting allows.
8891 */ 8891 */
8892 if (total > to_ratio(global_rt_period(), global_rt_runtime())) 8892 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
8893 return -EINVAL; 8893 return -EINVAL;
8894 8894
8895 /* 8895 /*
8896 * The sum of our children's runtime should not exceed our own. 8896 * The sum of our children's runtime should not exceed our own.
8897 */ 8897 */
8898 list_for_each_entry_rcu(child, &tg->children, siblings) { 8898 list_for_each_entry_rcu(child, &tg->children, siblings) {
8899 period = ktime_to_ns(child->rt_bandwidth.rt_period); 8899 period = ktime_to_ns(child->rt_bandwidth.rt_period);
8900 runtime = child->rt_bandwidth.rt_runtime; 8900 runtime = child->rt_bandwidth.rt_runtime;
8901 8901
8902 if (child == d->tg) { 8902 if (child == d->tg) {
8903 period = d->rt_period; 8903 period = d->rt_period;
8904 runtime = d->rt_runtime; 8904 runtime = d->rt_runtime;
8905 } 8905 }
8906 8906
8907 sum += to_ratio(period, runtime); 8907 sum += to_ratio(period, runtime);
8908 } 8908 }
8909 8909
8910 if (sum > total) 8910 if (sum > total)
8911 return -EINVAL; 8911 return -EINVAL;
8912 8912
8913 return 0; 8913 return 0;
8914 } 8914 }
8915 8915
8916 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 8916 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8917 { 8917 {
8918 struct rt_schedulable_data data = { 8918 struct rt_schedulable_data data = {
8919 .tg = tg, 8919 .tg = tg,
8920 .rt_period = period, 8920 .rt_period = period,
8921 .rt_runtime = runtime, 8921 .rt_runtime = runtime,
8922 }; 8922 };
8923 8923
8924 return walk_tg_tree(tg_schedulable, tg_nop, &data); 8924 return walk_tg_tree(tg_schedulable, tg_nop, &data);
8925 } 8925 }
8926 8926
8927 static int tg_set_bandwidth(struct task_group *tg, 8927 static int tg_set_bandwidth(struct task_group *tg,
8928 u64 rt_period, u64 rt_runtime) 8928 u64 rt_period, u64 rt_runtime)
8929 { 8929 {
8930 int i, err = 0; 8930 int i, err = 0;
8931 8931
8932 mutex_lock(&rt_constraints_mutex); 8932 mutex_lock(&rt_constraints_mutex);
8933 read_lock(&tasklist_lock); 8933 read_lock(&tasklist_lock);
8934 err = __rt_schedulable(tg, rt_period, rt_runtime); 8934 err = __rt_schedulable(tg, rt_period, rt_runtime);
8935 if (err) 8935 if (err)
8936 goto unlock; 8936 goto unlock;
8937 8937
8938 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); 8938 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8939 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); 8939 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
8940 tg->rt_bandwidth.rt_runtime = rt_runtime; 8940 tg->rt_bandwidth.rt_runtime = rt_runtime;
8941 8941
8942 for_each_possible_cpu(i) { 8942 for_each_possible_cpu(i) {
8943 struct rt_rq *rt_rq = tg->rt_rq[i]; 8943 struct rt_rq *rt_rq = tg->rt_rq[i];
8944 8944
8945 raw_spin_lock(&rt_rq->rt_runtime_lock); 8945 raw_spin_lock(&rt_rq->rt_runtime_lock);
8946 rt_rq->rt_runtime = rt_runtime; 8946 rt_rq->rt_runtime = rt_runtime;
8947 raw_spin_unlock(&rt_rq->rt_runtime_lock); 8947 raw_spin_unlock(&rt_rq->rt_runtime_lock);
8948 } 8948 }
8949 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); 8949 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8950 unlock: 8950 unlock:
8951 read_unlock(&tasklist_lock); 8951 read_unlock(&tasklist_lock);
8952 mutex_unlock(&rt_constraints_mutex); 8952 mutex_unlock(&rt_constraints_mutex);
8953 8953
8954 return err; 8954 return err;
8955 } 8955 }
8956 8956
8957 int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) 8957 int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
8958 { 8958 {
8959 u64 rt_runtime, rt_period; 8959 u64 rt_runtime, rt_period;
8960 8960
8961 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); 8961 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8962 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; 8962 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
8963 if (rt_runtime_us < 0) 8963 if (rt_runtime_us < 0)
8964 rt_runtime = RUNTIME_INF; 8964 rt_runtime = RUNTIME_INF;
8965 8965
8966 return tg_set_bandwidth(tg, rt_period, rt_runtime); 8966 return tg_set_bandwidth(tg, rt_period, rt_runtime);
8967 } 8967 }
8968 8968
8969 long sched_group_rt_runtime(struct task_group *tg) 8969 long sched_group_rt_runtime(struct task_group *tg)
8970 { 8970 {
8971 u64 rt_runtime_us; 8971 u64 rt_runtime_us;
8972 8972
8973 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) 8973 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
8974 return -1; 8974 return -1;
8975 8975
8976 rt_runtime_us = tg->rt_bandwidth.rt_runtime; 8976 rt_runtime_us = tg->rt_bandwidth.rt_runtime;
8977 do_div(rt_runtime_us, NSEC_PER_USEC); 8977 do_div(rt_runtime_us, NSEC_PER_USEC);
8978 return rt_runtime_us; 8978 return rt_runtime_us;
8979 } 8979 }
8980 8980
8981 int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) 8981 int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
8982 { 8982 {
8983 u64 rt_runtime, rt_period; 8983 u64 rt_runtime, rt_period;
8984 8984
8985 rt_period = (u64)rt_period_us * NSEC_PER_USEC; 8985 rt_period = (u64)rt_period_us * NSEC_PER_USEC;
8986 rt_runtime = tg->rt_bandwidth.rt_runtime; 8986 rt_runtime = tg->rt_bandwidth.rt_runtime;
8987 8987
8988 if (rt_period == 0) 8988 if (rt_period == 0)
8989 return -EINVAL; 8989 return -EINVAL;
8990 8990
8991 return tg_set_bandwidth(tg, rt_period, rt_runtime); 8991 return tg_set_bandwidth(tg, rt_period, rt_runtime);
8992 } 8992 }
8993 8993
8994 long sched_group_rt_period(struct task_group *tg) 8994 long sched_group_rt_period(struct task_group *tg)
8995 { 8995 {
8996 u64 rt_period_us; 8996 u64 rt_period_us;
8997 8997
8998 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); 8998 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
8999 do_div(rt_period_us, NSEC_PER_USEC); 8999 do_div(rt_period_us, NSEC_PER_USEC);
9000 return rt_period_us; 9000 return rt_period_us;
9001 } 9001 }
9002 9002
9003 static int sched_rt_global_constraints(void) 9003 static int sched_rt_global_constraints(void)
9004 { 9004 {
9005 u64 runtime, period; 9005 u64 runtime, period;
9006 int ret = 0; 9006 int ret = 0;
9007 9007
9008 if (sysctl_sched_rt_period <= 0) 9008 if (sysctl_sched_rt_period <= 0)
9009 return -EINVAL; 9009 return -EINVAL;
9010 9010
9011 runtime = global_rt_runtime(); 9011 runtime = global_rt_runtime();
9012 period = global_rt_period(); 9012 period = global_rt_period();
9013 9013
9014 /* 9014 /*
9015 * Sanity check on the sysctl variables. 9015 * Sanity check on the sysctl variables.
9016 */ 9016 */
9017 if (runtime > period && runtime != RUNTIME_INF) 9017 if (runtime > period && runtime != RUNTIME_INF)
9018 return -EINVAL; 9018 return -EINVAL;
9019 9019
9020 mutex_lock(&rt_constraints_mutex); 9020 mutex_lock(&rt_constraints_mutex);
9021 read_lock(&tasklist_lock); 9021 read_lock(&tasklist_lock);
9022 ret = __rt_schedulable(NULL, 0, 0); 9022 ret = __rt_schedulable(NULL, 0, 0);
9023 read_unlock(&tasklist_lock); 9023 read_unlock(&tasklist_lock);
9024 mutex_unlock(&rt_constraints_mutex); 9024 mutex_unlock(&rt_constraints_mutex);
9025 9025
9026 return ret; 9026 return ret;
9027 } 9027 }
9028 9028
9029 int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) 9029 int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
9030 { 9030 {
9031 /* Don't accept realtime tasks when there is no way for them to run */ 9031 /* Don't accept realtime tasks when there is no way for them to run */
9032 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) 9032 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
9033 return 0; 9033 return 0;
9034 9034
9035 return 1; 9035 return 1;
9036 } 9036 }
9037 9037
9038 #else /* !CONFIG_RT_GROUP_SCHED */ 9038 #else /* !CONFIG_RT_GROUP_SCHED */
9039 static int sched_rt_global_constraints(void) 9039 static int sched_rt_global_constraints(void)
9040 { 9040 {
9041 unsigned long flags; 9041 unsigned long flags;
9042 int i; 9042 int i;
9043 9043
9044 if (sysctl_sched_rt_period <= 0) 9044 if (sysctl_sched_rt_period <= 0)
9045 return -EINVAL; 9045 return -EINVAL;
9046 9046
9047 /* 9047 /*
9048 * There's always some RT tasks in the root group 9048 * There's always some RT tasks in the root group
9049 * -- migration, kstopmachine etc.. 9049 * -- migration, kstopmachine etc..
9050 */ 9050 */
9051 if (sysctl_sched_rt_runtime == 0) 9051 if (sysctl_sched_rt_runtime == 0)
9052 return -EBUSY; 9052 return -EBUSY;
9053 9053
9054 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 9054 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
9055 for_each_possible_cpu(i) { 9055 for_each_possible_cpu(i) {
9056 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 9056 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
9057 9057
9058 raw_spin_lock(&rt_rq->rt_runtime_lock); 9058 raw_spin_lock(&rt_rq->rt_runtime_lock);
9059 rt_rq->rt_runtime = global_rt_runtime(); 9059 rt_rq->rt_runtime = global_rt_runtime();
9060 raw_spin_unlock(&rt_rq->rt_runtime_lock); 9060 raw_spin_unlock(&rt_rq->rt_runtime_lock);
9061 } 9061 }
9062 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); 9062 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
9063 9063
9064 return 0; 9064 return 0;
9065 } 9065 }
9066 #endif /* CONFIG_RT_GROUP_SCHED */ 9066 #endif /* CONFIG_RT_GROUP_SCHED */
9067 9067
9068 int sched_rt_handler(struct ctl_table *table, int write, 9068 int sched_rt_handler(struct ctl_table *table, int write,
9069 void __user *buffer, size_t *lenp, 9069 void __user *buffer, size_t *lenp,
9070 loff_t *ppos) 9070 loff_t *ppos)
9071 { 9071 {
9072 int ret; 9072 int ret;
9073 int old_period, old_runtime; 9073 int old_period, old_runtime;
9074 static DEFINE_MUTEX(mutex); 9074 static DEFINE_MUTEX(mutex);
9075 9075
9076 mutex_lock(&mutex); 9076 mutex_lock(&mutex);
9077 old_period = sysctl_sched_rt_period; 9077 old_period = sysctl_sched_rt_period;
9078 old_runtime = sysctl_sched_rt_runtime; 9078 old_runtime = sysctl_sched_rt_runtime;
9079 9079
9080 ret = proc_dointvec(table, write, buffer, lenp, ppos); 9080 ret = proc_dointvec(table, write, buffer, lenp, ppos);
9081 9081
9082 if (!ret && write) { 9082 if (!ret && write) {
9083 ret = sched_rt_global_constraints(); 9083 ret = sched_rt_global_constraints();
9084 if (ret) { 9084 if (ret) {
9085 sysctl_sched_rt_period = old_period; 9085 sysctl_sched_rt_period = old_period;
9086 sysctl_sched_rt_runtime = old_runtime; 9086 sysctl_sched_rt_runtime = old_runtime;
9087 } else { 9087 } else {
9088 def_rt_bandwidth.rt_runtime = global_rt_runtime(); 9088 def_rt_bandwidth.rt_runtime = global_rt_runtime();
9089 def_rt_bandwidth.rt_period = 9089 def_rt_bandwidth.rt_period =
9090 ns_to_ktime(global_rt_period()); 9090 ns_to_ktime(global_rt_period());
9091 } 9091 }
9092 } 9092 }
9093 mutex_unlock(&mutex); 9093 mutex_unlock(&mutex);
9094 9094
9095 return ret; 9095 return ret;
9096 } 9096 }
9097 9097
9098 #ifdef CONFIG_CGROUP_SCHED 9098 #ifdef CONFIG_CGROUP_SCHED
9099 9099
9100 /* return corresponding task_group object of a cgroup */ 9100 /* return corresponding task_group object of a cgroup */
9101 static inline struct task_group *cgroup_tg(struct cgroup *cgrp) 9101 static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
9102 { 9102 {
9103 return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id), 9103 return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
9104 struct task_group, css); 9104 struct task_group, css);
9105 } 9105 }
9106 9106
9107 static struct cgroup_subsys_state * 9107 static struct cgroup_subsys_state *
9108 cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) 9108 cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
9109 { 9109 {
9110 struct task_group *tg, *parent; 9110 struct task_group *tg, *parent;
9111 9111
9112 if (!cgrp->parent) { 9112 if (!cgrp->parent) {
9113 /* This is early initialization for the top cgroup */ 9113 /* This is early initialization for the top cgroup */
9114 return &root_task_group.css; 9114 return &root_task_group.css;
9115 } 9115 }
9116 9116
9117 parent = cgroup_tg(cgrp->parent); 9117 parent = cgroup_tg(cgrp->parent);
9118 tg = sched_create_group(parent); 9118 tg = sched_create_group(parent);
9119 if (IS_ERR(tg)) 9119 if (IS_ERR(tg))
9120 return ERR_PTR(-ENOMEM); 9120 return ERR_PTR(-ENOMEM);
9121 9121
9122 return &tg->css; 9122 return &tg->css;
9123 } 9123 }
9124 9124
9125 static void 9125 static void
9126 cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) 9126 cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
9127 { 9127 {
9128 struct task_group *tg = cgroup_tg(cgrp); 9128 struct task_group *tg = cgroup_tg(cgrp);
9129 9129
9130 sched_destroy_group(tg); 9130 sched_destroy_group(tg);
9131 } 9131 }
9132 9132
9133 static int 9133 static int
9134 cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 9134 cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
9135 { 9135 {
9136 #ifdef CONFIG_RT_GROUP_SCHED 9136 #ifdef CONFIG_RT_GROUP_SCHED
9137 if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) 9137 if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
9138 return -EINVAL; 9138 return -EINVAL;
9139 #else 9139 #else
9140 /* We don't support RT-tasks being in separate groups */ 9140 /* We don't support RT-tasks being in separate groups */
9141 if (tsk->sched_class != &fair_sched_class) 9141 if (tsk->sched_class != &fair_sched_class)
9142 return -EINVAL; 9142 return -EINVAL;
9143 #endif 9143 #endif
9144 return 0; 9144 return 0;
9145 } 9145 }
9146 9146
9147 static int 9147 static int
9148 cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 9148 cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
9149 struct task_struct *tsk, bool threadgroup) 9149 struct task_struct *tsk, bool threadgroup)
9150 { 9150 {
9151 int retval = cpu_cgroup_can_attach_task(cgrp, tsk); 9151 int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
9152 if (retval) 9152 if (retval)
9153 return retval; 9153 return retval;
9154 if (threadgroup) { 9154 if (threadgroup) {
9155 struct task_struct *c; 9155 struct task_struct *c;
9156 rcu_read_lock(); 9156 rcu_read_lock();
9157 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { 9157 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
9158 retval = cpu_cgroup_can_attach_task(cgrp, c); 9158 retval = cpu_cgroup_can_attach_task(cgrp, c);
9159 if (retval) { 9159 if (retval) {
9160 rcu_read_unlock(); 9160 rcu_read_unlock();
9161 return retval; 9161 return retval;
9162 } 9162 }
9163 } 9163 }
9164 rcu_read_unlock(); 9164 rcu_read_unlock();
9165 } 9165 }
9166 return 0; 9166 return 0;
9167 } 9167 }
9168 9168
9169 static void 9169 static void
9170 cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 9170 cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
9171 struct cgroup *old_cont, struct task_struct *tsk, 9171 struct cgroup *old_cont, struct task_struct *tsk,
9172 bool threadgroup) 9172 bool threadgroup)
9173 { 9173 {
9174 sched_move_task(tsk); 9174 sched_move_task(tsk);
9175 if (threadgroup) { 9175 if (threadgroup) {
9176 struct task_struct *c; 9176 struct task_struct *c;
9177 rcu_read_lock(); 9177 rcu_read_lock();
9178 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { 9178 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
9179 sched_move_task(c); 9179 sched_move_task(c);
9180 } 9180 }
9181 rcu_read_unlock(); 9181 rcu_read_unlock();
9182 } 9182 }
9183 } 9183 }
9184 9184
9185 static void 9185 static void
9186 cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, 9186 cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
9187 struct cgroup *old_cgrp, struct task_struct *task) 9187 struct cgroup *old_cgrp, struct task_struct *task)
9188 { 9188 {
9189 /* 9189 /*
9190 * cgroup_exit() is called in the copy_process() failure path. 9190 * cgroup_exit() is called in the copy_process() failure path.
9191 * Ignore this case since the task hasn't ran yet, this avoids 9191 * Ignore this case since the task hasn't ran yet, this avoids
9192 * trying to poke a half freed task state from generic code. 9192 * trying to poke a half freed task state from generic code.
9193 */ 9193 */
9194 if (!(task->flags & PF_EXITING)) 9194 if (!(task->flags & PF_EXITING))
9195 return; 9195 return;
9196 9196
9197 sched_move_task(task); 9197 sched_move_task(task);
9198 } 9198 }
9199 9199
9200 #ifdef CONFIG_FAIR_GROUP_SCHED 9200 #ifdef CONFIG_FAIR_GROUP_SCHED
9201 static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, 9201 static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
9202 u64 shareval) 9202 u64 shareval)
9203 { 9203 {
9204 return sched_group_set_shares(cgroup_tg(cgrp), shareval); 9204 return sched_group_set_shares(cgroup_tg(cgrp), shareval);
9205 } 9205 }
9206 9206
9207 static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) 9207 static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
9208 { 9208 {
9209 struct task_group *tg = cgroup_tg(cgrp); 9209 struct task_group *tg = cgroup_tg(cgrp);
9210 9210
9211 return (u64) tg->shares; 9211 return (u64) tg->shares;
9212 } 9212 }
9213 #endif /* CONFIG_FAIR_GROUP_SCHED */ 9213 #endif /* CONFIG_FAIR_GROUP_SCHED */
9214 9214
9215 #ifdef CONFIG_RT_GROUP_SCHED 9215 #ifdef CONFIG_RT_GROUP_SCHED
9216 static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, 9216 static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
9217 s64 val) 9217 s64 val)
9218 { 9218 {
9219 return sched_group_set_rt_runtime(cgroup_tg(cgrp), val); 9219 return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
9220 } 9220 }
9221 9221
9222 static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft) 9222 static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
9223 { 9223 {
9224 return sched_group_rt_runtime(cgroup_tg(cgrp)); 9224 return sched_group_rt_runtime(cgroup_tg(cgrp));
9225 } 9225 }
9226 9226
9227 static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, 9227 static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
9228 u64 rt_period_us) 9228 u64 rt_period_us)
9229 { 9229 {
9230 return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); 9230 return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
9231 } 9231 }
9232 9232
9233 static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) 9233 static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
9234 { 9234 {
9235 return sched_group_rt_period(cgroup_tg(cgrp)); 9235 return sched_group_rt_period(cgroup_tg(cgrp));
9236 } 9236 }
9237 #endif /* CONFIG_RT_GROUP_SCHED */ 9237 #endif /* CONFIG_RT_GROUP_SCHED */
9238 9238
9239 static struct cftype cpu_files[] = { 9239 static struct cftype cpu_files[] = {
9240 #ifdef CONFIG_FAIR_GROUP_SCHED 9240 #ifdef CONFIG_FAIR_GROUP_SCHED
9241 { 9241 {
9242 .name = "shares", 9242 .name = "shares",
9243 .read_u64 = cpu_shares_read_u64, 9243 .read_u64 = cpu_shares_read_u64,
9244 .write_u64 = cpu_shares_write_u64, 9244 .write_u64 = cpu_shares_write_u64,
9245 }, 9245 },
9246 #endif 9246 #endif
9247 #ifdef CONFIG_RT_GROUP_SCHED 9247 #ifdef CONFIG_RT_GROUP_SCHED
9248 { 9248 {
9249 .name = "rt_runtime_us", 9249 .name = "rt_runtime_us",
9250 .read_s64 = cpu_rt_runtime_read, 9250 .read_s64 = cpu_rt_runtime_read,
9251 .write_s64 = cpu_rt_runtime_write, 9251 .write_s64 = cpu_rt_runtime_write,
9252 }, 9252 },
9253 { 9253 {
9254 .name = "rt_period_us", 9254 .name = "rt_period_us",
9255 .read_u64 = cpu_rt_period_read_uint, 9255 .read_u64 = cpu_rt_period_read_uint,
9256 .write_u64 = cpu_rt_period_write_uint, 9256 .write_u64 = cpu_rt_period_write_uint,
9257 }, 9257 },
9258 #endif 9258 #endif
9259 }; 9259 };
9260 9260
9261 static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) 9261 static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
9262 { 9262 {
9263 return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files)); 9263 return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
9264 } 9264 }
9265 9265
9266 struct cgroup_subsys cpu_cgroup_subsys = { 9266 struct cgroup_subsys cpu_cgroup_subsys = {
9267 .name = "cpu", 9267 .name = "cpu",
9268 .create = cpu_cgroup_create, 9268 .create = cpu_cgroup_create,
9269 .destroy = cpu_cgroup_destroy, 9269 .destroy = cpu_cgroup_destroy,
9270 .can_attach = cpu_cgroup_can_attach, 9270 .can_attach = cpu_cgroup_can_attach,
9271 .attach = cpu_cgroup_attach, 9271 .attach = cpu_cgroup_attach,
9272 .exit = cpu_cgroup_exit, 9272 .exit = cpu_cgroup_exit,
9273 .populate = cpu_cgroup_populate, 9273 .populate = cpu_cgroup_populate,
9274 .subsys_id = cpu_cgroup_subsys_id, 9274 .subsys_id = cpu_cgroup_subsys_id,
9275 .early_init = 1, 9275 .early_init = 1,
9276 }; 9276 };
9277 9277
9278 #endif /* CONFIG_CGROUP_SCHED */ 9278 #endif /* CONFIG_CGROUP_SCHED */
9279 9279
9280 #ifdef CONFIG_CGROUP_CPUACCT 9280 #ifdef CONFIG_CGROUP_CPUACCT
9281 9281
9282 /* 9282 /*
9283 * CPU accounting code for task groups. 9283 * CPU accounting code for task groups.
9284 * 9284 *
9285 * Based on the work by Paul Menage (menage@google.com) and Balbir Singh 9285 * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
9286 * (balbir@in.ibm.com). 9286 * (balbir@in.ibm.com).
9287 */ 9287 */
9288 9288
9289 /* track cpu usage of a group of tasks and its child groups */ 9289 /* track cpu usage of a group of tasks and its child groups */
9290 struct cpuacct { 9290 struct cpuacct {
9291 struct cgroup_subsys_state css; 9291 struct cgroup_subsys_state css;
9292 /* cpuusage holds pointer to a u64-type object on every cpu */ 9292 /* cpuusage holds pointer to a u64-type object on every cpu */
9293 u64 __percpu *cpuusage; 9293 u64 __percpu *cpuusage;
9294 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; 9294 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
9295 struct cpuacct *parent; 9295 struct cpuacct *parent;
9296 }; 9296 };
9297 9297
9298 struct cgroup_subsys cpuacct_subsys; 9298 struct cgroup_subsys cpuacct_subsys;
9299 9299
9300 /* return cpu accounting group corresponding to this container */ 9300 /* return cpu accounting group corresponding to this container */
9301 static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) 9301 static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
9302 { 9302 {
9303 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), 9303 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
9304 struct cpuacct, css); 9304 struct cpuacct, css);
9305 } 9305 }
9306 9306
9307 /* return cpu accounting group to which this task belongs */ 9307 /* return cpu accounting group to which this task belongs */
9308 static inline struct cpuacct *task_ca(struct task_struct *tsk) 9308 static inline struct cpuacct *task_ca(struct task_struct *tsk)
9309 { 9309 {
9310 return container_of(task_subsys_state(tsk, cpuacct_subsys_id), 9310 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
9311 struct cpuacct, css); 9311 struct cpuacct, css);
9312 } 9312 }
9313 9313
9314 /* create a new cpu accounting group */ 9314 /* create a new cpu accounting group */
9315 static struct cgroup_subsys_state *cpuacct_create( 9315 static struct cgroup_subsys_state *cpuacct_create(
9316 struct cgroup_subsys *ss, struct cgroup *cgrp) 9316 struct cgroup_subsys *ss, struct cgroup *cgrp)
9317 { 9317 {
9318 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); 9318 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
9319 int i; 9319 int i;
9320 9320
9321 if (!ca) 9321 if (!ca)
9322 goto out; 9322 goto out;
9323 9323
9324 ca->cpuusage = alloc_percpu(u64); 9324 ca->cpuusage = alloc_percpu(u64);
9325 if (!ca->cpuusage) 9325 if (!ca->cpuusage)
9326 goto out_free_ca; 9326 goto out_free_ca;
9327 9327
9328 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) 9328 for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
9329 if (percpu_counter_init(&ca->cpustat[i], 0)) 9329 if (percpu_counter_init(&ca->cpustat[i], 0))
9330 goto out_free_counters; 9330 goto out_free_counters;
9331 9331
9332 if (cgrp->parent) 9332 if (cgrp->parent)
9333 ca->parent = cgroup_ca(cgrp->parent); 9333 ca->parent = cgroup_ca(cgrp->parent);
9334 9334
9335 return &ca->css; 9335 return &ca->css;
9336 9336
9337 out_free_counters: 9337 out_free_counters:
9338 while (--i >= 0) 9338 while (--i >= 0)
9339 percpu_counter_destroy(&ca->cpustat[i]); 9339 percpu_counter_destroy(&ca->cpustat[i]);
9340 free_percpu(ca->cpuusage); 9340 free_percpu(ca->cpuusage);
9341 out_free_ca: 9341 out_free_ca:
9342 kfree(ca); 9342 kfree(ca);
9343 out: 9343 out:
9344 return ERR_PTR(-ENOMEM); 9344 return ERR_PTR(-ENOMEM);
9345 } 9345 }
9346 9346
9347 /* destroy an existing cpu accounting group */ 9347 /* destroy an existing cpu accounting group */
9348 static void 9348 static void
9349 cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) 9349 cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
9350 { 9350 {
9351 struct cpuacct *ca = cgroup_ca(cgrp); 9351 struct cpuacct *ca = cgroup_ca(cgrp);
9352 int i; 9352 int i;
9353 9353
9354 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) 9354 for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
9355 percpu_counter_destroy(&ca->cpustat[i]); 9355 percpu_counter_destroy(&ca->cpustat[i]);
9356 free_percpu(ca->cpuusage); 9356 free_percpu(ca->cpuusage);
9357 kfree(ca); 9357 kfree(ca);
9358 } 9358 }
9359 9359
9360 static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) 9360 static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
9361 { 9361 {
9362 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 9362 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
9363 u64 data; 9363 u64 data;
9364 9364
9365 #ifndef CONFIG_64BIT 9365 #ifndef CONFIG_64BIT
9366 /* 9366 /*
9367 * Take rq->lock to make 64-bit read safe on 32-bit platforms. 9367 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
9368 */ 9368 */
9369 raw_spin_lock_irq(&cpu_rq(cpu)->lock); 9369 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
9370 data = *cpuusage; 9370 data = *cpuusage;
9371 raw_spin_unlock_irq(&cpu_rq(cpu)->lock); 9371 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
9372 #else 9372 #else
9373 data = *cpuusage; 9373 data = *cpuusage;
9374 #endif 9374 #endif
9375 9375
9376 return data; 9376 return data;
9377 } 9377 }
9378 9378
9379 static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) 9379 static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
9380 { 9380 {
9381 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 9381 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
9382 9382
9383 #ifndef CONFIG_64BIT 9383 #ifndef CONFIG_64BIT
9384 /* 9384 /*
9385 * Take rq->lock to make 64-bit write safe on 32-bit platforms. 9385 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
9386 */ 9386 */
9387 raw_spin_lock_irq(&cpu_rq(cpu)->lock); 9387 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
9388 *cpuusage = val; 9388 *cpuusage = val;
9389 raw_spin_unlock_irq(&cpu_rq(cpu)->lock); 9389 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
9390 #else 9390 #else
9391 *cpuusage = val; 9391 *cpuusage = val;
9392 #endif 9392 #endif
9393 } 9393 }
9394 9394
9395 /* return total cpu usage (in nanoseconds) of a group */ 9395 /* return total cpu usage (in nanoseconds) of a group */
9396 static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) 9396 static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
9397 { 9397 {
9398 struct cpuacct *ca = cgroup_ca(cgrp); 9398 struct cpuacct *ca = cgroup_ca(cgrp);
9399 u64 totalcpuusage = 0; 9399 u64 totalcpuusage = 0;
9400 int i; 9400 int i;
9401 9401
9402 for_each_present_cpu(i) 9402 for_each_present_cpu(i)
9403 totalcpuusage += cpuacct_cpuusage_read(ca, i); 9403 totalcpuusage += cpuacct_cpuusage_read(ca, i);
9404 9404
9405 return totalcpuusage; 9405 return totalcpuusage;
9406 } 9406 }
9407 9407
9408 static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, 9408 static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
9409 u64 reset) 9409 u64 reset)
9410 { 9410 {
9411 struct cpuacct *ca = cgroup_ca(cgrp); 9411 struct cpuacct *ca = cgroup_ca(cgrp);
9412 int err = 0; 9412 int err = 0;
9413 int i; 9413 int i;
9414 9414
9415 if (reset) { 9415 if (reset) {
9416 err = -EINVAL; 9416 err = -EINVAL;
9417 goto out; 9417 goto out;
9418 } 9418 }
9419 9419
9420 for_each_present_cpu(i) 9420 for_each_present_cpu(i)
9421 cpuacct_cpuusage_write(ca, i, 0); 9421 cpuacct_cpuusage_write(ca, i, 0);
9422 9422
9423 out: 9423 out:
9424 return err; 9424 return err;
9425 } 9425 }
9426 9426
9427 static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, 9427 static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
9428 struct seq_file *m) 9428 struct seq_file *m)
9429 { 9429 {
9430 struct cpuacct *ca = cgroup_ca(cgroup); 9430 struct cpuacct *ca = cgroup_ca(cgroup);
9431 u64 percpu; 9431 u64 percpu;
9432 int i; 9432 int i;
9433 9433
9434 for_each_present_cpu(i) { 9434 for_each_present_cpu(i) {
9435 percpu = cpuacct_cpuusage_read(ca, i); 9435 percpu = cpuacct_cpuusage_read(ca, i);
9436 seq_printf(m, "%llu ", (unsigned long long) percpu); 9436 seq_printf(m, "%llu ", (unsigned long long) percpu);
9437 } 9437 }
9438 seq_printf(m, "\n"); 9438 seq_printf(m, "\n");
9439 return 0; 9439 return 0;
9440 } 9440 }
9441 9441
9442 static const char *cpuacct_stat_desc[] = { 9442 static const char *cpuacct_stat_desc[] = {
9443 [CPUACCT_STAT_USER] = "user", 9443 [CPUACCT_STAT_USER] = "user",
9444 [CPUACCT_STAT_SYSTEM] = "system", 9444 [CPUACCT_STAT_SYSTEM] = "system",
9445 }; 9445 };
9446 9446
9447 static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, 9447 static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
9448 struct cgroup_map_cb *cb) 9448 struct cgroup_map_cb *cb)
9449 { 9449 {
9450 struct cpuacct *ca = cgroup_ca(cgrp); 9450 struct cpuacct *ca = cgroup_ca(cgrp);
9451 int i; 9451 int i;
9452 9452
9453 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) { 9453 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
9454 s64 val = percpu_counter_read(&ca->cpustat[i]); 9454 s64 val = percpu_counter_read(&ca->cpustat[i]);
9455 val = cputime64_to_clock_t(val); 9455 val = cputime64_to_clock_t(val);
9456 cb->fill(cb, cpuacct_stat_desc[i], val); 9456 cb->fill(cb, cpuacct_stat_desc[i], val);
9457 } 9457 }
9458 return 0; 9458 return 0;
9459 } 9459 }
9460 9460
9461 static struct cftype files[] = { 9461 static struct cftype files[] = {
9462 { 9462 {
9463 .name = "usage", 9463 .name = "usage",
9464 .read_u64 = cpuusage_read, 9464 .read_u64 = cpuusage_read,
9465 .write_u64 = cpuusage_write, 9465 .write_u64 = cpuusage_write,
9466 }, 9466 },
9467 { 9467 {
9468 .name = "usage_percpu", 9468 .name = "usage_percpu",
9469 .read_seq_string = cpuacct_percpu_seq_read, 9469 .read_seq_string = cpuacct_percpu_seq_read,
9470 }, 9470 },
9471 { 9471 {
9472 .name = "stat", 9472 .name = "stat",
9473 .read_map = cpuacct_stats_show, 9473 .read_map = cpuacct_stats_show,
9474 }, 9474 },
9475 }; 9475 };
9476 9476
9477 static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) 9477 static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
9478 { 9478 {
9479 return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files)); 9479 return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
9480 } 9480 }
9481 9481
9482 /* 9482 /*
9483 * charge this task's execution time to its accounting group. 9483 * charge this task's execution time to its accounting group.
9484 * 9484 *
9485 * called with rq->lock held. 9485 * called with rq->lock held.
9486 */ 9486 */
9487 static void cpuacct_charge(struct task_struct *tsk, u64 cputime) 9487 static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9488 { 9488 {
9489 struct cpuacct *ca; 9489 struct cpuacct *ca;
9490 int cpu; 9490 int cpu;
9491 9491
9492 if (unlikely(!cpuacct_subsys.active)) 9492 if (unlikely(!cpuacct_subsys.active))
9493 return; 9493 return;
9494 9494
9495 cpu = task_cpu(tsk); 9495 cpu = task_cpu(tsk);
9496 9496
9497 rcu_read_lock(); 9497 rcu_read_lock();
9498 9498
9499 ca = task_ca(tsk); 9499 ca = task_ca(tsk);
9500 9500
9501 for (; ca; ca = ca->parent) { 9501 for (; ca; ca = ca->parent) {
9502 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 9502 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
9503 *cpuusage += cputime; 9503 *cpuusage += cputime;
9504 } 9504 }
9505 9505
9506 rcu_read_unlock(); 9506 rcu_read_unlock();
9507 } 9507 }
9508 9508
9509 /* 9509 /*
9510 * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large 9510 * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
9511 * in cputime_t units. As a result, cpuacct_update_stats calls 9511 * in cputime_t units. As a result, cpuacct_update_stats calls
9512 * percpu_counter_add with values large enough to always overflow the 9512 * percpu_counter_add with values large enough to always overflow the
9513 * per cpu batch limit causing bad SMP scalability. 9513 * per cpu batch limit causing bad SMP scalability.
9514 * 9514 *
9515 * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we 9515 * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
9516 * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled 9516 * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
9517 * and enabled. We cap it at INT_MAX which is the largest allowed batch value. 9517 * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
9518 */ 9518 */
9519 #ifdef CONFIG_SMP 9519 #ifdef CONFIG_SMP
9520 #define CPUACCT_BATCH \ 9520 #define CPUACCT_BATCH \
9521 min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX) 9521 min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
9522 #else 9522 #else
9523 #define CPUACCT_BATCH 0 9523 #define CPUACCT_BATCH 0
9524 #endif 9524 #endif
9525 9525
9526 /* 9526 /*
9527 * Charge the system/user time to the task's accounting group. 9527 * Charge the system/user time to the task's accounting group.
9528 */ 9528 */
9529 static void cpuacct_update_stats(struct task_struct *tsk, 9529 static void cpuacct_update_stats(struct task_struct *tsk,
9530 enum cpuacct_stat_index idx, cputime_t val) 9530 enum cpuacct_stat_index idx, cputime_t val)
9531 { 9531 {
9532 struct cpuacct *ca; 9532 struct cpuacct *ca;
9533 int batch = CPUACCT_BATCH; 9533 int batch = CPUACCT_BATCH;
9534 9534
9535 if (unlikely(!cpuacct_subsys.active)) 9535 if (unlikely(!cpuacct_subsys.active))
9536 return; 9536 return;
9537 9537
9538 rcu_read_lock(); 9538 rcu_read_lock();
9539 ca = task_ca(tsk); 9539 ca = task_ca(tsk);
9540 9540
9541 do { 9541 do {
9542 __percpu_counter_add(&ca->cpustat[idx], val, batch); 9542 __percpu_counter_add(&ca->cpustat[idx], val, batch);
9543 ca = ca->parent; 9543 ca = ca->parent;
9544 } while (ca); 9544 } while (ca);
9545 rcu_read_unlock(); 9545 rcu_read_unlock();
9546 } 9546 }
9547 9547
9548 struct cgroup_subsys cpuacct_subsys = { 9548 struct cgroup_subsys cpuacct_subsys = {
9549 .name = "cpuacct", 9549 .name = "cpuacct",
9550 .create = cpuacct_create, 9550 .create = cpuacct_create,
9551 .destroy = cpuacct_destroy, 9551 .destroy = cpuacct_destroy,
9552 .populate = cpuacct_populate, 9552 .populate = cpuacct_populate,
9553 .subsys_id = cpuacct_subsys_id, 9553 .subsys_id = cpuacct_subsys_id,
9554 }; 9554 };
9555 #endif /* CONFIG_CGROUP_CPUACCT */ 9555 #endif /* CONFIG_CGROUP_CPUACCT */
9556 9556
9557 9557