Eric Lee / linux-smarc-t335x-v3.2

Commit 17d9f311eca13a42bf950198a358be1420d19c5f

Authored by Daniel Hellstrom 2011-05-20 12:01:10 +0800

Committed by David S. Miller 2011-05-21 04:10:55 +0800

Exists in master and in 4 other branches

SCHED_TTWU_QUEUE is not longer needed since sparc32 now implements IPI

Signed-off-by: Daniel Hellstrom <daniel@gaisler.com>
Reported-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

Showing 2 changed files with 1 additions and 6 deletions Inline Diff

init/Kconfig
kernel/sched.c

init/Kconfig

Diff comments View file @ 17d9f31

 config ARCH
 	string
 	option env="ARCH"
 config KERNELVERSION
 	string
 	option env="KERNELVERSION"
 config DEFCONFIG_LIST
 	string
 	depends on !UML
 	option defconfig_list
 	default "/lib/modules/$UNAME_RELEASE/.config"
 	default "/etc/kernel-config"
 	default "/boot/config-$UNAME_RELEASE"
 	default "$ARCH_DEFCONFIG"
 	default "arch/$ARCH/defconfig"
 config CONSTRUCTORS
 	bool
 	depends on !UML
 	default y
 config HAVE_IRQ_WORK
 	bool
 config IRQ_WORK
 	bool
 	depends on HAVE_IRQ_WORK
 menu "General setup"
 config EXPERIMENTAL
 	bool "Prompt for development and/or incomplete code/drivers"
 	---help---
 	  Some of the various things that Linux supports (such as network
 	  drivers, file systems, network protocols, etc.) can be in a state
 	  of development where the functionality, stability, or the level of
 	  testing is not yet high enough for general use. This is usually
 	  known as the "alpha-test" phase among developers. If a feature is
 	  currently in alpha-test, then the developers usually discourage
 	  uninformed widespread use of this feature by the general public to
 	  avoid "Why doesn't this work?" type mail messages. However, active
 	  testing and use of these systems is welcomed. Just be aware that it
 	  may not meet the normal level of reliability or it may fail to work
 	  in some special cases. Detailed bug reports from people familiar
 	  with the kernel internals are usually welcomed by the developers
 	  (before submitting bug reports, please read the documents
 	  <file:README>, <file:MAINTAINERS>, <file:REPORTING-BUGS>,
 	  <file:Documentation/BUG-HUNTING>, and
 	  <file:Documentation/oops-tracing.txt> in the kernel source).
 	  This option will also make obsoleted drivers available. These are
 	  drivers that have been replaced by something else, and/or are
 	  scheduled to be removed in a future kernel release.
 	  Unless you intend to help test and develop a feature or driver that
 	  falls into this category, or you have a situation that requires
 	  using these features, you should probably say N here, which will
 	  cause the configurator to present you with fewer choices. If
 	  you say Y here, you will be offered the choice of using features or
 	  drivers that are currently considered to be in the alpha-test phase.
 config BROKEN
 	bool
 config BROKEN_ON_SMP
 	bool
 	depends on BROKEN || !SMP
 	default y
 config INIT_ENV_ARG_LIMIT
 	int
 	default 32 if !UML
 	default 128 if UML
 	help
 	  Maximum of each of the number of arguments and environment
 	  variables passed to init from the kernel command line.
 config CROSS_COMPILE
 	string "Cross-compiler tool prefix"
 	help
 	  Same as running 'make CROSS_COMPILE=prefix-' but stored for
 	  default make runs in this kernel build directory.  You don't
 	  need to set this unless you want the configured kernel build
 	  directory to select the cross-compiler automatically.
 config LOCALVERSION
 	string "Local version - append to kernel release"
 	help
 	  Append an extra string to the end of your kernel version.
 	  This will show up when you type uname, for example.
 	  The string you set here will be appended after the contents of
 	  any files with a filename matching localversion* in your
 	  object and source tree, in that order.  Your total string can
 	  be a maximum of 64 characters.
 config LOCALVERSION_AUTO
 	bool "Automatically append version information to the version string"
 	default y
 	help
 	  This will try to automatically determine if the current tree is a
 	  release tree by looking for git tags that belong to the current
 	  top of tree revision.
 	  A string of the format -gxxxxxxxx will be added to the localversion
 	  if a git-based tree is found.  The string generated by this will be
 	  appended after any matching localversion* files, and after the value
 	  set in CONFIG_LOCALVERSION.
 	  (The actual string used here is the first eight characters produced
 	  by running the command:
 	    $ git rev-parse --verify HEAD
 	  which is done within the script "scripts/setlocalversion".)
 config HAVE_KERNEL_GZIP
 	bool
 config HAVE_KERNEL_BZIP2
 	bool
 config HAVE_KERNEL_LZMA
 	bool
 config HAVE_KERNEL_XZ
 	bool
 config HAVE_KERNEL_LZO
 	bool
 choice
 	prompt "Kernel compression mode"
 	default KERNEL_GZIP
 	depends on HAVE_KERNEL_GZIP || HAVE_KERNEL_BZIP2 || HAVE_KERNEL_LZMA || HAVE_KERNEL_XZ || HAVE_KERNEL_LZO
 	help
 	  The linux kernel is a kind of self-extracting executable.
 	  Several compression algorithms are available, which differ
 	  in efficiency, compression and decompression speed.
 	  Compression speed is only relevant when building a kernel.
 	  Decompression speed is relevant at each boot.
 	  If you have any problems with bzip2 or lzma compressed
 	  kernels, mail me (Alain Knaff) <alain@knaff.lu>. (An older
 	  version of this functionality (bzip2 only), for 2.4, was
 	  supplied by Christian Ludwig)
 	  High compression options are mostly useful for users, who
 	  are low on disk space (embedded systems), but for whom ram
 	  size matters less.
 	  If in doubt, select 'gzip'
 config KERNEL_GZIP
 	bool "Gzip"
 	depends on HAVE_KERNEL_GZIP
 	help
 	  The old and tried gzip compression. It provides a good balance
 	  between compression ratio and decompression speed.
 config KERNEL_BZIP2
 	bool "Bzip2"
 	depends on HAVE_KERNEL_BZIP2
 	help
 	  Its compression ratio and speed is intermediate.
 	  Decompression speed is slowest among the three.  The kernel
 	  size is about 10% smaller with bzip2, in comparison to gzip.
 	  Bzip2 uses a large amount of memory. For modern kernels you
 	  will need at least 8MB RAM or more for booting.
 config KERNEL_LZMA
 	bool "LZMA"
 	depends on HAVE_KERNEL_LZMA
 	help
 	  The most recent compression algorithm.
 	  Its ratio is best, decompression speed is between the other
 	  two. Compression is slowest.	The kernel size is about 33%
 	  smaller with LZMA in comparison to gzip.
 config KERNEL_XZ
 	bool "XZ"
 	depends on HAVE_KERNEL_XZ
 	help
 	  XZ uses the LZMA2 algorithm and instruction set specific
 	  BCJ filters which can improve compression ratio of executable
 	  code. The size of the kernel is about 30% smaller with XZ in
 	  comparison to gzip. On architectures for which there is a BCJ
 	  filter (i386, x86_64, ARM, IA-64, PowerPC, and SPARC), XZ
 	  will create a few percent smaller kernel than plain LZMA.
 	  The speed is about the same as with LZMA: The decompression
 	  speed of XZ is better than that of bzip2 but worse than gzip
 	  and LZO. Compression is slow.
 config KERNEL_LZO
 	bool "LZO"
 	depends on HAVE_KERNEL_LZO
 	help
 	  Its compression ratio is the poorest among the 4. The kernel
 	  size is about 10% bigger than gzip; however its speed
 	  (both compression and decompression) is the fastest.
 endchoice
 config SWAP
 	bool "Support for paging of anonymous memory (swap)"
 	depends on MMU && BLOCK
 	default y
 	help
 	  This option allows you to choose whether you want to have support
 	  for so called swap devices or swap files in your kernel that are
 	  used to provide more virtual memory than the actual RAM present
 	  in your computer.  If unsure say Y.
 config SYSVIPC
 	bool "System V IPC"
 	---help---
 	  Inter Process Communication is a suite of library functions and
 	  system calls which let processes (running programs) synchronize and
 	  exchange information. It is generally considered to be a good thing,
 	  and some programs won't run unless you say Y here. In particular, if
 	  you want to run the DOS emulator dosemu under Linux (read the
 	  DOSEMU-HOWTO, available from <http://www.tldp.org/docs.html#howto>),
 	  you'll need to say Y here.
 	  You can find documentation about IPC with "info ipc" and also in
 	  section 6.4 of the Linux Programmer's Guide, available from
 	  <http://www.tldp.org/guides.html>.
 config SYSVIPC_SYSCTL
 	bool
 	depends on SYSVIPC
 	depends on SYSCTL
 	default y
 config POSIX_MQUEUE
 	bool "POSIX Message Queues"
 	depends on NET && EXPERIMENTAL
 	---help---
 	  POSIX variant of message queues is a part of IPC. In POSIX message
 	  queues every message has a priority which decides about succession
 	  of receiving it by a process. If you want to compile and run
 	  programs written e.g. for Solaris with use of its POSIX message
 	  queues (functions mq_*) say Y here.
 	  POSIX message queues are visible as a filesystem called 'mqueue'
 	  and can be mounted somewhere if you want to do filesystem
 	  operations on message queues.
 	  If unsure, say Y.
 config POSIX_MQUEUE_SYSCTL
 	bool
 	depends on POSIX_MQUEUE
 	depends on SYSCTL
 	default y
 config BSD_PROCESS_ACCT
 	bool "BSD Process Accounting"
 	help
 	  If you say Y here, a user level program will be able to instruct the
 	  kernel (via a special system call) to write process accounting
 	  information to a file: whenever a process exits, information about
 	  that process will be appended to the file by the kernel.  The
 	  information includes things such as creation time, owning user,
 	  command name, memory usage, controlling terminal etc. (the complete
 	  list is in the struct acct in <file:include/linux/acct.h>).  It is
 	  up to the user level program to do useful things with this
 	  information.  This is generally a good idea, so say Y.
 config BSD_PROCESS_ACCT_V3
 	bool "BSD Process Accounting version 3 file format"
 	depends on BSD_PROCESS_ACCT
 	default n
 	help
 	  If you say Y here, the process accounting information is written
 	  in a new file format that also logs the process IDs of each
 	  process and it's parent. Note that this file format is incompatible
 	  with previous v0/v1/v2 file formats, so you will need updated tools
 	  for processing it. A preliminary version of these tools is available
 	  at <http://www.gnu.org/software/acct/>.
 config FHANDLE
 	bool "open by fhandle syscalls"
 	select EXPORTFS
 	help
 	  If you say Y here, a user level program will be able to map
 	  file names to handle and then later use the handle for
 	  different file system operations. This is useful in implementing
 	  userspace file servers, which now track files using handles instead
 	  of names. The handle would remain the same even if file names
 	  get renamed. Enables open_by_handle_at(2) and name_to_handle_at(2)
 	  syscalls.
 config TASKSTATS
 	bool "Export task/process statistics through netlink (EXPERIMENTAL)"
 	depends on NET
 	default n
 	help
 	  Export selected statistics for tasks/processes through the
 	  generic netlink interface. Unlike BSD process accounting, the
 	  statistics are available during the lifetime of tasks/processes as
 	  responses to commands. Like BSD accounting, they are sent to user
 	  space on task exit.
 	  Say N if unsure.
 config TASK_DELAY_ACCT
 	bool "Enable per-task delay accounting (EXPERIMENTAL)"
 	depends on TASKSTATS
 	help
 	  Collect information on time spent by a task waiting for system
 	  resources like cpu, synchronous block I/O completion and swapping
 	  in pages. Such statistics can help in setting a task's priorities
 	  relative to other tasks for cpu, io, rss limits etc.
 	  Say N if unsure.
 config TASK_XACCT
 	bool "Enable extended accounting over taskstats (EXPERIMENTAL)"
 	depends on TASKSTATS
 	help
 	  Collect extended task accounting data and send the data
 	  to userland for processing over the taskstats interface.
 	  Say N if unsure.
 config TASK_IO_ACCOUNTING
 	bool "Enable per-task storage I/O accounting (EXPERIMENTAL)"
 	depends on TASK_XACCT
 	help
 	  Collect information on the number of bytes of storage I/O which this
 	  task has caused.
 	  Say N if unsure.
 config AUDIT
 	bool "Auditing support"
 	depends on NET
 	help
 	  Enable auditing infrastructure that can be used with another
 	  kernel subsystem, such as SELinux (which requires this for
 	  logging of avc messages output).  Does not do system-call
 	  auditing without CONFIG_AUDITSYSCALL.
 config AUDITSYSCALL
 	bool "Enable system-call auditing support"
 	depends on AUDIT && (X86 || PPC || S390 || IA64 || UML || SPARC64 || SUPERH)
 	default y if SECURITY_SELINUX
 	help
 	  Enable low-overhead system-call auditing infrastructure that
 	  can be used independently or with another kernel subsystem,
 	  such as SELinux.
 config AUDIT_WATCH
 	def_bool y
 	depends on AUDITSYSCALL
 	select FSNOTIFY
 config AUDIT_TREE
 	def_bool y
 	depends on AUDITSYSCALL
 	select FSNOTIFY
 source "kernel/irq/Kconfig"
 menu "RCU Subsystem"
 choice
 	prompt "RCU Implementation"
 	default TREE_RCU
 config TREE_RCU
 	bool "Tree-based hierarchical RCU"
 	depends on !PREEMPT && SMP
 	help
 	  This option selects the RCU implementation that is
 	  designed for very large SMP system with hundreds or
 	  thousands of CPUs.  It also scales down nicely to
 	  smaller systems.
 config TREE_PREEMPT_RCU
 	bool "Preemptible tree-based hierarchical RCU"
 	depends on PREEMPT
 	help
 	  This option selects the RCU implementation that is
 	  designed for very large SMP systems with hundreds or
 	  thousands of CPUs, but for which real-time response
 	  is also required.  It also scales down nicely to
 	  smaller systems.
 config TINY_RCU
 	bool "UP-only small-memory-footprint RCU"
 	depends on !SMP
 	help
 	  This option selects the RCU implementation that is
 	  designed for UP systems from which real-time response
 	  is not required.  This option greatly reduces the
 	  memory footprint of RCU.
 config TINY_PREEMPT_RCU
 	bool "Preemptible UP-only small-memory-footprint RCU"
 	depends on !SMP && PREEMPT
 	help
 	  This option selects the RCU implementation that is designed
 	  for real-time UP systems.  This option greatly reduces the
 	  memory footprint of RCU.
 endchoice
 config PREEMPT_RCU
 	def_bool ( TREE_PREEMPT_RCU || TINY_PREEMPT_RCU )
 	help
 	  This option enables preemptible-RCU code that is common between
 	  the TREE_PREEMPT_RCU and TINY_PREEMPT_RCU implementations.
 config RCU_TRACE
 	bool "Enable tracing for RCU"
 	help
 	  This option provides tracing in RCU which presents stats
 	  in debugfs for debugging RCU implementation.
 	  Say Y here if you want to enable RCU tracing
 	  Say N if you are unsure.
 config RCU_FANOUT
 	int "Tree-based hierarchical RCU fanout value"
 	range 2 64 if 64BIT
 	range 2 32 if !64BIT
 	depends on TREE_RCU || TREE_PREEMPT_RCU
 	default 64 if 64BIT
 	default 32 if !64BIT
 	help
 	  This option controls the fanout of hierarchical implementations
 	  of RCU, allowing RCU to work efficiently on machines with
 	  large numbers of CPUs.  This value must be at least the fourth
 	  root of NR_CPUS, which allows NR_CPUS to be insanely large.
 	  The default value of RCU_FANOUT should be used for production
 	  systems, but if you are stress-testing the RCU implementation
 	  itself, small RCU_FANOUT values allow you to test large-system
 	  code paths on small(er) systems.
 	  Select a specific number if testing RCU itself.
 	  Take the default if unsure.
 config RCU_FANOUT_EXACT
 	bool "Disable tree-based hierarchical RCU auto-balancing"
 	depends on TREE_RCU || TREE_PREEMPT_RCU
 	default n
 	help
 	  This option forces use of the exact RCU_FANOUT value specified,
 	  regardless of imbalances in the hierarchy.  This is useful for
 	  testing RCU itself, and might one day be useful on systems with
 	  strong NUMA behavior.
 	  Without RCU_FANOUT_EXACT, the code will balance the hierarchy.
 	  Say N if unsure.
 config RCU_FAST_NO_HZ
 	bool "Accelerate last non-dyntick-idle CPU's grace periods"
 	depends on TREE_RCU && NO_HZ && SMP
 	default n
 	help
 	  This option causes RCU to attempt to accelerate grace periods
 	  in order to allow the final CPU to enter dynticks-idle state
 	  more quickly.  On the other hand, this option increases the
 	  overhead of the dynticks-idle checking, particularly on systems
 	  with large numbers of CPUs.
 	  Say Y if energy efficiency is critically important, particularly
 	  	if you have relatively few CPUs.
 	  Say N if you are unsure.
 config TREE_RCU_TRACE
 	def_bool RCU_TRACE && ( TREE_RCU || TREE_PREEMPT_RCU )
 	select DEBUG_FS
 	help
 	  This option provides tracing for the TREE_RCU and
 	  TREE_PREEMPT_RCU implementations, permitting Makefile to
 	  trivially select kernel/rcutree_trace.c.
 config RCU_BOOST
 	bool "Enable RCU priority boosting"
 	depends on RT_MUTEXES && TINY_PREEMPT_RCU
 	default n
 	help
 	  This option boosts the priority of preempted RCU readers that
 	  block the current preemptible RCU grace period for too long.
 	  This option also prevents heavy loads from blocking RCU
 	  callback invocation for all flavors of RCU.
 	  Say Y here if you are working with real-time apps or heavy loads
 	  Say N here if you are unsure.
 config RCU_BOOST_PRIO
 	int "Real-time priority to boost RCU readers to"
 	range 1 99
 	depends on RCU_BOOST
 	default 1
 	help
 	  This option specifies the real-time priority to which preempted
 	  RCU readers are to be boosted.  If you are working with CPU-bound
 	  real-time applications, you should specify a priority higher then
 	  the highest-priority CPU-bound application.
 	  Specify the real-time priority, or take the default if unsure.
 config RCU_BOOST_DELAY
 	int "Milliseconds to delay boosting after RCU grace-period start"
 	range 0 3000
 	depends on RCU_BOOST
 	default 500
 	help
 	  This option specifies the time to wait after the beginning of
 	  a given grace period before priority-boosting preempted RCU
 	  readers blocking that grace period.  Note that any RCU reader
 	  blocking an expedited RCU grace period is boosted immediately.
 	  Accept the default if unsure.
 endmenu # "RCU Subsystem"
 config IKCONFIG
 	tristate "Kernel .config support"
 	---help---
 	  This option enables the complete Linux kernel ".config" file
 	  contents to be saved in the kernel. It provides documentation
 	  of which kernel options are used in a running kernel or in an
 	  on-disk kernel.  This information can be extracted from the kernel
 	  image file with the script scripts/extract-ikconfig and used as
 	  input to rebuild the current kernel or to build another kernel.
 	  It can also be extracted from a running kernel by reading
 	  /proc/config.gz if enabled (below).
 config IKCONFIG_PROC
 	bool "Enable access to .config through /proc/config.gz"
 	depends on IKCONFIG && PROC_FS
 	---help---
 	  This option enables access to the kernel configuration file
 	  through /proc/config.gz.
 config LOG_BUF_SHIFT
 	int "Kernel log buffer size (16 => 64KB, 17 => 128KB)"
 	range 12 21
 	default 17
 	help
 	  Select kernel log buffer size as a power of 2.
 	  Examples:
 	  	     17 => 128 KB
 		     16 => 64 KB
 	             15 => 32 KB
 	             14 => 16 KB
 		     13 =>  8 KB
 		     12 =>  4 KB
 #
 # Architectures with an unreliable sched_clock() should select this:
 #
 config HAVE_UNSTABLE_SCHED_CLOCK
 	bool
 menuconfig CGROUPS
 	boolean "Control Group support"
 	depends on EVENTFD
 	help
 	  This option adds support for grouping sets of processes together, for
 	  use with process control subsystems such as Cpusets, CFS, memory
 	  controls or device isolation.
 	  See
 		- Documentation/scheduler/sched-design-CFS.txt	(CFS)
 		- Documentation/cgroups/ (features for grouping, isolation
 					  and resource control)
 	  Say N if unsure.
 if CGROUPS
 config CGROUP_DEBUG
 	bool "Example debug cgroup subsystem"
 	default n
 	help
 	  This option enables a simple cgroup subsystem that
 	  exports useful debugging information about the cgroups
 	  framework.
 	  Say N if unsure.
 config CGROUP_NS
 	bool "Namespace cgroup subsystem"
 	help
 	  Provides a simple namespace cgroup subsystem to
 	  provide hierarchical naming of sets of namespaces,
 	  for instance virtual servers and checkpoint/restart
 	  jobs.
 config CGROUP_FREEZER
 	bool "Freezer cgroup subsystem"
 	help
 	  Provides a way to freeze and unfreeze all tasks in a
 	  cgroup.
 config CGROUP_DEVICE
 	bool "Device controller for cgroups"
 	help
 	  Provides a cgroup implementing whitelists for devices which
 	  a process in the cgroup can mknod or open.
 config CPUSETS
 	bool "Cpuset support"
 	help
 	  This option will let you create and manage CPUSETs which
 	  allow dynamically partitioning a system into sets of CPUs and
 	  Memory Nodes and assigning tasks to run only within those sets.
 	  This is primarily useful on large SMP or NUMA systems.
 	  Say N if unsure.
 config PROC_PID_CPUSET
 	bool "Include legacy /proc/<pid>/cpuset file"
 	depends on CPUSETS
 	default y
 config CGROUP_CPUACCT
 	bool "Simple CPU accounting cgroup subsystem"
 	help
 	  Provides a simple Resource Controller for monitoring the
 	  total CPU consumed by the tasks in a cgroup.
 config RESOURCE_COUNTERS
 	bool "Resource counters"
 	help
 	  This option enables controller independent resource accounting
 	  infrastructure that works with cgroups.
 config CGROUP_MEM_RES_CTLR
 	bool "Memory Resource Controller for Control Groups"
 	depends on RESOURCE_COUNTERS
 	select MM_OWNER
 	help
 	  Provides a memory resource controller that manages both anonymous
 	  memory and page cache. (See Documentation/cgroups/memory.txt)
 	  Note that setting this option increases fixed memory overhead
 	  associated with each page of memory in the system. By this,
 	  20(40)bytes/PAGE_SIZE on 32(64)bit system will be occupied by memory
 	  usage tracking struct at boot. Total amount of this is printed out
 	  at boot.
 	  Only enable when you're ok with these trade offs and really
 	  sure you need the memory resource controller. Even when you enable
 	  this, you can set "cgroup_disable=memory" at your boot option to
 	  disable memory resource controller and you can avoid overheads.
 	  (and lose benefits of memory resource controller)
 	  This config option also selects MM_OWNER config option, which
 	  could in turn add some fork/exit overhead.
 config CGROUP_MEM_RES_CTLR_SWAP
 	bool "Memory Resource Controller Swap Extension"
 	depends on CGROUP_MEM_RES_CTLR && SWAP
 	help
 	  Add swap management feature to memory resource controller. When you
 	  enable this, you can limit mem+swap usage per cgroup. In other words,
 	  when you disable this, memory resource controller has no cares to
 	  usage of swap...a process can exhaust all of the swap. This extension
 	  is useful when you want to avoid exhaustion swap but this itself
 	  adds more overheads and consumes memory for remembering information.
 	  Especially if you use 32bit system or small memory system, please
 	  be careful about enabling this. When memory resource controller
 	  is disabled by boot option, this will be automatically disabled and
 	  there will be no overhead from this. Even when you set this config=y,
 	  if boot option "noswapaccount" is set, swap will not be accounted.
 	  Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page
 	  size is 4096bytes, 512k per 1Gbytes of swap.
 config CGROUP_MEM_RES_CTLR_SWAP_ENABLED
 	bool "Memory Resource Controller Swap Extension enabled by default"
 	depends on CGROUP_MEM_RES_CTLR_SWAP
 	default y
 	help
 	  Memory Resource Controller Swap Extension comes with its price in
 	  a bigger memory consumption. General purpose distribution kernels
 	  which want to enable the feature but keep it disabled by default
 	  and let the user enable it by swapaccount boot command line
 	  parameter should have this option unselected.
 	  For those who want to have the feature enabled by default should
 	  select this option (if, for some reason, they need to disable it
 	  then noswapaccount does the trick).
 config CGROUP_PERF
 	bool "Enable perf_event per-cpu per-container group (cgroup) monitoring"
 	depends on PERF_EVENTS && CGROUPS
 	help
 	  This option extends the per-cpu mode to restrict monitoring to
 	  threads which belong to the cgroup specified and run on the
 	  designated cpu.
 	  Say N if unsure.
 menuconfig CGROUP_SCHED
 	bool "Group CPU scheduler"
 	depends on EXPERIMENTAL
 	default n
 	help
 	  This feature lets CPU scheduler recognize task groups and control CPU
 	  bandwidth allocation to such task groups. It uses cgroups to group
 	  tasks.
 if CGROUP_SCHED
 config FAIR_GROUP_SCHED
 	bool "Group scheduling for SCHED_OTHER"
 	depends on CGROUP_SCHED
 	default CGROUP_SCHED
 config RT_GROUP_SCHED
 	bool "Group scheduling for SCHED_RR/FIFO"
 	depends on EXPERIMENTAL
 	depends on CGROUP_SCHED
 	default n
 	help
 	  This feature lets you explicitly allocate real CPU bandwidth
 	  to task groups. If enabled, it will also make it impossible to
 	  schedule realtime tasks for non-root users until you allocate
 	  realtime bandwidth for them.
 	  See Documentation/scheduler/sched-rt-group.txt for more information.
 endif #CGROUP_SCHED
 config BLK_CGROUP
 	tristate "Block IO controller"
 	depends on BLOCK
 	default n
 	---help---
 	Generic block IO controller cgroup interface. This is the common
 	cgroup interface which should be used by various IO controlling
 	policies.
 	Currently, CFQ IO scheduler uses it to recognize task groups and
 	control disk bandwidth allocation (proportional time slice allocation)
 	to such task groups. It is also used by bio throttling logic in
 	block layer to implement upper limit in IO rates on a device.
 	This option only enables generic Block IO controller infrastructure.
 	One needs to also enable actual IO controlling logic/policy. For
 	enabling proportional weight division of disk bandwidth in CFQ, set
 	CONFIG_CFQ_GROUP_IOSCHED=y; for enabling throttling policy, set
 	CONFIG_BLK_DEV_THROTTLING=y.
 	See Documentation/cgroups/blkio-controller.txt for more information.
 config DEBUG_BLK_CGROUP
 	bool "Enable Block IO controller debugging"
 	depends on BLK_CGROUP
 	default n
 	---help---
 	Enable some debugging help. Currently it exports additional stat
 	files in a cgroup which can be useful for debugging.
 endif # CGROUPS
 menuconfig NAMESPACES
 	bool "Namespaces support" if EXPERT
 	default !EXPERT
 	help
 	  Provides the way to make tasks work with different objects using
 	  the same id. For example same IPC id may refer to different objects
 	  or same user id or pid may refer to different tasks when used in
 	  different namespaces.
 if NAMESPACES
 config UTS_NS
 	bool "UTS namespace"
 	default y
 	help
 	  In this namespace tasks see different info provided with the
 	  uname() system call
 config IPC_NS
 	bool "IPC namespace"
 	depends on (SYSVIPC || POSIX_MQUEUE)
 	default y
 	help
 	  In this namespace tasks work with IPC ids which correspond to
 	  different IPC objects in different namespaces.
 config USER_NS
 	bool "User namespace (EXPERIMENTAL)"
 	depends on EXPERIMENTAL
 	default y
 	help
 	  This allows containers, i.e. vservers, to use user namespaces
 	  to provide different user info for different servers.
 	  If unsure, say N.
 config PID_NS
 	bool "PID Namespaces"
 	default y
 	help
 	  Support process id namespaces.  This allows having multiple
 	  processes with the same pid as long as they are in different
 	  pid namespaces.  This is a building block of containers.
 config NET_NS
 	bool "Network namespace"
 	depends on NET
 	default y
 	help
 	  Allow user space to create what appear to be multiple instances
 	  of the network stack.
 endif # NAMESPACES
 config SCHED_AUTOGROUP
 	bool "Automatic process group scheduling"
 	select EVENTFD
 	select CGROUPS
 	select CGROUP_SCHED
 	select FAIR_GROUP_SCHED
 	help
 	  This option optimizes the scheduler for common desktop workloads by
 	  automatically creating and populating task groups.  This separation
 	  of workloads isolates aggressive CPU burners (like build jobs) from
 	  desktop applications.  Task group autogeneration is currently based
 	  upon task session.
-config SCHED_TTWU_QUEUE
-	bool
-	depends on !SPARC32
-	default y
 config MM_OWNER
 	bool
 config SYSFS_DEPRECATED
 	bool "Enable deprecated sysfs features to support old userspace tools"
 	depends on SYSFS
 	default n
 	help
 	  This option adds code that switches the layout of the "block" class
 	  devices, to not show up in /sys/class/block/, but only in
 	  /sys/block/.
 	  This switch is only active when the sysfs.deprecated=1 boot option is
 	  passed or the SYSFS_DEPRECATED_V2 option is set.
 	  This option allows new kernels to run on old distributions and tools,
 	  which might get confused by /sys/class/block/. Since 2007/2008 all
 	  major distributions and tools handle this just fine.
 	  Recent distributions and userspace tools after 2009/2010 depend on
 	  the existence of /sys/class/block/, and will not work with this
 	  option enabled.
 	  Only if you are using a new kernel on an old distribution, you might
 	  need to say Y here.
 config SYSFS_DEPRECATED_V2
 	bool "Enable deprecated sysfs features by default"
 	default n
 	depends on SYSFS
 	depends on SYSFS_DEPRECATED
 	help
 	  Enable deprecated sysfs by default.
 	  See the CONFIG_SYSFS_DEPRECATED option for more details about this
 	  option.
 	  Only if you are using a new kernel on an old distribution, you might
 	  need to say Y here. Even then, odds are you would not need it
 	  enabled, you can always pass the boot option if absolutely necessary.
 config RELAY
 	bool "Kernel->user space relay support (formerly relayfs)"
 	help
 	  This option enables support for relay interface support in
 	  certain file systems (such as debugfs).
 	  It is designed to provide an efficient mechanism for tools and
 	  facilities to relay large amounts of data from kernel space to
 	  user space.
 	  If unsure, say N.
 config BLK_DEV_INITRD
 	bool "Initial RAM filesystem and RAM disk (initramfs/initrd) support"
 	depends on BROKEN || !FRV
 	help
 	  The initial RAM filesystem is a ramfs which is loaded by the
 	  boot loader (loadlin or lilo) and that is mounted as root
 	  before the normal boot procedure. It is typically used to
 	  load modules needed to mount the "real" root file system,
 	  etc. See <file:Documentation/initrd.txt> for details.
 	  If RAM disk support (BLK_DEV_RAM) is also included, this
 	  also enables initial RAM disk (initrd) support and adds
 	  15 Kbytes (more on some other architectures) to the kernel size.
 	  If unsure say Y.
 if BLK_DEV_INITRD
 source "usr/Kconfig"
 endif
 config CC_OPTIMIZE_FOR_SIZE
 	bool "Optimize for size"
 	default y
 	help
 	  Enabling this option will pass "-Os" instead of "-O2" to gcc
 	  resulting in a smaller kernel.
 	  If unsure, say Y.
 config SYSCTL
 	bool
 config ANON_INODES
 	bool
 menuconfig EXPERT
 	bool "Configure standard kernel features (expert users)"
 	help
 	  This option allows certain base kernel options and settings
           to be disabled or tweaked. This is for specialized
           environments which can tolerate a "non-standard" kernel.
           Only use this if you really know what you are doing.
 config EMBEDDED
 	bool "Embedded system"
 	select EXPERT
 	help
 	  This option should be enabled if compiling the kernel for
 	  an embedded system so certain expert options are available
 	  for configuration.
 config UID16
 	bool "Enable 16-bit UID system calls" if EXPERT
 	depends on ARM || BLACKFIN || CRIS || FRV || H8300 || X86_32 || M68K || (S390 && !64BIT) || SUPERH || SPARC32 || (SPARC64 && COMPAT) || UML || (X86_64 && IA32_EMULATION)
 	default y
 	help
 	  This enables the legacy 16-bit UID syscall wrappers.
 config SYSCTL_SYSCALL
 	bool "Sysctl syscall support" if EXPERT
 	depends on PROC_SYSCTL
 	default y
 	select SYSCTL
 	---help---
 	  sys_sysctl uses binary paths that have been found challenging
 	  to properly maintain and use.  The interface in /proc/sys
 	  using paths with ascii names is now the primary path to this
 	  information.
 	  Almost nothing using the binary sysctl interface so if you are
 	  trying to save some space it is probably safe to disable this,
 	  making your kernel marginally smaller.
 	  If unsure say Y here.
 config KALLSYMS
 	 bool "Load all symbols for debugging/ksymoops" if EXPERT
 	 default y
 	 help
 	   Say Y here to let the kernel print out symbolic crash information and
 	   symbolic stack backtraces. This increases the size of the kernel
 	   somewhat, as all symbols have to be loaded into the kernel image.
 config KALLSYMS_ALL
 	bool "Include all symbols in kallsyms"
 	depends on DEBUG_KERNEL && KALLSYMS
 	help
 	   Normally kallsyms only contains the symbols of functions, for nicer
 	   OOPS messages.  Some debuggers can use kallsyms for other
 	   symbols too: say Y here to include all symbols, if you need them
 	   and you don't care about adding 300k to the size of your kernel.
 	   Say N.
 config KALLSYMS_EXTRA_PASS
 	bool "Do an extra kallsyms pass"
 	depends on KALLSYMS
 	help
 	   If kallsyms is not working correctly, the build will fail with
 	   inconsistent kallsyms data.  If that occurs, log a bug report and
 	   turn on KALLSYMS_EXTRA_PASS which should result in a stable build.
 	   Always say N here unless you find a bug in kallsyms, which must be
 	   reported.  KALLSYMS_EXTRA_PASS is only a temporary workaround while
 	   you wait for kallsyms to be fixed.
 config HOTPLUG
 	bool "Support for hot-pluggable devices" if EXPERT
 	default y
 	help
 	  This option is provided for the case where no hotplug or uevent
 	  capabilities is wanted by the kernel.  You should only consider
 	  disabling this option for embedded systems that do not use modules, a
 	  dynamic /dev tree, or dynamic device discovery.  Just say Y.
 config PRINTK
 	default y
 	bool "Enable support for printk" if EXPERT
 	help
 	  This option enables normal printk support. Removing it
 	  eliminates most of the message strings from the kernel image
 	  and makes the kernel more or less silent. As this makes it
 	  very difficult to diagnose system problems, saying N here is
 	  strongly discouraged.
 config BUG
 	bool "BUG() support" if EXPERT
 	default y
 	help
           Disabling this option eliminates support for BUG and WARN, reducing
           the size of your kernel image and potentially quietly ignoring
           numerous fatal conditions. You should only consider disabling this
           option for embedded systems with no facilities for reporting errors.
           Just say Y.
 config ELF_CORE
 	default y
 	bool "Enable ELF core dumps" if EXPERT
 	help
 	  Enable support for generating core dumps. Disabling saves about 4k.
 config PCSPKR_PLATFORM
 	bool "Enable PC-Speaker support" if EXPERT
 	depends on ALPHA || X86 || MIPS || PPC_PREP || PPC_CHRP || PPC_PSERIES
 	default y
 	help
           This option allows to disable the internal PC-Speaker
           support, saving some memory.
 config BASE_FULL
 	default y
 	bool "Enable full-sized data structures for core" if EXPERT
 	help
 	  Disabling this option reduces the size of miscellaneous core
 	  kernel data structures. This saves memory on small machines,
 	  but may reduce performance.
 config FUTEX
 	bool "Enable futex support" if EXPERT
 	default y
 	select RT_MUTEXES
 	help
 	  Disabling this option will cause the kernel to be built without
 	  support for "fast userspace mutexes".  The resulting kernel may not
 	  run glibc-based applications correctly.
 config EPOLL
 	bool "Enable eventpoll support" if EXPERT
 	default y
 	select ANON_INODES
 	help
 	  Disabling this option will cause the kernel to be built without
 	  support for epoll family of system calls.
 config SIGNALFD
 	bool "Enable signalfd() system call" if EXPERT
 	select ANON_INODES
 	default y
 	help
 	  Enable the signalfd() system call that allows to receive signals
 	  on a file descriptor.
 	  If unsure, say Y.
 config TIMERFD
 	bool "Enable timerfd() system call" if EXPERT
 	select ANON_INODES
 	default y
 	help
 	  Enable the timerfd() system call that allows to receive timer
 	  events on a file descriptor.
 	  If unsure, say Y.
 config EVENTFD
 	bool "Enable eventfd() system call" if EXPERT
 	select ANON_INODES
 	default y
 	help
 	  Enable the eventfd() system call that allows to receive both
 	  kernel notification (ie. KAIO) or userspace notifications.
 	  If unsure, say Y.
 config SHMEM
 	bool "Use full shmem filesystem" if EXPERT
 	default y
 	depends on MMU
 	help
 	  The shmem is an internal filesystem used to manage shared memory.
 	  It is backed by swap and manages resource limits. It is also exported
 	  to userspace as tmpfs if TMPFS is enabled. Disabling this
 	  option replaces shmem and tmpfs with the much simpler ramfs code,
 	  which may be appropriate on small systems without swap.
 config AIO
 	bool "Enable AIO support" if EXPERT
 	default y
 	help
 	  This option enables POSIX asynchronous I/O which may by used
           by some high performance threaded applications. Disabling
           this option saves about 7k.
 config HAVE_PERF_EVENTS
 	bool
 	help
 	  See tools/perf/design.txt for details.
 config PERF_USE_VMALLOC
 	bool
 	help
 	  See tools/perf/design.txt for details
 menu "Kernel Performance Events And Counters"
 config PERF_EVENTS
 	bool "Kernel performance events and counters"
 	default y if (PROFILING || PERF_COUNTERS)
 	depends on HAVE_PERF_EVENTS
 	select ANON_INODES
 	select IRQ_WORK
 	help
 	  Enable kernel support for various performance events provided
 	  by software and hardware.
 	  Software events are supported either built-in or via the
 	  use of generic tracepoints.
 	  Most modern CPUs support performance events via performance
 	  counter registers. These registers count the number of certain
 	  types of hw events: such as instructions executed, cachemisses
 	  suffered, or branches mis-predicted - without slowing down the
 	  kernel or applications. These registers can also trigger interrupts
 	  when a threshold number of events have passed - and can thus be
 	  used to profile the code that runs on that CPU.
 	  The Linux Performance Event subsystem provides an abstraction of
 	  these software and hardware event capabilities, available via a
 	  system call and used by the "perf" utility in tools/perf/. It
 	  provides per task and per CPU counters, and it provides event
 	  capabilities on top of those.
 	  Say Y if unsure.
 config PERF_COUNTERS
 	bool "Kernel performance counters (old config option)"
 	depends on HAVE_PERF_EVENTS
 	help
 	  This config has been obsoleted by the PERF_EVENTS
 	  config option - please see that one for details.
 	  It has no effect on the kernel whether you enable
 	  it or not, it is a compatibility placeholder.
 	  Say N if unsure.
 config DEBUG_PERF_USE_VMALLOC
 	default n
 	bool "Debug: use vmalloc to back perf mmap() buffers"
 	depends on PERF_EVENTS && DEBUG_KERNEL
 	select PERF_USE_VMALLOC
 	help
 	 Use vmalloc memory to back perf mmap() buffers.
 	 Mostly useful for debugging the vmalloc code on platforms
 	 that don't require it.
 	 Say N if unsure.
 endmenu
 config VM_EVENT_COUNTERS
 	default y
 	bool "Enable VM event counters for /proc/vmstat" if EXPERT
 	help
 	  VM event counters are needed for event counts to be shown.
 	  This option allows the disabling of the VM event counters
 	  on EXPERT systems.  /proc/vmstat will only show page counts
 	  if VM event counters are disabled.
 config PCI_QUIRKS
 	default y
 	bool "Enable PCI quirk workarounds" if EXPERT
 	depends on PCI
 	help
 	  This enables workarounds for various PCI chipset
           bugs/quirks. Disable this only if your target machine is
           unaffected by PCI quirks.
 config SLUB_DEBUG
 	default y
 	bool "Enable SLUB debugging support" if EXPERT
 	depends on SLUB && SYSFS
 	help
 	  SLUB has extensive debug support features. Disabling these can
 	  result in significant savings in code size. This also disables
 	  SLUB sysfs support. /sys/slab will not exist and there will be
 	  no support for cache validation etc.
 config COMPAT_BRK
 	bool "Disable heap randomization"
 	default y
 	help
 	  Randomizing heap placement makes heap exploits harder, but it
 	  also breaks ancient binaries (including anything libc5 based).
 	  This option changes the bootup default to heap randomization
 	  disabled, and can be overridden at runtime by setting
 	  /proc/sys/kernel/randomize_va_space to 2.
 	  On non-ancient distros (post-2000 ones) N is usually a safe choice.
 choice
 	prompt "Choose SLAB allocator"
 	default SLUB
 	help
 	   This option allows to select a slab allocator.
 config SLAB
 	bool "SLAB"
 	help
 	  The regular slab allocator that is established and known to work
 	  well in all environments. It organizes cache hot objects in
 	  per cpu and per node queues.
 config SLUB
 	bool "SLUB (Unqueued Allocator)"
 	help
 	   SLUB is a slab allocator that minimizes cache line usage
 	   instead of managing queues of cached objects (SLAB approach).
 	   Per cpu caching is realized using slabs of objects instead
 	   of queues of objects. SLUB can use memory efficiently
 	   and has enhanced diagnostics. SLUB is the default choice for
 	   a slab allocator.
 config SLOB
 	depends on EXPERT
 	bool "SLOB (Simple Allocator)"
 	help
 	   SLOB replaces the stock allocator with a drastically simpler
 	   allocator. SLOB is generally more space efficient but
 	   does not perform as well on large systems.
 endchoice
 config MMAP_ALLOW_UNINITIALIZED
 	bool "Allow mmapped anonymous memory to be uninitialized"
 	depends on EXPERT && !MMU
 	default n
 	help
 	  Normally, and according to the Linux spec, anonymous memory obtained
 	  from mmap() has it's contents cleared before it is passed to
 	  userspace.  Enabling this config option allows you to request that
 	  mmap() skip that if it is given an MAP_UNINITIALIZED flag, thus
 	  providing a huge performance boost.  If this option is not enabled,
 	  then the flag will be ignored.
 	  This is taken advantage of by uClibc's malloc(), and also by
 	  ELF-FDPIC binfmt's brk and stack allocator.
 	  Because of the obvious security issues, this option should only be
 	  enabled on embedded devices where you control what is run in
 	  userspace.  Since that isn't generally a problem on no-MMU systems,
 	  it is normally safe to say Y here.
 	  See Documentation/nommu-mmap.txt for more information.
 config PROFILING
 	bool "Profiling support"
 	help
 	  Say Y here to enable the extended profiling support mechanisms used
 	  by profilers such as OProfile.
 #
 # Place an empty function call at each tracepoint site. Can be
 # dynamically changed for a probe function.
 #
 config TRACEPOINTS
 	bool
 source "arch/Kconfig"
 endmenu		# General setup
 config HAVE_GENERIC_DMA_COHERENT
 	bool
 	default n
 config SLABINFO
 	bool
 	depends on PROC_FS
 	depends on SLAB || SLUB_DEBUG
 	default y
 config RT_MUTEXES
 	boolean
 config BASE_SMALL
 	int
 	default 0 if BASE_FULL
 	default 1 if !BASE_FULL
 menuconfig MODULES
 	bool "Enable loadable module support"
 	help
 	  Kernel modules are small pieces of compiled code which can
 	  be inserted in the running kernel, rather than being
 	  permanently built into the kernel.  You use the "modprobe"
 	  tool to add (and sometimes remove) them.  If you say Y here,
 	  many parts of the kernel can be built as modules (by
 	  answering M instead of Y where indicated): this is most
 	  useful for infrequently used options which are not required
 	  for booting.  For more information, see the man pages for
 	  modprobe, lsmod, modinfo, insmod and rmmod.
 	  If you say Y here, you will need to run "make
 	  modules_install" to put the modules under /lib/modules/
 	  where modprobe can find them (you may need to be root to do
 	  this).
 	  If unsure, say Y.
 if MODULES
 config MODULE_FORCE_LOAD
 	bool "Forced module loading"
 	default n
 	help
 	  Allow loading of modules without version information (ie. modprobe
 	  --force).  Forced module loading sets the 'F' (forced) taint flag and
 	  is usually a really bad idea.
 config MODULE_UNLOAD
 	bool "Module unloading"
 	help
 	  Without this option you will not be able to unload any
 	  modules (note that some modules may not be unloadable
 	  anyway), which makes your kernel smaller, faster
 	  and simpler.  If unsure, say Y.
 config MODULE_FORCE_UNLOAD
 	bool "Forced module unloading"
 	depends on MODULE_UNLOAD && EXPERIMENTAL
 	help
 	  This option allows you to force a module to unload, even if the
 	  kernel believes it is unsafe: the kernel will remove the module
 	  without waiting for anyone to stop using it (using the -f option to
 	  rmmod).  This is mainly for kernel developers and desperate users.
 	  If unsure, say N.
 config MODVERSIONS
 	bool "Module versioning support"
 	help
 	  Usually, you have to use modules compiled with your kernel.
 	  Saying Y here makes it sometimes possible to use modules
 	  compiled for different kernels, by adding enough information
 	  to the modules to (hopefully) spot any changes which would
 	  make them incompatible with the kernel you are running.  If
 	  unsure, say N.
 config MODULE_SRCVERSION_ALL
 	bool "Source checksum for all modules"
 	help
 	  Modules which contain a MODULE_VERSION get an extra "srcversion"
 	  field inserted into their modinfo section, which contains a
     	  sum of the source files which made it.  This helps maintainers
 	  see exactly which source was used to build a module (since
 	  others sometimes change the module source without updating
 	  the version).  With this option, such a "srcversion" field
 	  will be created for all modules.  If unsure, say N.
 endif # MODULES
 config INIT_ALL_POSSIBLE
 	bool
 	help
 	  Back when each arch used to define their own cpu_online_map and
 	  cpu_possible_map, some of them chose to initialize cpu_possible_map
 	  with all 1s, and others with all 0s.  When they were centralised,
 	  it was better to provide this option than to break all the archs
 	  and have several arch maintainers pursuing me down dark alleys.
 config STOP_MACHINE
 	bool
 	default y
 	depends on (SMP && MODULE_UNLOAD) || HOTPLUG_CPU
 	help
 	  Need stop_machine() primitive.
 source "block/Kconfig"
 config PREEMPT_NOTIFIERS
 	bool
 config PADATA
 	depends on SMP
 	bool
 source "kernel/Kconfig.locks"

kernel/sched.c

Diff comments View file @ 17d9f31

1	/*	1	/*
2	* kernel/sched.c	2	* kernel/sched.c
3	*	3	*
4	* Kernel scheduler and related syscalls	4	* Kernel scheduler and related syscalls
5	*	5	*
6	* Copyright (C) 1991-2002 Linus Torvalds	6	* Copyright (C) 1991-2002 Linus Torvalds
7	*	7	*
8	* 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and	8	* 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
9	* make semaphores SMP safe	9	* make semaphores SMP safe
10	* 1998-11-19 Implemented schedule_timeout() and related stuff	10	* 1998-11-19 Implemented schedule_timeout() and related stuff
11	* by Andrea Arcangeli	11	* by Andrea Arcangeli
12	* 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:	12	* 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
13	* hybrid priority-list and round-robin design with	13	* hybrid priority-list and round-robin design with
14	* an array-switch method of distributing timeslices	14	* an array-switch method of distributing timeslices
15	* and per-CPU runqueues. Cleanups and useful suggestions	15	* and per-CPU runqueues. Cleanups and useful suggestions
16	* by Davide Libenzi, preemptible kernel bits by Robert Love.	16	* by Davide Libenzi, preemptible kernel bits by Robert Love.
17	* 2003-09-03 Interactivity tuning by Con Kolivas.	17	* 2003-09-03 Interactivity tuning by Con Kolivas.
18	* 2004-04-02 Scheduler domains code by Nick Piggin	18	* 2004-04-02 Scheduler domains code by Nick Piggin
19	* 2007-04-15 Work begun on replacing all interactivity tuning with a	19	* 2007-04-15 Work begun on replacing all interactivity tuning with a
20	* fair scheduling design by Con Kolivas.	20	* fair scheduling design by Con Kolivas.
21	* 2007-05-05 Load balancing (smp-nice) and other improvements	21	* 2007-05-05 Load balancing (smp-nice) and other improvements
22	* by Peter Williams	22	* by Peter Williams
23	* 2007-05-06 Interactivity improvements to CFS by Mike Galbraith	23	* 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
24	* 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri	24	* 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
25	* 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,	25	* 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,
26	* Thomas Gleixner, Mike Kravetz	26	* Thomas Gleixner, Mike Kravetz
27	*/	27	*/
28		28
29	#include <linux/mm.h>	29	#include <linux/mm.h>
30	#include <linux/module.h>	30	#include <linux/module.h>
31	#include <linux/nmi.h>	31	#include <linux/nmi.h>
32	#include <linux/init.h>	32	#include <linux/init.h>
33	#include <linux/uaccess.h>	33	#include <linux/uaccess.h>
34	#include <linux/highmem.h>	34	#include <linux/highmem.h>
35	#include <asm/mmu_context.h>	35	#include <asm/mmu_context.h>
36	#include <linux/interrupt.h>	36	#include <linux/interrupt.h>
37	#include <linux/capability.h>	37	#include <linux/capability.h>
38	#include <linux/completion.h>	38	#include <linux/completion.h>
39	#include <linux/kernel_stat.h>	39	#include <linux/kernel_stat.h>
40	#include <linux/debug_locks.h>	40	#include <linux/debug_locks.h>
41	#include <linux/perf_event.h>	41	#include <linux/perf_event.h>
42	#include <linux/security.h>	42	#include <linux/security.h>
43	#include <linux/notifier.h>	43	#include <linux/notifier.h>
44	#include <linux/profile.h>	44	#include <linux/profile.h>
45	#include <linux/freezer.h>	45	#include <linux/freezer.h>
46	#include <linux/vmalloc.h>	46	#include <linux/vmalloc.h>
47	#include <linux/blkdev.h>	47	#include <linux/blkdev.h>
48	#include <linux/delay.h>	48	#include <linux/delay.h>
49	#include <linux/pid_namespace.h>	49	#include <linux/pid_namespace.h>
50	#include <linux/smp.h>	50	#include <linux/smp.h>
51	#include <linux/threads.h>	51	#include <linux/threads.h>
52	#include <linux/timer.h>	52	#include <linux/timer.h>
53	#include <linux/rcupdate.h>	53	#include <linux/rcupdate.h>
54	#include <linux/cpu.h>	54	#include <linux/cpu.h>
55	#include <linux/cpuset.h>	55	#include <linux/cpuset.h>
56	#include <linux/percpu.h>	56	#include <linux/percpu.h>
57	#include <linux/proc_fs.h>	57	#include <linux/proc_fs.h>
58	#include <linux/seq_file.h>	58	#include <linux/seq_file.h>
59	#include <linux/stop_machine.h>	59	#include <linux/stop_machine.h>
60	#include <linux/sysctl.h>	60	#include <linux/sysctl.h>
61	#include <linux/syscalls.h>	61	#include <linux/syscalls.h>
62	#include <linux/times.h>	62	#include <linux/times.h>
63	#include <linux/tsacct_kern.h>	63	#include <linux/tsacct_kern.h>
64	#include <linux/kprobes.h>	64	#include <linux/kprobes.h>
65	#include <linux/delayacct.h>	65	#include <linux/delayacct.h>
66	#include <linux/unistd.h>	66	#include <linux/unistd.h>
67	#include <linux/pagemap.h>	67	#include <linux/pagemap.h>
68	#include <linux/hrtimer.h>	68	#include <linux/hrtimer.h>
69	#include <linux/tick.h>	69	#include <linux/tick.h>
70	#include <linux/debugfs.h>	70	#include <linux/debugfs.h>
71	#include <linux/ctype.h>	71	#include <linux/ctype.h>
72	#include <linux/ftrace.h>	72	#include <linux/ftrace.h>
73	#include <linux/slab.h>	73	#include <linux/slab.h>
74		74
75	#include <asm/tlb.h>	75	#include <asm/tlb.h>
76	#include <asm/irq_regs.h>	76	#include <asm/irq_regs.h>
77	#include <asm/mutex.h>	77	#include <asm/mutex.h>
78		78
79	#include "sched_cpupri.h"	79	#include "sched_cpupri.h"
80	#include "workqueue_sched.h"	80	#include "workqueue_sched.h"
81	#include "sched_autogroup.h"	81	#include "sched_autogroup.h"
82		82
83	#define CREATE_TRACE_POINTS	83	#define CREATE_TRACE_POINTS
84	#include <trace/events/sched.h>	84	#include <trace/events/sched.h>
85		85
86	/*	86	/*
87	* Convert user-nice values [ -20 ... 0 ... 19 ]	87	* Convert user-nice values [ -20 ... 0 ... 19 ]
88	* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],	88	* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
89	* and back.	89	* and back.
90	*/	90	*/
91	#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)	91	#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
92	#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)	92	#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
93	#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)	93	#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
94		94
95	/*	95	/*
96	* 'User priority' is the nice value converted to something we	96	* 'User priority' is the nice value converted to something we
97	* can work with better when scaling various scheduler parameters,	97	* can work with better when scaling various scheduler parameters,
98	* it's a [ 0 ... 39 ] range.	98	* it's a [ 0 ... 39 ] range.
99	*/	99	*/
100	#define USER_PRIO(p) ((p)-MAX_RT_PRIO)	100	#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
101	#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)	101	#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
102	#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))	102	#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
103		103
104	/*	104	/*
105	* Helpers for converting nanosecond timing to jiffy resolution	105	* Helpers for converting nanosecond timing to jiffy resolution
106	*/	106	*/
107	#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))	107	#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
108		108
109	#define NICE_0_LOAD SCHED_LOAD_SCALE	109	#define NICE_0_LOAD SCHED_LOAD_SCALE
110	#define NICE_0_SHIFT SCHED_LOAD_SHIFT	110	#define NICE_0_SHIFT SCHED_LOAD_SHIFT
111		111
112	/*	112	/*
113	* These are the 'tuning knobs' of the scheduler:	113	* These are the 'tuning knobs' of the scheduler:
114	*	114	*
115	* default timeslice is 100 msecs (used only for SCHED_RR tasks).	115	* default timeslice is 100 msecs (used only for SCHED_RR tasks).
116	* Timeslices get refilled after they expire.	116	* Timeslices get refilled after they expire.
117	*/	117	*/
118	#define DEF_TIMESLICE (100 * HZ / 1000)	118	#define DEF_TIMESLICE (100 * HZ / 1000)
119		119
120	/*	120	/*
121	* single value that denotes runtime == period, ie unlimited time.	121	* single value that denotes runtime == period, ie unlimited time.
122	*/	122	*/
123	#define RUNTIME_INF ((u64)~0ULL)	123	#define RUNTIME_INF ((u64)~0ULL)
124		124
125	static inline int rt_policy(int policy)	125	static inline int rt_policy(int policy)
126	{	126	{
127	if (unlikely(policy == SCHED_FIFO \|\| policy == SCHED_RR))	127	if (unlikely(policy == SCHED_FIFO \|\| policy == SCHED_RR))
128	return 1;	128	return 1;
129	return 0;	129	return 0;
130	}	130	}
131		131
132	static inline int task_has_rt_policy(struct task_struct *p)	132	static inline int task_has_rt_policy(struct task_struct *p)
133	{	133	{
134	return rt_policy(p->policy);	134	return rt_policy(p->policy);
135	}	135	}
136		136
137	/*	137	/*
138	* This is the priority-queue data structure of the RT scheduling class:	138	* This is the priority-queue data structure of the RT scheduling class:
139	*/	139	*/
140	struct rt_prio_array {	140	struct rt_prio_array {
141	DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */	141	DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
142	struct list_head queue[MAX_RT_PRIO];	142	struct list_head queue[MAX_RT_PRIO];
143	};	143	};
144		144
145	struct rt_bandwidth {	145	struct rt_bandwidth {
146	/* nests inside the rq lock: */	146	/* nests inside the rq lock: */
147	raw_spinlock_t rt_runtime_lock;	147	raw_spinlock_t rt_runtime_lock;
148	ktime_t rt_period;	148	ktime_t rt_period;
149	u64 rt_runtime;	149	u64 rt_runtime;
150	struct hrtimer rt_period_timer;	150	struct hrtimer rt_period_timer;
151	};	151	};
152		152
153	static struct rt_bandwidth def_rt_bandwidth;	153	static struct rt_bandwidth def_rt_bandwidth;
154		154
155	static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);	155	static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
156		156
157	static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)	157	static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
158	{	158	{
159	struct rt_bandwidth *rt_b =	159	struct rt_bandwidth *rt_b =
160	container_of(timer, struct rt_bandwidth, rt_period_timer);	160	container_of(timer, struct rt_bandwidth, rt_period_timer);
161	ktime_t now;	161	ktime_t now;
162	int overrun;	162	int overrun;
163	int idle = 0;	163	int idle = 0;
164		164
165	for (;;) {	165	for (;;) {
166	now = hrtimer_cb_get_time(timer);	166	now = hrtimer_cb_get_time(timer);
167	overrun = hrtimer_forward(timer, now, rt_b->rt_period);	167	overrun = hrtimer_forward(timer, now, rt_b->rt_period);
168		168
169	if (!overrun)	169	if (!overrun)
170	break;	170	break;
171		171
172	idle = do_sched_rt_period_timer(rt_b, overrun);	172	idle = do_sched_rt_period_timer(rt_b, overrun);
173	}	173	}
174		174
175	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;	175	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
176	}	176	}
177		177
178	static	178	static
179	void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)	179	void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
180	{	180	{
181	rt_b->rt_period = ns_to_ktime(period);	181	rt_b->rt_period = ns_to_ktime(period);
182	rt_b->rt_runtime = runtime;	182	rt_b->rt_runtime = runtime;
183		183
184	raw_spin_lock_init(&rt_b->rt_runtime_lock);	184	raw_spin_lock_init(&rt_b->rt_runtime_lock);
185		185
186	hrtimer_init(&rt_b->rt_period_timer,	186	hrtimer_init(&rt_b->rt_period_timer,
187	CLOCK_MONOTONIC, HRTIMER_MODE_REL);	187	CLOCK_MONOTONIC, HRTIMER_MODE_REL);
188	rt_b->rt_period_timer.function = sched_rt_period_timer;	188	rt_b->rt_period_timer.function = sched_rt_period_timer;
189	}	189	}
190		190
191	static inline int rt_bandwidth_enabled(void)	191	static inline int rt_bandwidth_enabled(void)
192	{	192	{
193	return sysctl_sched_rt_runtime >= 0;	193	return sysctl_sched_rt_runtime >= 0;
194	}	194	}
195		195
196	static void start_rt_bandwidth(struct rt_bandwidth *rt_b)	196	static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
197	{	197	{
198	ktime_t now;	198	ktime_t now;
199		199
200	if (!rt_bandwidth_enabled() \|\| rt_b->rt_runtime == RUNTIME_INF)	200	if (!rt_bandwidth_enabled() \|\| rt_b->rt_runtime == RUNTIME_INF)
201	return;	201	return;
202		202
203	if (hrtimer_active(&rt_b->rt_period_timer))	203	if (hrtimer_active(&rt_b->rt_period_timer))
204	return;	204	return;
205		205
206	raw_spin_lock(&rt_b->rt_runtime_lock);	206	raw_spin_lock(&rt_b->rt_runtime_lock);
207	for (;;) {	207	for (;;) {
208	unsigned long delta;	208	unsigned long delta;
209	ktime_t soft, hard;	209	ktime_t soft, hard;
210		210
211	if (hrtimer_active(&rt_b->rt_period_timer))	211	if (hrtimer_active(&rt_b->rt_period_timer))
212	break;	212	break;
213		213
214	now = hrtimer_cb_get_time(&rt_b->rt_period_timer);	214	now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
215	hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);	215	hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
216		216
217	soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);	217	soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
218	hard = hrtimer_get_expires(&rt_b->rt_period_timer);	218	hard = hrtimer_get_expires(&rt_b->rt_period_timer);
219	delta = ktime_to_ns(ktime_sub(hard, soft));	219	delta = ktime_to_ns(ktime_sub(hard, soft));
220	__hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,	220	__hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
221	HRTIMER_MODE_ABS_PINNED, 0);	221	HRTIMER_MODE_ABS_PINNED, 0);
222	}	222	}
223	raw_spin_unlock(&rt_b->rt_runtime_lock);	223	raw_spin_unlock(&rt_b->rt_runtime_lock);
224	}	224	}
225		225
226	#ifdef CONFIG_RT_GROUP_SCHED	226	#ifdef CONFIG_RT_GROUP_SCHED
227	static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)	227	static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
228	{	228	{
229	hrtimer_cancel(&rt_b->rt_period_timer);	229	hrtimer_cancel(&rt_b->rt_period_timer);
230	}	230	}
231	#endif	231	#endif
232		232
233	/*	233	/*
234	* sched_domains_mutex serializes calls to arch_init_sched_domains,	234	* sched_domains_mutex serializes calls to arch_init_sched_domains,
235	* detach_destroy_domains and partition_sched_domains.	235	* detach_destroy_domains and partition_sched_domains.
236	*/	236	*/
237	static DEFINE_MUTEX(sched_domains_mutex);	237	static DEFINE_MUTEX(sched_domains_mutex);
238		238
239	#ifdef CONFIG_CGROUP_SCHED	239	#ifdef CONFIG_CGROUP_SCHED
240		240
241	#include <linux/cgroup.h>	241	#include <linux/cgroup.h>
242		242
243	struct cfs_rq;	243	struct cfs_rq;
244		244
245	static LIST_HEAD(task_groups);	245	static LIST_HEAD(task_groups);
246		246
247	/* task group related information */	247	/* task group related information */
248	struct task_group {	248	struct task_group {
249	struct cgroup_subsys_state css;	249	struct cgroup_subsys_state css;
250		250
251	#ifdef CONFIG_FAIR_GROUP_SCHED	251	#ifdef CONFIG_FAIR_GROUP_SCHED
252	/* schedulable entities of this group on each cpu */	252	/* schedulable entities of this group on each cpu */
253	struct sched_entity **se;	253	struct sched_entity **se;
254	/* runqueue "owned" by this group on each cpu */	254	/* runqueue "owned" by this group on each cpu */
255	struct cfs_rq **cfs_rq;	255	struct cfs_rq **cfs_rq;
256	unsigned long shares;	256	unsigned long shares;
257		257
258	atomic_t load_weight;	258	atomic_t load_weight;
259	#endif	259	#endif
260		260
261	#ifdef CONFIG_RT_GROUP_SCHED	261	#ifdef CONFIG_RT_GROUP_SCHED
262	struct sched_rt_entity **rt_se;	262	struct sched_rt_entity **rt_se;
263	struct rt_rq **rt_rq;	263	struct rt_rq **rt_rq;
264		264
265	struct rt_bandwidth rt_bandwidth;	265	struct rt_bandwidth rt_bandwidth;
266	#endif	266	#endif
267		267
268	struct rcu_head rcu;	268	struct rcu_head rcu;
269	struct list_head list;	269	struct list_head list;
270		270
271	struct task_group *parent;	271	struct task_group *parent;
272	struct list_head siblings;	272	struct list_head siblings;
273	struct list_head children;	273	struct list_head children;
274		274
275	#ifdef CONFIG_SCHED_AUTOGROUP	275	#ifdef CONFIG_SCHED_AUTOGROUP
276	struct autogroup *autogroup;	276	struct autogroup *autogroup;
277	#endif	277	#endif
278	};	278	};
279		279
280	/* task_group_lock serializes the addition/removal of task groups */	280	/* task_group_lock serializes the addition/removal of task groups */
281	static DEFINE_SPINLOCK(task_group_lock);	281	static DEFINE_SPINLOCK(task_group_lock);
282		282
283	#ifdef CONFIG_FAIR_GROUP_SCHED	283	#ifdef CONFIG_FAIR_GROUP_SCHED
284		284
285	# define ROOT_TASK_GROUP_LOAD NICE_0_LOAD	285	# define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
286		286
287	/*	287	/*
288	* A weight of 0 or 1 can cause arithmetics problems.	288	* A weight of 0 or 1 can cause arithmetics problems.
289	* A weight of a cfs_rq is the sum of weights of which entities	289	* A weight of a cfs_rq is the sum of weights of which entities
290	* are queued on this cfs_rq, so a weight of a entity should not be	290	* are queued on this cfs_rq, so a weight of a entity should not be
291	* too large, so as the shares value of a task group.	291	* too large, so as the shares value of a task group.
292	* (The default weight is 1024 - so there's no practical	292	* (The default weight is 1024 - so there's no practical
293	* limitation from this.)	293	* limitation from this.)
294	*/	294	*/
295	#define MIN_SHARES 2	295	#define MIN_SHARES 2
296	#define MAX_SHARES (1UL << 18)	296	#define MAX_SHARES (1UL << 18)
297		297
298	static int root_task_group_load = ROOT_TASK_GROUP_LOAD;	298	static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
299	#endif	299	#endif
300		300
301	/* Default task group.	301	/* Default task group.
302	* Every task in system belong to this group at bootup.	302	* Every task in system belong to this group at bootup.
303	*/	303	*/
304	struct task_group root_task_group;	304	struct task_group root_task_group;
305		305
306	#endif /* CONFIG_CGROUP_SCHED */	306	#endif /* CONFIG_CGROUP_SCHED */
307		307
308	/* CFS-related fields in a runqueue */	308	/* CFS-related fields in a runqueue */
309	struct cfs_rq {	309	struct cfs_rq {
310	struct load_weight load;	310	struct load_weight load;
311	unsigned long nr_running;	311	unsigned long nr_running;
312		312
313	u64 exec_clock;	313	u64 exec_clock;
314	u64 min_vruntime;	314	u64 min_vruntime;
315	#ifndef CONFIG_64BIT	315	#ifndef CONFIG_64BIT
316	u64 min_vruntime_copy;	316	u64 min_vruntime_copy;
317	#endif	317	#endif
318		318
319	struct rb_root tasks_timeline;	319	struct rb_root tasks_timeline;
320	struct rb_node *rb_leftmost;	320	struct rb_node *rb_leftmost;
321		321
322	struct list_head tasks;	322	struct list_head tasks;
323	struct list_head *balance_iterator;	323	struct list_head *balance_iterator;
324		324
325	/*	325	/*
326	* 'curr' points to currently running entity on this cfs_rq.	326	* 'curr' points to currently running entity on this cfs_rq.
327	* It is set to NULL otherwise (i.e when none are currently running).	327	* It is set to NULL otherwise (i.e when none are currently running).
328	*/	328	*/
329	struct sched_entity curr, next, last, skip;	329	struct sched_entity curr, next, last, skip;
330		330
331	unsigned int nr_spread_over;	331	unsigned int nr_spread_over;
332		332
333	#ifdef CONFIG_FAIR_GROUP_SCHED	333	#ifdef CONFIG_FAIR_GROUP_SCHED
334	struct rq rq; / cpu runqueue to which this cfs_rq is attached */	334	struct rq rq; / cpu runqueue to which this cfs_rq is attached */
335		335
336	/*	336	/*
337	* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in	337	* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
338	* a hierarchy). Non-leaf lrqs hold other higher schedulable entities	338	* a hierarchy). Non-leaf lrqs hold other higher schedulable entities
339	* (like users, containers etc.)	339	* (like users, containers etc.)
340	*	340	*
341	* leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This	341	* leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
342	* list is used during load balance.	342	* list is used during load balance.
343	*/	343	*/
344	int on_list;	344	int on_list;
345	struct list_head leaf_cfs_rq_list;	345	struct list_head leaf_cfs_rq_list;
346	struct task_group tg; / group that "owns" this runqueue */	346	struct task_group tg; / group that "owns" this runqueue */
347		347
348	#ifdef CONFIG_SMP	348	#ifdef CONFIG_SMP
349	/*	349	/*
350	* the part of load.weight contributed by tasks	350	* the part of load.weight contributed by tasks
351	*/	351	*/
352	unsigned long task_weight;	352	unsigned long task_weight;
353		353
354	/*	354	/*
355	* h_load = weight * f(tg)	355	* h_load = weight * f(tg)
356	*	356	*
357	* Where f(tg) is the recursive weight fraction assigned to	357	* Where f(tg) is the recursive weight fraction assigned to
358	* this group.	358	* this group.
359	*/	359	*/
360	unsigned long h_load;	360	unsigned long h_load;
361		361
362	/*	362	/*
363	* Maintaining per-cpu shares distribution for group scheduling	363	* Maintaining per-cpu shares distribution for group scheduling
364	*	364	*
365	* load_stamp is the last time we updated the load average	365	* load_stamp is the last time we updated the load average
366	* load_last is the last time we updated the load average and saw load	366	* load_last is the last time we updated the load average and saw load
367	* load_unacc_exec_time is currently unaccounted execution time	367	* load_unacc_exec_time is currently unaccounted execution time
368	*/	368	*/
369	u64 load_avg;	369	u64 load_avg;
370	u64 load_period;	370	u64 load_period;
371	u64 load_stamp, load_last, load_unacc_exec_time;	371	u64 load_stamp, load_last, load_unacc_exec_time;
372		372
373	unsigned long load_contribution;	373	unsigned long load_contribution;
374	#endif	374	#endif
375	#endif	375	#endif
376	};	376	};
377		377
378	/* Real-Time classes' related field in a runqueue: */	378	/* Real-Time classes' related field in a runqueue: */
379	struct rt_rq {	379	struct rt_rq {
380	struct rt_prio_array active;	380	struct rt_prio_array active;
381	unsigned long rt_nr_running;	381	unsigned long rt_nr_running;
382	#if defined CONFIG_SMP \|\| defined CONFIG_RT_GROUP_SCHED	382	#if defined CONFIG_SMP \|\| defined CONFIG_RT_GROUP_SCHED
383	struct {	383	struct {
384	int curr; /* highest queued rt task prio */	384	int curr; /* highest queued rt task prio */
385	#ifdef CONFIG_SMP	385	#ifdef CONFIG_SMP
386	int next; /* next highest */	386	int next; /* next highest */
387	#endif	387	#endif
388	} highest_prio;	388	} highest_prio;
389	#endif	389	#endif
390	#ifdef CONFIG_SMP	390	#ifdef CONFIG_SMP
391	unsigned long rt_nr_migratory;	391	unsigned long rt_nr_migratory;
392	unsigned long rt_nr_total;	392	unsigned long rt_nr_total;
393	int overloaded;	393	int overloaded;
394	struct plist_head pushable_tasks;	394	struct plist_head pushable_tasks;
395	#endif	395	#endif
396	int rt_throttled;	396	int rt_throttled;
397	u64 rt_time;	397	u64 rt_time;
398	u64 rt_runtime;	398	u64 rt_runtime;
399	/* Nests inside the rq lock: */	399	/* Nests inside the rq lock: */
400	raw_spinlock_t rt_runtime_lock;	400	raw_spinlock_t rt_runtime_lock;
401		401
402	#ifdef CONFIG_RT_GROUP_SCHED	402	#ifdef CONFIG_RT_GROUP_SCHED
403	unsigned long rt_nr_boosted;	403	unsigned long rt_nr_boosted;
404		404
405	struct rq *rq;	405	struct rq *rq;
406	struct list_head leaf_rt_rq_list;	406	struct list_head leaf_rt_rq_list;
407	struct task_group *tg;	407	struct task_group *tg;
408	#endif	408	#endif
409	};	409	};
410		410
411	#ifdef CONFIG_SMP	411	#ifdef CONFIG_SMP
412		412
413	/*	413	/*
414	* We add the notion of a root-domain which will be used to define per-domain	414	* We add the notion of a root-domain which will be used to define per-domain
415	* variables. Each exclusive cpuset essentially defines an island domain by	415	* variables. Each exclusive cpuset essentially defines an island domain by
416	* fully partitioning the member cpus from any other cpuset. Whenever a new	416	* fully partitioning the member cpus from any other cpuset. Whenever a new
417	* exclusive cpuset is created, we also create and attach a new root-domain	417	* exclusive cpuset is created, we also create and attach a new root-domain
418	* object.	418	* object.
419	*	419	*
420	*/	420	*/
421	struct root_domain {	421	struct root_domain {
422	atomic_t refcount;	422	atomic_t refcount;
423	cpumask_var_t span;	423	cpumask_var_t span;
424	cpumask_var_t online;	424	cpumask_var_t online;
425		425
426	/*	426	/*
427	* The "RT overload" flag: it gets set if a CPU has more than	427	* The "RT overload" flag: it gets set if a CPU has more than
428	* one runnable RT task.	428	* one runnable RT task.
429	*/	429	*/
430	cpumask_var_t rto_mask;	430	cpumask_var_t rto_mask;
431	atomic_t rto_count;	431	atomic_t rto_count;
432	struct cpupri cpupri;	432	struct cpupri cpupri;
433	};	433	};
434		434
435	/*	435	/*
436	* By default the system creates a single root-domain with all cpus as	436	* By default the system creates a single root-domain with all cpus as
437	* members (mimicking the global state we have today).	437	* members (mimicking the global state we have today).
438	*/	438	*/
439	static struct root_domain def_root_domain;	439	static struct root_domain def_root_domain;
440		440
441	#endif /* CONFIG_SMP */	441	#endif /* CONFIG_SMP */
442		442
443	/*	443	/*
444	* This is the main, per-CPU runqueue data structure.	444	* This is the main, per-CPU runqueue data structure.
445	*	445	*
446	* Locking rule: those places that want to lock multiple runqueues	446	* Locking rule: those places that want to lock multiple runqueues
447	* (such as the load balancing or the thread migration code), lock	447	* (such as the load balancing or the thread migration code), lock
448	* acquire operations must be ordered by ascending &runqueue.	448	* acquire operations must be ordered by ascending &runqueue.
449	*/	449	*/
450	struct rq {	450	struct rq {
451	/* runqueue lock: */	451	/* runqueue lock: */
452	raw_spinlock_t lock;	452	raw_spinlock_t lock;
453		453
454	/*	454	/*
455	* nr_running and cpu_load should be in the same cacheline because	455	* nr_running and cpu_load should be in the same cacheline because
456	* remote CPUs use both these fields when doing load calculation.	456	* remote CPUs use both these fields when doing load calculation.
457	*/	457	*/
458	unsigned long nr_running;	458	unsigned long nr_running;
459	#define CPU_LOAD_IDX_MAX 5	459	#define CPU_LOAD_IDX_MAX 5
460	unsigned long cpu_load[CPU_LOAD_IDX_MAX];	460	unsigned long cpu_load[CPU_LOAD_IDX_MAX];
461	unsigned long last_load_update_tick;	461	unsigned long last_load_update_tick;
462	#ifdef CONFIG_NO_HZ	462	#ifdef CONFIG_NO_HZ
463	u64 nohz_stamp;	463	u64 nohz_stamp;
464	unsigned char nohz_balance_kick;	464	unsigned char nohz_balance_kick;
465	#endif	465	#endif
466	unsigned int skip_clock_update;	466	unsigned int skip_clock_update;
467		467
468	/* capture load from all tasks on this cpu: */	468	/* capture load from all tasks on this cpu: */
469	struct load_weight load;	469	struct load_weight load;
470	unsigned long nr_load_updates;	470	unsigned long nr_load_updates;
471	u64 nr_switches;	471	u64 nr_switches;
472		472
473	struct cfs_rq cfs;	473	struct cfs_rq cfs;
474	struct rt_rq rt;	474	struct rt_rq rt;
475		475
476	#ifdef CONFIG_FAIR_GROUP_SCHED	476	#ifdef CONFIG_FAIR_GROUP_SCHED
477	/* list of leaf cfs_rq on this cpu: */	477	/* list of leaf cfs_rq on this cpu: */
478	struct list_head leaf_cfs_rq_list;	478	struct list_head leaf_cfs_rq_list;
479	#endif	479	#endif
480	#ifdef CONFIG_RT_GROUP_SCHED	480	#ifdef CONFIG_RT_GROUP_SCHED
481	struct list_head leaf_rt_rq_list;	481	struct list_head leaf_rt_rq_list;
482	#endif	482	#endif
483		483
484	/*	484	/*
485	* This is part of a global counter where only the total sum	485	* This is part of a global counter where only the total sum
486	* over all CPUs matters. A task can increase this counter on	486	* over all CPUs matters. A task can increase this counter on
487	* one CPU and if it got migrated afterwards it may decrease	487	* one CPU and if it got migrated afterwards it may decrease
488	* it on another CPU. Always updated under the runqueue lock:	488	* it on another CPU. Always updated under the runqueue lock:
489	*/	489	*/
490	unsigned long nr_uninterruptible;	490	unsigned long nr_uninterruptible;
491		491
492	struct task_struct curr, idle, *stop;	492	struct task_struct curr, idle, *stop;
493	unsigned long next_balance;	493	unsigned long next_balance;
494	struct mm_struct *prev_mm;	494	struct mm_struct *prev_mm;
495		495
496	u64 clock;	496	u64 clock;
497	u64 clock_task;	497	u64 clock_task;
498		498
499	atomic_t nr_iowait;	499	atomic_t nr_iowait;
500		500
501	#ifdef CONFIG_SMP	501	#ifdef CONFIG_SMP
502	struct root_domain *rd;	502	struct root_domain *rd;
503	struct sched_domain *sd;	503	struct sched_domain *sd;
504		504
505	unsigned long cpu_power;	505	unsigned long cpu_power;
506		506
507	unsigned char idle_at_tick;	507	unsigned char idle_at_tick;
508	/* For active balancing */	508	/* For active balancing */
509	int post_schedule;	509	int post_schedule;
510	int active_balance;	510	int active_balance;
511	int push_cpu;	511	int push_cpu;
512	struct cpu_stop_work active_balance_work;	512	struct cpu_stop_work active_balance_work;
513	/* cpu of this runqueue: */	513	/* cpu of this runqueue: */
514	int cpu;	514	int cpu;
515	int online;	515	int online;
516		516
517	unsigned long avg_load_per_task;	517	unsigned long avg_load_per_task;
518		518
519	u64 rt_avg;	519	u64 rt_avg;
520	u64 age_stamp;	520	u64 age_stamp;
521	u64 idle_stamp;	521	u64 idle_stamp;
522	u64 avg_idle;	522	u64 avg_idle;
523	#endif	523	#endif
524		524
525	#ifdef CONFIG_IRQ_TIME_ACCOUNTING	525	#ifdef CONFIG_IRQ_TIME_ACCOUNTING
526	u64 prev_irq_time;	526	u64 prev_irq_time;
527	#endif	527	#endif
528		528
529	/* calc_load related fields */	529	/* calc_load related fields */
530	unsigned long calc_load_update;	530	unsigned long calc_load_update;
531	long calc_load_active;	531	long calc_load_active;
532		532
533	#ifdef CONFIG_SCHED_HRTICK	533	#ifdef CONFIG_SCHED_HRTICK
534	#ifdef CONFIG_SMP	534	#ifdef CONFIG_SMP
535	int hrtick_csd_pending;	535	int hrtick_csd_pending;
536	struct call_single_data hrtick_csd;	536	struct call_single_data hrtick_csd;
537	#endif	537	#endif
538	struct hrtimer hrtick_timer;	538	struct hrtimer hrtick_timer;
539	#endif	539	#endif
540		540
541	#ifdef CONFIG_SCHEDSTATS	541	#ifdef CONFIG_SCHEDSTATS
542	/* latency stats */	542	/* latency stats */
543	struct sched_info rq_sched_info;	543	struct sched_info rq_sched_info;
544	unsigned long long rq_cpu_time;	544	unsigned long long rq_cpu_time;
545	/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */	545	/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
546		546
547	/* sys_sched_yield() stats */	547	/* sys_sched_yield() stats */
548	unsigned int yld_count;	548	unsigned int yld_count;
549		549
550	/* schedule() stats */	550	/* schedule() stats */
551	unsigned int sched_switch;	551	unsigned int sched_switch;
552	unsigned int sched_count;	552	unsigned int sched_count;
553	unsigned int sched_goidle;	553	unsigned int sched_goidle;
554		554
555	/* try_to_wake_up() stats */	555	/* try_to_wake_up() stats */
556	unsigned int ttwu_count;	556	unsigned int ttwu_count;
557	unsigned int ttwu_local;	557	unsigned int ttwu_local;
558	#endif	558	#endif
559		559
560	#ifdef CONFIG_SMP	560	#ifdef CONFIG_SMP
561	struct task_struct *wake_list;	561	struct task_struct *wake_list;
562	#endif	562	#endif
563	};	563	};
564		564
565	static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);	565	static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
566		566
567		567
568	static void check_preempt_curr(struct rq rq, struct task_struct p, int flags);	568	static void check_preempt_curr(struct rq rq, struct task_struct p, int flags);
569		569
570	static inline int cpu_of(struct rq *rq)	570	static inline int cpu_of(struct rq *rq)
571	{	571	{
572	#ifdef CONFIG_SMP	572	#ifdef CONFIG_SMP
573	return rq->cpu;	573	return rq->cpu;
574	#else	574	#else
575	return 0;	575	return 0;
576	#endif	576	#endif
577	}	577	}
578		578
579	#define rcu_dereference_check_sched_domain(p) \	579	#define rcu_dereference_check_sched_domain(p) \
580	rcu_dereference_check((p), \	580	rcu_dereference_check((p), \
581	rcu_read_lock_sched_held() \|\| \	581	rcu_read_lock_sched_held() \|\| \
582	lockdep_is_held(&sched_domains_mutex))	582	lockdep_is_held(&sched_domains_mutex))
583		583
584	/*	584	/*
585	* The domain tree (rq->sd) is protected by RCU's quiescent state transition.	585	* The domain tree (rq->sd) is protected by RCU's quiescent state transition.
586	* See detach_destroy_domains: synchronize_sched for details.	586	* See detach_destroy_domains: synchronize_sched for details.
587	*	587	*
588	* The domain tree of any CPU may only be accessed from within	588	* The domain tree of any CPU may only be accessed from within
589	* preempt-disabled sections.	589	* preempt-disabled sections.
590	*/	590	*/
591	#define for_each_domain(cpu, __sd) \	591	#define for_each_domain(cpu, __sd) \
592	for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)	592	for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
593		593
594	#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))	594	#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
595	#define this_rq() (&__get_cpu_var(runqueues))	595	#define this_rq() (&__get_cpu_var(runqueues))
596	#define task_rq(p) cpu_rq(task_cpu(p))	596	#define task_rq(p) cpu_rq(task_cpu(p))
597	#define cpu_curr(cpu) (cpu_rq(cpu)->curr)	597	#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
598	#define raw_rq() (&__raw_get_cpu_var(runqueues))	598	#define raw_rq() (&__raw_get_cpu_var(runqueues))
599		599
600	#ifdef CONFIG_CGROUP_SCHED	600	#ifdef CONFIG_CGROUP_SCHED
601		601
602	/*	602	/*
603	* Return the group to which this tasks belongs.	603	* Return the group to which this tasks belongs.
604	*	604	*
605	* We use task_subsys_state_check() and extend the RCU verification	605	* We use task_subsys_state_check() and extend the RCU verification
606	* with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach()	606	* with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach()
607	* holds that lock for each task it moves into the cgroup. Therefore	607	* holds that lock for each task it moves into the cgroup. Therefore
608	* by holding that lock, we pin the task to the current cgroup.	608	* by holding that lock, we pin the task to the current cgroup.
609	*/	609	*/
610	static inline struct task_group task_group(struct task_struct p)	610	static inline struct task_group task_group(struct task_struct p)
611	{	611	{
612	struct task_group *tg;	612	struct task_group *tg;
613	struct cgroup_subsys_state *css;	613	struct cgroup_subsys_state *css;
614		614
615	css = task_subsys_state_check(p, cpu_cgroup_subsys_id,	615	css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
616	lockdep_is_held(&p->pi_lock));	616	lockdep_is_held(&p->pi_lock));
617	tg = container_of(css, struct task_group, css);	617	tg = container_of(css, struct task_group, css);
618		618
619	return autogroup_task_group(p, tg);	619	return autogroup_task_group(p, tg);
620	}	620	}
621		621
622	/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */	622	/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
623	static inline void set_task_rq(struct task_struct *p, unsigned int cpu)	623	static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
624	{	624	{
625	#ifdef CONFIG_FAIR_GROUP_SCHED	625	#ifdef CONFIG_FAIR_GROUP_SCHED
626	p->se.cfs_rq = task_group(p)->cfs_rq[cpu];	626	p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
627	p->se.parent = task_group(p)->se[cpu];	627	p->se.parent = task_group(p)->se[cpu];
628	#endif	628	#endif
629		629
630	#ifdef CONFIG_RT_GROUP_SCHED	630	#ifdef CONFIG_RT_GROUP_SCHED
631	p->rt.rt_rq = task_group(p)->rt_rq[cpu];	631	p->rt.rt_rq = task_group(p)->rt_rq[cpu];
632	p->rt.parent = task_group(p)->rt_se[cpu];	632	p->rt.parent = task_group(p)->rt_se[cpu];
633	#endif	633	#endif
634	}	634	}
635		635
636	#else /* CONFIG_CGROUP_SCHED */	636	#else /* CONFIG_CGROUP_SCHED */
637		637
638	static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }	638	static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
639	static inline struct task_group task_group(struct task_struct p)	639	static inline struct task_group task_group(struct task_struct p)
640	{	640	{
641	return NULL;	641	return NULL;
642	}	642	}
643		643
644	#endif /* CONFIG_CGROUP_SCHED */	644	#endif /* CONFIG_CGROUP_SCHED */
645		645
646	static void update_rq_clock_task(struct rq *rq, s64 delta);	646	static void update_rq_clock_task(struct rq *rq, s64 delta);
647		647
648	static void update_rq_clock(struct rq *rq)	648	static void update_rq_clock(struct rq *rq)
649	{	649	{
650	s64 delta;	650	s64 delta;
651		651
652	if (rq->skip_clock_update)	652	if (rq->skip_clock_update)
653	return;	653	return;
654		654
655	delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;	655	delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
656	rq->clock += delta;	656	rq->clock += delta;
657	update_rq_clock_task(rq, delta);	657	update_rq_clock_task(rq, delta);
658	}	658	}
659		659
660	/*	660	/*
661	* Tunables that become constants when CONFIG_SCHED_DEBUG is off:	661	* Tunables that become constants when CONFIG_SCHED_DEBUG is off:
662	*/	662	*/
663	#ifdef CONFIG_SCHED_DEBUG	663	#ifdef CONFIG_SCHED_DEBUG
664	# define const_debug __read_mostly	664	# define const_debug __read_mostly
665	#else	665	#else
666	# define const_debug static const	666	# define const_debug static const
667	#endif	667	#endif
668		668
669	/**	669	/**
670	* runqueue_is_locked - Returns true if the current cpu runqueue is locked	670	* runqueue_is_locked - Returns true if the current cpu runqueue is locked
671	* @cpu: the processor in question.	671	* @cpu: the processor in question.
672	*	672	*
673	* This interface allows printk to be called with the runqueue lock	673	* This interface allows printk to be called with the runqueue lock
674	* held and know whether or not it is OK to wake up the klogd.	674	* held and know whether or not it is OK to wake up the klogd.
675	*/	675	*/
676	int runqueue_is_locked(int cpu)	676	int runqueue_is_locked(int cpu)
677	{	677	{
678	return raw_spin_is_locked(&cpu_rq(cpu)->lock);	678	return raw_spin_is_locked(&cpu_rq(cpu)->lock);
679	}	679	}
680		680
681	/*	681	/*
682	* Debugging: various feature bits	682	* Debugging: various feature bits
683	*/	683	*/
684		684
685	#define SCHED_FEAT(name, enabled) \	685	#define SCHED_FEAT(name, enabled) \
686	__SCHED_FEAT_##name ,	686	__SCHED_FEAT_##name ,
687		687
688	enum {	688	enum {
689	#include "sched_features.h"	689	#include "sched_features.h"
690	};	690	};
691		691
692	#undef SCHED_FEAT	692	#undef SCHED_FEAT
693		693
694	#define SCHED_FEAT(name, enabled) \	694	#define SCHED_FEAT(name, enabled) \
695	(1UL << __SCHED_FEAT_##name) * enabled \|	695	(1UL << __SCHED_FEAT_##name) * enabled \|
696		696
697	const_debug unsigned int sysctl_sched_features =	697	const_debug unsigned int sysctl_sched_features =
698	#include "sched_features.h"	698	#include "sched_features.h"
699	0;	699	0;
700		700
701	#undef SCHED_FEAT	701	#undef SCHED_FEAT
702		702
703	#ifdef CONFIG_SCHED_DEBUG	703	#ifdef CONFIG_SCHED_DEBUG
704	#define SCHED_FEAT(name, enabled) \	704	#define SCHED_FEAT(name, enabled) \
705	#name ,	705	#name ,
706		706
707	static __read_mostly char *sched_feat_names[] = {	707	static __read_mostly char *sched_feat_names[] = {
708	#include "sched_features.h"	708	#include "sched_features.h"
709	NULL	709	NULL
710	};	710	};
711		711
712	#undef SCHED_FEAT	712	#undef SCHED_FEAT
713		713
714	static int sched_feat_show(struct seq_file m, void v)	714	static int sched_feat_show(struct seq_file m, void v)
715	{	715	{
716	int i;	716	int i;
717		717
718	for (i = 0; sched_feat_names[i]; i++) {	718	for (i = 0; sched_feat_names[i]; i++) {
719	if (!(sysctl_sched_features & (1UL << i)))	719	if (!(sysctl_sched_features & (1UL << i)))
720	seq_puts(m, "NO_");	720	seq_puts(m, "NO_");
721	seq_printf(m, "%s ", sched_feat_names[i]);	721	seq_printf(m, "%s ", sched_feat_names[i]);
722	}	722	}
723	seq_puts(m, "\n");	723	seq_puts(m, "\n");
724		724
725	return 0;	725	return 0;
726	}	726	}
727		727
728	static ssize_t	728	static ssize_t
729	sched_feat_write(struct file filp, const char __user ubuf,	729	sched_feat_write(struct file filp, const char __user ubuf,
730	size_t cnt, loff_t *ppos)	730	size_t cnt, loff_t *ppos)
731	{	731	{
732	char buf[64];	732	char buf[64];
733	char *cmp;	733	char *cmp;
734	int neg = 0;	734	int neg = 0;
735	int i;	735	int i;
736		736
737	if (cnt > 63)	737	if (cnt > 63)
738	cnt = 63;	738	cnt = 63;
739		739
740	if (copy_from_user(&buf, ubuf, cnt))	740	if (copy_from_user(&buf, ubuf, cnt))
741	return -EFAULT;	741	return -EFAULT;
742		742
743	buf[cnt] = 0;	743	buf[cnt] = 0;
744	cmp = strstrip(buf);	744	cmp = strstrip(buf);
745		745
746	if (strncmp(cmp, "NO_", 3) == 0) {	746	if (strncmp(cmp, "NO_", 3) == 0) {
747	neg = 1;	747	neg = 1;
748	cmp += 3;	748	cmp += 3;
749	}	749	}
750		750
751	for (i = 0; sched_feat_names[i]; i++) {	751	for (i = 0; sched_feat_names[i]; i++) {
752	if (strcmp(cmp, sched_feat_names[i]) == 0) {	752	if (strcmp(cmp, sched_feat_names[i]) == 0) {
753	if (neg)	753	if (neg)
754	sysctl_sched_features &= ~(1UL << i);	754	sysctl_sched_features &= ~(1UL << i);
755	else	755	else
756	sysctl_sched_features \|= (1UL << i);	756	sysctl_sched_features \|= (1UL << i);
757	break;	757	break;
758	}	758	}
759	}	759	}
760		760
761	if (!sched_feat_names[i])	761	if (!sched_feat_names[i])
762	return -EINVAL;	762	return -EINVAL;
763		763
764	*ppos += cnt;	764	*ppos += cnt;
765		765
766	return cnt;	766	return cnt;
767	}	767	}
768		768
769	static int sched_feat_open(struct inode inode, struct file filp)	769	static int sched_feat_open(struct inode inode, struct file filp)
770	{	770	{
771	return single_open(filp, sched_feat_show, NULL);	771	return single_open(filp, sched_feat_show, NULL);
772	}	772	}
773		773
774	static const struct file_operations sched_feat_fops = {	774	static const struct file_operations sched_feat_fops = {
775	.open = sched_feat_open,	775	.open = sched_feat_open,
776	.write = sched_feat_write,	776	.write = sched_feat_write,
777	.read = seq_read,	777	.read = seq_read,
778	.llseek = seq_lseek,	778	.llseek = seq_lseek,
779	.release = single_release,	779	.release = single_release,
780	};	780	};
781		781
782	static __init int sched_init_debug(void)	782	static __init int sched_init_debug(void)
783	{	783	{
784	debugfs_create_file("sched_features", 0644, NULL, NULL,	784	debugfs_create_file("sched_features", 0644, NULL, NULL,
785	&sched_feat_fops);	785	&sched_feat_fops);
786		786
787	return 0;	787	return 0;
788	}	788	}
789	late_initcall(sched_init_debug);	789	late_initcall(sched_init_debug);
790		790
791	#endif	791	#endif
792		792
793	#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))	793	#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
794		794
795	/*	795	/*
796	* Number of tasks to iterate in a single balance run.	796	* Number of tasks to iterate in a single balance run.
797	* Limited because this is done with IRQs disabled.	797	* Limited because this is done with IRQs disabled.
798	*/	798	*/
799	const_debug unsigned int sysctl_sched_nr_migrate = 32;	799	const_debug unsigned int sysctl_sched_nr_migrate = 32;
800		800
801	/*	801	/*
802	* period over which we average the RT time consumption, measured	802	* period over which we average the RT time consumption, measured
803	* in ms.	803	* in ms.
804	*	804	*
805	* default: 1s	805	* default: 1s
806	*/	806	*/
807	const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;	807	const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
808		808
809	/*	809	/*
810	* period over which we measure -rt task cpu usage in us.	810	* period over which we measure -rt task cpu usage in us.
811	* default: 1s	811	* default: 1s
812	*/	812	*/
813	unsigned int sysctl_sched_rt_period = 1000000;	813	unsigned int sysctl_sched_rt_period = 1000000;
814		814
815	static __read_mostly int scheduler_running;	815	static __read_mostly int scheduler_running;
816		816
817	/*	817	/*
818	* part of the period that we allow rt tasks to run in us.	818	* part of the period that we allow rt tasks to run in us.
819	* default: 0.95s	819	* default: 0.95s
820	*/	820	*/
821	int sysctl_sched_rt_runtime = 950000;	821	int sysctl_sched_rt_runtime = 950000;
822		822
823	static inline u64 global_rt_period(void)	823	static inline u64 global_rt_period(void)
824	{	824	{
825	return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;	825	return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
826	}	826	}
827		827
828	static inline u64 global_rt_runtime(void)	828	static inline u64 global_rt_runtime(void)
829	{	829	{
830	if (sysctl_sched_rt_runtime < 0)	830	if (sysctl_sched_rt_runtime < 0)
831	return RUNTIME_INF;	831	return RUNTIME_INF;
832		832
833	return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;	833	return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
834	}	834	}
835		835
836	#ifndef prepare_arch_switch	836	#ifndef prepare_arch_switch
837	# define prepare_arch_switch(next) do { } while (0)	837	# define prepare_arch_switch(next) do { } while (0)
838	#endif	838	#endif
839	#ifndef finish_arch_switch	839	#ifndef finish_arch_switch
840	# define finish_arch_switch(prev) do { } while (0)	840	# define finish_arch_switch(prev) do { } while (0)
841	#endif	841	#endif
842		842
843	static inline int task_current(struct rq rq, struct task_struct p)	843	static inline int task_current(struct rq rq, struct task_struct p)
844	{	844	{
845	return rq->curr == p;	845	return rq->curr == p;
846	}	846	}
847		847
848	static inline int task_running(struct rq rq, struct task_struct p)	848	static inline int task_running(struct rq rq, struct task_struct p)
849	{	849	{
850	#ifdef CONFIG_SMP	850	#ifdef CONFIG_SMP
851	return p->on_cpu;	851	return p->on_cpu;
852	#else	852	#else
853	return task_current(rq, p);	853	return task_current(rq, p);
854	#endif	854	#endif
855	}	855	}
856		856
857	#ifndef __ARCH_WANT_UNLOCKED_CTXSW	857	#ifndef __ARCH_WANT_UNLOCKED_CTXSW
858	static inline void prepare_lock_switch(struct rq rq, struct task_struct next)	858	static inline void prepare_lock_switch(struct rq rq, struct task_struct next)
859	{	859	{
860	#ifdef CONFIG_SMP	860	#ifdef CONFIG_SMP
861	/*	861	/*
862	* We can optimise this out completely for !SMP, because the	862	* We can optimise this out completely for !SMP, because the
863	* SMP rebalancing from interrupt is the only thing that cares	863	* SMP rebalancing from interrupt is the only thing that cares
864	* here.	864	* here.
865	*/	865	*/
866	next->on_cpu = 1;	866	next->on_cpu = 1;
867	#endif	867	#endif
868	}	868	}
869		869
870	static inline void finish_lock_switch(struct rq rq, struct task_struct prev)	870	static inline void finish_lock_switch(struct rq rq, struct task_struct prev)
871	{	871	{
872	#ifdef CONFIG_SMP	872	#ifdef CONFIG_SMP
873	/*	873	/*
874	* After ->on_cpu is cleared, the task can be moved to a different CPU.	874	* After ->on_cpu is cleared, the task can be moved to a different CPU.
875	* We must ensure this doesn't happen until the switch is completely	875	* We must ensure this doesn't happen until the switch is completely
876	* finished.	876	* finished.
877	*/	877	*/
878	smp_wmb();	878	smp_wmb();
879	prev->on_cpu = 0;	879	prev->on_cpu = 0;
880	#endif	880	#endif
881	#ifdef CONFIG_DEBUG_SPINLOCK	881	#ifdef CONFIG_DEBUG_SPINLOCK
882	/* this is a valid case when another task releases the spinlock */	882	/* this is a valid case when another task releases the spinlock */
883	rq->lock.owner = current;	883	rq->lock.owner = current;
884	#endif	884	#endif
885	/*	885	/*
886	* If we are tracking spinlock dependencies then we have to	886	* If we are tracking spinlock dependencies then we have to
887	* fix up the runqueue lock - which gets 'carried over' from	887	* fix up the runqueue lock - which gets 'carried over' from
888	* prev into current:	888	* prev into current:
889	*/	889	*/
890	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);	890	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
891		891
892	raw_spin_unlock_irq(&rq->lock);	892	raw_spin_unlock_irq(&rq->lock);
893	}	893	}
894		894
895	#else /* __ARCH_WANT_UNLOCKED_CTXSW */	895	#else /* __ARCH_WANT_UNLOCKED_CTXSW */
896	static inline void prepare_lock_switch(struct rq rq, struct task_struct next)	896	static inline void prepare_lock_switch(struct rq rq, struct task_struct next)
897	{	897	{
898	#ifdef CONFIG_SMP	898	#ifdef CONFIG_SMP
899	/*	899	/*
900	* We can optimise this out completely for !SMP, because the	900	* We can optimise this out completely for !SMP, because the
901	* SMP rebalancing from interrupt is the only thing that cares	901	* SMP rebalancing from interrupt is the only thing that cares
902	* here.	902	* here.
903	*/	903	*/
904	next->on_cpu = 1;	904	next->on_cpu = 1;
905	#endif	905	#endif
906	#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW	906	#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
907	raw_spin_unlock_irq(&rq->lock);	907	raw_spin_unlock_irq(&rq->lock);
908	#else	908	#else
909	raw_spin_unlock(&rq->lock);	909	raw_spin_unlock(&rq->lock);
910	#endif	910	#endif
911	}	911	}
912		912
913	static inline void finish_lock_switch(struct rq rq, struct task_struct prev)	913	static inline void finish_lock_switch(struct rq rq, struct task_struct prev)
914	{	914	{
915	#ifdef CONFIG_SMP	915	#ifdef CONFIG_SMP
916	/*	916	/*
917	* After ->on_cpu is cleared, the task can be moved to a different CPU.	917	* After ->on_cpu is cleared, the task can be moved to a different CPU.
918	* We must ensure this doesn't happen until the switch is completely	918	* We must ensure this doesn't happen until the switch is completely
919	* finished.	919	* finished.
920	*/	920	*/
921	smp_wmb();	921	smp_wmb();
922	prev->on_cpu = 0;	922	prev->on_cpu = 0;
923	#endif	923	#endif
924	#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW	924	#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
925	local_irq_enable();	925	local_irq_enable();
926	#endif	926	#endif
927	}	927	}
928	#endif /* __ARCH_WANT_UNLOCKED_CTXSW */	928	#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
929		929
930	/*	930	/*
931	* __task_rq_lock - lock the rq @p resides on.	931	* __task_rq_lock - lock the rq @p resides on.
932	*/	932	*/
933	static inline struct rq __task_rq_lock(struct task_struct p)	933	static inline struct rq __task_rq_lock(struct task_struct p)
934	__acquires(rq->lock)	934	__acquires(rq->lock)
935	{	935	{
936	struct rq *rq;	936	struct rq *rq;
937		937
938	lockdep_assert_held(&p->pi_lock);	938	lockdep_assert_held(&p->pi_lock);
939		939
940	for (;;) {	940	for (;;) {
941	rq = task_rq(p);	941	rq = task_rq(p);
942	raw_spin_lock(&rq->lock);	942	raw_spin_lock(&rq->lock);
943	if (likely(rq == task_rq(p)))	943	if (likely(rq == task_rq(p)))
944	return rq;	944	return rq;
945	raw_spin_unlock(&rq->lock);	945	raw_spin_unlock(&rq->lock);
946	}	946	}
947	}	947	}
948		948
949	/*	949	/*
950	* task_rq_lock - lock p->pi_lock and lock the rq @p resides on.	950	* task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
951	*/	951	*/
952	static struct rq task_rq_lock(struct task_struct p, unsigned long *flags)	952	static struct rq task_rq_lock(struct task_struct p, unsigned long *flags)
953	__acquires(p->pi_lock)	953	__acquires(p->pi_lock)
954	__acquires(rq->lock)	954	__acquires(rq->lock)
955	{	955	{
956	struct rq *rq;	956	struct rq *rq;
957		957
958	for (;;) {	958	for (;;) {
959	raw_spin_lock_irqsave(&p->pi_lock, *flags);	959	raw_spin_lock_irqsave(&p->pi_lock, *flags);
960	rq = task_rq(p);	960	rq = task_rq(p);
961	raw_spin_lock(&rq->lock);	961	raw_spin_lock(&rq->lock);
962	if (likely(rq == task_rq(p)))	962	if (likely(rq == task_rq(p)))
963	return rq;	963	return rq;
964	raw_spin_unlock(&rq->lock);	964	raw_spin_unlock(&rq->lock);
965	raw_spin_unlock_irqrestore(&p->pi_lock, *flags);	965	raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
966	}	966	}
967	}	967	}
968		968
969	static void __task_rq_unlock(struct rq *rq)	969	static void __task_rq_unlock(struct rq *rq)
970	__releases(rq->lock)	970	__releases(rq->lock)
971	{	971	{
972	raw_spin_unlock(&rq->lock);	972	raw_spin_unlock(&rq->lock);
973	}	973	}
974		974
975	static inline void	975	static inline void
976	task_rq_unlock(struct rq rq, struct task_struct p, unsigned long *flags)	976	task_rq_unlock(struct rq rq, struct task_struct p, unsigned long *flags)
977	__releases(rq->lock)	977	__releases(rq->lock)
978	__releases(p->pi_lock)	978	__releases(p->pi_lock)
979	{	979	{
980	raw_spin_unlock(&rq->lock);	980	raw_spin_unlock(&rq->lock);
981	raw_spin_unlock_irqrestore(&p->pi_lock, *flags);	981	raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
982	}	982	}
983		983
984	/*	984	/*
985	* this_rq_lock - lock this runqueue and disable interrupts.	985	* this_rq_lock - lock this runqueue and disable interrupts.
986	*/	986	*/
987	static struct rq *this_rq_lock(void)	987	static struct rq *this_rq_lock(void)
988	__acquires(rq->lock)	988	__acquires(rq->lock)
989	{	989	{
990	struct rq *rq;	990	struct rq *rq;
991		991
992	local_irq_disable();	992	local_irq_disable();
993	rq = this_rq();	993	rq = this_rq();
994	raw_spin_lock(&rq->lock);	994	raw_spin_lock(&rq->lock);
995		995
996	return rq;	996	return rq;
997	}	997	}
998		998
999	#ifdef CONFIG_SCHED_HRTICK	999	#ifdef CONFIG_SCHED_HRTICK
1000	/*	1000	/*
1001	* Use HR-timers to deliver accurate preemption points.	1001	* Use HR-timers to deliver accurate preemption points.
1002	*	1002	*
1003	* Its all a bit involved since we cannot program an hrt while holding the	1003	* Its all a bit involved since we cannot program an hrt while holding the
1004	* rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a	1004	* rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
1005	* reschedule event.	1005	* reschedule event.
1006	*	1006	*
1007	* When we get rescheduled we reprogram the hrtick_timer outside of the	1007	* When we get rescheduled we reprogram the hrtick_timer outside of the
1008	* rq->lock.	1008	* rq->lock.
1009	*/	1009	*/
1010		1010
1011	/*	1011	/*
1012	* Use hrtick when:	1012	* Use hrtick when:
1013	* - enabled by features	1013	* - enabled by features
1014	* - hrtimer is actually high res	1014	* - hrtimer is actually high res
1015	*/	1015	*/
1016	static inline int hrtick_enabled(struct rq *rq)	1016	static inline int hrtick_enabled(struct rq *rq)
1017	{	1017	{
1018	if (!sched_feat(HRTICK))	1018	if (!sched_feat(HRTICK))
1019	return 0;	1019	return 0;
1020	if (!cpu_active(cpu_of(rq)))	1020	if (!cpu_active(cpu_of(rq)))
1021	return 0;	1021	return 0;
1022	return hrtimer_is_hres_active(&rq->hrtick_timer);	1022	return hrtimer_is_hres_active(&rq->hrtick_timer);
1023	}	1023	}
1024		1024
1025	static void hrtick_clear(struct rq *rq)	1025	static void hrtick_clear(struct rq *rq)
1026	{	1026	{
1027	if (hrtimer_active(&rq->hrtick_timer))	1027	if (hrtimer_active(&rq->hrtick_timer))
1028	hrtimer_cancel(&rq->hrtick_timer);	1028	hrtimer_cancel(&rq->hrtick_timer);
1029	}	1029	}
1030		1030
1031	/*	1031	/*
1032	* High-resolution timer tick.	1032	* High-resolution timer tick.
1033	* Runs from hardirq context with interrupts disabled.	1033	* Runs from hardirq context with interrupts disabled.
1034	*/	1034	*/
1035	static enum hrtimer_restart hrtick(struct hrtimer *timer)	1035	static enum hrtimer_restart hrtick(struct hrtimer *timer)
1036	{	1036	{
1037	struct rq *rq = container_of(timer, struct rq, hrtick_timer);	1037	struct rq *rq = container_of(timer, struct rq, hrtick_timer);
1038		1038
1039	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());	1039	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1040		1040
1041	raw_spin_lock(&rq->lock);	1041	raw_spin_lock(&rq->lock);
1042	update_rq_clock(rq);	1042	update_rq_clock(rq);
1043	rq->curr->sched_class->task_tick(rq, rq->curr, 1);	1043	rq->curr->sched_class->task_tick(rq, rq->curr, 1);
1044	raw_spin_unlock(&rq->lock);	1044	raw_spin_unlock(&rq->lock);
1045		1045
1046	return HRTIMER_NORESTART;	1046	return HRTIMER_NORESTART;
1047	}	1047	}
1048		1048
1049	#ifdef CONFIG_SMP	1049	#ifdef CONFIG_SMP
1050	/*	1050	/*
1051	* called from hardirq (IPI) context	1051	* called from hardirq (IPI) context
1052	*/	1052	*/
1053	static void __hrtick_start(void *arg)	1053	static void __hrtick_start(void *arg)
1054	{	1054	{
1055	struct rq *rq = arg;	1055	struct rq *rq = arg;
1056		1056
1057	raw_spin_lock(&rq->lock);	1057	raw_spin_lock(&rq->lock);
1058	hrtimer_restart(&rq->hrtick_timer);	1058	hrtimer_restart(&rq->hrtick_timer);
1059	rq->hrtick_csd_pending = 0;	1059	rq->hrtick_csd_pending = 0;
1060	raw_spin_unlock(&rq->lock);	1060	raw_spin_unlock(&rq->lock);
1061	}	1061	}
1062		1062
1063	/*	1063	/*
1064	* Called to set the hrtick timer state.	1064	* Called to set the hrtick timer state.
1065	*	1065	*
1066	* called with rq->lock held and irqs disabled	1066	* called with rq->lock held and irqs disabled
1067	*/	1067	*/
1068	static void hrtick_start(struct rq *rq, u64 delay)	1068	static void hrtick_start(struct rq *rq, u64 delay)
1069	{	1069	{
1070	struct hrtimer *timer = &rq->hrtick_timer;	1070	struct hrtimer *timer = &rq->hrtick_timer;
1071	ktime_t time = ktime_add_ns(timer->base->get_time(), delay);	1071	ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
1072		1072
1073	hrtimer_set_expires(timer, time);	1073	hrtimer_set_expires(timer, time);
1074		1074
1075	if (rq == this_rq()) {	1075	if (rq == this_rq()) {
1076	hrtimer_restart(timer);	1076	hrtimer_restart(timer);
1077	} else if (!rq->hrtick_csd_pending) {	1077	} else if (!rq->hrtick_csd_pending) {
1078	__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);	1078	__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
1079	rq->hrtick_csd_pending = 1;	1079	rq->hrtick_csd_pending = 1;
1080	}	1080	}
1081	}	1081	}
1082		1082
1083	static int	1083	static int
1084	hotplug_hrtick(struct notifier_block nfb, unsigned long action, void hcpu)	1084	hotplug_hrtick(struct notifier_block nfb, unsigned long action, void hcpu)
1085	{	1085	{
1086	int cpu = (int)(long)hcpu;	1086	int cpu = (int)(long)hcpu;
1087		1087
1088	switch (action) {	1088	switch (action) {
1089	case CPU_UP_CANCELED:	1089	case CPU_UP_CANCELED:
1090	case CPU_UP_CANCELED_FROZEN:	1090	case CPU_UP_CANCELED_FROZEN:
1091	case CPU_DOWN_PREPARE:	1091	case CPU_DOWN_PREPARE:
1092	case CPU_DOWN_PREPARE_FROZEN:	1092	case CPU_DOWN_PREPARE_FROZEN:
1093	case CPU_DEAD:	1093	case CPU_DEAD:
1094	case CPU_DEAD_FROZEN:	1094	case CPU_DEAD_FROZEN:
1095	hrtick_clear(cpu_rq(cpu));	1095	hrtick_clear(cpu_rq(cpu));
1096	return NOTIFY_OK;	1096	return NOTIFY_OK;
1097	}	1097	}
1098		1098
1099	return NOTIFY_DONE;	1099	return NOTIFY_DONE;
1100	}	1100	}
1101		1101
1102	static __init void init_hrtick(void)	1102	static __init void init_hrtick(void)
1103	{	1103	{
1104	hotcpu_notifier(hotplug_hrtick, 0);	1104	hotcpu_notifier(hotplug_hrtick, 0);
1105	}	1105	}
1106	#else	1106	#else
1107	/*	1107	/*
1108	* Called to set the hrtick timer state.	1108	* Called to set the hrtick timer state.
1109	*	1109	*
1110	* called with rq->lock held and irqs disabled	1110	* called with rq->lock held and irqs disabled
1111	*/	1111	*/
1112	static void hrtick_start(struct rq *rq, u64 delay)	1112	static void hrtick_start(struct rq *rq, u64 delay)
1113	{	1113	{
1114	__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,	1114	__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
1115	HRTIMER_MODE_REL_PINNED, 0);	1115	HRTIMER_MODE_REL_PINNED, 0);
1116	}	1116	}
1117		1117
1118	static inline void init_hrtick(void)	1118	static inline void init_hrtick(void)
1119	{	1119	{
1120	}	1120	}
1121	#endif /* CONFIG_SMP */	1121	#endif /* CONFIG_SMP */
1122		1122
1123	static void init_rq_hrtick(struct rq *rq)	1123	static void init_rq_hrtick(struct rq *rq)
1124	{	1124	{
1125	#ifdef CONFIG_SMP	1125	#ifdef CONFIG_SMP
1126	rq->hrtick_csd_pending = 0;	1126	rq->hrtick_csd_pending = 0;
1127		1127
1128	rq->hrtick_csd.flags = 0;	1128	rq->hrtick_csd.flags = 0;
1129	rq->hrtick_csd.func = __hrtick_start;	1129	rq->hrtick_csd.func = __hrtick_start;
1130	rq->hrtick_csd.info = rq;	1130	rq->hrtick_csd.info = rq;
1131	#endif	1131	#endif
1132		1132
1133	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);	1133	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1134	rq->hrtick_timer.function = hrtick;	1134	rq->hrtick_timer.function = hrtick;
1135	}	1135	}
1136	#else /* CONFIG_SCHED_HRTICK */	1136	#else /* CONFIG_SCHED_HRTICK */
1137	static inline void hrtick_clear(struct rq *rq)	1137	static inline void hrtick_clear(struct rq *rq)
1138	{	1138	{
1139	}	1139	}
1140		1140
1141	static inline void init_rq_hrtick(struct rq *rq)	1141	static inline void init_rq_hrtick(struct rq *rq)
1142	{	1142	{
1143	}	1143	}
1144		1144
1145	static inline void init_hrtick(void)	1145	static inline void init_hrtick(void)
1146	{	1146	{
1147	}	1147	}
1148	#endif /* CONFIG_SCHED_HRTICK */	1148	#endif /* CONFIG_SCHED_HRTICK */
1149		1149
1150	/*	1150	/*
1151	* resched_task - mark a task 'to be rescheduled now'.	1151	* resched_task - mark a task 'to be rescheduled now'.
1152	*	1152	*
1153	* On UP this means the setting of the need_resched flag, on SMP it	1153	* On UP this means the setting of the need_resched flag, on SMP it
1154	* might also involve a cross-CPU call to trigger the scheduler on	1154	* might also involve a cross-CPU call to trigger the scheduler on
1155	* the target CPU.	1155	* the target CPU.
1156	*/	1156	*/
1157	#ifdef CONFIG_SMP	1157	#ifdef CONFIG_SMP
1158		1158
1159	#ifndef tsk_is_polling	1159	#ifndef tsk_is_polling
1160	#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)	1160	#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1161	#endif	1161	#endif
1162		1162
1163	static void resched_task(struct task_struct *p)	1163	static void resched_task(struct task_struct *p)
1164	{	1164	{
1165	int cpu;	1165	int cpu;
1166		1166
1167	assert_raw_spin_locked(&task_rq(p)->lock);	1167	assert_raw_spin_locked(&task_rq(p)->lock);
1168		1168
1169	if (test_tsk_need_resched(p))	1169	if (test_tsk_need_resched(p))
1170	return;	1170	return;
1171		1171
1172	set_tsk_need_resched(p);	1172	set_tsk_need_resched(p);
1173		1173
1174	cpu = task_cpu(p);	1174	cpu = task_cpu(p);
1175	if (cpu == smp_processor_id())	1175	if (cpu == smp_processor_id())
1176	return;	1176	return;
1177		1177
1178	/* NEED_RESCHED must be visible before we test polling */	1178	/* NEED_RESCHED must be visible before we test polling */
1179	smp_mb();	1179	smp_mb();
1180	if (!tsk_is_polling(p))	1180	if (!tsk_is_polling(p))
1181	smp_send_reschedule(cpu);	1181	smp_send_reschedule(cpu);
1182	}	1182	}
1183		1183
1184	static void resched_cpu(int cpu)	1184	static void resched_cpu(int cpu)
1185	{	1185	{
1186	struct rq *rq = cpu_rq(cpu);	1186	struct rq *rq = cpu_rq(cpu);
1187	unsigned long flags;	1187	unsigned long flags;
1188		1188
1189	if (!raw_spin_trylock_irqsave(&rq->lock, flags))	1189	if (!raw_spin_trylock_irqsave(&rq->lock, flags))
1190	return;	1190	return;
1191	resched_task(cpu_curr(cpu));	1191	resched_task(cpu_curr(cpu));
1192	raw_spin_unlock_irqrestore(&rq->lock, flags);	1192	raw_spin_unlock_irqrestore(&rq->lock, flags);
1193	}	1193	}
1194		1194
1195	#ifdef CONFIG_NO_HZ	1195	#ifdef CONFIG_NO_HZ
1196	/*	1196	/*
1197	* In the semi idle case, use the nearest busy cpu for migrating timers	1197	* In the semi idle case, use the nearest busy cpu for migrating timers
1198	* from an idle cpu. This is good for power-savings.	1198	* from an idle cpu. This is good for power-savings.
1199	*	1199	*
1200	* We don't do similar optimization for completely idle system, as	1200	* We don't do similar optimization for completely idle system, as
1201	* selecting an idle cpu will add more delays to the timers than intended	1201	* selecting an idle cpu will add more delays to the timers than intended
1202	* (as that cpu's timer base may not be uptodate wrt jiffies etc).	1202	* (as that cpu's timer base may not be uptodate wrt jiffies etc).
1203	*/	1203	*/
1204	int get_nohz_timer_target(void)	1204	int get_nohz_timer_target(void)
1205	{	1205	{
1206	int cpu = smp_processor_id();	1206	int cpu = smp_processor_id();
1207	int i;	1207	int i;
1208	struct sched_domain *sd;	1208	struct sched_domain *sd;
1209		1209
1210	for_each_domain(cpu, sd) {	1210	for_each_domain(cpu, sd) {
1211	for_each_cpu(i, sched_domain_span(sd))	1211	for_each_cpu(i, sched_domain_span(sd))
1212	if (!idle_cpu(i))	1212	if (!idle_cpu(i))
1213	return i;	1213	return i;
1214	}	1214	}
1215	return cpu;	1215	return cpu;
1216	}	1216	}
1217	/*	1217	/*
1218	* When add_timer_on() enqueues a timer into the timer wheel of an	1218	* When add_timer_on() enqueues a timer into the timer wheel of an
1219	* idle CPU then this timer might expire before the next timer event	1219	* idle CPU then this timer might expire before the next timer event
1220	* which is scheduled to wake up that CPU. In case of a completely	1220	* which is scheduled to wake up that CPU. In case of a completely
1221	* idle system the next event might even be infinite time into the	1221	* idle system the next event might even be infinite time into the
1222	* future. wake_up_idle_cpu() ensures that the CPU is woken up and	1222	* future. wake_up_idle_cpu() ensures that the CPU is woken up and
1223	* leaves the inner idle loop so the newly added timer is taken into	1223	* leaves the inner idle loop so the newly added timer is taken into
1224	* account when the CPU goes back to idle and evaluates the timer	1224	* account when the CPU goes back to idle and evaluates the timer
1225	* wheel for the next timer event.	1225	* wheel for the next timer event.
1226	*/	1226	*/
1227	void wake_up_idle_cpu(int cpu)	1227	void wake_up_idle_cpu(int cpu)
1228	{	1228	{
1229	struct rq *rq = cpu_rq(cpu);	1229	struct rq *rq = cpu_rq(cpu);
1230		1230
1231	if (cpu == smp_processor_id())	1231	if (cpu == smp_processor_id())
1232	return;	1232	return;
1233		1233
1234	/*	1234	/*
1235	* This is safe, as this function is called with the timer	1235	* This is safe, as this function is called with the timer
1236	* wheel base lock of (cpu) held. When the CPU is on the way	1236	* wheel base lock of (cpu) held. When the CPU is on the way
1237	* to idle and has not yet set rq->curr to idle then it will	1237	* to idle and has not yet set rq->curr to idle then it will
1238	* be serialized on the timer wheel base lock and take the new	1238	* be serialized on the timer wheel base lock and take the new
1239	* timer into account automatically.	1239	* timer into account automatically.
1240	*/	1240	*/
1241	if (rq->curr != rq->idle)	1241	if (rq->curr != rq->idle)
1242	return;	1242	return;
1243		1243
1244	/*	1244	/*
1245	* We can set TIF_RESCHED on the idle task of the other CPU	1245	* We can set TIF_RESCHED on the idle task of the other CPU
1246	* lockless. The worst case is that the other CPU runs the	1246	* lockless. The worst case is that the other CPU runs the
1247	* idle task through an additional NOOP schedule()	1247	* idle task through an additional NOOP schedule()
1248	*/	1248	*/
1249	set_tsk_need_resched(rq->idle);	1249	set_tsk_need_resched(rq->idle);
1250		1250
1251	/* NEED_RESCHED must be visible before we test polling */	1251	/* NEED_RESCHED must be visible before we test polling */
1252	smp_mb();	1252	smp_mb();
1253	if (!tsk_is_polling(rq->idle))	1253	if (!tsk_is_polling(rq->idle))
1254	smp_send_reschedule(cpu);	1254	smp_send_reschedule(cpu);
1255	}	1255	}
1256		1256
1257	#endif /* CONFIG_NO_HZ */	1257	#endif /* CONFIG_NO_HZ */
1258		1258
1259	static u64 sched_avg_period(void)	1259	static u64 sched_avg_period(void)
1260	{	1260	{
1261	return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;	1261	return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
1262	}	1262	}
1263		1263
1264	static void sched_avg_update(struct rq *rq)	1264	static void sched_avg_update(struct rq *rq)
1265	{	1265	{
1266	s64 period = sched_avg_period();	1266	s64 period = sched_avg_period();
1267		1267
1268	while ((s64)(rq->clock - rq->age_stamp) > period) {	1268	while ((s64)(rq->clock - rq->age_stamp) > period) {
1269	/*	1269	/*
1270	* Inline assembly required to prevent the compiler	1270	* Inline assembly required to prevent the compiler
1271	* optimising this loop into a divmod call.	1271	* optimising this loop into a divmod call.
1272	* See __iter_div_u64_rem() for another example of this.	1272	* See __iter_div_u64_rem() for another example of this.
1273	*/	1273	*/
1274	asm("" : "+rm" (rq->age_stamp));	1274	asm("" : "+rm" (rq->age_stamp));
1275	rq->age_stamp += period;	1275	rq->age_stamp += period;
1276	rq->rt_avg /= 2;	1276	rq->rt_avg /= 2;
1277	}	1277	}
1278	}	1278	}
1279		1279
1280	static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)	1280	static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1281	{	1281	{
1282	rq->rt_avg += rt_delta;	1282	rq->rt_avg += rt_delta;
1283	sched_avg_update(rq);	1283	sched_avg_update(rq);
1284	}	1284	}
1285		1285
1286	#else /* !CONFIG_SMP */	1286	#else /* !CONFIG_SMP */
1287	static void resched_task(struct task_struct *p)	1287	static void resched_task(struct task_struct *p)
1288	{	1288	{
1289	assert_raw_spin_locked(&task_rq(p)->lock);	1289	assert_raw_spin_locked(&task_rq(p)->lock);
1290	set_tsk_need_resched(p);	1290	set_tsk_need_resched(p);
1291	}	1291	}
1292		1292
1293	static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)	1293	static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1294	{	1294	{
1295	}	1295	}
1296		1296
1297	static void sched_avg_update(struct rq *rq)	1297	static void sched_avg_update(struct rq *rq)
1298	{	1298	{
1299	}	1299	}
1300	#endif /* CONFIG_SMP */	1300	#endif /* CONFIG_SMP */
1301		1301
1302	#if BITS_PER_LONG == 32	1302	#if BITS_PER_LONG == 32
1303	# define WMULT_CONST (~0UL)	1303	# define WMULT_CONST (~0UL)
1304	#else	1304	#else
1305	# define WMULT_CONST (1UL << 32)	1305	# define WMULT_CONST (1UL << 32)
1306	#endif	1306	#endif
1307		1307
1308	#define WMULT_SHIFT 32	1308	#define WMULT_SHIFT 32
1309		1309
1310	/*	1310	/*
1311	* Shift right and round:	1311	* Shift right and round:
1312	*/	1312	*/
1313	#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))	1313	#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
1314		1314
1315	/*	1315	/*
1316	* delta *= weight / lw	1316	* delta *= weight / lw
1317	*/	1317	*/
1318	static unsigned long	1318	static unsigned long
1319	calc_delta_mine(unsigned long delta_exec, unsigned long weight,	1319	calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1320	struct load_weight *lw)	1320	struct load_weight *lw)
1321	{	1321	{
1322	u64 tmp;	1322	u64 tmp;
1323		1323
1324	if (!lw->inv_weight) {	1324	if (!lw->inv_weight) {
1325	if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))	1325	if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
1326	lw->inv_weight = 1;	1326	lw->inv_weight = 1;
1327	else	1327	else
1328	lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)	1328	lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
1329	/ (lw->weight+1);	1329	/ (lw->weight+1);
1330	}	1330	}
1331		1331
1332	tmp = (u64)delta_exec * weight;	1332	tmp = (u64)delta_exec * weight;
1333	/*	1333	/*
1334	* Check whether we'd overflow the 64-bit multiplication:	1334	* Check whether we'd overflow the 64-bit multiplication:
1335	*/	1335	*/
1336	if (unlikely(tmp > WMULT_CONST))	1336	if (unlikely(tmp > WMULT_CONST))
1337	tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,	1337	tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
1338	WMULT_SHIFT/2);	1338	WMULT_SHIFT/2);
1339	else	1339	else
1340	tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);	1340	tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
1341		1341
1342	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);	1342	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1343	}	1343	}
1344		1344
1345	static inline void update_load_add(struct load_weight *lw, unsigned long inc)	1345	static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1346	{	1346	{
1347	lw->weight += inc;	1347	lw->weight += inc;
1348	lw->inv_weight = 0;	1348	lw->inv_weight = 0;
1349	}	1349	}
1350		1350
1351	static inline void update_load_sub(struct load_weight *lw, unsigned long dec)	1351	static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1352	{	1352	{
1353	lw->weight -= dec;	1353	lw->weight -= dec;
1354	lw->inv_weight = 0;	1354	lw->inv_weight = 0;
1355	}	1355	}
1356		1356
1357	static inline void update_load_set(struct load_weight *lw, unsigned long w)	1357	static inline void update_load_set(struct load_weight *lw, unsigned long w)
1358	{	1358	{
1359	lw->weight = w;	1359	lw->weight = w;
1360	lw->inv_weight = 0;	1360	lw->inv_weight = 0;
1361	}	1361	}
1362		1362
1363	/*	1363	/*
1364	* To aid in avoiding the subversion of "niceness" due to uneven distribution	1364	* To aid in avoiding the subversion of "niceness" due to uneven distribution
1365	* of tasks with abnormal "nice" values across CPUs the contribution that	1365	* of tasks with abnormal "nice" values across CPUs the contribution that
1366	* each task makes to its run queue's load is weighted according to its	1366	* each task makes to its run queue's load is weighted according to its
1367	* scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a	1367	* scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
1368	* scaled version of the new time slice allocation that they receive on time	1368	* scaled version of the new time slice allocation that they receive on time
1369	* slice expiry etc.	1369	* slice expiry etc.
1370	*/	1370	*/
1371		1371
1372	#define WEIGHT_IDLEPRIO 3	1372	#define WEIGHT_IDLEPRIO 3
1373	#define WMULT_IDLEPRIO 1431655765	1373	#define WMULT_IDLEPRIO 1431655765
1374		1374
1375	/*	1375	/*
1376	* Nice levels are multiplicative, with a gentle 10% change for every	1376	* Nice levels are multiplicative, with a gentle 10% change for every
1377	* nice level changed. I.e. when a CPU-bound task goes from nice 0 to	1377	* nice level changed. I.e. when a CPU-bound task goes from nice 0 to
1378	* nice 1, it will get ~10% less CPU time than another CPU-bound task	1378	* nice 1, it will get ~10% less CPU time than another CPU-bound task
1379	* that remained on nice 0.	1379	* that remained on nice 0.
1380	*	1380	*
1381	* The "10% effect" is relative and cumulative: from _any_ nice level,	1381	* The "10% effect" is relative and cumulative: from _any_ nice level,
1382	* if you go up 1 level, it's -10% CPU usage, if you go down 1 level	1382	* if you go up 1 level, it's -10% CPU usage, if you go down 1 level
1383	* it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.	1383	* it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
1384	* If a task goes up by ~10% and another task goes down by ~10% then	1384	* If a task goes up by ~10% and another task goes down by ~10% then
1385	* the relative distance between them is ~25%.)	1385	* the relative distance between them is ~25%.)
1386	*/	1386	*/
1387	static const int prio_to_weight[40] = {	1387	static const int prio_to_weight[40] = {
1388	/* -20 */ 88761, 71755, 56483, 46273, 36291,	1388	/* -20 */ 88761, 71755, 56483, 46273, 36291,
1389	/* -15 */ 29154, 23254, 18705, 14949, 11916,	1389	/* -15 */ 29154, 23254, 18705, 14949, 11916,
1390	/* -10 */ 9548, 7620, 6100, 4904, 3906,	1390	/* -10 */ 9548, 7620, 6100, 4904, 3906,
1391	/* -5 */ 3121, 2501, 1991, 1586, 1277,	1391	/* -5 */ 3121, 2501, 1991, 1586, 1277,
1392	/* 0 */ 1024, 820, 655, 526, 423,	1392	/* 0 */ 1024, 820, 655, 526, 423,
1393	/* 5 */ 335, 272, 215, 172, 137,	1393	/* 5 */ 335, 272, 215, 172, 137,
1394	/* 10 */ 110, 87, 70, 56, 45,	1394	/* 10 */ 110, 87, 70, 56, 45,
1395	/* 15 */ 36, 29, 23, 18, 15,	1395	/* 15 */ 36, 29, 23, 18, 15,
1396	};	1396	};
1397		1397
1398	/*	1398	/*
1399	* Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.	1399	* Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
1400	*	1400	*
1401	* In cases where the weight does not change often, we can use the	1401	* In cases where the weight does not change often, we can use the
1402	* precalculated inverse to speed up arithmetics by turning divisions	1402	* precalculated inverse to speed up arithmetics by turning divisions
1403	* into multiplications:	1403	* into multiplications:
1404	*/	1404	*/
1405	static const u32 prio_to_wmult[40] = {	1405	static const u32 prio_to_wmult[40] = {
1406	/* -20 */ 48388, 59856, 76040, 92818, 118348,	1406	/* -20 */ 48388, 59856, 76040, 92818, 118348,
1407	/* -15 */ 147320, 184698, 229616, 287308, 360437,	1407	/* -15 */ 147320, 184698, 229616, 287308, 360437,
1408	/* -10 */ 449829, 563644, 704093, 875809, 1099582,	1408	/* -10 */ 449829, 563644, 704093, 875809, 1099582,
1409	/* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,	1409	/* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
1410	/* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,	1410	/* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
1411	/* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,	1411	/* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
1412	/* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,	1412	/* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
1413	/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,	1413	/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
1414	};	1414	};
1415		1415
1416	/* Time spent by the tasks of the cpu accounting group executing in ... */	1416	/* Time spent by the tasks of the cpu accounting group executing in ... */
1417	enum cpuacct_stat_index {	1417	enum cpuacct_stat_index {
1418	CPUACCT_STAT_USER, /* ... user mode */	1418	CPUACCT_STAT_USER, /* ... user mode */
1419	CPUACCT_STAT_SYSTEM, /* ... kernel mode */	1419	CPUACCT_STAT_SYSTEM, /* ... kernel mode */
1420		1420
1421	CPUACCT_STAT_NSTATS,	1421	CPUACCT_STAT_NSTATS,
1422	};	1422	};
1423		1423
1424	#ifdef CONFIG_CGROUP_CPUACCT	1424	#ifdef CONFIG_CGROUP_CPUACCT
1425	static void cpuacct_charge(struct task_struct *tsk, u64 cputime);	1425	static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1426	static void cpuacct_update_stats(struct task_struct *tsk,	1426	static void cpuacct_update_stats(struct task_struct *tsk,
1427	enum cpuacct_stat_index idx, cputime_t val);	1427	enum cpuacct_stat_index idx, cputime_t val);
1428	#else	1428	#else
1429	static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}	1429	static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1430	static inline void cpuacct_update_stats(struct task_struct *tsk,	1430	static inline void cpuacct_update_stats(struct task_struct *tsk,
1431	enum cpuacct_stat_index idx, cputime_t val) {}	1431	enum cpuacct_stat_index idx, cputime_t val) {}
1432	#endif	1432	#endif
1433		1433
1434	static inline void inc_cpu_load(struct rq *rq, unsigned long load)	1434	static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1435	{	1435	{
1436	update_load_add(&rq->load, load);	1436	update_load_add(&rq->load, load);
1437	}	1437	}
1438		1438
1439	static inline void dec_cpu_load(struct rq *rq, unsigned long load)	1439	static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1440	{	1440	{
1441	update_load_sub(&rq->load, load);	1441	update_load_sub(&rq->load, load);
1442	}	1442	}
1443		1443
1444	#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) \|\| defined(CONFIG_RT_GROUP_SCHED)	1444	#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) \|\| defined(CONFIG_RT_GROUP_SCHED)
1445	typedef int (tg_visitor)(struct task_group , void *);	1445	typedef int (tg_visitor)(struct task_group , void *);
1446		1446
1447	/*	1447	/*
1448	* Iterate the full tree, calling @down when first entering a node and @up when	1448	* Iterate the full tree, calling @down when first entering a node and @up when
1449	* leaving it for the final time.	1449	* leaving it for the final time.
1450	*/	1450	*/
1451	static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)	1451	static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1452	{	1452	{
1453	struct task_group parent, child;	1453	struct task_group parent, child;
1454	int ret;	1454	int ret;
1455		1455
1456	rcu_read_lock();	1456	rcu_read_lock();
1457	parent = &root_task_group;	1457	parent = &root_task_group;
1458	down:	1458	down:
1459	ret = (*down)(parent, data);	1459	ret = (*down)(parent, data);
1460	if (ret)	1460	if (ret)
1461	goto out_unlock;	1461	goto out_unlock;
1462	list_for_each_entry_rcu(child, &parent->children, siblings) {	1462	list_for_each_entry_rcu(child, &parent->children, siblings) {
1463	parent = child;	1463	parent = child;
1464	goto down;	1464	goto down;
1465		1465
1466	up:	1466	up:
1467	continue;	1467	continue;
1468	}	1468	}
1469	ret = (*up)(parent, data);	1469	ret = (*up)(parent, data);
1470	if (ret)	1470	if (ret)
1471	goto out_unlock;	1471	goto out_unlock;
1472		1472
1473	child = parent;	1473	child = parent;
1474	parent = parent->parent;	1474	parent = parent->parent;
1475	if (parent)	1475	if (parent)
1476	goto up;	1476	goto up;
1477	out_unlock:	1477	out_unlock:
1478	rcu_read_unlock();	1478	rcu_read_unlock();
1479		1479
1480	return ret;	1480	return ret;
1481	}	1481	}
1482		1482
1483	static int tg_nop(struct task_group tg, void data)	1483	static int tg_nop(struct task_group tg, void data)
1484	{	1484	{
1485	return 0;	1485	return 0;
1486	}	1486	}
1487	#endif	1487	#endif
1488		1488
1489	#ifdef CONFIG_SMP	1489	#ifdef CONFIG_SMP
1490	/* Used instead of source_load when we know the type == 0 */	1490	/* Used instead of source_load when we know the type == 0 */
1491	static unsigned long weighted_cpuload(const int cpu)	1491	static unsigned long weighted_cpuload(const int cpu)
1492	{	1492	{
1493	return cpu_rq(cpu)->load.weight;	1493	return cpu_rq(cpu)->load.weight;
1494	}	1494	}
1495		1495
1496	/*	1496	/*
1497	* Return a low guess at the load of a migration-source cpu weighted	1497	* Return a low guess at the load of a migration-source cpu weighted
1498	* according to the scheduling class and "nice" value.	1498	* according to the scheduling class and "nice" value.
1499	*	1499	*
1500	* We want to under-estimate the load of migration sources, to	1500	* We want to under-estimate the load of migration sources, to
1501	* balance conservatively.	1501	* balance conservatively.
1502	*/	1502	*/
1503	static unsigned long source_load(int cpu, int type)	1503	static unsigned long source_load(int cpu, int type)
1504	{	1504	{
1505	struct rq *rq = cpu_rq(cpu);	1505	struct rq *rq = cpu_rq(cpu);
1506	unsigned long total = weighted_cpuload(cpu);	1506	unsigned long total = weighted_cpuload(cpu);
1507		1507
1508	if (type == 0 \|\| !sched_feat(LB_BIAS))	1508	if (type == 0 \|\| !sched_feat(LB_BIAS))
1509	return total;	1509	return total;
1510		1510
1511	return min(rq->cpu_load[type-1], total);	1511	return min(rq->cpu_load[type-1], total);
1512	}	1512	}
1513		1513
1514	/*	1514	/*
1515	* Return a high guess at the load of a migration-target cpu weighted	1515	* Return a high guess at the load of a migration-target cpu weighted
1516	* according to the scheduling class and "nice" value.	1516	* according to the scheduling class and "nice" value.
1517	*/	1517	*/
1518	static unsigned long target_load(int cpu, int type)	1518	static unsigned long target_load(int cpu, int type)
1519	{	1519	{
1520	struct rq *rq = cpu_rq(cpu);	1520	struct rq *rq = cpu_rq(cpu);
1521	unsigned long total = weighted_cpuload(cpu);	1521	unsigned long total = weighted_cpuload(cpu);
1522		1522
1523	if (type == 0 \|\| !sched_feat(LB_BIAS))	1523	if (type == 0 \|\| !sched_feat(LB_BIAS))
1524	return total;	1524	return total;
1525		1525
1526	return max(rq->cpu_load[type-1], total);	1526	return max(rq->cpu_load[type-1], total);
1527	}	1527	}
1528		1528
1529	static unsigned long power_of(int cpu)	1529	static unsigned long power_of(int cpu)
1530	{	1530	{
1531	return cpu_rq(cpu)->cpu_power;	1531	return cpu_rq(cpu)->cpu_power;
1532	}	1532	}
1533		1533
1534	static int task_hot(struct task_struct p, u64 now, struct sched_domain sd);	1534	static int task_hot(struct task_struct p, u64 now, struct sched_domain sd);
1535		1535
1536	static unsigned long cpu_avg_load_per_task(int cpu)	1536	static unsigned long cpu_avg_load_per_task(int cpu)
1537	{	1537	{
1538	struct rq *rq = cpu_rq(cpu);	1538	struct rq *rq = cpu_rq(cpu);
1539	unsigned long nr_running = ACCESS_ONCE(rq->nr_running);	1539	unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
1540		1540
1541	if (nr_running)	1541	if (nr_running)
1542	rq->avg_load_per_task = rq->load.weight / nr_running;	1542	rq->avg_load_per_task = rq->load.weight / nr_running;
1543	else	1543	else
1544	rq->avg_load_per_task = 0;	1544	rq->avg_load_per_task = 0;
1545		1545
1546	return rq->avg_load_per_task;	1546	return rq->avg_load_per_task;
1547	}	1547	}
1548		1548
1549	#ifdef CONFIG_FAIR_GROUP_SCHED	1549	#ifdef CONFIG_FAIR_GROUP_SCHED
1550		1550
1551	/*	1551	/*
1552	* Compute the cpu's hierarchical load factor for each task group.	1552	* Compute the cpu's hierarchical load factor for each task group.
1553	* This needs to be done in a top-down fashion because the load of a child	1553	* This needs to be done in a top-down fashion because the load of a child
1554	* group is a fraction of its parents load.	1554	* group is a fraction of its parents load.
1555	*/	1555	*/
1556	static int tg_load_down(struct task_group tg, void data)	1556	static int tg_load_down(struct task_group tg, void data)
1557	{	1557	{
1558	unsigned long load;	1558	unsigned long load;
1559	long cpu = (long)data;	1559	long cpu = (long)data;
1560		1560
1561	if (!tg->parent) {	1561	if (!tg->parent) {
1562	load = cpu_rq(cpu)->load.weight;	1562	load = cpu_rq(cpu)->load.weight;
1563	} else {	1563	} else {
1564	load = tg->parent->cfs_rq[cpu]->h_load;	1564	load = tg->parent->cfs_rq[cpu]->h_load;
1565	load *= tg->se[cpu]->load.weight;	1565	load *= tg->se[cpu]->load.weight;
1566	load /= tg->parent->cfs_rq[cpu]->load.weight + 1;	1566	load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1567	}	1567	}
1568		1568
1569	tg->cfs_rq[cpu]->h_load = load;	1569	tg->cfs_rq[cpu]->h_load = load;
1570		1570
1571	return 0;	1571	return 0;
1572	}	1572	}
1573		1573
1574	static void update_h_load(long cpu)	1574	static void update_h_load(long cpu)
1575	{	1575	{
1576	walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);	1576	walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1577	}	1577	}
1578		1578
1579	#endif	1579	#endif
1580		1580
1581	#ifdef CONFIG_PREEMPT	1581	#ifdef CONFIG_PREEMPT
1582		1582
1583	static void double_rq_lock(struct rq rq1, struct rq rq2);	1583	static void double_rq_lock(struct rq rq1, struct rq rq2);
1584		1584
1585	/*	1585	/*
1586	* fair double_lock_balance: Safely acquires both rq->locks in a fair	1586	* fair double_lock_balance: Safely acquires both rq->locks in a fair
1587	* way at the expense of forcing extra atomic operations in all	1587	* way at the expense of forcing extra atomic operations in all
1588	* invocations. This assures that the double_lock is acquired using the	1588	* invocations. This assures that the double_lock is acquired using the
1589	* same underlying policy as the spinlock_t on this architecture, which	1589	* same underlying policy as the spinlock_t on this architecture, which
1590	* reduces latency compared to the unfair variant below. However, it	1590	* reduces latency compared to the unfair variant below. However, it
1591	* also adds more overhead and therefore may reduce throughput.	1591	* also adds more overhead and therefore may reduce throughput.
1592	*/	1592	*/
1593	static inline int _double_lock_balance(struct rq this_rq, struct rq busiest)	1593	static inline int _double_lock_balance(struct rq this_rq, struct rq busiest)
1594	__releases(this_rq->lock)	1594	__releases(this_rq->lock)
1595	__acquires(busiest->lock)	1595	__acquires(busiest->lock)
1596	__acquires(this_rq->lock)	1596	__acquires(this_rq->lock)
1597	{	1597	{
1598	raw_spin_unlock(&this_rq->lock);	1598	raw_spin_unlock(&this_rq->lock);
1599	double_rq_lock(this_rq, busiest);	1599	double_rq_lock(this_rq, busiest);
1600		1600
1601	return 1;	1601	return 1;
1602	}	1602	}
1603		1603
1604	#else	1604	#else
1605	/*	1605	/*
1606	* Unfair double_lock_balance: Optimizes throughput at the expense of	1606	* Unfair double_lock_balance: Optimizes throughput at the expense of
1607	* latency by eliminating extra atomic operations when the locks are	1607	* latency by eliminating extra atomic operations when the locks are
1608	* already in proper order on entry. This favors lower cpu-ids and will	1608	* already in proper order on entry. This favors lower cpu-ids and will
1609	* grant the double lock to lower cpus over higher ids under contention,	1609	* grant the double lock to lower cpus over higher ids under contention,
1610	* regardless of entry order into the function.	1610	* regardless of entry order into the function.
1611	*/	1611	*/
1612	static int _double_lock_balance(struct rq this_rq, struct rq busiest)	1612	static int _double_lock_balance(struct rq this_rq, struct rq busiest)
1613	__releases(this_rq->lock)	1613	__releases(this_rq->lock)
1614	__acquires(busiest->lock)	1614	__acquires(busiest->lock)
1615	__acquires(this_rq->lock)	1615	__acquires(this_rq->lock)
1616	{	1616	{
1617	int ret = 0;	1617	int ret = 0;
1618		1618
1619	if (unlikely(!raw_spin_trylock(&busiest->lock))) {	1619	if (unlikely(!raw_spin_trylock(&busiest->lock))) {
1620	if (busiest < this_rq) {	1620	if (busiest < this_rq) {
1621	raw_spin_unlock(&this_rq->lock);	1621	raw_spin_unlock(&this_rq->lock);
1622	raw_spin_lock(&busiest->lock);	1622	raw_spin_lock(&busiest->lock);
1623	raw_spin_lock_nested(&this_rq->lock,	1623	raw_spin_lock_nested(&this_rq->lock,
1624	SINGLE_DEPTH_NESTING);	1624	SINGLE_DEPTH_NESTING);
1625	ret = 1;	1625	ret = 1;
1626	} else	1626	} else
1627	raw_spin_lock_nested(&busiest->lock,	1627	raw_spin_lock_nested(&busiest->lock,
1628	SINGLE_DEPTH_NESTING);	1628	SINGLE_DEPTH_NESTING);
1629	}	1629	}
1630	return ret;	1630	return ret;
1631	}	1631	}
1632		1632
1633	#endif /* CONFIG_PREEMPT */	1633	#endif /* CONFIG_PREEMPT */
1634		1634
1635	/*	1635	/*
1636	* double_lock_balance - lock the busiest runqueue, this_rq is locked already.	1636	* double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1637	*/	1637	*/
1638	static int double_lock_balance(struct rq this_rq, struct rq busiest)	1638	static int double_lock_balance(struct rq this_rq, struct rq busiest)
1639	{	1639	{
1640	if (unlikely(!irqs_disabled())) {	1640	if (unlikely(!irqs_disabled())) {
1641	/* printk() doesn't work good under rq->lock */	1641	/* printk() doesn't work good under rq->lock */
1642	raw_spin_unlock(&this_rq->lock);	1642	raw_spin_unlock(&this_rq->lock);
1643	BUG_ON(1);	1643	BUG_ON(1);
1644	}	1644	}
1645		1645
1646	return _double_lock_balance(this_rq, busiest);	1646	return _double_lock_balance(this_rq, busiest);
1647	}	1647	}
1648		1648
1649	static inline void double_unlock_balance(struct rq this_rq, struct rq busiest)	1649	static inline void double_unlock_balance(struct rq this_rq, struct rq busiest)
1650	__releases(busiest->lock)	1650	__releases(busiest->lock)
1651	{	1651	{
1652	raw_spin_unlock(&busiest->lock);	1652	raw_spin_unlock(&busiest->lock);
1653	lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);	1653	lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1654	}	1654	}
1655		1655
1656	/*	1656	/*
1657	* double_rq_lock - safely lock two runqueues	1657	* double_rq_lock - safely lock two runqueues
1658	*	1658	*
1659	* Note this does not disable interrupts like task_rq_lock,	1659	* Note this does not disable interrupts like task_rq_lock,
1660	* you need to do so manually before calling.	1660	* you need to do so manually before calling.
1661	*/	1661	*/
1662	static void double_rq_lock(struct rq rq1, struct rq rq2)	1662	static void double_rq_lock(struct rq rq1, struct rq rq2)
1663	__acquires(rq1->lock)	1663	__acquires(rq1->lock)
1664	__acquires(rq2->lock)	1664	__acquires(rq2->lock)
1665	{	1665	{
1666	BUG_ON(!irqs_disabled());	1666	BUG_ON(!irqs_disabled());
1667	if (rq1 == rq2) {	1667	if (rq1 == rq2) {
1668	raw_spin_lock(&rq1->lock);	1668	raw_spin_lock(&rq1->lock);
1669	__acquire(rq2->lock); /* Fake it out ;) */	1669	__acquire(rq2->lock); /* Fake it out ;) */
1670	} else {	1670	} else {
1671	if (rq1 < rq2) {	1671	if (rq1 < rq2) {
1672	raw_spin_lock(&rq1->lock);	1672	raw_spin_lock(&rq1->lock);
1673	raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);	1673	raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1674	} else {	1674	} else {
1675	raw_spin_lock(&rq2->lock);	1675	raw_spin_lock(&rq2->lock);
1676	raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);	1676	raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1677	}	1677	}
1678	}	1678	}
1679	}	1679	}
1680		1680
1681	/*	1681	/*
1682	* double_rq_unlock - safely unlock two runqueues	1682	* double_rq_unlock - safely unlock two runqueues
1683	*	1683	*
1684	* Note this does not restore interrupts like task_rq_unlock,	1684	* Note this does not restore interrupts like task_rq_unlock,
1685	* you need to do so manually after calling.	1685	* you need to do so manually after calling.
1686	*/	1686	*/
1687	static void double_rq_unlock(struct rq rq1, struct rq rq2)	1687	static void double_rq_unlock(struct rq rq1, struct rq rq2)
1688	__releases(rq1->lock)	1688	__releases(rq1->lock)
1689	__releases(rq2->lock)	1689	__releases(rq2->lock)
1690	{	1690	{
1691	raw_spin_unlock(&rq1->lock);	1691	raw_spin_unlock(&rq1->lock);
1692	if (rq1 != rq2)	1692	if (rq1 != rq2)
1693	raw_spin_unlock(&rq2->lock);	1693	raw_spin_unlock(&rq2->lock);
1694	else	1694	else
1695	__release(rq2->lock);	1695	__release(rq2->lock);
1696	}	1696	}
1697		1697
1698	#else /* CONFIG_SMP */	1698	#else /* CONFIG_SMP */
1699		1699
1700	/*	1700	/*
1701	* double_rq_lock - safely lock two runqueues	1701	* double_rq_lock - safely lock two runqueues
1702	*	1702	*
1703	* Note this does not disable interrupts like task_rq_lock,	1703	* Note this does not disable interrupts like task_rq_lock,
1704	* you need to do so manually before calling.	1704	* you need to do so manually before calling.
1705	*/	1705	*/
1706	static void double_rq_lock(struct rq rq1, struct rq rq2)	1706	static void double_rq_lock(struct rq rq1, struct rq rq2)
1707	__acquires(rq1->lock)	1707	__acquires(rq1->lock)
1708	__acquires(rq2->lock)	1708	__acquires(rq2->lock)
1709	{	1709	{
1710	BUG_ON(!irqs_disabled());	1710	BUG_ON(!irqs_disabled());
1711	BUG_ON(rq1 != rq2);	1711	BUG_ON(rq1 != rq2);
1712	raw_spin_lock(&rq1->lock);	1712	raw_spin_lock(&rq1->lock);
1713	__acquire(rq2->lock); /* Fake it out ;) */	1713	__acquire(rq2->lock); /* Fake it out ;) */
1714	}	1714	}
1715		1715
1716	/*	1716	/*
1717	* double_rq_unlock - safely unlock two runqueues	1717	* double_rq_unlock - safely unlock two runqueues
1718	*	1718	*
1719	* Note this does not restore interrupts like task_rq_unlock,	1719	* Note this does not restore interrupts like task_rq_unlock,
1720	* you need to do so manually after calling.	1720	* you need to do so manually after calling.
1721	*/	1721	*/
1722	static void double_rq_unlock(struct rq rq1, struct rq rq2)	1722	static void double_rq_unlock(struct rq rq1, struct rq rq2)
1723	__releases(rq1->lock)	1723	__releases(rq1->lock)
1724	__releases(rq2->lock)	1724	__releases(rq2->lock)
1725	{	1725	{
1726	BUG_ON(rq1 != rq2);	1726	BUG_ON(rq1 != rq2);
1727	raw_spin_unlock(&rq1->lock);	1727	raw_spin_unlock(&rq1->lock);
1728	__release(rq2->lock);	1728	__release(rq2->lock);
1729	}	1729	}
1730		1730
1731	#endif	1731	#endif
1732		1732
1733	static void calc_load_account_idle(struct rq *this_rq);	1733	static void calc_load_account_idle(struct rq *this_rq);
1734	static void update_sysctl(void);	1734	static void update_sysctl(void);
1735	static int get_update_sysctl_factor(void);	1735	static int get_update_sysctl_factor(void);
1736	static void update_cpu_load(struct rq *this_rq);	1736	static void update_cpu_load(struct rq *this_rq);
1737		1737
1738	static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)	1738	static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1739	{	1739	{
1740	set_task_rq(p, cpu);	1740	set_task_rq(p, cpu);
1741	#ifdef CONFIG_SMP	1741	#ifdef CONFIG_SMP
1742	/*	1742	/*
1743	* After ->cpu is set up to a new value, task_rq_lock(p, ...) can be	1743	* After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1744	* successfuly executed on another CPU. We must ensure that updates of	1744	* successfuly executed on another CPU. We must ensure that updates of
1745	* per-task data have been completed by this moment.	1745	* per-task data have been completed by this moment.
1746	*/	1746	*/
1747	smp_wmb();	1747	smp_wmb();
1748	task_thread_info(p)->cpu = cpu;	1748	task_thread_info(p)->cpu = cpu;
1749	#endif	1749	#endif
1750	}	1750	}
1751		1751
1752	static const struct sched_class rt_sched_class;	1752	static const struct sched_class rt_sched_class;
1753		1753
1754	#define sched_class_highest (&stop_sched_class)	1754	#define sched_class_highest (&stop_sched_class)
1755	#define for_each_class(class) \	1755	#define for_each_class(class) \
1756	for (class = sched_class_highest; class; class = class->next)	1756	for (class = sched_class_highest; class; class = class->next)
1757		1757
1758	#include "sched_stats.h"	1758	#include "sched_stats.h"
1759		1759
1760	static void inc_nr_running(struct rq *rq)	1760	static void inc_nr_running(struct rq *rq)
1761	{	1761	{
1762	rq->nr_running++;	1762	rq->nr_running++;
1763	}	1763	}
1764		1764
1765	static void dec_nr_running(struct rq *rq)	1765	static void dec_nr_running(struct rq *rq)
1766	{	1766	{
1767	rq->nr_running--;	1767	rq->nr_running--;
1768	}	1768	}
1769		1769
1770	static void set_load_weight(struct task_struct *p)	1770	static void set_load_weight(struct task_struct *p)
1771	{	1771	{
1772	/*	1772	/*
1773	* SCHED_IDLE tasks get minimal weight:	1773	* SCHED_IDLE tasks get minimal weight:
1774	*/	1774	*/
1775	if (p->policy == SCHED_IDLE) {	1775	if (p->policy == SCHED_IDLE) {
1776	p->se.load.weight = WEIGHT_IDLEPRIO;	1776	p->se.load.weight = WEIGHT_IDLEPRIO;
1777	p->se.load.inv_weight = WMULT_IDLEPRIO;	1777	p->se.load.inv_weight = WMULT_IDLEPRIO;
1778	return;	1778	return;
1779	}	1779	}
1780		1780
1781	p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];	1781	p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
1782	p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];	1782	p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
1783	}	1783	}
1784		1784
1785	static void enqueue_task(struct rq rq, struct task_struct p, int flags)	1785	static void enqueue_task(struct rq rq, struct task_struct p, int flags)
1786	{	1786	{
1787	update_rq_clock(rq);	1787	update_rq_clock(rq);
1788	sched_info_queued(p);	1788	sched_info_queued(p);
1789	p->sched_class->enqueue_task(rq, p, flags);	1789	p->sched_class->enqueue_task(rq, p, flags);
1790	}	1790	}
1791		1791
1792	static void dequeue_task(struct rq rq, struct task_struct p, int flags)	1792	static void dequeue_task(struct rq rq, struct task_struct p, int flags)
1793	{	1793	{
1794	update_rq_clock(rq);	1794	update_rq_clock(rq);
1795	sched_info_dequeued(p);	1795	sched_info_dequeued(p);
1796	p->sched_class->dequeue_task(rq, p, flags);	1796	p->sched_class->dequeue_task(rq, p, flags);
1797	}	1797	}
1798		1798
1799	/*	1799	/*
1800	* activate_task - move a task to the runqueue.	1800	* activate_task - move a task to the runqueue.
1801	*/	1801	*/
1802	static void activate_task(struct rq rq, struct task_struct p, int flags)	1802	static void activate_task(struct rq rq, struct task_struct p, int flags)
1803	{	1803	{
1804	if (task_contributes_to_load(p))	1804	if (task_contributes_to_load(p))
1805	rq->nr_uninterruptible--;	1805	rq->nr_uninterruptible--;
1806		1806
1807	enqueue_task(rq, p, flags);	1807	enqueue_task(rq, p, flags);
1808	inc_nr_running(rq);	1808	inc_nr_running(rq);
1809	}	1809	}
1810		1810
1811	/*	1811	/*
1812	* deactivate_task - remove a task from the runqueue.	1812	* deactivate_task - remove a task from the runqueue.
1813	*/	1813	*/
1814	static void deactivate_task(struct rq rq, struct task_struct p, int flags)	1814	static void deactivate_task(struct rq rq, struct task_struct p, int flags)
1815	{	1815	{
1816	if (task_contributes_to_load(p))	1816	if (task_contributes_to_load(p))
1817	rq->nr_uninterruptible++;	1817	rq->nr_uninterruptible++;
1818		1818
1819	dequeue_task(rq, p, flags);	1819	dequeue_task(rq, p, flags);
1820	dec_nr_running(rq);	1820	dec_nr_running(rq);
1821	}	1821	}
1822		1822
1823	#ifdef CONFIG_IRQ_TIME_ACCOUNTING	1823	#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1824		1824
1825	/*	1825	/*
1826	* There are no locks covering percpu hardirq/softirq time.	1826	* There are no locks covering percpu hardirq/softirq time.
1827	* They are only modified in account_system_vtime, on corresponding CPU	1827	* They are only modified in account_system_vtime, on corresponding CPU
1828	* with interrupts disabled. So, writes are safe.	1828	* with interrupts disabled. So, writes are safe.
1829	* They are read and saved off onto struct rq in update_rq_clock().	1829	* They are read and saved off onto struct rq in update_rq_clock().
1830	* This may result in other CPU reading this CPU's irq time and can	1830	* This may result in other CPU reading this CPU's irq time and can
1831	* race with irq/account_system_vtime on this CPU. We would either get old	1831	* race with irq/account_system_vtime on this CPU. We would either get old
1832	* or new value with a side effect of accounting a slice of irq time to wrong	1832	* or new value with a side effect of accounting a slice of irq time to wrong
1833	* task when irq is in progress while we read rq->clock. That is a worthy	1833	* task when irq is in progress while we read rq->clock. That is a worthy
1834	* compromise in place of having locks on each irq in account_system_time.	1834	* compromise in place of having locks on each irq in account_system_time.
1835	*/	1835	*/
1836	static DEFINE_PER_CPU(u64, cpu_hardirq_time);	1836	static DEFINE_PER_CPU(u64, cpu_hardirq_time);
1837	static DEFINE_PER_CPU(u64, cpu_softirq_time);	1837	static DEFINE_PER_CPU(u64, cpu_softirq_time);
1838		1838
1839	static DEFINE_PER_CPU(u64, irq_start_time);	1839	static DEFINE_PER_CPU(u64, irq_start_time);
1840	static int sched_clock_irqtime;	1840	static int sched_clock_irqtime;
1841		1841
1842	void enable_sched_clock_irqtime(void)	1842	void enable_sched_clock_irqtime(void)
1843	{	1843	{
1844	sched_clock_irqtime = 1;	1844	sched_clock_irqtime = 1;
1845	}	1845	}
1846		1846
1847	void disable_sched_clock_irqtime(void)	1847	void disable_sched_clock_irqtime(void)
1848	{	1848	{
1849	sched_clock_irqtime = 0;	1849	sched_clock_irqtime = 0;
1850	}	1850	}
1851		1851
1852	#ifndef CONFIG_64BIT	1852	#ifndef CONFIG_64BIT
1853	static DEFINE_PER_CPU(seqcount_t, irq_time_seq);	1853	static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
1854		1854
1855	static inline void irq_time_write_begin(void)	1855	static inline void irq_time_write_begin(void)
1856	{	1856	{
1857	__this_cpu_inc(irq_time_seq.sequence);	1857	__this_cpu_inc(irq_time_seq.sequence);
1858	smp_wmb();	1858	smp_wmb();
1859	}	1859	}
1860		1860
1861	static inline void irq_time_write_end(void)	1861	static inline void irq_time_write_end(void)
1862	{	1862	{
1863	smp_wmb();	1863	smp_wmb();
1864	__this_cpu_inc(irq_time_seq.sequence);	1864	__this_cpu_inc(irq_time_seq.sequence);
1865	}	1865	}
1866		1866
1867	static inline u64 irq_time_read(int cpu)	1867	static inline u64 irq_time_read(int cpu)
1868	{	1868	{
1869	u64 irq_time;	1869	u64 irq_time;
1870	unsigned seq;	1870	unsigned seq;
1871		1871
1872	do {	1872	do {
1873	seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));	1873	seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
1874	irq_time = per_cpu(cpu_softirq_time, cpu) +	1874	irq_time = per_cpu(cpu_softirq_time, cpu) +
1875	per_cpu(cpu_hardirq_time, cpu);	1875	per_cpu(cpu_hardirq_time, cpu);
1876	} while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));	1876	} while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
1877		1877
1878	return irq_time;	1878	return irq_time;
1879	}	1879	}
1880	#else /* CONFIG_64BIT */	1880	#else /* CONFIG_64BIT */
1881	static inline void irq_time_write_begin(void)	1881	static inline void irq_time_write_begin(void)
1882	{	1882	{
1883	}	1883	}
1884		1884
1885	static inline void irq_time_write_end(void)	1885	static inline void irq_time_write_end(void)
1886	{	1886	{
1887	}	1887	}
1888		1888
1889	static inline u64 irq_time_read(int cpu)	1889	static inline u64 irq_time_read(int cpu)
1890	{	1890	{
1891	return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);	1891	return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
1892	}	1892	}
1893	#endif /* CONFIG_64BIT */	1893	#endif /* CONFIG_64BIT */
1894		1894
1895	/*	1895	/*
1896	* Called before incrementing preempt_count on {soft,}irq_enter	1896	* Called before incrementing preempt_count on {soft,}irq_enter
1897	* and before decrementing preempt_count on {soft,}irq_exit.	1897	* and before decrementing preempt_count on {soft,}irq_exit.
1898	*/	1898	*/
1899	void account_system_vtime(struct task_struct *curr)	1899	void account_system_vtime(struct task_struct *curr)
1900	{	1900	{
1901	unsigned long flags;	1901	unsigned long flags;
1902	s64 delta;	1902	s64 delta;
1903	int cpu;	1903	int cpu;
1904		1904
1905	if (!sched_clock_irqtime)	1905	if (!sched_clock_irqtime)
1906	return;	1906	return;
1907		1907
1908	local_irq_save(flags);	1908	local_irq_save(flags);
1909		1909
1910	cpu = smp_processor_id();	1910	cpu = smp_processor_id();
1911	delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);	1911	delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
1912	__this_cpu_add(irq_start_time, delta);	1912	__this_cpu_add(irq_start_time, delta);
1913		1913
1914	irq_time_write_begin();	1914	irq_time_write_begin();
1915	/*	1915	/*
1916	* We do not account for softirq time from ksoftirqd here.	1916	* We do not account for softirq time from ksoftirqd here.
1917	* We want to continue accounting softirq time to ksoftirqd thread	1917	* We want to continue accounting softirq time to ksoftirqd thread
1918	* in that case, so as not to confuse scheduler with a special task	1918	* in that case, so as not to confuse scheduler with a special task
1919	* that do not consume any time, but still wants to run.	1919	* that do not consume any time, but still wants to run.
1920	*/	1920	*/
1921	if (hardirq_count())	1921	if (hardirq_count())
1922	__this_cpu_add(cpu_hardirq_time, delta);	1922	__this_cpu_add(cpu_hardirq_time, delta);
1923	else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())	1923	else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
1924	__this_cpu_add(cpu_softirq_time, delta);	1924	__this_cpu_add(cpu_softirq_time, delta);
1925		1925
1926	irq_time_write_end();	1926	irq_time_write_end();
1927	local_irq_restore(flags);	1927	local_irq_restore(flags);
1928	}	1928	}
1929	EXPORT_SYMBOL_GPL(account_system_vtime);	1929	EXPORT_SYMBOL_GPL(account_system_vtime);
1930		1930
1931	static void update_rq_clock_task(struct rq *rq, s64 delta)	1931	static void update_rq_clock_task(struct rq *rq, s64 delta)
1932	{	1932	{
1933	s64 irq_delta;	1933	s64 irq_delta;
1934		1934
1935	irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;	1935	irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
1936		1936
1937	/*	1937	/*
1938	* Since irq_time is only updated on {soft,}irq_exit, we might run into	1938	* Since irq_time is only updated on {soft,}irq_exit, we might run into
1939	* this case when a previous update_rq_clock() happened inside a	1939	* this case when a previous update_rq_clock() happened inside a
1940	* {soft,}irq region.	1940	* {soft,}irq region.
1941	*	1941	*
1942	* When this happens, we stop ->clock_task and only update the	1942	* When this happens, we stop ->clock_task and only update the
1943	* prev_irq_time stamp to account for the part that fit, so that a next	1943	* prev_irq_time stamp to account for the part that fit, so that a next
1944	* update will consume the rest. This ensures ->clock_task is	1944	* update will consume the rest. This ensures ->clock_task is
1945	* monotonic.	1945	* monotonic.
1946	*	1946	*
1947	* It does however cause some slight miss-attribution of {soft,}irq	1947	* It does however cause some slight miss-attribution of {soft,}irq
1948	* time, a more accurate solution would be to update the irq_time using	1948	* time, a more accurate solution would be to update the irq_time using
1949	* the current rq->clock timestamp, except that would require using	1949	* the current rq->clock timestamp, except that would require using
1950	* atomic ops.	1950	* atomic ops.
1951	*/	1951	*/
1952	if (irq_delta > delta)	1952	if (irq_delta > delta)
1953	irq_delta = delta;	1953	irq_delta = delta;
1954		1954
1955	rq->prev_irq_time += irq_delta;	1955	rq->prev_irq_time += irq_delta;
1956	delta -= irq_delta;	1956	delta -= irq_delta;
1957	rq->clock_task += delta;	1957	rq->clock_task += delta;
1958		1958
1959	if (irq_delta && sched_feat(NONIRQ_POWER))	1959	if (irq_delta && sched_feat(NONIRQ_POWER))
1960	sched_rt_avg_update(rq, irq_delta);	1960	sched_rt_avg_update(rq, irq_delta);
1961	}	1961	}
1962		1962
1963	static int irqtime_account_hi_update(void)	1963	static int irqtime_account_hi_update(void)
1964	{	1964	{
1965	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;	1965	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
1966	unsigned long flags;	1966	unsigned long flags;
1967	u64 latest_ns;	1967	u64 latest_ns;
1968	int ret = 0;	1968	int ret = 0;
1969		1969
1970	local_irq_save(flags);	1970	local_irq_save(flags);
1971	latest_ns = this_cpu_read(cpu_hardirq_time);	1971	latest_ns = this_cpu_read(cpu_hardirq_time);
1972	if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))	1972	if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))
1973	ret = 1;	1973	ret = 1;
1974	local_irq_restore(flags);	1974	local_irq_restore(flags);
1975	return ret;	1975	return ret;
1976	}	1976	}
1977		1977
1978	static int irqtime_account_si_update(void)	1978	static int irqtime_account_si_update(void)
1979	{	1979	{
1980	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;	1980	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
1981	unsigned long flags;	1981	unsigned long flags;
1982	u64 latest_ns;	1982	u64 latest_ns;
1983	int ret = 0;	1983	int ret = 0;
1984		1984
1985	local_irq_save(flags);	1985	local_irq_save(flags);
1986	latest_ns = this_cpu_read(cpu_softirq_time);	1986	latest_ns = this_cpu_read(cpu_softirq_time);
1987	if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))	1987	if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))
1988	ret = 1;	1988	ret = 1;
1989	local_irq_restore(flags);	1989	local_irq_restore(flags);
1990	return ret;	1990	return ret;
1991	}	1991	}
1992		1992
1993	#else /* CONFIG_IRQ_TIME_ACCOUNTING */	1993	#else /* CONFIG_IRQ_TIME_ACCOUNTING */
1994		1994
1995	#define sched_clock_irqtime (0)	1995	#define sched_clock_irqtime (0)
1996		1996
1997	static void update_rq_clock_task(struct rq *rq, s64 delta)	1997	static void update_rq_clock_task(struct rq *rq, s64 delta)
1998	{	1998	{
1999	rq->clock_task += delta;	1999	rq->clock_task += delta;
2000	}	2000	}
2001		2001
2002	#endif /* CONFIG_IRQ_TIME_ACCOUNTING */	2002	#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
2003		2003
2004	#include "sched_idletask.c"	2004	#include "sched_idletask.c"
2005	#include "sched_fair.c"	2005	#include "sched_fair.c"
2006	#include "sched_rt.c"	2006	#include "sched_rt.c"
2007	#include "sched_autogroup.c"	2007	#include "sched_autogroup.c"
2008	#include "sched_stoptask.c"	2008	#include "sched_stoptask.c"
2009	#ifdef CONFIG_SCHED_DEBUG	2009	#ifdef CONFIG_SCHED_DEBUG
2010	# include "sched_debug.c"	2010	# include "sched_debug.c"
2011	#endif	2011	#endif
2012		2012
2013	void sched_set_stop_task(int cpu, struct task_struct *stop)	2013	void sched_set_stop_task(int cpu, struct task_struct *stop)
2014	{	2014	{
2015	struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };	2015	struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
2016	struct task_struct *old_stop = cpu_rq(cpu)->stop;	2016	struct task_struct *old_stop = cpu_rq(cpu)->stop;
2017		2017
2018	if (stop) {	2018	if (stop) {
2019	/*	2019	/*
2020	* Make it appear like a SCHED_FIFO task, its something	2020	* Make it appear like a SCHED_FIFO task, its something
2021	* userspace knows about and won't get confused about.	2021	* userspace knows about and won't get confused about.
2022	*	2022	*
2023	* Also, it will make PI more or less work without too	2023	* Also, it will make PI more or less work without too
2024	* much confusion -- but then, stop work should not	2024	* much confusion -- but then, stop work should not
2025	* rely on PI working anyway.	2025	* rely on PI working anyway.
2026	*/	2026	*/
2027	sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);	2027	sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
2028		2028
2029	stop->sched_class = &stop_sched_class;	2029	stop->sched_class = &stop_sched_class;
2030	}	2030	}
2031		2031
2032	cpu_rq(cpu)->stop = stop;	2032	cpu_rq(cpu)->stop = stop;
2033		2033
2034	if (old_stop) {	2034	if (old_stop) {
2035	/*	2035	/*
2036	* Reset it back to a normal scheduling class so that	2036	* Reset it back to a normal scheduling class so that
2037	* it can die in pieces.	2037	* it can die in pieces.
2038	*/	2038	*/
2039	old_stop->sched_class = &rt_sched_class;	2039	old_stop->sched_class = &rt_sched_class;
2040	}	2040	}
2041	}	2041	}
2042		2042
2043	/*	2043	/*
2044	* __normal_prio - return the priority that is based on the static prio	2044	* __normal_prio - return the priority that is based on the static prio
2045	*/	2045	*/
2046	static inline int __normal_prio(struct task_struct *p)	2046	static inline int __normal_prio(struct task_struct *p)
2047	{	2047	{
2048	return p->static_prio;	2048	return p->static_prio;
2049	}	2049	}
2050		2050
2051	/*	2051	/*
2052	* Calculate the expected normal priority: i.e. priority	2052	* Calculate the expected normal priority: i.e. priority
2053	* without taking RT-inheritance into account. Might be	2053	* without taking RT-inheritance into account. Might be
2054	* boosted by interactivity modifiers. Changes upon fork,	2054	* boosted by interactivity modifiers. Changes upon fork,
2055	* setprio syscalls, and whenever the interactivity	2055	* setprio syscalls, and whenever the interactivity
2056	* estimator recalculates.	2056	* estimator recalculates.
2057	*/	2057	*/
2058	static inline int normal_prio(struct task_struct *p)	2058	static inline int normal_prio(struct task_struct *p)
2059	{	2059	{
2060	int prio;	2060	int prio;
2061		2061
2062	if (task_has_rt_policy(p))	2062	if (task_has_rt_policy(p))
2063	prio = MAX_RT_PRIO-1 - p->rt_priority;	2063	prio = MAX_RT_PRIO-1 - p->rt_priority;
2064	else	2064	else
2065	prio = __normal_prio(p);	2065	prio = __normal_prio(p);
2066	return prio;	2066	return prio;
2067	}	2067	}
2068		2068
2069	/*	2069	/*
2070	* Calculate the current priority, i.e. the priority	2070	* Calculate the current priority, i.e. the priority
2071	* taken into account by the scheduler. This value might	2071	* taken into account by the scheduler. This value might
2072	* be boosted by RT tasks, or might be boosted by	2072	* be boosted by RT tasks, or might be boosted by
2073	* interactivity modifiers. Will be RT if the task got	2073	* interactivity modifiers. Will be RT if the task got
2074	* RT-boosted. If not then it returns p->normal_prio.	2074	* RT-boosted. If not then it returns p->normal_prio.
2075	*/	2075	*/
2076	static int effective_prio(struct task_struct *p)	2076	static int effective_prio(struct task_struct *p)
2077	{	2077	{
2078	p->normal_prio = normal_prio(p);	2078	p->normal_prio = normal_prio(p);
2079	/*	2079	/*
2080	* If we are RT tasks or we were boosted to RT priority,	2080	* If we are RT tasks or we were boosted to RT priority,
2081	* keep the priority unchanged. Otherwise, update priority	2081	* keep the priority unchanged. Otherwise, update priority
2082	* to the normal priority:	2082	* to the normal priority:
2083	*/	2083	*/
2084	if (!rt_prio(p->prio))	2084	if (!rt_prio(p->prio))
2085	return p->normal_prio;	2085	return p->normal_prio;
2086	return p->prio;	2086	return p->prio;
2087	}	2087	}
2088		2088
2089	/**	2089	/**
2090	* task_curr - is this task currently executing on a CPU?	2090	* task_curr - is this task currently executing on a CPU?
2091	* @p: the task in question.	2091	* @p: the task in question.
2092	*/	2092	*/
2093	inline int task_curr(const struct task_struct *p)	2093	inline int task_curr(const struct task_struct *p)
2094	{	2094	{
2095	return cpu_curr(task_cpu(p)) == p;	2095	return cpu_curr(task_cpu(p)) == p;
2096	}	2096	}
2097		2097
2098	static inline void check_class_changed(struct rq rq, struct task_struct p,	2098	static inline void check_class_changed(struct rq rq, struct task_struct p,
2099	const struct sched_class *prev_class,	2099	const struct sched_class *prev_class,
2100	int oldprio)	2100	int oldprio)
2101	{	2101	{
2102	if (prev_class != p->sched_class) {	2102	if (prev_class != p->sched_class) {
2103	if (prev_class->switched_from)	2103	if (prev_class->switched_from)
2104	prev_class->switched_from(rq, p);	2104	prev_class->switched_from(rq, p);
2105	p->sched_class->switched_to(rq, p);	2105	p->sched_class->switched_to(rq, p);
2106	} else if (oldprio != p->prio)	2106	} else if (oldprio != p->prio)
2107	p->sched_class->prio_changed(rq, p, oldprio);	2107	p->sched_class->prio_changed(rq, p, oldprio);
2108	}	2108	}
2109		2109
2110	static void check_preempt_curr(struct rq rq, struct task_struct p, int flags)	2110	static void check_preempt_curr(struct rq rq, struct task_struct p, int flags)
2111	{	2111	{
2112	const struct sched_class *class;	2112	const struct sched_class *class;
2113		2113
2114	if (p->sched_class == rq->curr->sched_class) {	2114	if (p->sched_class == rq->curr->sched_class) {
2115	rq->curr->sched_class->check_preempt_curr(rq, p, flags);	2115	rq->curr->sched_class->check_preempt_curr(rq, p, flags);
2116	} else {	2116	} else {
2117	for_each_class(class) {	2117	for_each_class(class) {
2118	if (class == rq->curr->sched_class)	2118	if (class == rq->curr->sched_class)
2119	break;	2119	break;
2120	if (class == p->sched_class) {	2120	if (class == p->sched_class) {
2121	resched_task(rq->curr);	2121	resched_task(rq->curr);
2122	break;	2122	break;
2123	}	2123	}
2124	}	2124	}
2125	}	2125	}
2126		2126
2127	/*	2127	/*
2128	* A queue event has occurred, and we're going to schedule. In	2128	* A queue event has occurred, and we're going to schedule. In
2129	* this case, we can save a useless back to back clock update.	2129	* this case, we can save a useless back to back clock update.
2130	*/	2130	*/
2131	if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))	2131	if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
2132	rq->skip_clock_update = 1;	2132	rq->skip_clock_update = 1;
2133	}	2133	}
2134		2134
2135	#ifdef CONFIG_SMP	2135	#ifdef CONFIG_SMP
2136	/*	2136	/*
2137	* Is this task likely cache-hot:	2137	* Is this task likely cache-hot:
2138	*/	2138	*/
2139	static int	2139	static int
2140	task_hot(struct task_struct p, u64 now, struct sched_domain sd)	2140	task_hot(struct task_struct p, u64 now, struct sched_domain sd)
2141	{	2141	{
2142	s64 delta;	2142	s64 delta;
2143		2143
2144	if (p->sched_class != &fair_sched_class)	2144	if (p->sched_class != &fair_sched_class)
2145	return 0;	2145	return 0;
2146		2146
2147	if (unlikely(p->policy == SCHED_IDLE))	2147	if (unlikely(p->policy == SCHED_IDLE))
2148	return 0;	2148	return 0;
2149		2149
2150	/*	2150	/*
2151	* Buddy candidates are cache hot:	2151	* Buddy candidates are cache hot:
2152	*/	2152	*/
2153	if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&	2153	if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
2154	(&p->se == cfs_rq_of(&p->se)->next \|\|	2154	(&p->se == cfs_rq_of(&p->se)->next \|\|
2155	&p->se == cfs_rq_of(&p->se)->last))	2155	&p->se == cfs_rq_of(&p->se)->last))
2156	return 1;	2156	return 1;
2157		2157
2158	if (sysctl_sched_migration_cost == -1)	2158	if (sysctl_sched_migration_cost == -1)
2159	return 1;	2159	return 1;
2160	if (sysctl_sched_migration_cost == 0)	2160	if (sysctl_sched_migration_cost == 0)
2161	return 0;	2161	return 0;
2162		2162
2163	delta = now - p->se.exec_start;	2163	delta = now - p->se.exec_start;
2164		2164
2165	return delta < (s64)sysctl_sched_migration_cost;	2165	return delta < (s64)sysctl_sched_migration_cost;
2166	}	2166	}
2167		2167
2168	void set_task_cpu(struct task_struct *p, unsigned int new_cpu)	2168	void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2169	{	2169	{
2170	#ifdef CONFIG_SCHED_DEBUG	2170	#ifdef CONFIG_SCHED_DEBUG
2171	/*	2171	/*
2172	* We should never call set_task_cpu() on a blocked task,	2172	* We should never call set_task_cpu() on a blocked task,
2173	* ttwu() will sort out the placement.	2173	* ttwu() will sort out the placement.
2174	*/	2174	*/
2175	WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&	2175	WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
2176	!(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));	2176	!(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
2177		2177
2178	#ifdef CONFIG_LOCKDEP	2178	#ifdef CONFIG_LOCKDEP
2179	WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) \|\|	2179	WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) \|\|
2180	lockdep_is_held(&task_rq(p)->lock)));	2180	lockdep_is_held(&task_rq(p)->lock)));
2181	#endif	2181	#endif
2182	#endif	2182	#endif
2183		2183
2184	trace_sched_migrate_task(p, new_cpu);	2184	trace_sched_migrate_task(p, new_cpu);
2185		2185
2186	if (task_cpu(p) != new_cpu) {	2186	if (task_cpu(p) != new_cpu) {
2187	p->se.nr_migrations++;	2187	p->se.nr_migrations++;
2188	perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0);	2188	perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0);
2189	}	2189	}
2190		2190
2191	__set_task_cpu(p, new_cpu);	2191	__set_task_cpu(p, new_cpu);
2192	}	2192	}
2193		2193
2194	struct migration_arg {	2194	struct migration_arg {
2195	struct task_struct *task;	2195	struct task_struct *task;
2196	int dest_cpu;	2196	int dest_cpu;
2197	};	2197	};
2198		2198
2199	static int migration_cpu_stop(void *data);	2199	static int migration_cpu_stop(void *data);
2200		2200
2201	/*	2201	/*
2202	* The task's runqueue lock must be held.	2202	* The task's runqueue lock must be held.
2203	* Returns true if you have to wait for migration thread.	2203	* Returns true if you have to wait for migration thread.
2204	*/	2204	*/
2205	static bool need_migrate_task(struct task_struct *p)	2205	static bool need_migrate_task(struct task_struct *p)
2206	{	2206	{
2207	/*	2207	/*
2208	* If the task is not on a runqueue (and not running), then	2208	* If the task is not on a runqueue (and not running), then
2209	* the next wake-up will properly place the task.	2209	* the next wake-up will properly place the task.
2210	*/	2210	*/
2211	bool running = p->on_rq \|\| p->on_cpu;	2211	bool running = p->on_rq \|\| p->on_cpu;
2212	smp_rmb(); /* finish_lock_switch() */	2212	smp_rmb(); /* finish_lock_switch() */
2213	return running;	2213	return running;
2214	}	2214	}
2215		2215
2216	/*	2216	/*
2217	* wait_task_inactive - wait for a thread to unschedule.	2217	* wait_task_inactive - wait for a thread to unschedule.
2218	*	2218	*
2219	* If @match_state is nonzero, it's the @p->state value just checked and	2219	* If @match_state is nonzero, it's the @p->state value just checked and
2220	* not expected to change. If it changes, i.e. @p might have woken up,	2220	* not expected to change. If it changes, i.e. @p might have woken up,
2221	* then return zero. When we succeed in waiting for @p to be off its CPU,	2221	* then return zero. When we succeed in waiting for @p to be off its CPU,
2222	* we return a positive number (its total switch count). If a second call	2222	* we return a positive number (its total switch count). If a second call
2223	* a short while later returns the same number, the caller can be sure that	2223	* a short while later returns the same number, the caller can be sure that
2224	* @p has remained unscheduled the whole time.	2224	* @p has remained unscheduled the whole time.
2225	*	2225	*
2226	* The caller must ensure that the task will unschedule sometime soon,	2226	* The caller must ensure that the task will unschedule sometime soon,
2227	* else this function might spin for a long time. This function can't	2227	* else this function might spin for a long time. This function can't
2228	* be called with interrupts off, or it may introduce deadlock with	2228	* be called with interrupts off, or it may introduce deadlock with
2229	* smp_call_function() if an IPI is sent by the same process we are	2229	* smp_call_function() if an IPI is sent by the same process we are
2230	* waiting to become inactive.	2230	* waiting to become inactive.
2231	*/	2231	*/
2232	unsigned long wait_task_inactive(struct task_struct *p, long match_state)	2232	unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2233	{	2233	{
2234	unsigned long flags;	2234	unsigned long flags;
2235	int running, on_rq;	2235	int running, on_rq;
2236	unsigned long ncsw;	2236	unsigned long ncsw;
2237	struct rq *rq;	2237	struct rq *rq;
2238		2238
2239	for (;;) {	2239	for (;;) {
2240	/*	2240	/*
2241	* We do the initial early heuristics without holding	2241	* We do the initial early heuristics without holding
2242	* any task-queue locks at all. We'll only try to get	2242	* any task-queue locks at all. We'll only try to get
2243	* the runqueue lock when things look like they will	2243	* the runqueue lock when things look like they will
2244	* work out!	2244	* work out!
2245	*/	2245	*/
2246	rq = task_rq(p);	2246	rq = task_rq(p);
2247		2247
2248	/*	2248	/*
2249	* If the task is actively running on another CPU	2249	* If the task is actively running on another CPU
2250	* still, just relax and busy-wait without holding	2250	* still, just relax and busy-wait without holding
2251	* any locks.	2251	* any locks.
2252	*	2252	*
2253	* NOTE! Since we don't hold any locks, it's not	2253	* NOTE! Since we don't hold any locks, it's not
2254	* even sure that "rq" stays as the right runqueue!	2254	* even sure that "rq" stays as the right runqueue!
2255	* But we don't care, since "task_running()" will	2255	* But we don't care, since "task_running()" will
2256	* return false if the runqueue has changed and p	2256	* return false if the runqueue has changed and p
2257	* is actually now running somewhere else!	2257	* is actually now running somewhere else!
2258	*/	2258	*/
2259	while (task_running(rq, p)) {	2259	while (task_running(rq, p)) {
2260	if (match_state && unlikely(p->state != match_state))	2260	if (match_state && unlikely(p->state != match_state))
2261	return 0;	2261	return 0;
2262	cpu_relax();	2262	cpu_relax();
2263	}	2263	}
2264		2264
2265	/*	2265	/*
2266	* Ok, time to look more closely! We need the rq	2266	* Ok, time to look more closely! We need the rq
2267	* lock now, to be sure. If we're wrong, we'll	2267	* lock now, to be sure. If we're wrong, we'll
2268	* just go back and repeat.	2268	* just go back and repeat.
2269	*/	2269	*/
2270	rq = task_rq_lock(p, &flags);	2270	rq = task_rq_lock(p, &flags);
2271	trace_sched_wait_task(p);	2271	trace_sched_wait_task(p);
2272	running = task_running(rq, p);	2272	running = task_running(rq, p);
2273	on_rq = p->on_rq;	2273	on_rq = p->on_rq;
2274	ncsw = 0;	2274	ncsw = 0;
2275	if (!match_state \|\| p->state == match_state)	2275	if (!match_state \|\| p->state == match_state)
2276	ncsw = p->nvcsw \| LONG_MIN; /* sets MSB */	2276	ncsw = p->nvcsw \| LONG_MIN; /* sets MSB */
2277	task_rq_unlock(rq, p, &flags);	2277	task_rq_unlock(rq, p, &flags);
2278		2278
2279	/*	2279	/*
2280	* If it changed from the expected state, bail out now.	2280	* If it changed from the expected state, bail out now.
2281	*/	2281	*/
2282	if (unlikely(!ncsw))	2282	if (unlikely(!ncsw))
2283	break;	2283	break;
2284		2284
2285	/*	2285	/*
2286	* Was it really running after all now that we	2286	* Was it really running after all now that we
2287	* checked with the proper locks actually held?	2287	* checked with the proper locks actually held?
2288	*	2288	*
2289	* Oops. Go back and try again..	2289	* Oops. Go back and try again..
2290	*/	2290	*/
2291	if (unlikely(running)) {	2291	if (unlikely(running)) {
2292	cpu_relax();	2292	cpu_relax();
2293	continue;	2293	continue;
2294	}	2294	}
2295		2295
2296	/*	2296	/*
2297	* It's not enough that it's not actively running,	2297	* It's not enough that it's not actively running,
2298	* it must be off the runqueue _entirely_, and not	2298	* it must be off the runqueue _entirely_, and not
2299	* preempted!	2299	* preempted!
2300	*	2300	*
2301	* So if it was still runnable (but just not actively	2301	* So if it was still runnable (but just not actively
2302	* running right now), it's preempted, and we should	2302	* running right now), it's preempted, and we should
2303	* yield - it could be a while.	2303	* yield - it could be a while.
2304	*/	2304	*/
2305	if (unlikely(on_rq)) {	2305	if (unlikely(on_rq)) {
2306	ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);	2306	ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
2307		2307
2308	set_current_state(TASK_UNINTERRUPTIBLE);	2308	set_current_state(TASK_UNINTERRUPTIBLE);
2309	schedule_hrtimeout(&to, HRTIMER_MODE_REL);	2309	schedule_hrtimeout(&to, HRTIMER_MODE_REL);
2310	continue;	2310	continue;
2311	}	2311	}
2312		2312
2313	/*	2313	/*
2314	* Ahh, all good. It wasn't running, and it wasn't	2314	* Ahh, all good. It wasn't running, and it wasn't
2315	* runnable, which means that it will never become	2315	* runnable, which means that it will never become
2316	* running in the future either. We're all done!	2316	* running in the future either. We're all done!
2317	*/	2317	*/
2318	break;	2318	break;
2319	}	2319	}
2320		2320
2321	return ncsw;	2321	return ncsw;
2322	}	2322	}
2323		2323
2324	/***	2324	/***
2325	* kick_process - kick a running thread to enter/exit the kernel	2325	* kick_process - kick a running thread to enter/exit the kernel
2326	* @p: the to-be-kicked thread	2326	* @p: the to-be-kicked thread
2327	*	2327	*
2328	* Cause a process which is running on another CPU to enter	2328	* Cause a process which is running on another CPU to enter
2329	* kernel-mode, without any delay. (to get signals handled.)	2329	* kernel-mode, without any delay. (to get signals handled.)
2330	*	2330	*
2331	* NOTE: this function doesn't have to take the runqueue lock,	2331	* NOTE: this function doesn't have to take the runqueue lock,
2332	* because all it wants to ensure is that the remote task enters	2332	* because all it wants to ensure is that the remote task enters
2333	* the kernel. If the IPI races and the task has been migrated	2333	* the kernel. If the IPI races and the task has been migrated
2334	* to another CPU then no harm is done and the purpose has been	2334	* to another CPU then no harm is done and the purpose has been
2335	* achieved as well.	2335	* achieved as well.
2336	*/	2336	*/
2337	void kick_process(struct task_struct *p)	2337	void kick_process(struct task_struct *p)
2338	{	2338	{
2339	int cpu;	2339	int cpu;
2340		2340
2341	preempt_disable();	2341	preempt_disable();
2342	cpu = task_cpu(p);	2342	cpu = task_cpu(p);
2343	if ((cpu != smp_processor_id()) && task_curr(p))	2343	if ((cpu != smp_processor_id()) && task_curr(p))
2344	smp_send_reschedule(cpu);	2344	smp_send_reschedule(cpu);
2345	preempt_enable();	2345	preempt_enable();
2346	}	2346	}
2347	EXPORT_SYMBOL_GPL(kick_process);	2347	EXPORT_SYMBOL_GPL(kick_process);
2348	#endif /* CONFIG_SMP */	2348	#endif /* CONFIG_SMP */
2349		2349
2350	#ifdef CONFIG_SMP	2350	#ifdef CONFIG_SMP
2351	/*	2351	/*
2352	* ->cpus_allowed is protected by both rq->lock and p->pi_lock	2352	* ->cpus_allowed is protected by both rq->lock and p->pi_lock
2353	*/	2353	*/
2354	static int select_fallback_rq(int cpu, struct task_struct *p)	2354	static int select_fallback_rq(int cpu, struct task_struct *p)
2355	{	2355	{
2356	int dest_cpu;	2356	int dest_cpu;
2357	const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));	2357	const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
2358		2358
2359	/* Look for allowed, online CPU in same node. */	2359	/* Look for allowed, online CPU in same node. */
2360	for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)	2360	for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
2361	if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))	2361	if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
2362	return dest_cpu;	2362	return dest_cpu;
2363		2363
2364	/* Any allowed, online CPU? */	2364	/* Any allowed, online CPU? */
2365	dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);	2365	dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
2366	if (dest_cpu < nr_cpu_ids)	2366	if (dest_cpu < nr_cpu_ids)
2367	return dest_cpu;	2367	return dest_cpu;
2368		2368
2369	/* No more Mr. Nice Guy. */	2369	/* No more Mr. Nice Guy. */
2370	dest_cpu = cpuset_cpus_allowed_fallback(p);	2370	dest_cpu = cpuset_cpus_allowed_fallback(p);
2371	/*	2371	/*
2372	* Don't tell them about moving exiting tasks or	2372	* Don't tell them about moving exiting tasks or
2373	* kernel threads (both mm NULL), since they never	2373	* kernel threads (both mm NULL), since they never
2374	* leave kernel.	2374	* leave kernel.
2375	*/	2375	*/
2376	if (p->mm && printk_ratelimit()) {	2376	if (p->mm && printk_ratelimit()) {
2377	printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",	2377	printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
2378	task_pid_nr(p), p->comm, cpu);	2378	task_pid_nr(p), p->comm, cpu);
2379	}	2379	}
2380		2380
2381	return dest_cpu;	2381	return dest_cpu;
2382	}	2382	}
2383		2383
2384	/*	2384	/*
2385	* The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.	2385	* The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
2386	*/	2386	*/
2387	static inline	2387	static inline
2388	int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)	2388	int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2389	{	2389	{
2390	int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);	2390	int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
2391		2391
2392	/*	2392	/*
2393	* In order not to call set_task_cpu() on a blocking task we need	2393	* In order not to call set_task_cpu() on a blocking task we need
2394	* to rely on ttwu() to place the task on a valid ->cpus_allowed	2394	* to rely on ttwu() to place the task on a valid ->cpus_allowed
2395	* cpu.	2395	* cpu.
2396	*	2396	*
2397	* Since this is common to all placement strategies, this lives here.	2397	* Since this is common to all placement strategies, this lives here.
2398	*	2398	*
2399	* [ this allows ->select_task() to simply return task_cpu(p) and	2399	* [ this allows ->select_task() to simply return task_cpu(p) and
2400	* not worry about this generic constraint ]	2400	* not worry about this generic constraint ]
2401	*/	2401	*/
2402	if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) \|\|	2402	if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) \|\|
2403	!cpu_online(cpu)))	2403	!cpu_online(cpu)))
2404	cpu = select_fallback_rq(task_cpu(p), p);	2404	cpu = select_fallback_rq(task_cpu(p), p);
2405		2405
2406	return cpu;	2406	return cpu;
2407	}	2407	}
2408		2408
2409	static void update_avg(u64 *avg, u64 sample)	2409	static void update_avg(u64 *avg, u64 sample)
2410	{	2410	{
2411	s64 diff = sample - *avg;	2411	s64 diff = sample - *avg;
2412	*avg += diff >> 3;	2412	*avg += diff >> 3;
2413	}	2413	}
2414	#endif	2414	#endif
2415		2415
2416	static void	2416	static void
2417	ttwu_stat(struct task_struct *p, int cpu, int wake_flags)	2417	ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
2418	{	2418	{
2419	#ifdef CONFIG_SCHEDSTATS	2419	#ifdef CONFIG_SCHEDSTATS
2420	struct rq *rq = this_rq();	2420	struct rq *rq = this_rq();
2421		2421
2422	#ifdef CONFIG_SMP	2422	#ifdef CONFIG_SMP
2423	int this_cpu = smp_processor_id();	2423	int this_cpu = smp_processor_id();
2424		2424
2425	if (cpu == this_cpu) {	2425	if (cpu == this_cpu) {
2426	schedstat_inc(rq, ttwu_local);	2426	schedstat_inc(rq, ttwu_local);
2427	schedstat_inc(p, se.statistics.nr_wakeups_local);	2427	schedstat_inc(p, se.statistics.nr_wakeups_local);
2428	} else {	2428	} else {
2429	struct sched_domain *sd;	2429	struct sched_domain *sd;
2430		2430
2431	schedstat_inc(p, se.statistics.nr_wakeups_remote);	2431	schedstat_inc(p, se.statistics.nr_wakeups_remote);
2432	for_each_domain(this_cpu, sd) {	2432	for_each_domain(this_cpu, sd) {
2433	if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {	2433	if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2434	schedstat_inc(sd, ttwu_wake_remote);	2434	schedstat_inc(sd, ttwu_wake_remote);
2435	break;	2435	break;
2436	}	2436	}
2437	}	2437	}
2438	}	2438	}
2439	#endif /* CONFIG_SMP */	2439	#endif /* CONFIG_SMP */
2440		2440
2441	schedstat_inc(rq, ttwu_count);	2441	schedstat_inc(rq, ttwu_count);
2442	schedstat_inc(p, se.statistics.nr_wakeups);	2442	schedstat_inc(p, se.statistics.nr_wakeups);
2443		2443
2444	if (wake_flags & WF_SYNC)	2444	if (wake_flags & WF_SYNC)
2445	schedstat_inc(p, se.statistics.nr_wakeups_sync);	2445	schedstat_inc(p, se.statistics.nr_wakeups_sync);
2446		2446
2447	if (cpu != task_cpu(p))	2447	if (cpu != task_cpu(p))
2448	schedstat_inc(p, se.statistics.nr_wakeups_migrate);	2448	schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2449		2449
2450	#endif /* CONFIG_SCHEDSTATS */	2450	#endif /* CONFIG_SCHEDSTATS */
2451	}	2451	}
2452		2452
2453	static void ttwu_activate(struct rq rq, struct task_struct p, int en_flags)	2453	static void ttwu_activate(struct rq rq, struct task_struct p, int en_flags)
2454	{	2454	{
2455	activate_task(rq, p, en_flags);	2455	activate_task(rq, p, en_flags);
2456	p->on_rq = 1;	2456	p->on_rq = 1;
2457		2457
2458	/* if a worker is waking up, notify workqueue */	2458	/* if a worker is waking up, notify workqueue */
2459	if (p->flags & PF_WQ_WORKER)	2459	if (p->flags & PF_WQ_WORKER)
2460	wq_worker_waking_up(p, cpu_of(rq));	2460	wq_worker_waking_up(p, cpu_of(rq));
2461	}	2461	}
2462		2462
2463	/*	2463	/*
2464	* Mark the task runnable and perform wakeup-preemption.	2464	* Mark the task runnable and perform wakeup-preemption.
2465	*/	2465	*/
2466	static void	2466	static void
2467	ttwu_do_wakeup(struct rq rq, struct task_struct p, int wake_flags)	2467	ttwu_do_wakeup(struct rq rq, struct task_struct p, int wake_flags)
2468	{	2468	{
2469	trace_sched_wakeup(p, true);	2469	trace_sched_wakeup(p, true);
2470	check_preempt_curr(rq, p, wake_flags);	2470	check_preempt_curr(rq, p, wake_flags);
2471		2471
2472	p->state = TASK_RUNNING;	2472	p->state = TASK_RUNNING;
2473	#ifdef CONFIG_SMP	2473	#ifdef CONFIG_SMP
2474	if (p->sched_class->task_woken)	2474	if (p->sched_class->task_woken)
2475	p->sched_class->task_woken(rq, p);	2475	p->sched_class->task_woken(rq, p);
2476		2476
2477	if (unlikely(rq->idle_stamp)) {	2477	if (unlikely(rq->idle_stamp)) {
2478	u64 delta = rq->clock - rq->idle_stamp;	2478	u64 delta = rq->clock - rq->idle_stamp;
2479	u64 max = 2*sysctl_sched_migration_cost;	2479	u64 max = 2*sysctl_sched_migration_cost;
2480		2480
2481	if (delta > max)	2481	if (delta > max)
2482	rq->avg_idle = max;	2482	rq->avg_idle = max;
2483	else	2483	else
2484	update_avg(&rq->avg_idle, delta);	2484	update_avg(&rq->avg_idle, delta);
2485	rq->idle_stamp = 0;	2485	rq->idle_stamp = 0;
2486	}	2486	}
2487	#endif	2487	#endif
2488	}	2488	}
2489		2489
2490	static void	2490	static void
2491	ttwu_do_activate(struct rq rq, struct task_struct p, int wake_flags)	2491	ttwu_do_activate(struct rq rq, struct task_struct p, int wake_flags)
2492	{	2492	{
2493	#ifdef CONFIG_SMP	2493	#ifdef CONFIG_SMP
2494	if (p->sched_contributes_to_load)	2494	if (p->sched_contributes_to_load)
2495	rq->nr_uninterruptible--;	2495	rq->nr_uninterruptible--;
2496	#endif	2496	#endif
2497		2497
2498	ttwu_activate(rq, p, ENQUEUE_WAKEUP \| ENQUEUE_WAKING);	2498	ttwu_activate(rq, p, ENQUEUE_WAKEUP \| ENQUEUE_WAKING);
2499	ttwu_do_wakeup(rq, p, wake_flags);	2499	ttwu_do_wakeup(rq, p, wake_flags);
2500	}	2500	}
2501		2501
2502	/*	2502	/*
2503	* Called in case the task @p isn't fully descheduled from its runqueue,	2503	* Called in case the task @p isn't fully descheduled from its runqueue,
2504	* in this case we must do a remote wakeup. Its a 'light' wakeup though,	2504	* in this case we must do a remote wakeup. Its a 'light' wakeup though,
2505	* since all we need to do is flip p->state to TASK_RUNNING, since	2505	* since all we need to do is flip p->state to TASK_RUNNING, since
2506	* the task is still ->on_rq.	2506	* the task is still ->on_rq.
2507	*/	2507	*/
2508	static int ttwu_remote(struct task_struct *p, int wake_flags)	2508	static int ttwu_remote(struct task_struct *p, int wake_flags)
2509	{	2509	{
2510	struct rq *rq;	2510	struct rq *rq;
2511	int ret = 0;	2511	int ret = 0;
2512		2512
2513	rq = __task_rq_lock(p);	2513	rq = __task_rq_lock(p);
2514	if (p->on_rq) {	2514	if (p->on_rq) {
2515	ttwu_do_wakeup(rq, p, wake_flags);	2515	ttwu_do_wakeup(rq, p, wake_flags);
2516	ret = 1;	2516	ret = 1;
2517	}	2517	}
2518	__task_rq_unlock(rq);	2518	__task_rq_unlock(rq);
2519		2519
2520	return ret;	2520	return ret;
2521	}	2521	}
2522		2522
2523	#ifdef CONFIG_SMP	2523	#ifdef CONFIG_SMP
2524	static void sched_ttwu_pending(void)	2524	static void sched_ttwu_pending(void)
2525	{	2525	{
2526	struct rq *rq = this_rq();	2526	struct rq *rq = this_rq();
2527	struct task_struct *list = xchg(&rq->wake_list, NULL);	2527	struct task_struct *list = xchg(&rq->wake_list, NULL);
2528		2528
2529	if (!list)	2529	if (!list)
2530	return;	2530	return;
2531		2531
2532	raw_spin_lock(&rq->lock);	2532	raw_spin_lock(&rq->lock);
2533		2533
2534	while (list) {	2534	while (list) {
2535	struct task_struct *p = list;	2535	struct task_struct *p = list;
2536	list = list->wake_entry;	2536	list = list->wake_entry;
2537	ttwu_do_activate(rq, p, 0);	2537	ttwu_do_activate(rq, p, 0);
2538	}	2538	}
2539		2539
2540	raw_spin_unlock(&rq->lock);	2540	raw_spin_unlock(&rq->lock);
2541	}	2541	}
2542		2542
2543	void scheduler_ipi(void)	2543	void scheduler_ipi(void)
2544	{	2544	{
2545	sched_ttwu_pending();	2545	sched_ttwu_pending();
2546	}	2546	}
2547		2547
2548	static void ttwu_queue_remote(struct task_struct *p, int cpu)	2548	static void ttwu_queue_remote(struct task_struct *p, int cpu)
2549	{	2549	{
2550	struct rq *rq = cpu_rq(cpu);	2550	struct rq *rq = cpu_rq(cpu);
2551	struct task_struct *next = rq->wake_list;	2551	struct task_struct *next = rq->wake_list;
2552		2552
2553	for (;;) {	2553	for (;;) {
2554	struct task_struct *old = next;	2554	struct task_struct *old = next;
2555		2555
2556	p->wake_entry = next;	2556	p->wake_entry = next;
2557	next = cmpxchg(&rq->wake_list, old, p);	2557	next = cmpxchg(&rq->wake_list, old, p);
2558	if (next == old)	2558	if (next == old)
2559	break;	2559	break;
2560	}	2560	}
2561		2561
2562	if (!next)	2562	if (!next)
2563	smp_send_reschedule(cpu);	2563	smp_send_reschedule(cpu);
2564	}	2564	}
2565	#endif	2565	#endif
2566		2566
2567	static void ttwu_queue(struct task_struct *p, int cpu)	2567	static void ttwu_queue(struct task_struct *p, int cpu)
2568	{	2568	{
2569	struct rq *rq = cpu_rq(cpu);	2569	struct rq *rq = cpu_rq(cpu);
2570		2570
2571	#if defined(CONFIG_SMP) && defined(CONFIG_SCHED_TTWU_QUEUE)	2571	#if defined(CONFIG_SMP)
2572	if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {	2572	if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
2573	ttwu_queue_remote(p, cpu);	2573	ttwu_queue_remote(p, cpu);
2574	return;	2574	return;
2575	}	2575	}
2576	#endif	2576	#endif
2577		2577
2578	raw_spin_lock(&rq->lock);	2578	raw_spin_lock(&rq->lock);
2579	ttwu_do_activate(rq, p, 0);	2579	ttwu_do_activate(rq, p, 0);
2580	raw_spin_unlock(&rq->lock);	2580	raw_spin_unlock(&rq->lock);
2581	}	2581	}
2582		2582
2583	/**	2583	/**
2584	* try_to_wake_up - wake up a thread	2584	* try_to_wake_up - wake up a thread
2585	* @p: the thread to be awakened	2585	* @p: the thread to be awakened
2586	* @state: the mask of task states that can be woken	2586	* @state: the mask of task states that can be woken
2587	* @wake_flags: wake modifier flags (WF_*)	2587	* @wake_flags: wake modifier flags (WF_*)
2588	*	2588	*
2589	* Put it on the run-queue if it's not already there. The "current"	2589	* Put it on the run-queue if it's not already there. The "current"
2590	* thread is always on the run-queue (except when the actual	2590	* thread is always on the run-queue (except when the actual
2591	* re-schedule is in progress), and as such you're allowed to do	2591	* re-schedule is in progress), and as such you're allowed to do
2592	* the simpler "current->state = TASK_RUNNING" to mark yourself	2592	* the simpler "current->state = TASK_RUNNING" to mark yourself
2593	* runnable without the overhead of this.	2593	* runnable without the overhead of this.
2594	*	2594	*
2595	* Returns %true if @p was woken up, %false if it was already running	2595	* Returns %true if @p was woken up, %false if it was already running
2596	* or @state didn't match @p's state.	2596	* or @state didn't match @p's state.
2597	*/	2597	*/
2598	static int	2598	static int
2599	try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)	2599	try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2600	{	2600	{
2601	unsigned long flags;	2601	unsigned long flags;
2602	int cpu, success = 0;	2602	int cpu, success = 0;
2603		2603
2604	smp_wmb();	2604	smp_wmb();
2605	raw_spin_lock_irqsave(&p->pi_lock, flags);	2605	raw_spin_lock_irqsave(&p->pi_lock, flags);
2606	if (!(p->state & state))	2606	if (!(p->state & state))
2607	goto out;	2607	goto out;
2608		2608
2609	success = 1; /* we're going to change ->state */	2609	success = 1; /* we're going to change ->state */
2610	cpu = task_cpu(p);	2610	cpu = task_cpu(p);
2611		2611
2612	if (p->on_rq && ttwu_remote(p, wake_flags))	2612	if (p->on_rq && ttwu_remote(p, wake_flags))
2613	goto stat;	2613	goto stat;
2614		2614
2615	#ifdef CONFIG_SMP	2615	#ifdef CONFIG_SMP
2616	/*	2616	/*
2617	* If the owning (remote) cpu is still in the middle of schedule() with	2617	* If the owning (remote) cpu is still in the middle of schedule() with
2618	* this task as prev, wait until its done referencing the task.	2618	* this task as prev, wait until its done referencing the task.
2619	*/	2619	*/
2620	while (p->on_cpu) {	2620	while (p->on_cpu) {
2621	#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW	2621	#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2622	/*	2622	/*
2623	* If called from interrupt context we could have landed in the	2623	* If called from interrupt context we could have landed in the
2624	* middle of schedule(), in this case we should take care not	2624	* middle of schedule(), in this case we should take care not
2625	* to spin on ->on_cpu if p is current, since that would	2625	* to spin on ->on_cpu if p is current, since that would
2626	* deadlock.	2626	* deadlock.
2627	*/	2627	*/
2628	if (p == current) {	2628	if (p == current) {
2629	ttwu_queue(p, cpu);	2629	ttwu_queue(p, cpu);
2630	goto stat;	2630	goto stat;
2631	}	2631	}
2632	#endif	2632	#endif
2633	cpu_relax();	2633	cpu_relax();
2634	}	2634	}
2635	/*	2635	/*
2636	* Pairs with the smp_wmb() in finish_lock_switch().	2636	* Pairs with the smp_wmb() in finish_lock_switch().
2637	*/	2637	*/
2638	smp_rmb();	2638	smp_rmb();
2639		2639
2640	p->sched_contributes_to_load = !!task_contributes_to_load(p);	2640	p->sched_contributes_to_load = !!task_contributes_to_load(p);
2641	p->state = TASK_WAKING;	2641	p->state = TASK_WAKING;
2642		2642
2643	if (p->sched_class->task_waking)	2643	if (p->sched_class->task_waking)
2644	p->sched_class->task_waking(p);	2644	p->sched_class->task_waking(p);
2645		2645
2646	cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);	2646	cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2647	if (task_cpu(p) != cpu)	2647	if (task_cpu(p) != cpu)
2648	set_task_cpu(p, cpu);	2648	set_task_cpu(p, cpu);
2649	#endif /* CONFIG_SMP */	2649	#endif /* CONFIG_SMP */
2650		2650
2651	ttwu_queue(p, cpu);	2651	ttwu_queue(p, cpu);
2652	stat:	2652	stat:
2653	ttwu_stat(p, cpu, wake_flags);	2653	ttwu_stat(p, cpu, wake_flags);
2654	out:	2654	out:
2655	raw_spin_unlock_irqrestore(&p->pi_lock, flags);	2655	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2656		2656
2657	return success;	2657	return success;
2658	}	2658	}
2659		2659
2660	/**	2660	/**
2661	* try_to_wake_up_local - try to wake up a local task with rq lock held	2661	* try_to_wake_up_local - try to wake up a local task with rq lock held
2662	* @p: the thread to be awakened	2662	* @p: the thread to be awakened
2663	*	2663	*
2664	* Put @p on the run-queue if it's not already there. The caller must	2664	* Put @p on the run-queue if it's not already there. The caller must
2665	* ensure that this_rq() is locked, @p is bound to this_rq() and not	2665	* ensure that this_rq() is locked, @p is bound to this_rq() and not
2666	* the current task.	2666	* the current task.
2667	*/	2667	*/
2668	static void try_to_wake_up_local(struct task_struct *p)	2668	static void try_to_wake_up_local(struct task_struct *p)
2669	{	2669	{
2670	struct rq *rq = task_rq(p);	2670	struct rq *rq = task_rq(p);
2671		2671
2672	BUG_ON(rq != this_rq());	2672	BUG_ON(rq != this_rq());
2673	BUG_ON(p == current);	2673	BUG_ON(p == current);
2674	lockdep_assert_held(&rq->lock);	2674	lockdep_assert_held(&rq->lock);
2675		2675
2676	if (!raw_spin_trylock(&p->pi_lock)) {	2676	if (!raw_spin_trylock(&p->pi_lock)) {
2677	raw_spin_unlock(&rq->lock);	2677	raw_spin_unlock(&rq->lock);
2678	raw_spin_lock(&p->pi_lock);	2678	raw_spin_lock(&p->pi_lock);
2679	raw_spin_lock(&rq->lock);	2679	raw_spin_lock(&rq->lock);
2680	}	2680	}
2681		2681
2682	if (!(p->state & TASK_NORMAL))	2682	if (!(p->state & TASK_NORMAL))
2683	goto out;	2683	goto out;
2684		2684
2685	if (!p->on_rq)	2685	if (!p->on_rq)
2686	ttwu_activate(rq, p, ENQUEUE_WAKEUP);	2686	ttwu_activate(rq, p, ENQUEUE_WAKEUP);
2687		2687
2688	ttwu_do_wakeup(rq, p, 0);	2688	ttwu_do_wakeup(rq, p, 0);
2689	ttwu_stat(p, smp_processor_id(), 0);	2689	ttwu_stat(p, smp_processor_id(), 0);
2690	out:	2690	out:
2691	raw_spin_unlock(&p->pi_lock);	2691	raw_spin_unlock(&p->pi_lock);
2692	}	2692	}
2693		2693
2694	/**	2694	/**
2695	* wake_up_process - Wake up a specific process	2695	* wake_up_process - Wake up a specific process
2696	* @p: The process to be woken up.	2696	* @p: The process to be woken up.
2697	*	2697	*
2698	* Attempt to wake up the nominated process and move it to the set of runnable	2698	* Attempt to wake up the nominated process and move it to the set of runnable
2699	* processes. Returns 1 if the process was woken up, 0 if it was already	2699	* processes. Returns 1 if the process was woken up, 0 if it was already
2700	* running.	2700	* running.
2701	*	2701	*
2702	* It may be assumed that this function implies a write memory barrier before	2702	* It may be assumed that this function implies a write memory barrier before
2703	* changing the task state if and only if any tasks are woken up.	2703	* changing the task state if and only if any tasks are woken up.
2704	*/	2704	*/
2705	int wake_up_process(struct task_struct *p)	2705	int wake_up_process(struct task_struct *p)
2706	{	2706	{
2707	return try_to_wake_up(p, TASK_ALL, 0);	2707	return try_to_wake_up(p, TASK_ALL, 0);
2708	}	2708	}
2709	EXPORT_SYMBOL(wake_up_process);	2709	EXPORT_SYMBOL(wake_up_process);
2710		2710
2711	int wake_up_state(struct task_struct *p, unsigned int state)	2711	int wake_up_state(struct task_struct *p, unsigned int state)
2712	{	2712	{
2713	return try_to_wake_up(p, state, 0);	2713	return try_to_wake_up(p, state, 0);
2714	}	2714	}
2715		2715
2716	/*	2716	/*
2717	* Perform scheduler related setup for a newly forked process p.	2717	* Perform scheduler related setup for a newly forked process p.
2718	* p is forked by current.	2718	* p is forked by current.
2719	*	2719	*
2720	* __sched_fork() is basic setup used by init_idle() too:	2720	* __sched_fork() is basic setup used by init_idle() too:
2721	*/	2721	*/
2722	static void __sched_fork(struct task_struct *p)	2722	static void __sched_fork(struct task_struct *p)
2723	{	2723	{
2724	p->on_rq = 0;	2724	p->on_rq = 0;
2725		2725
2726	p->se.on_rq = 0;	2726	p->se.on_rq = 0;
2727	p->se.exec_start = 0;	2727	p->se.exec_start = 0;
2728	p->se.sum_exec_runtime = 0;	2728	p->se.sum_exec_runtime = 0;
2729	p->se.prev_sum_exec_runtime = 0;	2729	p->se.prev_sum_exec_runtime = 0;
2730	p->se.nr_migrations = 0;	2730	p->se.nr_migrations = 0;
2731	p->se.vruntime = 0;	2731	p->se.vruntime = 0;
2732	INIT_LIST_HEAD(&p->se.group_node);	2732	INIT_LIST_HEAD(&p->se.group_node);
2733		2733
2734	#ifdef CONFIG_SCHEDSTATS	2734	#ifdef CONFIG_SCHEDSTATS
2735	memset(&p->se.statistics, 0, sizeof(p->se.statistics));	2735	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
2736	#endif	2736	#endif
2737		2737
2738	INIT_LIST_HEAD(&p->rt.run_list);	2738	INIT_LIST_HEAD(&p->rt.run_list);
2739		2739
2740	#ifdef CONFIG_PREEMPT_NOTIFIERS	2740	#ifdef CONFIG_PREEMPT_NOTIFIERS
2741	INIT_HLIST_HEAD(&p->preempt_notifiers);	2741	INIT_HLIST_HEAD(&p->preempt_notifiers);
2742	#endif	2742	#endif
2743	}	2743	}
2744		2744
2745	/*	2745	/*
2746	* fork()/clone()-time setup:	2746	* fork()/clone()-time setup:
2747	*/	2747	*/
2748	void sched_fork(struct task_struct *p, int clone_flags)	2748	void sched_fork(struct task_struct *p, int clone_flags)
2749	{	2749	{
2750	unsigned long flags;	2750	unsigned long flags;
2751	int cpu = get_cpu();	2751	int cpu = get_cpu();
2752		2752
2753	__sched_fork(p);	2753	__sched_fork(p);
2754	/*	2754	/*
2755	* We mark the process as running here. This guarantees that	2755	* We mark the process as running here. This guarantees that
2756	* nobody will actually run it, and a signal or other external	2756	* nobody will actually run it, and a signal or other external
2757	* event cannot wake it up and insert it on the runqueue either.	2757	* event cannot wake it up and insert it on the runqueue either.
2758	*/	2758	*/
2759	p->state = TASK_RUNNING;	2759	p->state = TASK_RUNNING;
2760		2760
2761	/*	2761	/*
2762	* Revert to default priority/policy on fork if requested.	2762	* Revert to default priority/policy on fork if requested.
2763	*/	2763	*/
2764	if (unlikely(p->sched_reset_on_fork)) {	2764	if (unlikely(p->sched_reset_on_fork)) {
2765	if (p->policy == SCHED_FIFO \|\| p->policy == SCHED_RR) {	2765	if (p->policy == SCHED_FIFO \|\| p->policy == SCHED_RR) {
2766	p->policy = SCHED_NORMAL;	2766	p->policy = SCHED_NORMAL;
2767	p->normal_prio = p->static_prio;	2767	p->normal_prio = p->static_prio;
2768	}	2768	}
2769		2769
2770	if (PRIO_TO_NICE(p->static_prio) < 0) {	2770	if (PRIO_TO_NICE(p->static_prio) < 0) {
2771	p->static_prio = NICE_TO_PRIO(0);	2771	p->static_prio = NICE_TO_PRIO(0);
2772	p->normal_prio = p->static_prio;	2772	p->normal_prio = p->static_prio;
2773	set_load_weight(p);	2773	set_load_weight(p);
2774	}	2774	}
2775		2775
2776	/*	2776	/*
2777	* We don't need the reset flag anymore after the fork. It has	2777	* We don't need the reset flag anymore after the fork. It has
2778	* fulfilled its duty:	2778	* fulfilled its duty:
2779	*/	2779	*/
2780	p->sched_reset_on_fork = 0;	2780	p->sched_reset_on_fork = 0;
2781	}	2781	}
2782		2782
2783	/*	2783	/*
2784	* Make sure we do not leak PI boosting priority to the child.	2784	* Make sure we do not leak PI boosting priority to the child.
2785	*/	2785	*/
2786	p->prio = current->normal_prio;	2786	p->prio = current->normal_prio;
2787		2787
2788	if (!rt_prio(p->prio))	2788	if (!rt_prio(p->prio))
2789	p->sched_class = &fair_sched_class;	2789	p->sched_class = &fair_sched_class;
2790		2790
2791	if (p->sched_class->task_fork)	2791	if (p->sched_class->task_fork)
2792	p->sched_class->task_fork(p);	2792	p->sched_class->task_fork(p);
2793		2793
2794	/*	2794	/*
2795	* The child is not yet in the pid-hash so no cgroup attach races,	2795	* The child is not yet in the pid-hash so no cgroup attach races,
2796	* and the cgroup is pinned to this child due to cgroup_fork()	2796	* and the cgroup is pinned to this child due to cgroup_fork()
2797	* is ran before sched_fork().	2797	* is ran before sched_fork().
2798	*	2798	*
2799	* Silence PROVE_RCU.	2799	* Silence PROVE_RCU.
2800	*/	2800	*/
2801	raw_spin_lock_irqsave(&p->pi_lock, flags);	2801	raw_spin_lock_irqsave(&p->pi_lock, flags);
2802	set_task_cpu(p, cpu);	2802	set_task_cpu(p, cpu);
2803	raw_spin_unlock_irqrestore(&p->pi_lock, flags);	2803	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2804		2804
2805	#if defined(CONFIG_SCHEDSTATS) \|\| defined(CONFIG_TASK_DELAY_ACCT)	2805	#if defined(CONFIG_SCHEDSTATS) \|\| defined(CONFIG_TASK_DELAY_ACCT)
2806	if (likely(sched_info_on()))	2806	if (likely(sched_info_on()))
2807	memset(&p->sched_info, 0, sizeof(p->sched_info));	2807	memset(&p->sched_info, 0, sizeof(p->sched_info));
2808	#endif	2808	#endif
2809	#if defined(CONFIG_SMP)	2809	#if defined(CONFIG_SMP)
2810	p->on_cpu = 0;	2810	p->on_cpu = 0;
2811	#endif	2811	#endif
2812	#ifdef CONFIG_PREEMPT	2812	#ifdef CONFIG_PREEMPT
2813	/* Want to start with kernel preemption disabled. */	2813	/* Want to start with kernel preemption disabled. */
2814	task_thread_info(p)->preempt_count = 1;	2814	task_thread_info(p)->preempt_count = 1;
2815	#endif	2815	#endif
2816	#ifdef CONFIG_SMP	2816	#ifdef CONFIG_SMP
2817	plist_node_init(&p->pushable_tasks, MAX_PRIO);	2817	plist_node_init(&p->pushable_tasks, MAX_PRIO);
2818	#endif	2818	#endif
2819		2819
2820	put_cpu();	2820	put_cpu();
2821	}	2821	}
2822		2822
2823	/*	2823	/*
2824	* wake_up_new_task - wake up a newly created task for the first time.	2824	* wake_up_new_task - wake up a newly created task for the first time.
2825	*	2825	*
2826	* This function will do some initial scheduler statistics housekeeping	2826	* This function will do some initial scheduler statistics housekeeping
2827	* that must be done for every newly created context, then puts the task	2827	* that must be done for every newly created context, then puts the task
2828	* on the runqueue and wakes it.	2828	* on the runqueue and wakes it.
2829	*/	2829	*/
2830	void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)	2830	void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2831	{	2831	{
2832	unsigned long flags;	2832	unsigned long flags;
2833	struct rq *rq;	2833	struct rq *rq;
2834		2834
2835	raw_spin_lock_irqsave(&p->pi_lock, flags);	2835	raw_spin_lock_irqsave(&p->pi_lock, flags);
2836	#ifdef CONFIG_SMP	2836	#ifdef CONFIG_SMP
2837	/*	2837	/*
2838	* Fork balancing, do it here and not earlier because:	2838	* Fork balancing, do it here and not earlier because:
2839	* - cpus_allowed can change in the fork path	2839	* - cpus_allowed can change in the fork path
2840	* - any previously selected cpu might disappear through hotplug	2840	* - any previously selected cpu might disappear through hotplug
2841	*/	2841	*/
2842	set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));	2842	set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
2843	#endif	2843	#endif
2844		2844
2845	rq = __task_rq_lock(p);	2845	rq = __task_rq_lock(p);
2846	activate_task(rq, p, 0);	2846	activate_task(rq, p, 0);
2847	p->on_rq = 1;	2847	p->on_rq = 1;
2848	trace_sched_wakeup_new(p, true);	2848	trace_sched_wakeup_new(p, true);
2849	check_preempt_curr(rq, p, WF_FORK);	2849	check_preempt_curr(rq, p, WF_FORK);
2850	#ifdef CONFIG_SMP	2850	#ifdef CONFIG_SMP
2851	if (p->sched_class->task_woken)	2851	if (p->sched_class->task_woken)
2852	p->sched_class->task_woken(rq, p);	2852	p->sched_class->task_woken(rq, p);
2853	#endif	2853	#endif
2854	task_rq_unlock(rq, p, &flags);	2854	task_rq_unlock(rq, p, &flags);
2855	}	2855	}
2856		2856
2857	#ifdef CONFIG_PREEMPT_NOTIFIERS	2857	#ifdef CONFIG_PREEMPT_NOTIFIERS
2858		2858
2859	/**	2859	/**
2860	* preempt_notifier_register - tell me when current is being preempted & rescheduled	2860	* preempt_notifier_register - tell me when current is being preempted & rescheduled
2861	* @notifier: notifier struct to register	2861	* @notifier: notifier struct to register
2862	*/	2862	*/
2863	void preempt_notifier_register(struct preempt_notifier *notifier)	2863	void preempt_notifier_register(struct preempt_notifier *notifier)
2864	{	2864	{
2865	hlist_add_head(&notifier->link, &current->preempt_notifiers);	2865	hlist_add_head(&notifier->link, &current->preempt_notifiers);
2866	}	2866	}
2867	EXPORT_SYMBOL_GPL(preempt_notifier_register);	2867	EXPORT_SYMBOL_GPL(preempt_notifier_register);
2868		2868
2869	/**	2869	/**
2870	* preempt_notifier_unregister - no longer interested in preemption notifications	2870	* preempt_notifier_unregister - no longer interested in preemption notifications
2871	* @notifier: notifier struct to unregister	2871	* @notifier: notifier struct to unregister
2872	*	2872	*
2873	* This is safe to call from within a preemption notifier.	2873	* This is safe to call from within a preemption notifier.
2874	*/	2874	*/
2875	void preempt_notifier_unregister(struct preempt_notifier *notifier)	2875	void preempt_notifier_unregister(struct preempt_notifier *notifier)
2876	{	2876	{
2877	hlist_del(&notifier->link);	2877	hlist_del(&notifier->link);
2878	}	2878	}
2879	EXPORT_SYMBOL_GPL(preempt_notifier_unregister);	2879	EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
2880		2880
2881	static void fire_sched_in_preempt_notifiers(struct task_struct *curr)	2881	static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2882	{	2882	{
2883	struct preempt_notifier *notifier;	2883	struct preempt_notifier *notifier;
2884	struct hlist_node *node;	2884	struct hlist_node *node;
2885		2885
2886	hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)	2886	hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2887	notifier->ops->sched_in(notifier, raw_smp_processor_id());	2887	notifier->ops->sched_in(notifier, raw_smp_processor_id());
2888	}	2888	}
2889		2889
2890	static void	2890	static void
2891	fire_sched_out_preempt_notifiers(struct task_struct *curr,	2891	fire_sched_out_preempt_notifiers(struct task_struct *curr,
2892	struct task_struct *next)	2892	struct task_struct *next)
2893	{	2893	{
2894	struct preempt_notifier *notifier;	2894	struct preempt_notifier *notifier;
2895	struct hlist_node *node;	2895	struct hlist_node *node;
2896		2896
2897	hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)	2897	hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2898	notifier->ops->sched_out(notifier, next);	2898	notifier->ops->sched_out(notifier, next);
2899	}	2899	}
2900		2900
2901	#else /* !CONFIG_PREEMPT_NOTIFIERS */	2901	#else /* !CONFIG_PREEMPT_NOTIFIERS */
2902		2902
2903	static void fire_sched_in_preempt_notifiers(struct task_struct *curr)	2903	static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2904	{	2904	{
2905	}	2905	}
2906		2906
2907	static void	2907	static void
2908	fire_sched_out_preempt_notifiers(struct task_struct *curr,	2908	fire_sched_out_preempt_notifiers(struct task_struct *curr,
2909	struct task_struct *next)	2909	struct task_struct *next)
2910	{	2910	{
2911	}	2911	}
2912		2912
2913	#endif /* CONFIG_PREEMPT_NOTIFIERS */	2913	#endif /* CONFIG_PREEMPT_NOTIFIERS */
2914		2914
2915	/**	2915	/**
2916	* prepare_task_switch - prepare to switch tasks	2916	* prepare_task_switch - prepare to switch tasks
2917	* @rq: the runqueue preparing to switch	2917	* @rq: the runqueue preparing to switch
2918	* @prev: the current task that is being switched out	2918	* @prev: the current task that is being switched out
2919	* @next: the task we are going to switch to.	2919	* @next: the task we are going to switch to.
2920	*	2920	*
2921	* This is called with the rq lock held and interrupts off. It must	2921	* This is called with the rq lock held and interrupts off. It must
2922	* be paired with a subsequent finish_task_switch after the context	2922	* be paired with a subsequent finish_task_switch after the context
2923	* switch.	2923	* switch.
2924	*	2924	*
2925	* prepare_task_switch sets up locking and calls architecture specific	2925	* prepare_task_switch sets up locking and calls architecture specific
2926	* hooks.	2926	* hooks.
2927	*/	2927	*/
2928	static inline void	2928	static inline void
2929	prepare_task_switch(struct rq rq, struct task_struct prev,	2929	prepare_task_switch(struct rq rq, struct task_struct prev,
2930	struct task_struct *next)	2930	struct task_struct *next)
2931	{	2931	{
2932	sched_info_switch(prev, next);	2932	sched_info_switch(prev, next);
2933	perf_event_task_sched_out(prev, next);	2933	perf_event_task_sched_out(prev, next);
2934	fire_sched_out_preempt_notifiers(prev, next);	2934	fire_sched_out_preempt_notifiers(prev, next);
2935	prepare_lock_switch(rq, next);	2935	prepare_lock_switch(rq, next);
2936	prepare_arch_switch(next);	2936	prepare_arch_switch(next);
2937	trace_sched_switch(prev, next);	2937	trace_sched_switch(prev, next);
2938	}	2938	}
2939		2939
2940	/**	2940	/**
2941	* finish_task_switch - clean up after a task-switch	2941	* finish_task_switch - clean up after a task-switch
2942	* @rq: runqueue associated with task-switch	2942	* @rq: runqueue associated with task-switch
2943	* @prev: the thread we just switched away from.	2943	* @prev: the thread we just switched away from.
2944	*	2944	*
2945	* finish_task_switch must be called after the context switch, paired	2945	* finish_task_switch must be called after the context switch, paired
2946	* with a prepare_task_switch call before the context switch.	2946	* with a prepare_task_switch call before the context switch.
2947	* finish_task_switch will reconcile locking set up by prepare_task_switch,	2947	* finish_task_switch will reconcile locking set up by prepare_task_switch,
2948	* and do any other architecture-specific cleanup actions.	2948	* and do any other architecture-specific cleanup actions.
2949	*	2949	*
2950	* Note that we may have delayed dropping an mm in context_switch(). If	2950	* Note that we may have delayed dropping an mm in context_switch(). If
2951	* so, we finish that here outside of the runqueue lock. (Doing it	2951	* so, we finish that here outside of the runqueue lock. (Doing it
2952	* with the lock held can cause deadlocks; see schedule() for	2952	* with the lock held can cause deadlocks; see schedule() for
2953	* details.)	2953	* details.)
2954	*/	2954	*/
2955	static void finish_task_switch(struct rq rq, struct task_struct prev)	2955	static void finish_task_switch(struct rq rq, struct task_struct prev)
2956	__releases(rq->lock)	2956	__releases(rq->lock)
2957	{	2957	{
2958	struct mm_struct *mm = rq->prev_mm;	2958	struct mm_struct *mm = rq->prev_mm;
2959	long prev_state;	2959	long prev_state;
2960		2960
2961	rq->prev_mm = NULL;	2961	rq->prev_mm = NULL;
2962		2962
2963	/*	2963	/*
2964	* A task struct has one reference for the use as "current".	2964	* A task struct has one reference for the use as "current".
2965	* If a task dies, then it sets TASK_DEAD in tsk->state and calls	2965	* If a task dies, then it sets TASK_DEAD in tsk->state and calls
2966	* schedule one last time. The schedule call will never return, and	2966	* schedule one last time. The schedule call will never return, and
2967	* the scheduled task must drop that reference.	2967	* the scheduled task must drop that reference.
2968	* The test for TASK_DEAD must occur while the runqueue locks are	2968	* The test for TASK_DEAD must occur while the runqueue locks are
2969	* still held, otherwise prev could be scheduled on another cpu, die	2969	* still held, otherwise prev could be scheduled on another cpu, die
2970	* there before we look at prev->state, and then the reference would	2970	* there before we look at prev->state, and then the reference would
2971	* be dropped twice.	2971	* be dropped twice.
2972	* Manfred Spraul <manfred@colorfullife.com>	2972	* Manfred Spraul <manfred@colorfullife.com>
2973	*/	2973	*/
2974	prev_state = prev->state;	2974	prev_state = prev->state;
2975	finish_arch_switch(prev);	2975	finish_arch_switch(prev);
2976	#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW	2976	#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2977	local_irq_disable();	2977	local_irq_disable();
2978	#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */	2978	#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
2979	perf_event_task_sched_in(current);	2979	perf_event_task_sched_in(current);
2980	#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW	2980	#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2981	local_irq_enable();	2981	local_irq_enable();
2982	#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */	2982	#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
2983	finish_lock_switch(rq, prev);	2983	finish_lock_switch(rq, prev);
2984		2984
2985	fire_sched_in_preempt_notifiers(current);	2985	fire_sched_in_preempt_notifiers(current);
2986	if (mm)	2986	if (mm)
2987	mmdrop(mm);	2987	mmdrop(mm);
2988	if (unlikely(prev_state == TASK_DEAD)) {	2988	if (unlikely(prev_state == TASK_DEAD)) {
2989	/*	2989	/*
2990	* Remove function-return probe instances associated with this	2990	* Remove function-return probe instances associated with this
2991	* task and put them back on the free list.	2991	* task and put them back on the free list.
2992	*/	2992	*/
2993	kprobe_flush_task(prev);	2993	kprobe_flush_task(prev);
2994	put_task_struct(prev);	2994	put_task_struct(prev);
2995	}	2995	}
2996	}	2996	}
2997		2997
2998	#ifdef CONFIG_SMP	2998	#ifdef CONFIG_SMP
2999		2999
3000	/* assumes rq->lock is held */	3000	/* assumes rq->lock is held */
3001	static inline void pre_schedule(struct rq rq, struct task_struct prev)	3001	static inline void pre_schedule(struct rq rq, struct task_struct prev)
3002	{	3002	{
3003	if (prev->sched_class->pre_schedule)	3003	if (prev->sched_class->pre_schedule)
3004	prev->sched_class->pre_schedule(rq, prev);	3004	prev->sched_class->pre_schedule(rq, prev);
3005	}	3005	}
3006		3006
3007	/* rq->lock is NOT held, but preemption is disabled */	3007	/* rq->lock is NOT held, but preemption is disabled */
3008	static inline void post_schedule(struct rq *rq)	3008	static inline void post_schedule(struct rq *rq)
3009	{	3009	{
3010	if (rq->post_schedule) {	3010	if (rq->post_schedule) {
3011	unsigned long flags;	3011	unsigned long flags;
3012		3012
3013	raw_spin_lock_irqsave(&rq->lock, flags);	3013	raw_spin_lock_irqsave(&rq->lock, flags);
3014	if (rq->curr->sched_class->post_schedule)	3014	if (rq->curr->sched_class->post_schedule)
3015	rq->curr->sched_class->post_schedule(rq);	3015	rq->curr->sched_class->post_schedule(rq);
3016	raw_spin_unlock_irqrestore(&rq->lock, flags);	3016	raw_spin_unlock_irqrestore(&rq->lock, flags);
3017		3017
3018	rq->post_schedule = 0;	3018	rq->post_schedule = 0;
3019	}	3019	}
3020	}	3020	}
3021		3021
3022	#else	3022	#else
3023		3023
3024	static inline void pre_schedule(struct rq rq, struct task_struct p)	3024	static inline void pre_schedule(struct rq rq, struct task_struct p)
3025	{	3025	{
3026	}	3026	}
3027		3027
3028	static inline void post_schedule(struct rq *rq)	3028	static inline void post_schedule(struct rq *rq)
3029	{	3029	{
3030	}	3030	}
3031		3031
3032	#endif	3032	#endif
3033		3033
3034	/**	3034	/**
3035	* schedule_tail - first thing a freshly forked thread must call.	3035	* schedule_tail - first thing a freshly forked thread must call.
3036	* @prev: the thread we just switched away from.	3036	* @prev: the thread we just switched away from.
3037	*/	3037	*/
3038	asmlinkage void schedule_tail(struct task_struct *prev)	3038	asmlinkage void schedule_tail(struct task_struct *prev)
3039	__releases(rq->lock)	3039	__releases(rq->lock)
3040	{	3040	{
3041	struct rq *rq = this_rq();	3041	struct rq *rq = this_rq();
3042		3042
3043	finish_task_switch(rq, prev);	3043	finish_task_switch(rq, prev);
3044		3044
3045	/*	3045	/*
3046	* FIXME: do we need to worry about rq being invalidated by the	3046	* FIXME: do we need to worry about rq being invalidated by the
3047	* task_switch?	3047	* task_switch?
3048	*/	3048	*/
3049	post_schedule(rq);	3049	post_schedule(rq);
3050		3050
3051	#ifdef __ARCH_WANT_UNLOCKED_CTXSW	3051	#ifdef __ARCH_WANT_UNLOCKED_CTXSW
3052	/* In this case, finish_task_switch does not reenable preemption */	3052	/* In this case, finish_task_switch does not reenable preemption */
3053	preempt_enable();	3053	preempt_enable();
3054	#endif	3054	#endif
3055	if (current->set_child_tid)	3055	if (current->set_child_tid)
3056	put_user(task_pid_vnr(current), current->set_child_tid);	3056	put_user(task_pid_vnr(current), current->set_child_tid);
3057	}	3057	}
3058		3058
3059	/*	3059	/*
3060	* context_switch - switch to the new MM and the new	3060	* context_switch - switch to the new MM and the new
3061	* thread's register state.	3061	* thread's register state.
3062	*/	3062	*/
3063	static inline void	3063	static inline void
3064	context_switch(struct rq rq, struct task_struct prev,	3064	context_switch(struct rq rq, struct task_struct prev,
3065	struct task_struct *next)	3065	struct task_struct *next)
3066	{	3066	{
3067	struct mm_struct mm, oldmm;	3067	struct mm_struct mm, oldmm;
3068		3068
3069	prepare_task_switch(rq, prev, next);	3069	prepare_task_switch(rq, prev, next);
3070		3070
3071	mm = next->mm;	3071	mm = next->mm;
3072	oldmm = prev->active_mm;	3072	oldmm = prev->active_mm;
3073	/*	3073	/*
3074	* For paravirt, this is coupled with an exit in switch_to to	3074	* For paravirt, this is coupled with an exit in switch_to to
3075	* combine the page table reload and the switch backend into	3075	* combine the page table reload and the switch backend into
3076	* one hypercall.	3076	* one hypercall.
3077	*/	3077	*/
3078	arch_start_context_switch(prev);	3078	arch_start_context_switch(prev);
3079		3079
3080	if (!mm) {	3080	if (!mm) {
3081	next->active_mm = oldmm;	3081	next->active_mm = oldmm;
3082	atomic_inc(&oldmm->mm_count);	3082	atomic_inc(&oldmm->mm_count);
3083	enter_lazy_tlb(oldmm, next);	3083	enter_lazy_tlb(oldmm, next);
3084	} else	3084	} else
3085	switch_mm(oldmm, mm, next);	3085	switch_mm(oldmm, mm, next);
3086		3086
3087	if (!prev->mm) {	3087	if (!prev->mm) {
3088	prev->active_mm = NULL;	3088	prev->active_mm = NULL;
3089	rq->prev_mm = oldmm;	3089	rq->prev_mm = oldmm;
3090	}	3090	}
3091	/*	3091	/*
3092	* Since the runqueue lock will be released by the next	3092	* Since the runqueue lock will be released by the next
3093	* task (which is an invalid locking op but in the case	3093	* task (which is an invalid locking op but in the case
3094	* of the scheduler it's an obvious special-case), so we	3094	* of the scheduler it's an obvious special-case), so we
3095	* do an early lockdep release here:	3095	* do an early lockdep release here:
3096	*/	3096	*/
3097	#ifndef __ARCH_WANT_UNLOCKED_CTXSW	3097	#ifndef __ARCH_WANT_UNLOCKED_CTXSW
3098	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);	3098	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
3099	#endif	3099	#endif
3100		3100
3101	/* Here we just switch the register state and the stack. */	3101	/* Here we just switch the register state and the stack. */
3102	switch_to(prev, next, prev);	3102	switch_to(prev, next, prev);
3103		3103
3104	barrier();	3104	barrier();
3105	/*	3105	/*
3106	* this_rq must be evaluated again because prev may have moved	3106	* this_rq must be evaluated again because prev may have moved
3107	* CPUs since it called schedule(), thus the 'rq' on its stack	3107	* CPUs since it called schedule(), thus the 'rq' on its stack
3108	* frame will be invalid.	3108	* frame will be invalid.
3109	*/	3109	*/
3110	finish_task_switch(this_rq(), prev);	3110	finish_task_switch(this_rq(), prev);
3111	}	3111	}
3112		3112
3113	/*	3113	/*
3114	* nr_running, nr_uninterruptible and nr_context_switches:	3114	* nr_running, nr_uninterruptible and nr_context_switches:
3115	*	3115	*
3116	* externally visible scheduler statistics: current number of runnable	3116	* externally visible scheduler statistics: current number of runnable
3117	* threads, current number of uninterruptible-sleeping threads, total	3117	* threads, current number of uninterruptible-sleeping threads, total
3118	* number of context switches performed since bootup.	3118	* number of context switches performed since bootup.
3119	*/	3119	*/
3120	unsigned long nr_running(void)	3120	unsigned long nr_running(void)
3121	{	3121	{
3122	unsigned long i, sum = 0;	3122	unsigned long i, sum = 0;
3123		3123
3124	for_each_online_cpu(i)	3124	for_each_online_cpu(i)
3125	sum += cpu_rq(i)->nr_running;	3125	sum += cpu_rq(i)->nr_running;
3126		3126
3127	return sum;	3127	return sum;
3128	}	3128	}
3129		3129
3130	unsigned long nr_uninterruptible(void)	3130	unsigned long nr_uninterruptible(void)
3131	{	3131	{
3132	unsigned long i, sum = 0;	3132	unsigned long i, sum = 0;
3133		3133
3134	for_each_possible_cpu(i)	3134	for_each_possible_cpu(i)
3135	sum += cpu_rq(i)->nr_uninterruptible;	3135	sum += cpu_rq(i)->nr_uninterruptible;
3136		3136
3137	/*	3137	/*
3138	* Since we read the counters lockless, it might be slightly	3138	* Since we read the counters lockless, it might be slightly
3139	* inaccurate. Do not allow it to go below zero though:	3139	* inaccurate. Do not allow it to go below zero though:
3140	*/	3140	*/
3141	if (unlikely((long)sum < 0))	3141	if (unlikely((long)sum < 0))
3142	sum = 0;	3142	sum = 0;
3143		3143
3144	return sum;	3144	return sum;
3145	}	3145	}
3146		3146
3147	unsigned long long nr_context_switches(void)	3147	unsigned long long nr_context_switches(void)
3148	{	3148	{
3149	int i;	3149	int i;
3150	unsigned long long sum = 0;	3150	unsigned long long sum = 0;
3151		3151
3152	for_each_possible_cpu(i)	3152	for_each_possible_cpu(i)
3153	sum += cpu_rq(i)->nr_switches;	3153	sum += cpu_rq(i)->nr_switches;
3154		3154
3155	return sum;	3155	return sum;
3156	}	3156	}
3157		3157
3158	unsigned long nr_iowait(void)	3158	unsigned long nr_iowait(void)
3159	{	3159	{
3160	unsigned long i, sum = 0;	3160	unsigned long i, sum = 0;
3161		3161
3162	for_each_possible_cpu(i)	3162	for_each_possible_cpu(i)
3163	sum += atomic_read(&cpu_rq(i)->nr_iowait);	3163	sum += atomic_read(&cpu_rq(i)->nr_iowait);
3164		3164
3165	return sum;	3165	return sum;
3166	}	3166	}
3167		3167
3168	unsigned long nr_iowait_cpu(int cpu)	3168	unsigned long nr_iowait_cpu(int cpu)
3169	{	3169	{
3170	struct rq *this = cpu_rq(cpu);	3170	struct rq *this = cpu_rq(cpu);
3171	return atomic_read(&this->nr_iowait);	3171	return atomic_read(&this->nr_iowait);
3172	}	3172	}
3173		3173
3174	unsigned long this_cpu_load(void)	3174	unsigned long this_cpu_load(void)
3175	{	3175	{
3176	struct rq *this = this_rq();	3176	struct rq *this = this_rq();
3177	return this->cpu_load[0];	3177	return this->cpu_load[0];
3178	}	3178	}
3179		3179
3180		3180
3181	/* Variables and functions for calc_load */	3181	/* Variables and functions for calc_load */
3182	static atomic_long_t calc_load_tasks;	3182	static atomic_long_t calc_load_tasks;
3183	static unsigned long calc_load_update;	3183	static unsigned long calc_load_update;
3184	unsigned long avenrun[3];	3184	unsigned long avenrun[3];
3185	EXPORT_SYMBOL(avenrun);	3185	EXPORT_SYMBOL(avenrun);
3186		3186
3187	static long calc_load_fold_active(struct rq *this_rq)	3187	static long calc_load_fold_active(struct rq *this_rq)
3188	{	3188	{
3189	long nr_active, delta = 0;	3189	long nr_active, delta = 0;
3190		3190
3191	nr_active = this_rq->nr_running;	3191	nr_active = this_rq->nr_running;
3192	nr_active += (long) this_rq->nr_uninterruptible;	3192	nr_active += (long) this_rq->nr_uninterruptible;
3193		3193
3194	if (nr_active != this_rq->calc_load_active) {	3194	if (nr_active != this_rq->calc_load_active) {
3195	delta = nr_active - this_rq->calc_load_active;	3195	delta = nr_active - this_rq->calc_load_active;
3196	this_rq->calc_load_active = nr_active;	3196	this_rq->calc_load_active = nr_active;
3197	}	3197	}
3198		3198
3199	return delta;	3199	return delta;
3200	}	3200	}
3201		3201
3202	static unsigned long	3202	static unsigned long
3203	calc_load(unsigned long load, unsigned long exp, unsigned long active)	3203	calc_load(unsigned long load, unsigned long exp, unsigned long active)
3204	{	3204	{
3205	load *= exp;	3205	load *= exp;
3206	load += active * (FIXED_1 - exp);	3206	load += active * (FIXED_1 - exp);
3207	load += 1UL << (FSHIFT - 1);	3207	load += 1UL << (FSHIFT - 1);
3208	return load >> FSHIFT;	3208	return load >> FSHIFT;
3209	}	3209	}
3210		3210
3211	#ifdef CONFIG_NO_HZ	3211	#ifdef CONFIG_NO_HZ
3212	/*	3212	/*
3213	* For NO_HZ we delay the active fold to the next LOAD_FREQ update.	3213	* For NO_HZ we delay the active fold to the next LOAD_FREQ update.
3214	*	3214	*
3215	* When making the ILB scale, we should try to pull this in as well.	3215	* When making the ILB scale, we should try to pull this in as well.
3216	*/	3216	*/
3217	static atomic_long_t calc_load_tasks_idle;	3217	static atomic_long_t calc_load_tasks_idle;
3218		3218
3219	static void calc_load_account_idle(struct rq *this_rq)	3219	static void calc_load_account_idle(struct rq *this_rq)
3220	{	3220	{
3221	long delta;	3221	long delta;
3222		3222
3223	delta = calc_load_fold_active(this_rq);	3223	delta = calc_load_fold_active(this_rq);
3224	if (delta)	3224	if (delta)
3225	atomic_long_add(delta, &calc_load_tasks_idle);	3225	atomic_long_add(delta, &calc_load_tasks_idle);
3226	}	3226	}
3227		3227
3228	static long calc_load_fold_idle(void)	3228	static long calc_load_fold_idle(void)
3229	{	3229	{
3230	long delta = 0;	3230	long delta = 0;
3231		3231
3232	/*	3232	/*
3233	* Its got a race, we don't care...	3233	* Its got a race, we don't care...
3234	*/	3234	*/
3235	if (atomic_long_read(&calc_load_tasks_idle))	3235	if (atomic_long_read(&calc_load_tasks_idle))
3236	delta = atomic_long_xchg(&calc_load_tasks_idle, 0);	3236	delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
3237		3237
3238	return delta;	3238	return delta;
3239	}	3239	}
3240		3240
3241	/**	3241	/**
3242	* fixed_power_int - compute: x^n, in O(log n) time	3242	* fixed_power_int - compute: x^n, in O(log n) time
3243	*	3243	*
3244	* @x: base of the power	3244	* @x: base of the power
3245	* @frac_bits: fractional bits of @x	3245	* @frac_bits: fractional bits of @x
3246	* @n: power to raise @x to.	3246	* @n: power to raise @x to.
3247	*	3247	*
3248	* By exploiting the relation between the definition of the natural power	3248	* By exploiting the relation between the definition of the natural power
3249	* function: x^n := xx...*x (x multiplied by itself for n times), and	3249	* function: x^n := xx...*x (x multiplied by itself for n times), and
3250	* the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,	3250	* the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
3251	* (where: n_i \elem {0, 1}, the binary vector representing n),	3251	* (where: n_i \elem {0, 1}, the binary vector representing n),
3252	* we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is	3252	* we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
3253	* of course trivially computable in O(log_2 n), the length of our binary	3253	* of course trivially computable in O(log_2 n), the length of our binary
3254	* vector.	3254	* vector.
3255	*/	3255	*/
3256	static unsigned long	3256	static unsigned long
3257	fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)	3257	fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
3258	{	3258	{
3259	unsigned long result = 1UL << frac_bits;	3259	unsigned long result = 1UL << frac_bits;
3260		3260
3261	if (n) for (;;) {	3261	if (n) for (;;) {
3262	if (n & 1) {	3262	if (n & 1) {
3263	result *= x;	3263	result *= x;
3264	result += 1UL << (frac_bits - 1);	3264	result += 1UL << (frac_bits - 1);
3265	result >>= frac_bits;	3265	result >>= frac_bits;
3266	}	3266	}
3267	n >>= 1;	3267	n >>= 1;
3268	if (!n)	3268	if (!n)
3269	break;	3269	break;
3270	x *= x;	3270	x *= x;
3271	x += 1UL << (frac_bits - 1);	3271	x += 1UL << (frac_bits - 1);
3272	x >>= frac_bits;	3272	x >>= frac_bits;
3273	}	3273	}
3274		3274
3275	return result;	3275	return result;
3276	}	3276	}
3277		3277
3278	/*	3278	/*
3279	* a1 = a0 * e + a * (1 - e)	3279	* a1 = a0 * e + a * (1 - e)
3280	*	3280	*
3281	* a2 = a1 * e + a * (1 - e)	3281	* a2 = a1 * e + a * (1 - e)
3282	* = (a0 * e + a * (1 - e)) * e + a * (1 - e)	3282	* = (a0 * e + a * (1 - e)) * e + a * (1 - e)
3283	* = a0 * e^2 + a * (1 - e) * (1 + e)	3283	* = a0 * e^2 + a * (1 - e) * (1 + e)
3284	*	3284	*
3285	* a3 = a2 * e + a * (1 - e)	3285	* a3 = a2 * e + a * (1 - e)
3286	* = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)	3286	* = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
3287	* = a0 * e^3 + a * (1 - e) * (1 + e + e^2)	3287	* = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
3288	*	3288	*
3289	* ...	3289	* ...
3290	*	3290	*
3291	* an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]	3291	* an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
3292	* = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)	3292	* = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
3293	* = a0 * e^n + a * (1 - e^n)	3293	* = a0 * e^n + a * (1 - e^n)
3294	*	3294	*
3295	* [1] application of the geometric series:	3295	* [1] application of the geometric series:
3296	*	3296	*
3297	* n 1 - x^(n+1)	3297	* n 1 - x^(n+1)
3298	* S_n := \Sum x^i = -------------	3298	* S_n := \Sum x^i = -------------
3299	* i=0 1 - x	3299	* i=0 1 - x
3300	*/	3300	*/
3301	static unsigned long	3301	static unsigned long
3302	calc_load_n(unsigned long load, unsigned long exp,	3302	calc_load_n(unsigned long load, unsigned long exp,
3303	unsigned long active, unsigned int n)	3303	unsigned long active, unsigned int n)
3304	{	3304	{
3305		3305
3306	return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);	3306	return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
3307	}	3307	}
3308		3308
3309	/*	3309	/*
3310	* NO_HZ can leave us missing all per-cpu ticks calling	3310	* NO_HZ can leave us missing all per-cpu ticks calling
3311	* calc_load_account_active(), but since an idle CPU folds its delta into	3311	* calc_load_account_active(), but since an idle CPU folds its delta into
3312	* calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold	3312	* calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
3313	* in the pending idle delta if our idle period crossed a load cycle boundary.	3313	* in the pending idle delta if our idle period crossed a load cycle boundary.
3314	*	3314	*
3315	* Once we've updated the global active value, we need to apply the exponential	3315	* Once we've updated the global active value, we need to apply the exponential
3316	* weights adjusted to the number of cycles missed.	3316	* weights adjusted to the number of cycles missed.
3317	*/	3317	*/
3318	static void calc_global_nohz(unsigned long ticks)	3318	static void calc_global_nohz(unsigned long ticks)
3319	{	3319	{
3320	long delta, active, n;	3320	long delta, active, n;
3321		3321
3322	if (time_before(jiffies, calc_load_update))	3322	if (time_before(jiffies, calc_load_update))
3323	return;	3323	return;
3324		3324
3325	/*	3325	/*
3326	* If we crossed a calc_load_update boundary, make sure to fold	3326	* If we crossed a calc_load_update boundary, make sure to fold
3327	* any pending idle changes, the respective CPUs might have	3327	* any pending idle changes, the respective CPUs might have
3328	* missed the tick driven calc_load_account_active() update	3328	* missed the tick driven calc_load_account_active() update
3329	* due to NO_HZ.	3329	* due to NO_HZ.
3330	*/	3330	*/
3331	delta = calc_load_fold_idle();	3331	delta = calc_load_fold_idle();
3332	if (delta)	3332	if (delta)
3333	atomic_long_add(delta, &calc_load_tasks);	3333	atomic_long_add(delta, &calc_load_tasks);
3334		3334
3335	/*	3335	/*
3336	* If we were idle for multiple load cycles, apply them.	3336	* If we were idle for multiple load cycles, apply them.
3337	*/	3337	*/
3338	if (ticks >= LOAD_FREQ) {	3338	if (ticks >= LOAD_FREQ) {
3339	n = ticks / LOAD_FREQ;	3339	n = ticks / LOAD_FREQ;
3340		3340
3341	active = atomic_long_read(&calc_load_tasks);	3341	active = atomic_long_read(&calc_load_tasks);
3342	active = active > 0 ? active * FIXED_1 : 0;	3342	active = active > 0 ? active * FIXED_1 : 0;
3343		3343
3344	avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);	3344	avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
3345	avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);	3345	avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
3346	avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);	3346	avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
3347		3347
3348	calc_load_update += n * LOAD_FREQ;	3348	calc_load_update += n * LOAD_FREQ;
3349	}	3349	}
3350		3350
3351	/*	3351	/*
3352	* Its possible the remainder of the above division also crosses	3352	* Its possible the remainder of the above division also crosses
3353	* a LOAD_FREQ period, the regular check in calc_global_load()	3353	* a LOAD_FREQ period, the regular check in calc_global_load()
3354	* which comes after this will take care of that.	3354	* which comes after this will take care of that.
3355	*	3355	*
3356	* Consider us being 11 ticks before a cycle completion, and us	3356	* Consider us being 11 ticks before a cycle completion, and us
3357	* sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will	3357	* sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
3358	* age us 4 cycles, and the test in calc_global_load() will	3358	* age us 4 cycles, and the test in calc_global_load() will
3359	* pick up the final one.	3359	* pick up the final one.
3360	*/	3360	*/
3361	}	3361	}
3362	#else	3362	#else
3363	static void calc_load_account_idle(struct rq *this_rq)	3363	static void calc_load_account_idle(struct rq *this_rq)
3364	{	3364	{
3365	}	3365	}
3366		3366
3367	static inline long calc_load_fold_idle(void)	3367	static inline long calc_load_fold_idle(void)
3368	{	3368	{
3369	return 0;	3369	return 0;
3370	}	3370	}
3371		3371
3372	static void calc_global_nohz(unsigned long ticks)	3372	static void calc_global_nohz(unsigned long ticks)
3373	{	3373	{
3374	}	3374	}
3375	#endif	3375	#endif
3376		3376
3377	/**	3377	/**
3378	* get_avenrun - get the load average array	3378	* get_avenrun - get the load average array
3379	* @loads: pointer to dest load array	3379	* @loads: pointer to dest load array
3380	* @offset: offset to add	3380	* @offset: offset to add
3381	* @shift: shift count to shift the result left	3381	* @shift: shift count to shift the result left
3382	*	3382	*
3383	* These values are estimates at best, so no need for locking.	3383	* These values are estimates at best, so no need for locking.
3384	*/	3384	*/
3385	void get_avenrun(unsigned long *loads, unsigned long offset, int shift)	3385	void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
3386	{	3386	{
3387	loads[0] = (avenrun[0] + offset) << shift;	3387	loads[0] = (avenrun[0] + offset) << shift;
3388	loads[1] = (avenrun[1] + offset) << shift;	3388	loads[1] = (avenrun[1] + offset) << shift;
3389	loads[2] = (avenrun[2] + offset) << shift;	3389	loads[2] = (avenrun[2] + offset) << shift;
3390	}	3390	}
3391		3391
3392	/*	3392	/*
3393	* calc_load - update the avenrun load estimates 10 ticks after the	3393	* calc_load - update the avenrun load estimates 10 ticks after the
3394	* CPUs have updated calc_load_tasks.	3394	* CPUs have updated calc_load_tasks.
3395	*/	3395	*/
3396	void calc_global_load(unsigned long ticks)	3396	void calc_global_load(unsigned long ticks)
3397	{	3397	{
3398	long active;	3398	long active;
3399		3399
3400	calc_global_nohz(ticks);	3400	calc_global_nohz(ticks);
3401		3401
3402	if (time_before(jiffies, calc_load_update + 10))	3402	if (time_before(jiffies, calc_load_update + 10))
3403	return;	3403	return;
3404		3404
3405	active = atomic_long_read(&calc_load_tasks);	3405	active = atomic_long_read(&calc_load_tasks);
3406	active = active > 0 ? active * FIXED_1 : 0;	3406	active = active > 0 ? active * FIXED_1 : 0;
3407		3407
3408	avenrun[0] = calc_load(avenrun[0], EXP_1, active);	3408	avenrun[0] = calc_load(avenrun[0], EXP_1, active);
3409	avenrun[1] = calc_load(avenrun[1], EXP_5, active);	3409	avenrun[1] = calc_load(avenrun[1], EXP_5, active);
3410	avenrun[2] = calc_load(avenrun[2], EXP_15, active);	3410	avenrun[2] = calc_load(avenrun[2], EXP_15, active);
3411		3411
3412	calc_load_update += LOAD_FREQ;	3412	calc_load_update += LOAD_FREQ;
3413	}	3413	}
3414		3414
3415	/*	3415	/*
3416	* Called from update_cpu_load() to periodically update this CPU's	3416	* Called from update_cpu_load() to periodically update this CPU's
3417	* active count.	3417	* active count.
3418	*/	3418	*/
3419	static void calc_load_account_active(struct rq *this_rq)	3419	static void calc_load_account_active(struct rq *this_rq)
3420	{	3420	{
3421	long delta;	3421	long delta;
3422		3422
3423	if (time_before(jiffies, this_rq->calc_load_update))	3423	if (time_before(jiffies, this_rq->calc_load_update))
3424	return;	3424	return;
3425		3425
3426	delta = calc_load_fold_active(this_rq);	3426	delta = calc_load_fold_active(this_rq);
3427	delta += calc_load_fold_idle();	3427	delta += calc_load_fold_idle();
3428	if (delta)	3428	if (delta)
3429	atomic_long_add(delta, &calc_load_tasks);	3429	atomic_long_add(delta, &calc_load_tasks);
3430		3430
3431	this_rq->calc_load_update += LOAD_FREQ;	3431	this_rq->calc_load_update += LOAD_FREQ;
3432	}	3432	}
3433		3433
3434	/*	3434	/*
3435	* The exact cpuload at various idx values, calculated at every tick would be	3435	* The exact cpuload at various idx values, calculated at every tick would be
3436	* load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load	3436	* load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
3437	*	3437	*
3438	* If a cpu misses updates for n-1 ticks (as it was idle) and update gets called	3438	* If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
3439	* on nth tick when cpu may be busy, then we have:	3439	* on nth tick when cpu may be busy, then we have:
3440	* load = ((2^idx - 1) / 2^idx)^(n-1) * load	3440	* load = ((2^idx - 1) / 2^idx)^(n-1) * load
3441	* load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load	3441	* load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
3442	*	3442	*
3443	* decay_load_missed() below does efficient calculation of	3443	* decay_load_missed() below does efficient calculation of
3444	* load = ((2^idx - 1) / 2^idx)^(n-1) * load	3444	* load = ((2^idx - 1) / 2^idx)^(n-1) * load
3445	* avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load	3445	* avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
3446	*	3446	*
3447	* The calculation is approximated on a 128 point scale.	3447	* The calculation is approximated on a 128 point scale.
3448	* degrade_zero_ticks is the number of ticks after which load at any	3448	* degrade_zero_ticks is the number of ticks after which load at any
3449	* particular idx is approximated to be zero.	3449	* particular idx is approximated to be zero.
3450	* degrade_factor is a precomputed table, a row for each load idx.	3450	* degrade_factor is a precomputed table, a row for each load idx.
3451	* Each column corresponds to degradation factor for a power of two ticks,	3451	* Each column corresponds to degradation factor for a power of two ticks,
3452	* based on 128 point scale.	3452	* based on 128 point scale.
3453	* Example:	3453	* Example:
3454	* row 2, col 3 (=12) says that the degradation at load idx 2 after	3454	* row 2, col 3 (=12) says that the degradation at load idx 2 after
3455	* 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).	3455	* 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
3456	*	3456	*
3457	* With this power of 2 load factors, we can degrade the load n times	3457	* With this power of 2 load factors, we can degrade the load n times
3458	* by looking at 1 bits in n and doing as many mult/shift instead of	3458	* by looking at 1 bits in n and doing as many mult/shift instead of
3459	* n mult/shifts needed by the exact degradation.	3459	* n mult/shifts needed by the exact degradation.
3460	*/	3460	*/
3461	#define DEGRADE_SHIFT 7	3461	#define DEGRADE_SHIFT 7
3462	static const unsigned char	3462	static const unsigned char
3463	degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};	3463	degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
3464	static const unsigned char	3464	static const unsigned char
3465	degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {	3465	degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
3466	{0, 0, 0, 0, 0, 0, 0, 0},	3466	{0, 0, 0, 0, 0, 0, 0, 0},
3467	{64, 32, 8, 0, 0, 0, 0, 0},	3467	{64, 32, 8, 0, 0, 0, 0, 0},
3468	{96, 72, 40, 12, 1, 0, 0},	3468	{96, 72, 40, 12, 1, 0, 0},
3469	{112, 98, 75, 43, 15, 1, 0},	3469	{112, 98, 75, 43, 15, 1, 0},
3470	{120, 112, 98, 76, 45, 16, 2} };	3470	{120, 112, 98, 76, 45, 16, 2} };
3471		3471
3472	/*	3472	/*
3473	* Update cpu_load for any missed ticks, due to tickless idle. The backlog	3473	* Update cpu_load for any missed ticks, due to tickless idle. The backlog
3474	* would be when CPU is idle and so we just decay the old load without	3474	* would be when CPU is idle and so we just decay the old load without
3475	* adding any new load.	3475	* adding any new load.
3476	*/	3476	*/
3477	static unsigned long	3477	static unsigned long
3478	decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)	3478	decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
3479	{	3479	{
3480	int j = 0;	3480	int j = 0;
3481		3481
3482	if (!missed_updates)	3482	if (!missed_updates)
3483	return load;	3483	return load;
3484		3484
3485	if (missed_updates >= degrade_zero_ticks[idx])	3485	if (missed_updates >= degrade_zero_ticks[idx])
3486	return 0;	3486	return 0;
3487		3487
3488	if (idx == 1)	3488	if (idx == 1)
3489	return load >> missed_updates;	3489	return load >> missed_updates;
3490		3490
3491	while (missed_updates) {	3491	while (missed_updates) {
3492	if (missed_updates % 2)	3492	if (missed_updates % 2)
3493	load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;	3493	load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
3494		3494
3495	missed_updates >>= 1;	3495	missed_updates >>= 1;
3496	j++;	3496	j++;
3497	}	3497	}
3498	return load;	3498	return load;
3499	}	3499	}
3500		3500
3501	/*	3501	/*
3502	* Update rq->cpu_load[] statistics. This function is usually called every	3502	* Update rq->cpu_load[] statistics. This function is usually called every
3503	* scheduler tick (TICK_NSEC). With tickless idle this will not be called	3503	* scheduler tick (TICK_NSEC). With tickless idle this will not be called
3504	* every tick. We fix it up based on jiffies.	3504	* every tick. We fix it up based on jiffies.
3505	*/	3505	*/
3506	static void update_cpu_load(struct rq *this_rq)	3506	static void update_cpu_load(struct rq *this_rq)
3507	{	3507	{
3508	unsigned long this_load = this_rq->load.weight;	3508	unsigned long this_load = this_rq->load.weight;
3509	unsigned long curr_jiffies = jiffies;	3509	unsigned long curr_jiffies = jiffies;
3510	unsigned long pending_updates;	3510	unsigned long pending_updates;
3511	int i, scale;	3511	int i, scale;
3512		3512
3513	this_rq->nr_load_updates++;	3513	this_rq->nr_load_updates++;
3514		3514
3515	/* Avoid repeated calls on same jiffy, when moving in and out of idle */	3515	/* Avoid repeated calls on same jiffy, when moving in and out of idle */
3516	if (curr_jiffies == this_rq->last_load_update_tick)	3516	if (curr_jiffies == this_rq->last_load_update_tick)
3517	return;	3517	return;
3518		3518
3519	pending_updates = curr_jiffies - this_rq->last_load_update_tick;	3519	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
3520	this_rq->last_load_update_tick = curr_jiffies;	3520	this_rq->last_load_update_tick = curr_jiffies;
3521		3521
3522	/* Update our load: */	3522	/* Update our load: */
3523	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */	3523	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
3524	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {	3524	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
3525	unsigned long old_load, new_load;	3525	unsigned long old_load, new_load;
3526		3526
3527	/* scale is effectively 1 << i now, and >> i divides by scale */	3527	/* scale is effectively 1 << i now, and >> i divides by scale */
3528		3528
3529	old_load = this_rq->cpu_load[i];	3529	old_load = this_rq->cpu_load[i];
3530	old_load = decay_load_missed(old_load, pending_updates - 1, i);	3530	old_load = decay_load_missed(old_load, pending_updates - 1, i);
3531	new_load = this_load;	3531	new_load = this_load;
3532	/*	3532	/*
3533	* Round up the averaging division if load is increasing. This	3533	* Round up the averaging division if load is increasing. This
3534	* prevents us from getting stuck on 9 if the load is 10, for	3534	* prevents us from getting stuck on 9 if the load is 10, for
3535	* example.	3535	* example.
3536	*/	3536	*/
3537	if (new_load > old_load)	3537	if (new_load > old_load)
3538	new_load += scale - 1;	3538	new_load += scale - 1;
3539		3539
3540	this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;	3540	this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
3541	}	3541	}
3542		3542
3543	sched_avg_update(this_rq);	3543	sched_avg_update(this_rq);
3544	}	3544	}
3545		3545
3546	static void update_cpu_load_active(struct rq *this_rq)	3546	static void update_cpu_load_active(struct rq *this_rq)
3547	{	3547	{
3548	update_cpu_load(this_rq);	3548	update_cpu_load(this_rq);
3549		3549
3550	calc_load_account_active(this_rq);	3550	calc_load_account_active(this_rq);
3551	}	3551	}
3552		3552
3553	#ifdef CONFIG_SMP	3553	#ifdef CONFIG_SMP
3554		3554
3555	/*	3555	/*
3556	* sched_exec - execve() is a valuable balancing opportunity, because at	3556	* sched_exec - execve() is a valuable balancing opportunity, because at
3557	* this point the task has the smallest effective memory and cache footprint.	3557	* this point the task has the smallest effective memory and cache footprint.
3558	*/	3558	*/
3559	void sched_exec(void)	3559	void sched_exec(void)
3560	{	3560	{
3561	struct task_struct *p = current;	3561	struct task_struct *p = current;
3562	unsigned long flags;	3562	unsigned long flags;
3563	int dest_cpu;	3563	int dest_cpu;
3564		3564
3565	raw_spin_lock_irqsave(&p->pi_lock, flags);	3565	raw_spin_lock_irqsave(&p->pi_lock, flags);
3566	dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);	3566	dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
3567	if (dest_cpu == smp_processor_id())	3567	if (dest_cpu == smp_processor_id())
3568	goto unlock;	3568	goto unlock;
3569		3569
3570	if (likely(cpu_active(dest_cpu))) {	3570	if (likely(cpu_active(dest_cpu))) {
3571	struct migration_arg arg = { p, dest_cpu };	3571	struct migration_arg arg = { p, dest_cpu };
3572		3572
3573	raw_spin_unlock_irqrestore(&p->pi_lock, flags);	3573	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3574	stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);	3574	stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
3575	return;	3575	return;
3576	}	3576	}
3577	unlock:	3577	unlock:
3578	raw_spin_unlock_irqrestore(&p->pi_lock, flags);	3578	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3579	}	3579	}
3580		3580
3581	#endif	3581	#endif
3582		3582
3583	DEFINE_PER_CPU(struct kernel_stat, kstat);	3583	DEFINE_PER_CPU(struct kernel_stat, kstat);
3584		3584
3585	EXPORT_PER_CPU_SYMBOL(kstat);	3585	EXPORT_PER_CPU_SYMBOL(kstat);
3586		3586
3587	/*	3587	/*
3588	* Return any ns on the sched_clock that have not yet been accounted in	3588	* Return any ns on the sched_clock that have not yet been accounted in
3589	* @p in case that task is currently running.	3589	* @p in case that task is currently running.
3590	*	3590	*
3591	* Called with task_rq_lock() held on @rq.	3591	* Called with task_rq_lock() held on @rq.
3592	*/	3592	*/
3593	static u64 do_task_delta_exec(struct task_struct p, struct rq rq)	3593	static u64 do_task_delta_exec(struct task_struct p, struct rq rq)
3594	{	3594	{
3595	u64 ns = 0;	3595	u64 ns = 0;
3596		3596
3597	if (task_current(rq, p)) {	3597	if (task_current(rq, p)) {
3598	update_rq_clock(rq);	3598	update_rq_clock(rq);
3599	ns = rq->clock_task - p->se.exec_start;	3599	ns = rq->clock_task - p->se.exec_start;
3600	if ((s64)ns < 0)	3600	if ((s64)ns < 0)
3601	ns = 0;	3601	ns = 0;
3602	}	3602	}
3603		3603
3604	return ns;	3604	return ns;
3605	}	3605	}
3606		3606
3607	unsigned long long task_delta_exec(struct task_struct *p)	3607	unsigned long long task_delta_exec(struct task_struct *p)
3608	{	3608	{
3609	unsigned long flags;	3609	unsigned long flags;
3610	struct rq *rq;	3610	struct rq *rq;
3611	u64 ns = 0;	3611	u64 ns = 0;
3612		3612
3613	rq = task_rq_lock(p, &flags);	3613	rq = task_rq_lock(p, &flags);
3614	ns = do_task_delta_exec(p, rq);	3614	ns = do_task_delta_exec(p, rq);
3615	task_rq_unlock(rq, p, &flags);	3615	task_rq_unlock(rq, p, &flags);
3616		3616
3617	return ns;	3617	return ns;
3618	}	3618	}
3619		3619
3620	/*	3620	/*
3621	* Return accounted runtime for the task.	3621	* Return accounted runtime for the task.
3622	* In case the task is currently running, return the runtime plus current's	3622	* In case the task is currently running, return the runtime plus current's
3623	* pending runtime that have not been accounted yet.	3623	* pending runtime that have not been accounted yet.
3624	*/	3624	*/
3625	unsigned long long task_sched_runtime(struct task_struct *p)	3625	unsigned long long task_sched_runtime(struct task_struct *p)
3626	{	3626	{
3627	unsigned long flags;	3627	unsigned long flags;
3628	struct rq *rq;	3628	struct rq *rq;
3629	u64 ns = 0;	3629	u64 ns = 0;
3630		3630
3631	rq = task_rq_lock(p, &flags);	3631	rq = task_rq_lock(p, &flags);
3632	ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);	3632	ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
3633	task_rq_unlock(rq, p, &flags);	3633	task_rq_unlock(rq, p, &flags);
3634		3634
3635	return ns;	3635	return ns;
3636	}	3636	}
3637		3637
3638	/*	3638	/*
3639	* Return sum_exec_runtime for the thread group.	3639	* Return sum_exec_runtime for the thread group.
3640	* In case the task is currently running, return the sum plus current's	3640	* In case the task is currently running, return the sum plus current's
3641	* pending runtime that have not been accounted yet.	3641	* pending runtime that have not been accounted yet.
3642	*	3642	*
3643	* Note that the thread group might have other running tasks as well,	3643	* Note that the thread group might have other running tasks as well,
3644	* so the return value not includes other pending runtime that other	3644	* so the return value not includes other pending runtime that other
3645	* running tasks might have.	3645	* running tasks might have.
3646	*/	3646	*/
3647	unsigned long long thread_group_sched_runtime(struct task_struct *p)	3647	unsigned long long thread_group_sched_runtime(struct task_struct *p)
3648	{	3648	{
3649	struct task_cputime totals;	3649	struct task_cputime totals;
3650	unsigned long flags;	3650	unsigned long flags;
3651	struct rq *rq;	3651	struct rq *rq;
3652	u64 ns;	3652	u64 ns;
3653		3653
3654	rq = task_rq_lock(p, &flags);	3654	rq = task_rq_lock(p, &flags);
3655	thread_group_cputime(p, &totals);	3655	thread_group_cputime(p, &totals);
3656	ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);	3656	ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
3657	task_rq_unlock(rq, p, &flags);	3657	task_rq_unlock(rq, p, &flags);
3658		3658
3659	return ns;	3659	return ns;
3660	}	3660	}
3661		3661
3662	/*	3662	/*
3663	* Account user cpu time to a process.	3663	* Account user cpu time to a process.
3664	* @p: the process that the cpu time gets accounted to	3664	* @p: the process that the cpu time gets accounted to
3665	* @cputime: the cpu time spent in user space since the last update	3665	* @cputime: the cpu time spent in user space since the last update
3666	* @cputime_scaled: cputime scaled by cpu frequency	3666	* @cputime_scaled: cputime scaled by cpu frequency
3667	*/	3667	*/
3668	void account_user_time(struct task_struct *p, cputime_t cputime,	3668	void account_user_time(struct task_struct *p, cputime_t cputime,
3669	cputime_t cputime_scaled)	3669	cputime_t cputime_scaled)
3670	{	3670	{
3671	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;	3671	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3672	cputime64_t tmp;	3672	cputime64_t tmp;
3673		3673
3674	/* Add user time to process. */	3674	/* Add user time to process. */
3675	p->utime = cputime_add(p->utime, cputime);	3675	p->utime = cputime_add(p->utime, cputime);
3676	p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);	3676	p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
3677	account_group_user_time(p, cputime);	3677	account_group_user_time(p, cputime);
3678		3678
3679	/* Add user time to cpustat. */	3679	/* Add user time to cpustat. */
3680	tmp = cputime_to_cputime64(cputime);	3680	tmp = cputime_to_cputime64(cputime);
3681	if (TASK_NICE(p) > 0)	3681	if (TASK_NICE(p) > 0)
3682	cpustat->nice = cputime64_add(cpustat->nice, tmp);	3682	cpustat->nice = cputime64_add(cpustat->nice, tmp);
3683	else	3683	else
3684	cpustat->user = cputime64_add(cpustat->user, tmp);	3684	cpustat->user = cputime64_add(cpustat->user, tmp);
3685		3685
3686	cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);	3686	cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
3687	/* Account for user time used */	3687	/* Account for user time used */
3688	acct_update_integrals(p);	3688	acct_update_integrals(p);
3689	}	3689	}
3690		3690
3691	/*	3691	/*
3692	* Account guest cpu time to a process.	3692	* Account guest cpu time to a process.
3693	* @p: the process that the cpu time gets accounted to	3693	* @p: the process that the cpu time gets accounted to
3694	* @cputime: the cpu time spent in virtual machine since the last update	3694	* @cputime: the cpu time spent in virtual machine since the last update
3695	* @cputime_scaled: cputime scaled by cpu frequency	3695	* @cputime_scaled: cputime scaled by cpu frequency
3696	*/	3696	*/
3697	static void account_guest_time(struct task_struct *p, cputime_t cputime,	3697	static void account_guest_time(struct task_struct *p, cputime_t cputime,
3698	cputime_t cputime_scaled)	3698	cputime_t cputime_scaled)
3699	{	3699	{
3700	cputime64_t tmp;	3700	cputime64_t tmp;
3701	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;	3701	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3702		3702
3703	tmp = cputime_to_cputime64(cputime);	3703	tmp = cputime_to_cputime64(cputime);
3704		3704
3705	/* Add guest time to process. */	3705	/* Add guest time to process. */
3706	p->utime = cputime_add(p->utime, cputime);	3706	p->utime = cputime_add(p->utime, cputime);
3707	p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);	3707	p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
3708	account_group_user_time(p, cputime);	3708	account_group_user_time(p, cputime);
3709	p->gtime = cputime_add(p->gtime, cputime);	3709	p->gtime = cputime_add(p->gtime, cputime);
3710		3710
3711	/* Add guest time to cpustat. */	3711	/* Add guest time to cpustat. */
3712	if (TASK_NICE(p) > 0) {	3712	if (TASK_NICE(p) > 0) {
3713	cpustat->nice = cputime64_add(cpustat->nice, tmp);	3713	cpustat->nice = cputime64_add(cpustat->nice, tmp);
3714	cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);	3714	cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);
3715	} else {	3715	} else {
3716	cpustat->user = cputime64_add(cpustat->user, tmp);	3716	cpustat->user = cputime64_add(cpustat->user, tmp);
3717	cpustat->guest = cputime64_add(cpustat->guest, tmp);	3717	cpustat->guest = cputime64_add(cpustat->guest, tmp);
3718	}	3718	}
3719	}	3719	}
3720		3720
3721	/*	3721	/*
3722	* Account system cpu time to a process and desired cpustat field	3722	* Account system cpu time to a process and desired cpustat field
3723	* @p: the process that the cpu time gets accounted to	3723	* @p: the process that the cpu time gets accounted to
3724	* @cputime: the cpu time spent in kernel space since the last update	3724	* @cputime: the cpu time spent in kernel space since the last update
3725	* @cputime_scaled: cputime scaled by cpu frequency	3725	* @cputime_scaled: cputime scaled by cpu frequency
3726	* @target_cputime64: pointer to cpustat field that has to be updated	3726	* @target_cputime64: pointer to cpustat field that has to be updated
3727	*/	3727	*/
3728	static inline	3728	static inline
3729	void __account_system_time(struct task_struct *p, cputime_t cputime,	3729	void __account_system_time(struct task_struct *p, cputime_t cputime,
3730	cputime_t cputime_scaled, cputime64_t *target_cputime64)	3730	cputime_t cputime_scaled, cputime64_t *target_cputime64)
3731	{	3731	{
3732	cputime64_t tmp = cputime_to_cputime64(cputime);	3732	cputime64_t tmp = cputime_to_cputime64(cputime);
3733		3733
3734	/* Add system time to process. */	3734	/* Add system time to process. */
3735	p->stime = cputime_add(p->stime, cputime);	3735	p->stime = cputime_add(p->stime, cputime);
3736	p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);	3736	p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
3737	account_group_system_time(p, cputime);	3737	account_group_system_time(p, cputime);
3738		3738
3739	/* Add system time to cpustat. */	3739	/* Add system time to cpustat. */
3740	target_cputime64 = cputime64_add(target_cputime64, tmp);	3740	target_cputime64 = cputime64_add(target_cputime64, tmp);
3741	cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);	3741	cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
3742		3742
3743	/* Account for system time used */	3743	/* Account for system time used */
3744	acct_update_integrals(p);	3744	acct_update_integrals(p);
3745	}	3745	}
3746		3746
3747	/*	3747	/*
3748	* Account system cpu time to a process.	3748	* Account system cpu time to a process.
3749	* @p: the process that the cpu time gets accounted to	3749	* @p: the process that the cpu time gets accounted to
3750	* @hardirq_offset: the offset to subtract from hardirq_count()	3750	* @hardirq_offset: the offset to subtract from hardirq_count()
3751	* @cputime: the cpu time spent in kernel space since the last update	3751	* @cputime: the cpu time spent in kernel space since the last update
3752	* @cputime_scaled: cputime scaled by cpu frequency	3752	* @cputime_scaled: cputime scaled by cpu frequency
3753	*/	3753	*/
3754	void account_system_time(struct task_struct *p, int hardirq_offset,	3754	void account_system_time(struct task_struct *p, int hardirq_offset,
3755	cputime_t cputime, cputime_t cputime_scaled)	3755	cputime_t cputime, cputime_t cputime_scaled)
3756	{	3756	{
3757	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;	3757	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3758	cputime64_t *target_cputime64;	3758	cputime64_t *target_cputime64;
3759		3759
3760	if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {	3760	if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
3761	account_guest_time(p, cputime, cputime_scaled);	3761	account_guest_time(p, cputime, cputime_scaled);
3762	return;	3762	return;
3763	}	3763	}
3764		3764
3765	if (hardirq_count() - hardirq_offset)	3765	if (hardirq_count() - hardirq_offset)
3766	target_cputime64 = &cpustat->irq;	3766	target_cputime64 = &cpustat->irq;
3767	else if (in_serving_softirq())	3767	else if (in_serving_softirq())
3768	target_cputime64 = &cpustat->softirq;	3768	target_cputime64 = &cpustat->softirq;
3769	else	3769	else
3770	target_cputime64 = &cpustat->system;	3770	target_cputime64 = &cpustat->system;
3771		3771
3772	__account_system_time(p, cputime, cputime_scaled, target_cputime64);	3772	__account_system_time(p, cputime, cputime_scaled, target_cputime64);
3773	}	3773	}
3774		3774
3775	/*	3775	/*
3776	* Account for involuntary wait time.	3776	* Account for involuntary wait time.
3777	* @cputime: the cpu time spent in involuntary wait	3777	* @cputime: the cpu time spent in involuntary wait
3778	*/	3778	*/
3779	void account_steal_time(cputime_t cputime)	3779	void account_steal_time(cputime_t cputime)
3780	{	3780	{
3781	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;	3781	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3782	cputime64_t cputime64 = cputime_to_cputime64(cputime);	3782	cputime64_t cputime64 = cputime_to_cputime64(cputime);
3783		3783
3784	cpustat->steal = cputime64_add(cpustat->steal, cputime64);	3784	cpustat->steal = cputime64_add(cpustat->steal, cputime64);
3785	}	3785	}
3786		3786
3787	/*	3787	/*
3788	* Account for idle time.	3788	* Account for idle time.
3789	* @cputime: the cpu time spent in idle wait	3789	* @cputime: the cpu time spent in idle wait
3790	*/	3790	*/
3791	void account_idle_time(cputime_t cputime)	3791	void account_idle_time(cputime_t cputime)
3792	{	3792	{
3793	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;	3793	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3794	cputime64_t cputime64 = cputime_to_cputime64(cputime);	3794	cputime64_t cputime64 = cputime_to_cputime64(cputime);
3795	struct rq *rq = this_rq();	3795	struct rq *rq = this_rq();
3796		3796
3797	if (atomic_read(&rq->nr_iowait) > 0)	3797	if (atomic_read(&rq->nr_iowait) > 0)
3798	cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);	3798	cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
3799	else	3799	else
3800	cpustat->idle = cputime64_add(cpustat->idle, cputime64);	3800	cpustat->idle = cputime64_add(cpustat->idle, cputime64);
3801	}	3801	}
3802		3802
3803	#ifndef CONFIG_VIRT_CPU_ACCOUNTING	3803	#ifndef CONFIG_VIRT_CPU_ACCOUNTING
3804		3804
3805	#ifdef CONFIG_IRQ_TIME_ACCOUNTING	3805	#ifdef CONFIG_IRQ_TIME_ACCOUNTING
3806	/*	3806	/*
3807	* Account a tick to a process and cpustat	3807	* Account a tick to a process and cpustat
3808	* @p: the process that the cpu time gets accounted to	3808	* @p: the process that the cpu time gets accounted to
3809	* @user_tick: is the tick from userspace	3809	* @user_tick: is the tick from userspace
3810	* @rq: the pointer to rq	3810	* @rq: the pointer to rq
3811	*	3811	*
3812	* Tick demultiplexing follows the order	3812	* Tick demultiplexing follows the order
3813	* - pending hardirq update	3813	* - pending hardirq update
3814	* - pending softirq update	3814	* - pending softirq update
3815	* - user_time	3815	* - user_time
3816	* - idle_time	3816	* - idle_time
3817	* - system time	3817	* - system time
3818	* - check for guest_time	3818	* - check for guest_time
3819	* - else account as system_time	3819	* - else account as system_time
3820	*	3820	*
3821	* Check for hardirq is done both for system and user time as there is	3821	* Check for hardirq is done both for system and user time as there is
3822	* no timer going off while we are on hardirq and hence we may never get an	3822	* no timer going off while we are on hardirq and hence we may never get an
3823	* opportunity to update it solely in system time.	3823	* opportunity to update it solely in system time.
3824	* p->stime and friends are only updated on system time and not on irq	3824	* p->stime and friends are only updated on system time and not on irq
3825	* softirq as those do not count in task exec_runtime any more.	3825	* softirq as those do not count in task exec_runtime any more.
3826	*/	3826	*/
3827	static void irqtime_account_process_tick(struct task_struct *p, int user_tick,	3827	static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3828	struct rq *rq)	3828	struct rq *rq)
3829	{	3829	{
3830	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);	3830	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3831	cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);	3831	cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
3832	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;	3832	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3833		3833
3834	if (irqtime_account_hi_update()) {	3834	if (irqtime_account_hi_update()) {
3835	cpustat->irq = cputime64_add(cpustat->irq, tmp);	3835	cpustat->irq = cputime64_add(cpustat->irq, tmp);
3836	} else if (irqtime_account_si_update()) {	3836	} else if (irqtime_account_si_update()) {
3837	cpustat->softirq = cputime64_add(cpustat->softirq, tmp);	3837	cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3838	} else if (this_cpu_ksoftirqd() == p) {	3838	} else if (this_cpu_ksoftirqd() == p) {
3839	/*	3839	/*
3840	* ksoftirqd time do not get accounted in cpu_softirq_time.	3840	* ksoftirqd time do not get accounted in cpu_softirq_time.
3841	* So, we have to handle it separately here.	3841	* So, we have to handle it separately here.
3842	* Also, p->stime needs to be updated for ksoftirqd.	3842	* Also, p->stime needs to be updated for ksoftirqd.
3843	*/	3843	*/
3844	__account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,	3844	__account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3845	&cpustat->softirq);	3845	&cpustat->softirq);
3846	} else if (user_tick) {	3846	} else if (user_tick) {
3847	account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);	3847	account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3848	} else if (p == rq->idle) {	3848	} else if (p == rq->idle) {
3849	account_idle_time(cputime_one_jiffy);	3849	account_idle_time(cputime_one_jiffy);
3850	} else if (p->flags & PF_VCPU) { /* System time or guest time */	3850	} else if (p->flags & PF_VCPU) { /* System time or guest time */
3851	account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);	3851	account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
3852	} else {	3852	} else {
3853	__account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,	3853	__account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3854	&cpustat->system);	3854	&cpustat->system);
3855	}	3855	}
3856	}	3856	}
3857		3857
3858	static void irqtime_account_idle_ticks(int ticks)	3858	static void irqtime_account_idle_ticks(int ticks)
3859	{	3859	{
3860	int i;	3860	int i;
3861	struct rq *rq = this_rq();	3861	struct rq *rq = this_rq();
3862		3862
3863	for (i = 0; i < ticks; i++)	3863	for (i = 0; i < ticks; i++)
3864	irqtime_account_process_tick(current, 0, rq);	3864	irqtime_account_process_tick(current, 0, rq);
3865	}	3865	}
3866	#else /* CONFIG_IRQ_TIME_ACCOUNTING */	3866	#else /* CONFIG_IRQ_TIME_ACCOUNTING */
3867	static void irqtime_account_idle_ticks(int ticks) {}	3867	static void irqtime_account_idle_ticks(int ticks) {}
3868	static void irqtime_account_process_tick(struct task_struct *p, int user_tick,	3868	static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3869	struct rq *rq) {}	3869	struct rq *rq) {}
3870	#endif /* CONFIG_IRQ_TIME_ACCOUNTING */	3870	#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
3871		3871
3872	/*	3872	/*
3873	* Account a single tick of cpu time.	3873	* Account a single tick of cpu time.
3874	* @p: the process that the cpu time gets accounted to	3874	* @p: the process that the cpu time gets accounted to
3875	* @user_tick: indicates if the tick is a user or a system tick	3875	* @user_tick: indicates if the tick is a user or a system tick
3876	*/	3876	*/
3877	void account_process_tick(struct task_struct *p, int user_tick)	3877	void account_process_tick(struct task_struct *p, int user_tick)
3878	{	3878	{
3879	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);	3879	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3880	struct rq *rq = this_rq();	3880	struct rq *rq = this_rq();
3881		3881
3882	if (sched_clock_irqtime) {	3882	if (sched_clock_irqtime) {
3883	irqtime_account_process_tick(p, user_tick, rq);	3883	irqtime_account_process_tick(p, user_tick, rq);
3884	return;	3884	return;
3885	}	3885	}
3886		3886
3887	if (user_tick)	3887	if (user_tick)
3888	account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);	3888	account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3889	else if ((p != rq->idle) \|\| (irq_count() != HARDIRQ_OFFSET))	3889	else if ((p != rq->idle) \|\| (irq_count() != HARDIRQ_OFFSET))
3890	account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,	3890	account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
3891	one_jiffy_scaled);	3891	one_jiffy_scaled);
3892	else	3892	else
3893	account_idle_time(cputime_one_jiffy);	3893	account_idle_time(cputime_one_jiffy);
3894	}	3894	}
3895		3895
3896	/*	3896	/*
3897	* Account multiple ticks of steal time.	3897	* Account multiple ticks of steal time.
3898	* @p: the process from which the cpu time has been stolen	3898	* @p: the process from which the cpu time has been stolen
3899	* @ticks: number of stolen ticks	3899	* @ticks: number of stolen ticks
3900	*/	3900	*/
3901	void account_steal_ticks(unsigned long ticks)	3901	void account_steal_ticks(unsigned long ticks)
3902	{	3902	{
3903	account_steal_time(jiffies_to_cputime(ticks));	3903	account_steal_time(jiffies_to_cputime(ticks));
3904	}	3904	}
3905		3905
3906	/*	3906	/*
3907	* Account multiple ticks of idle time.	3907	* Account multiple ticks of idle time.
3908	* @ticks: number of stolen ticks	3908	* @ticks: number of stolen ticks
3909	*/	3909	*/
3910	void account_idle_ticks(unsigned long ticks)	3910	void account_idle_ticks(unsigned long ticks)
3911	{	3911	{
3912		3912
3913	if (sched_clock_irqtime) {	3913	if (sched_clock_irqtime) {
3914	irqtime_account_idle_ticks(ticks);	3914	irqtime_account_idle_ticks(ticks);
3915	return;	3915	return;
3916	}	3916	}
3917		3917
3918	account_idle_time(jiffies_to_cputime(ticks));	3918	account_idle_time(jiffies_to_cputime(ticks));
3919	}	3919	}
3920		3920
3921	#endif	3921	#endif
3922		3922
3923	/*	3923	/*
3924	* Use precise platform statistics if available:	3924	* Use precise platform statistics if available:
3925	*/	3925	*/
3926	#ifdef CONFIG_VIRT_CPU_ACCOUNTING	3926	#ifdef CONFIG_VIRT_CPU_ACCOUNTING
3927	void task_times(struct task_struct p, cputime_t ut, cputime_t *st)	3927	void task_times(struct task_struct p, cputime_t ut, cputime_t *st)
3928	{	3928	{
3929	*ut = p->utime;	3929	*ut = p->utime;
3930	*st = p->stime;	3930	*st = p->stime;
3931	}	3931	}
3932		3932
3933	void thread_group_times(struct task_struct p, cputime_t ut, cputime_t *st)	3933	void thread_group_times(struct task_struct p, cputime_t ut, cputime_t *st)
3934	{	3934	{
3935	struct task_cputime cputime;	3935	struct task_cputime cputime;
3936		3936
3937	thread_group_cputime(p, &cputime);	3937	thread_group_cputime(p, &cputime);
3938		3938
3939	*ut = cputime.utime;	3939	*ut = cputime.utime;
3940	*st = cputime.stime;	3940	*st = cputime.stime;
3941	}	3941	}
3942	#else	3942	#else
3943		3943
3944	#ifndef nsecs_to_cputime	3944	#ifndef nsecs_to_cputime
3945	# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)	3945	# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
3946	#endif	3946	#endif
3947		3947
3948	void task_times(struct task_struct p, cputime_t ut, cputime_t *st)	3948	void task_times(struct task_struct p, cputime_t ut, cputime_t *st)
3949	{	3949	{
3950	cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);	3950	cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);
3951		3951
3952	/*	3952	/*
3953	* Use CFS's precise accounting:	3953	* Use CFS's precise accounting:
3954	*/	3954	*/
3955	rtime = nsecs_to_cputime(p->se.sum_exec_runtime);	3955	rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
3956		3956
3957	if (total) {	3957	if (total) {
3958	u64 temp = rtime;	3958	u64 temp = rtime;
3959		3959
3960	temp *= utime;	3960	temp *= utime;
3961	do_div(temp, total);	3961	do_div(temp, total);
3962	utime = (cputime_t)temp;	3962	utime = (cputime_t)temp;
3963	} else	3963	} else
3964	utime = rtime;	3964	utime = rtime;
3965		3965
3966	/*	3966	/*
3967	* Compare with previous values, to keep monotonicity:	3967	* Compare with previous values, to keep monotonicity:
3968	*/	3968	*/
3969	p->prev_utime = max(p->prev_utime, utime);	3969	p->prev_utime = max(p->prev_utime, utime);
3970	p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));	3970	p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));
3971		3971
3972	*ut = p->prev_utime;	3972	*ut = p->prev_utime;
3973	*st = p->prev_stime;	3973	*st = p->prev_stime;
3974	}	3974	}
3975		3975
3976	/*	3976	/*
3977	* Must be called with siglock held.	3977	* Must be called with siglock held.
3978	*/	3978	*/
3979	void thread_group_times(struct task_struct p, cputime_t ut, cputime_t *st)	3979	void thread_group_times(struct task_struct p, cputime_t ut, cputime_t *st)
3980	{	3980	{
3981	struct signal_struct *sig = p->signal;	3981	struct signal_struct *sig = p->signal;
3982	struct task_cputime cputime;	3982	struct task_cputime cputime;
3983	cputime_t rtime, utime, total;	3983	cputime_t rtime, utime, total;
3984		3984
3985	thread_group_cputime(p, &cputime);	3985	thread_group_cputime(p, &cputime);
3986		3986
3987	total = cputime_add(cputime.utime, cputime.stime);	3987	total = cputime_add(cputime.utime, cputime.stime);
3988	rtime = nsecs_to_cputime(cputime.sum_exec_runtime);	3988	rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
3989		3989
3990	if (total) {	3990	if (total) {
3991	u64 temp = rtime;	3991	u64 temp = rtime;
3992		3992
3993	temp *= cputime.utime;	3993	temp *= cputime.utime;
3994	do_div(temp, total);	3994	do_div(temp, total);
3995	utime = (cputime_t)temp;	3995	utime = (cputime_t)temp;
3996	} else	3996	} else
3997	utime = rtime;	3997	utime = rtime;
3998		3998
3999	sig->prev_utime = max(sig->prev_utime, utime);	3999	sig->prev_utime = max(sig->prev_utime, utime);
4000	sig->prev_stime = max(sig->prev_stime,	4000	sig->prev_stime = max(sig->prev_stime,
4001	cputime_sub(rtime, sig->prev_utime));	4001	cputime_sub(rtime, sig->prev_utime));
4002		4002
4003	*ut = sig->prev_utime;	4003	*ut = sig->prev_utime;
4004	*st = sig->prev_stime;	4004	*st = sig->prev_stime;
4005	}	4005	}
4006	#endif	4006	#endif
4007		4007
4008	/*	4008	/*
4009	* This function gets called by the timer code, with HZ frequency.	4009	* This function gets called by the timer code, with HZ frequency.
4010	* We call it with interrupts disabled.	4010	* We call it with interrupts disabled.
4011	*	4011	*
4012	* It also gets called by the fork code, when changing the parent's	4012	* It also gets called by the fork code, when changing the parent's
4013	* timeslices.	4013	* timeslices.
4014	*/	4014	*/
4015	void scheduler_tick(void)	4015	void scheduler_tick(void)
4016	{	4016	{
4017	int cpu = smp_processor_id();	4017	int cpu = smp_processor_id();
4018	struct rq *rq = cpu_rq(cpu);	4018	struct rq *rq = cpu_rq(cpu);
4019	struct task_struct *curr = rq->curr;	4019	struct task_struct *curr = rq->curr;
4020		4020
4021	sched_clock_tick();	4021	sched_clock_tick();
4022		4022
4023	raw_spin_lock(&rq->lock);	4023	raw_spin_lock(&rq->lock);
4024	update_rq_clock(rq);	4024	update_rq_clock(rq);
4025	update_cpu_load_active(rq);	4025	update_cpu_load_active(rq);
4026	curr->sched_class->task_tick(rq, curr, 0);	4026	curr->sched_class->task_tick(rq, curr, 0);
4027	raw_spin_unlock(&rq->lock);	4027	raw_spin_unlock(&rq->lock);
4028		4028
4029	perf_event_task_tick();	4029	perf_event_task_tick();
4030		4030
4031	#ifdef CONFIG_SMP	4031	#ifdef CONFIG_SMP
4032	rq->idle_at_tick = idle_cpu(cpu);	4032	rq->idle_at_tick = idle_cpu(cpu);
4033	trigger_load_balance(rq, cpu);	4033	trigger_load_balance(rq, cpu);
4034	#endif	4034	#endif
4035	}	4035	}
4036		4036
4037	notrace unsigned long get_parent_ip(unsigned long addr)	4037	notrace unsigned long get_parent_ip(unsigned long addr)
4038	{	4038	{
4039	if (in_lock_functions(addr)) {	4039	if (in_lock_functions(addr)) {
4040	addr = CALLER_ADDR2;	4040	addr = CALLER_ADDR2;
4041	if (in_lock_functions(addr))	4041	if (in_lock_functions(addr))
4042	addr = CALLER_ADDR3;	4042	addr = CALLER_ADDR3;
4043	}	4043	}
4044	return addr;	4044	return addr;
4045	}	4045	}
4046		4046
4047	#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) \|\| \	4047	#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) \|\| \
4048	defined(CONFIG_PREEMPT_TRACER))	4048	defined(CONFIG_PREEMPT_TRACER))
4049		4049
4050	void __kprobes add_preempt_count(int val)	4050	void __kprobes add_preempt_count(int val)
4051	{	4051	{
4052	#ifdef CONFIG_DEBUG_PREEMPT	4052	#ifdef CONFIG_DEBUG_PREEMPT
4053	/*	4053	/*
4054	* Underflow?	4054	* Underflow?
4055	*/	4055	*/
4056	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))	4056	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
4057	return;	4057	return;
4058	#endif	4058	#endif
4059	preempt_count() += val;	4059	preempt_count() += val;
4060	#ifdef CONFIG_DEBUG_PREEMPT	4060	#ifdef CONFIG_DEBUG_PREEMPT
4061	/*	4061	/*
4062	* Spinlock count overflowing soon?	4062	* Spinlock count overflowing soon?
4063	*/	4063	*/
4064	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=	4064	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
4065	PREEMPT_MASK - 10);	4065	PREEMPT_MASK - 10);
4066	#endif	4066	#endif
4067	if (preempt_count() == val)	4067	if (preempt_count() == val)
4068	trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));	4068	trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4069	}	4069	}
4070	EXPORT_SYMBOL(add_preempt_count);	4070	EXPORT_SYMBOL(add_preempt_count);
4071		4071
4072	void __kprobes sub_preempt_count(int val)	4072	void __kprobes sub_preempt_count(int val)
4073	{	4073	{
4074	#ifdef CONFIG_DEBUG_PREEMPT	4074	#ifdef CONFIG_DEBUG_PREEMPT
4075	/*	4075	/*
4076	* Underflow?	4076	* Underflow?
4077	*/	4077	*/
4078	if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))	4078	if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
4079	return;	4079	return;
4080	/*	4080	/*
4081	* Is the spinlock portion underflowing?	4081	* Is the spinlock portion underflowing?
4082	*/	4082	*/
4083	if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&	4083	if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
4084	!(preempt_count() & PREEMPT_MASK)))	4084	!(preempt_count() & PREEMPT_MASK)))
4085	return;	4085	return;
4086	#endif	4086	#endif
4087		4087
4088	if (preempt_count() == val)	4088	if (preempt_count() == val)
4089	trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));	4089	trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4090	preempt_count() -= val;	4090	preempt_count() -= val;
4091	}	4091	}
4092	EXPORT_SYMBOL(sub_preempt_count);	4092	EXPORT_SYMBOL(sub_preempt_count);
4093		4093
4094	#endif	4094	#endif
4095		4095
4096	/*	4096	/*
4097	* Print scheduling while atomic bug:	4097	* Print scheduling while atomic bug:
4098	*/	4098	*/
4099	static noinline void __schedule_bug(struct task_struct *prev)	4099	static noinline void __schedule_bug(struct task_struct *prev)
4100	{	4100	{
4101	struct pt_regs *regs = get_irq_regs();	4101	struct pt_regs *regs = get_irq_regs();
4102		4102
4103	printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",	4103	printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
4104	prev->comm, prev->pid, preempt_count());	4104	prev->comm, prev->pid, preempt_count());
4105		4105
4106	debug_show_held_locks(prev);	4106	debug_show_held_locks(prev);
4107	print_modules();	4107	print_modules();
4108	if (irqs_disabled())	4108	if (irqs_disabled())
4109	print_irqtrace_events(prev);	4109	print_irqtrace_events(prev);
4110		4110
4111	if (regs)	4111	if (regs)
4112	show_regs(regs);	4112	show_regs(regs);
4113	else	4113	else
4114	dump_stack();	4114	dump_stack();
4115	}	4115	}
4116		4116
4117	/*	4117	/*
4118	* Various schedule()-time debugging checks and statistics:	4118	* Various schedule()-time debugging checks and statistics:
4119	*/	4119	*/
4120	static inline void schedule_debug(struct task_struct *prev)	4120	static inline void schedule_debug(struct task_struct *prev)
4121	{	4121	{
4122	/*	4122	/*
4123	* Test if we are atomic. Since do_exit() needs to call into	4123	* Test if we are atomic. Since do_exit() needs to call into
4124	* schedule() atomically, we ignore that path for now.	4124	* schedule() atomically, we ignore that path for now.
4125	* Otherwise, whine if we are scheduling when we should not be.	4125	* Otherwise, whine if we are scheduling when we should not be.
4126	*/	4126	*/
4127	if (unlikely(in_atomic_preempt_off() && !prev->exit_state))	4127	if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
4128	__schedule_bug(prev);	4128	__schedule_bug(prev);
4129		4129
4130	profile_hit(SCHED_PROFILING, __builtin_return_address(0));	4130	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
4131		4131
4132	schedstat_inc(this_rq(), sched_count);	4132	schedstat_inc(this_rq(), sched_count);
4133	#ifdef CONFIG_SCHEDSTATS	4133	#ifdef CONFIG_SCHEDSTATS
4134	if (unlikely(prev->lock_depth >= 0)) {	4134	if (unlikely(prev->lock_depth >= 0)) {
4135	schedstat_inc(this_rq(), rq_sched_info.bkl_count);	4135	schedstat_inc(this_rq(), rq_sched_info.bkl_count);
4136	schedstat_inc(prev, sched_info.bkl_count);	4136	schedstat_inc(prev, sched_info.bkl_count);
4137	}	4137	}
4138	#endif	4138	#endif
4139	}	4139	}
4140		4140
4141	static void put_prev_task(struct rq rq, struct task_struct prev)	4141	static void put_prev_task(struct rq rq, struct task_struct prev)
4142	{	4142	{
4143	if (prev->on_rq)	4143	if (prev->on_rq)
4144	update_rq_clock(rq);	4144	update_rq_clock(rq);
4145	prev->sched_class->put_prev_task(rq, prev);	4145	prev->sched_class->put_prev_task(rq, prev);
4146	}	4146	}
4147		4147
4148	/*	4148	/*
4149	* Pick up the highest-prio task:	4149	* Pick up the highest-prio task:
4150	*/	4150	*/
4151	static inline struct task_struct *	4151	static inline struct task_struct *
4152	pick_next_task(struct rq *rq)	4152	pick_next_task(struct rq *rq)
4153	{	4153	{
4154	const struct sched_class *class;	4154	const struct sched_class *class;
4155	struct task_struct *p;	4155	struct task_struct *p;
4156		4156
4157	/*	4157	/*
4158	* Optimization: we know that if all tasks are in	4158	* Optimization: we know that if all tasks are in
4159	* the fair class we can call that function directly:	4159	* the fair class we can call that function directly:
4160	*/	4160	*/
4161	if (likely(rq->nr_running == rq->cfs.nr_running)) {	4161	if (likely(rq->nr_running == rq->cfs.nr_running)) {
4162	p = fair_sched_class.pick_next_task(rq);	4162	p = fair_sched_class.pick_next_task(rq);
4163	if (likely(p))	4163	if (likely(p))
4164	return p;	4164	return p;
4165	}	4165	}
4166		4166
4167	for_each_class(class) {	4167	for_each_class(class) {
4168	p = class->pick_next_task(rq);	4168	p = class->pick_next_task(rq);
4169	if (p)	4169	if (p)
4170	return p;	4170	return p;
4171	}	4171	}
4172		4172
4173	BUG(); /* the idle class will always have a runnable task */	4173	BUG(); /* the idle class will always have a runnable task */
4174	}	4174	}
4175		4175
4176	/*	4176	/*
4177	* schedule() is the main scheduler function.	4177	* schedule() is the main scheduler function.
4178	*/	4178	*/
4179	asmlinkage void __sched schedule(void)	4179	asmlinkage void __sched schedule(void)
4180	{	4180	{
4181	struct task_struct prev, next;	4181	struct task_struct prev, next;
4182	unsigned long *switch_count;	4182	unsigned long *switch_count;
4183	struct rq *rq;	4183	struct rq *rq;
4184	int cpu;	4184	int cpu;
4185		4185
4186	need_resched:	4186	need_resched:
4187	preempt_disable();	4187	preempt_disable();
4188	cpu = smp_processor_id();	4188	cpu = smp_processor_id();
4189	rq = cpu_rq(cpu);	4189	rq = cpu_rq(cpu);
4190	rcu_note_context_switch(cpu);	4190	rcu_note_context_switch(cpu);
4191	prev = rq->curr;	4191	prev = rq->curr;
4192		4192
4193	schedule_debug(prev);	4193	schedule_debug(prev);
4194		4194
4195	if (sched_feat(HRTICK))	4195	if (sched_feat(HRTICK))
4196	hrtick_clear(rq);	4196	hrtick_clear(rq);
4197		4197
4198	raw_spin_lock_irq(&rq->lock);	4198	raw_spin_lock_irq(&rq->lock);
4199		4199
4200	switch_count = &prev->nivcsw;	4200	switch_count = &prev->nivcsw;
4201	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {	4201	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
4202	if (unlikely(signal_pending_state(prev->state, prev))) {	4202	if (unlikely(signal_pending_state(prev->state, prev))) {
4203	prev->state = TASK_RUNNING;	4203	prev->state = TASK_RUNNING;
4204	} else {	4204	} else {
4205	deactivate_task(rq, prev, DEQUEUE_SLEEP);	4205	deactivate_task(rq, prev, DEQUEUE_SLEEP);
4206	prev->on_rq = 0;	4206	prev->on_rq = 0;
4207		4207
4208	/*	4208	/*
4209	* If a worker went to sleep, notify and ask workqueue	4209	* If a worker went to sleep, notify and ask workqueue
4210	* whether it wants to wake up a task to maintain	4210	* whether it wants to wake up a task to maintain
4211	* concurrency.	4211	* concurrency.
4212	*/	4212	*/
4213	if (prev->flags & PF_WQ_WORKER) {	4213	if (prev->flags & PF_WQ_WORKER) {
4214	struct task_struct *to_wakeup;	4214	struct task_struct *to_wakeup;
4215		4215
4216	to_wakeup = wq_worker_sleeping(prev, cpu);	4216	to_wakeup = wq_worker_sleeping(prev, cpu);
4217	if (to_wakeup)	4217	if (to_wakeup)
4218	try_to_wake_up_local(to_wakeup);	4218	try_to_wake_up_local(to_wakeup);
4219	}	4219	}
4220		4220
4221	/*	4221	/*
4222	* If we are going to sleep and we have plugged IO	4222	* If we are going to sleep and we have plugged IO
4223	* queued, make sure to submit it to avoid deadlocks.	4223	* queued, make sure to submit it to avoid deadlocks.
4224	*/	4224	*/
4225	if (blk_needs_flush_plug(prev)) {	4225	if (blk_needs_flush_plug(prev)) {
4226	raw_spin_unlock(&rq->lock);	4226	raw_spin_unlock(&rq->lock);
4227	blk_schedule_flush_plug(prev);	4227	blk_schedule_flush_plug(prev);
4228	raw_spin_lock(&rq->lock);	4228	raw_spin_lock(&rq->lock);
4229	}	4229	}
4230	}	4230	}
4231	switch_count = &prev->nvcsw;	4231	switch_count = &prev->nvcsw;
4232	}	4232	}
4233		4233
4234	pre_schedule(rq, prev);	4234	pre_schedule(rq, prev);
4235		4235
4236	if (unlikely(!rq->nr_running))	4236	if (unlikely(!rq->nr_running))
4237	idle_balance(cpu, rq);	4237	idle_balance(cpu, rq);
4238		4238
4239	put_prev_task(rq, prev);	4239	put_prev_task(rq, prev);
4240	next = pick_next_task(rq);	4240	next = pick_next_task(rq);
4241	clear_tsk_need_resched(prev);	4241	clear_tsk_need_resched(prev);
4242	rq->skip_clock_update = 0;	4242	rq->skip_clock_update = 0;
4243		4243
4244	if (likely(prev != next)) {	4244	if (likely(prev != next)) {
4245	rq->nr_switches++;	4245	rq->nr_switches++;
4246	rq->curr = next;	4246	rq->curr = next;
4247	++*switch_count;	4247	++*switch_count;
4248		4248
4249	context_switch(rq, prev, next); /* unlocks the rq */	4249	context_switch(rq, prev, next); /* unlocks the rq */
4250	/*	4250	/*
4251	* The context switch have flipped the stack from under us	4251	* The context switch have flipped the stack from under us
4252	* and restored the local variables which were saved when	4252	* and restored the local variables which were saved when
4253	* this task called schedule() in the past. prev == current	4253	* this task called schedule() in the past. prev == current
4254	* is still correct, but it can be moved to another cpu/rq.	4254	* is still correct, but it can be moved to another cpu/rq.
4255	*/	4255	*/
4256	cpu = smp_processor_id();	4256	cpu = smp_processor_id();
4257	rq = cpu_rq(cpu);	4257	rq = cpu_rq(cpu);
4258	} else	4258	} else
4259	raw_spin_unlock_irq(&rq->lock);	4259	raw_spin_unlock_irq(&rq->lock);
4260		4260
4261	post_schedule(rq);	4261	post_schedule(rq);
4262		4262
4263	preempt_enable_no_resched();	4263	preempt_enable_no_resched();
4264	if (need_resched())	4264	if (need_resched())
4265	goto need_resched;	4265	goto need_resched;
4266	}	4266	}
4267	EXPORT_SYMBOL(schedule);	4267	EXPORT_SYMBOL(schedule);
4268		4268
4269	#ifdef CONFIG_MUTEX_SPIN_ON_OWNER	4269	#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
4270		4270
4271	static inline bool owner_running(struct mutex lock, struct task_struct owner)	4271	static inline bool owner_running(struct mutex lock, struct task_struct owner)
4272	{	4272	{
4273	bool ret = false;	4273	bool ret = false;
4274		4274
4275	rcu_read_lock();	4275	rcu_read_lock();
4276	if (lock->owner != owner)	4276	if (lock->owner != owner)
4277	goto fail;	4277	goto fail;
4278		4278
4279	/*	4279	/*
4280	* Ensure we emit the owner->on_cpu, dereference _after_ checking	4280	* Ensure we emit the owner->on_cpu, dereference _after_ checking
4281	* lock->owner still matches owner, if that fails, owner might	4281	* lock->owner still matches owner, if that fails, owner might
4282	* point to free()d memory, if it still matches, the rcu_read_lock()	4282	* point to free()d memory, if it still matches, the rcu_read_lock()
4283	* ensures the memory stays valid.	4283	* ensures the memory stays valid.
4284	*/	4284	*/
4285	barrier();	4285	barrier();
4286		4286
4287	ret = owner->on_cpu;	4287	ret = owner->on_cpu;
4288	fail:	4288	fail:
4289	rcu_read_unlock();	4289	rcu_read_unlock();
4290		4290
4291	return ret;	4291	return ret;
4292	}	4292	}
4293		4293
4294	/*	4294	/*
4295	* Look out! "owner" is an entirely speculative pointer	4295	* Look out! "owner" is an entirely speculative pointer
4296	* access and not reliable.	4296	* access and not reliable.
4297	*/	4297	*/
4298	int mutex_spin_on_owner(struct mutex lock, struct task_struct owner)	4298	int mutex_spin_on_owner(struct mutex lock, struct task_struct owner)
4299	{	4299	{
4300	if (!sched_feat(OWNER_SPIN))	4300	if (!sched_feat(OWNER_SPIN))
4301	return 0;	4301	return 0;
4302		4302
4303	while (owner_running(lock, owner)) {	4303	while (owner_running(lock, owner)) {
4304	if (need_resched())	4304	if (need_resched())
4305	return 0;	4305	return 0;
4306		4306
4307	arch_mutex_cpu_relax();	4307	arch_mutex_cpu_relax();
4308	}	4308	}
4309		4309
4310	/*	4310	/*
4311	* If the owner changed to another task there is likely	4311	* If the owner changed to another task there is likely
4312	* heavy contention, stop spinning.	4312	* heavy contention, stop spinning.
4313	*/	4313	*/
4314	if (lock->owner)	4314	if (lock->owner)
4315	return 0;	4315	return 0;
4316		4316
4317	return 1;	4317	return 1;
4318	}	4318	}
4319	#endif	4319	#endif
4320		4320
4321	#ifdef CONFIG_PREEMPT	4321	#ifdef CONFIG_PREEMPT
4322	/*	4322	/*
4323	* this is the entry point to schedule() from in-kernel preemption	4323	* this is the entry point to schedule() from in-kernel preemption
4324	* off of preempt_enable. Kernel preemptions off return from interrupt	4324	* off of preempt_enable. Kernel preemptions off return from interrupt
4325	* occur there and call schedule directly.	4325	* occur there and call schedule directly.
4326	*/	4326	*/
4327	asmlinkage void __sched notrace preempt_schedule(void)	4327	asmlinkage void __sched notrace preempt_schedule(void)
4328	{	4328	{
4329	struct thread_info *ti = current_thread_info();	4329	struct thread_info *ti = current_thread_info();
4330		4330
4331	/*	4331	/*
4332	* If there is a non-zero preempt_count or interrupts are disabled,	4332	* If there is a non-zero preempt_count or interrupts are disabled,
4333	* we do not want to preempt the current task. Just return..	4333	* we do not want to preempt the current task. Just return..
4334	*/	4334	*/
4335	if (likely(ti->preempt_count \|\| irqs_disabled()))	4335	if (likely(ti->preempt_count \|\| irqs_disabled()))
4336	return;	4336	return;
4337		4337
4338	do {	4338	do {
4339	add_preempt_count_notrace(PREEMPT_ACTIVE);	4339	add_preempt_count_notrace(PREEMPT_ACTIVE);
4340	schedule();	4340	schedule();
4341	sub_preempt_count_notrace(PREEMPT_ACTIVE);	4341	sub_preempt_count_notrace(PREEMPT_ACTIVE);
4342		4342
4343	/*	4343	/*
4344	* Check again in case we missed a preemption opportunity	4344	* Check again in case we missed a preemption opportunity
4345	* between schedule and now.	4345	* between schedule and now.
4346	*/	4346	*/
4347	barrier();	4347	barrier();
4348	} while (need_resched());	4348	} while (need_resched());
4349	}	4349	}
4350	EXPORT_SYMBOL(preempt_schedule);	4350	EXPORT_SYMBOL(preempt_schedule);
4351		4351
4352	/*	4352	/*
4353	* this is the entry point to schedule() from kernel preemption	4353	* this is the entry point to schedule() from kernel preemption
4354	* off of irq context.	4354	* off of irq context.
4355	* Note, that this is called and return with irqs disabled. This will	4355	* Note, that this is called and return with irqs disabled. This will
4356	* protect us against recursive calling from irq.	4356	* protect us against recursive calling from irq.
4357	*/	4357	*/
4358	asmlinkage void __sched preempt_schedule_irq(void)	4358	asmlinkage void __sched preempt_schedule_irq(void)
4359	{	4359	{
4360	struct thread_info *ti = current_thread_info();	4360	struct thread_info *ti = current_thread_info();
4361		4361
4362	/* Catch callers which need to be fixed */	4362	/* Catch callers which need to be fixed */
4363	BUG_ON(ti->preempt_count \|\| !irqs_disabled());	4363	BUG_ON(ti->preempt_count \|\| !irqs_disabled());
4364		4364
4365	do {	4365	do {
4366	add_preempt_count(PREEMPT_ACTIVE);	4366	add_preempt_count(PREEMPT_ACTIVE);
4367	local_irq_enable();	4367	local_irq_enable();
4368	schedule();	4368	schedule();
4369	local_irq_disable();	4369	local_irq_disable();
4370	sub_preempt_count(PREEMPT_ACTIVE);	4370	sub_preempt_count(PREEMPT_ACTIVE);
4371		4371
4372	/*	4372	/*
4373	* Check again in case we missed a preemption opportunity	4373	* Check again in case we missed a preemption opportunity
4374	* between schedule and now.	4374	* between schedule and now.
4375	*/	4375	*/
4376	barrier();	4376	barrier();
4377	} while (need_resched());	4377	} while (need_resched());
4378	}	4378	}
4379		4379
4380	#endif /* CONFIG_PREEMPT */	4380	#endif /* CONFIG_PREEMPT */
4381		4381
4382	int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,	4382	int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
4383	void *key)	4383	void *key)
4384	{	4384	{
4385	return try_to_wake_up(curr->private, mode, wake_flags);	4385	return try_to_wake_up(curr->private, mode, wake_flags);
4386	}	4386	}
4387	EXPORT_SYMBOL(default_wake_function);	4387	EXPORT_SYMBOL(default_wake_function);
4388		4388
4389	/*	4389	/*
4390	* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just	4390	* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
4391	* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve	4391	* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
4392	* number) then we wake all the non-exclusive tasks and one exclusive task.	4392	* number) then we wake all the non-exclusive tasks and one exclusive task.
4393	*	4393	*
4394	* There are circumstances in which we can try to wake a task which has already	4394	* There are circumstances in which we can try to wake a task which has already
4395	* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns	4395	* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
4396	* zero in this (rare) case, and we handle it by continuing to scan the queue.	4396	* zero in this (rare) case, and we handle it by continuing to scan the queue.
4397	*/	4397	*/
4398	static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,	4398	static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
4399	int nr_exclusive, int wake_flags, void *key)	4399	int nr_exclusive, int wake_flags, void *key)
4400	{	4400	{
4401	wait_queue_t curr, next;	4401	wait_queue_t curr, next;
4402		4402
4403	list_for_each_entry_safe(curr, next, &q->task_list, task_list) {	4403	list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
4404	unsigned flags = curr->flags;	4404	unsigned flags = curr->flags;
4405		4405
4406	if (curr->func(curr, mode, wake_flags, key) &&	4406	if (curr->func(curr, mode, wake_flags, key) &&
4407	(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)	4407	(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
4408	break;	4408	break;
4409	}	4409	}
4410	}	4410	}
4411		4411
4412	/**	4412	/**
4413	* __wake_up - wake up threads blocked on a waitqueue.	4413	* __wake_up - wake up threads blocked on a waitqueue.
4414	* @q: the waitqueue	4414	* @q: the waitqueue
4415	* @mode: which threads	4415	* @mode: which threads
4416	* @nr_exclusive: how many wake-one or wake-many threads to wake up	4416	* @nr_exclusive: how many wake-one or wake-many threads to wake up
4417	* @key: is directly passed to the wakeup function	4417	* @key: is directly passed to the wakeup function
4418	*	4418	*
4419	* It may be assumed that this function implies a write memory barrier before	4419	* It may be assumed that this function implies a write memory barrier before
4420	* changing the task state if and only if any tasks are woken up.	4420	* changing the task state if and only if any tasks are woken up.
4421	*/	4421	*/
4422	void __wake_up(wait_queue_head_t *q, unsigned int mode,	4422	void __wake_up(wait_queue_head_t *q, unsigned int mode,
4423	int nr_exclusive, void *key)	4423	int nr_exclusive, void *key)
4424	{	4424	{
4425	unsigned long flags;	4425	unsigned long flags;
4426		4426
4427	spin_lock_irqsave(&q->lock, flags);	4427	spin_lock_irqsave(&q->lock, flags);
4428	__wake_up_common(q, mode, nr_exclusive, 0, key);	4428	__wake_up_common(q, mode, nr_exclusive, 0, key);
4429	spin_unlock_irqrestore(&q->lock, flags);	4429	spin_unlock_irqrestore(&q->lock, flags);
4430	}	4430	}
4431	EXPORT_SYMBOL(__wake_up);	4431	EXPORT_SYMBOL(__wake_up);
4432		4432
4433	/*	4433	/*
4434	* Same as __wake_up but called with the spinlock in wait_queue_head_t held.	4434	* Same as __wake_up but called with the spinlock in wait_queue_head_t held.
4435	*/	4435	*/
4436	void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)	4436	void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
4437	{	4437	{
4438	__wake_up_common(q, mode, 1, 0, NULL);	4438	__wake_up_common(q, mode, 1, 0, NULL);
4439	}	4439	}
4440	EXPORT_SYMBOL_GPL(__wake_up_locked);	4440	EXPORT_SYMBOL_GPL(__wake_up_locked);
4441		4441
4442	void __wake_up_locked_key(wait_queue_head_t q, unsigned int mode, void key)	4442	void __wake_up_locked_key(wait_queue_head_t q, unsigned int mode, void key)
4443	{	4443	{
4444	__wake_up_common(q, mode, 1, 0, key);	4444	__wake_up_common(q, mode, 1, 0, key);
4445	}	4445	}
4446	EXPORT_SYMBOL_GPL(__wake_up_locked_key);	4446	EXPORT_SYMBOL_GPL(__wake_up_locked_key);
4447		4447
4448	/**	4448	/**
4449	* __wake_up_sync_key - wake up threads blocked on a waitqueue.	4449	* __wake_up_sync_key - wake up threads blocked on a waitqueue.
4450	* @q: the waitqueue	4450	* @q: the waitqueue
4451	* @mode: which threads	4451	* @mode: which threads
4452	* @nr_exclusive: how many wake-one or wake-many threads to wake up	4452	* @nr_exclusive: how many wake-one or wake-many threads to wake up
4453	* @key: opaque value to be passed to wakeup targets	4453	* @key: opaque value to be passed to wakeup targets
4454	*	4454	*
4455	* The sync wakeup differs that the waker knows that it will schedule	4455	* The sync wakeup differs that the waker knows that it will schedule
4456	* away soon, so while the target thread will be woken up, it will not	4456	* away soon, so while the target thread will be woken up, it will not
4457	* be migrated to another CPU - ie. the two threads are 'synchronized'	4457	* be migrated to another CPU - ie. the two threads are 'synchronized'
4458	* with each other. This can prevent needless bouncing between CPUs.	4458	* with each other. This can prevent needless bouncing between CPUs.
4459	*	4459	*
4460	* On UP it can prevent extra preemption.	4460	* On UP it can prevent extra preemption.
4461	*	4461	*
4462	* It may be assumed that this function implies a write memory barrier before	4462	* It may be assumed that this function implies a write memory barrier before
4463	* changing the task state if and only if any tasks are woken up.	4463	* changing the task state if and only if any tasks are woken up.
4464	*/	4464	*/
4465	void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,	4465	void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
4466	int nr_exclusive, void *key)	4466	int nr_exclusive, void *key)
4467	{	4467	{
4468	unsigned long flags;	4468	unsigned long flags;
4469	int wake_flags = WF_SYNC;	4469	int wake_flags = WF_SYNC;
4470		4470
4471	if (unlikely(!q))	4471	if (unlikely(!q))
4472	return;	4472	return;
4473		4473
4474	if (unlikely(!nr_exclusive))	4474	if (unlikely(!nr_exclusive))
4475	wake_flags = 0;	4475	wake_flags = 0;
4476		4476
4477	spin_lock_irqsave(&q->lock, flags);	4477	spin_lock_irqsave(&q->lock, flags);
4478	__wake_up_common(q, mode, nr_exclusive, wake_flags, key);	4478	__wake_up_common(q, mode, nr_exclusive, wake_flags, key);
4479	spin_unlock_irqrestore(&q->lock, flags);	4479	spin_unlock_irqrestore(&q->lock, flags);
4480	}	4480	}
4481	EXPORT_SYMBOL_GPL(__wake_up_sync_key);	4481	EXPORT_SYMBOL_GPL(__wake_up_sync_key);
4482		4482
4483	/*	4483	/*
4484	* __wake_up_sync - see __wake_up_sync_key()	4484	* __wake_up_sync - see __wake_up_sync_key()
4485	*/	4485	*/
4486	void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)	4486	void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
4487	{	4487	{
4488	__wake_up_sync_key(q, mode, nr_exclusive, NULL);	4488	__wake_up_sync_key(q, mode, nr_exclusive, NULL);
4489	}	4489	}
4490	EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */	4490	EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
4491		4491
4492	/**	4492	/**
4493	* complete: - signals a single thread waiting on this completion	4493	* complete: - signals a single thread waiting on this completion
4494	* @x: holds the state of this particular completion	4494	* @x: holds the state of this particular completion
4495	*	4495	*
4496	* This will wake up a single thread waiting on this completion. Threads will be	4496	* This will wake up a single thread waiting on this completion. Threads will be
4497	* awakened in the same order in which they were queued.	4497	* awakened in the same order in which they were queued.
4498	*	4498	*
4499	* See also complete_all(), wait_for_completion() and related routines.	4499	* See also complete_all(), wait_for_completion() and related routines.
4500	*	4500	*
4501	* It may be assumed that this function implies a write memory barrier before	4501	* It may be assumed that this function implies a write memory barrier before
4502	* changing the task state if and only if any tasks are woken up.	4502	* changing the task state if and only if any tasks are woken up.
4503	*/	4503	*/
4504	void complete(struct completion *x)	4504	void complete(struct completion *x)
4505	{	4505	{
4506	unsigned long flags;	4506	unsigned long flags;
4507		4507
4508	spin_lock_irqsave(&x->wait.lock, flags);	4508	spin_lock_irqsave(&x->wait.lock, flags);
4509	x->done++;	4509	x->done++;
4510	__wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);	4510	__wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
4511	spin_unlock_irqrestore(&x->wait.lock, flags);	4511	spin_unlock_irqrestore(&x->wait.lock, flags);
4512	}	4512	}
4513	EXPORT_SYMBOL(complete);	4513	EXPORT_SYMBOL(complete);
4514		4514
4515	/**	4515	/**
4516	* complete_all: - signals all threads waiting on this completion	4516	* complete_all: - signals all threads waiting on this completion
4517	* @x: holds the state of this particular completion	4517	* @x: holds the state of this particular completion
4518	*	4518	*
4519	* This will wake up all threads waiting on this particular completion event.	4519	* This will wake up all threads waiting on this particular completion event.
4520	*	4520	*
4521	* It may be assumed that this function implies a write memory barrier before	4521	* It may be assumed that this function implies a write memory barrier before
4522	* changing the task state if and only if any tasks are woken up.	4522	* changing the task state if and only if any tasks are woken up.
4523	*/	4523	*/
4524	void complete_all(struct completion *x)	4524	void complete_all(struct completion *x)
4525	{	4525	{
4526	unsigned long flags;	4526	unsigned long flags;
4527		4527
4528	spin_lock_irqsave(&x->wait.lock, flags);	4528	spin_lock_irqsave(&x->wait.lock, flags);
4529	x->done += UINT_MAX/2;	4529	x->done += UINT_MAX/2;
4530	__wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);	4530	__wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
4531	spin_unlock_irqrestore(&x->wait.lock, flags);	4531	spin_unlock_irqrestore(&x->wait.lock, flags);
4532	}	4532	}
4533	EXPORT_SYMBOL(complete_all);	4533	EXPORT_SYMBOL(complete_all);
4534		4534
4535	static inline long __sched	4535	static inline long __sched
4536	do_wait_for_common(struct completion *x, long timeout, int state)	4536	do_wait_for_common(struct completion *x, long timeout, int state)
4537	{	4537	{
4538	if (!x->done) {	4538	if (!x->done) {
4539	DECLARE_WAITQUEUE(wait, current);	4539	DECLARE_WAITQUEUE(wait, current);
4540		4540
4541	__add_wait_queue_tail_exclusive(&x->wait, &wait);	4541	__add_wait_queue_tail_exclusive(&x->wait, &wait);
4542	do {	4542	do {
4543	if (signal_pending_state(state, current)) {	4543	if (signal_pending_state(state, current)) {
4544	timeout = -ERESTARTSYS;	4544	timeout = -ERESTARTSYS;
4545	break;	4545	break;
4546	}	4546	}
4547	__set_current_state(state);	4547	__set_current_state(state);
4548	spin_unlock_irq(&x->wait.lock);	4548	spin_unlock_irq(&x->wait.lock);
4549	timeout = schedule_timeout(timeout);	4549	timeout = schedule_timeout(timeout);
4550	spin_lock_irq(&x->wait.lock);	4550	spin_lock_irq(&x->wait.lock);
4551	} while (!x->done && timeout);	4551	} while (!x->done && timeout);
4552	__remove_wait_queue(&x->wait, &wait);	4552	__remove_wait_queue(&x->wait, &wait);
4553	if (!x->done)	4553	if (!x->done)
4554	return timeout;	4554	return timeout;
4555	}	4555	}
4556	x->done--;	4556	x->done--;
4557	return timeout ?: 1;	4557	return timeout ?: 1;
4558	}	4558	}
4559		4559
4560	static long __sched	4560	static long __sched
4561	wait_for_common(struct completion *x, long timeout, int state)	4561	wait_for_common(struct completion *x, long timeout, int state)
4562	{	4562	{
4563	might_sleep();	4563	might_sleep();
4564		4564
4565	spin_lock_irq(&x->wait.lock);	4565	spin_lock_irq(&x->wait.lock);
4566	timeout = do_wait_for_common(x, timeout, state);	4566	timeout = do_wait_for_common(x, timeout, state);
4567	spin_unlock_irq(&x->wait.lock);	4567	spin_unlock_irq(&x->wait.lock);
4568	return timeout;	4568	return timeout;
4569	}	4569	}
4570		4570
4571	/**	4571	/**
4572	* wait_for_completion: - waits for completion of a task	4572	* wait_for_completion: - waits for completion of a task
4573	* @x: holds the state of this particular completion	4573	* @x: holds the state of this particular completion
4574	*	4574	*
4575	* This waits to be signaled for completion of a specific task. It is NOT	4575	* This waits to be signaled for completion of a specific task. It is NOT
4576	* interruptible and there is no timeout.	4576	* interruptible and there is no timeout.
4577	*	4577	*
4578	* See also similar routines (i.e. wait_for_completion_timeout()) with timeout	4578	* See also similar routines (i.e. wait_for_completion_timeout()) with timeout
4579	* and interrupt capability. Also see complete().	4579	* and interrupt capability. Also see complete().
4580	*/	4580	*/
4581	void __sched wait_for_completion(struct completion *x)	4581	void __sched wait_for_completion(struct completion *x)
4582	{	4582	{
4583	wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);	4583	wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
4584	}	4584	}
4585	EXPORT_SYMBOL(wait_for_completion);	4585	EXPORT_SYMBOL(wait_for_completion);
4586		4586
4587	/**	4587	/**
4588	* wait_for_completion_timeout: - waits for completion of a task (w/timeout)	4588	* wait_for_completion_timeout: - waits for completion of a task (w/timeout)
4589	* @x: holds the state of this particular completion	4589	* @x: holds the state of this particular completion
4590	* @timeout: timeout value in jiffies	4590	* @timeout: timeout value in jiffies
4591	*	4591	*
4592	* This waits for either a completion of a specific task to be signaled or for a	4592	* This waits for either a completion of a specific task to be signaled or for a
4593	* specified timeout to expire. The timeout is in jiffies. It is not	4593	* specified timeout to expire. The timeout is in jiffies. It is not
4594	* interruptible.	4594	* interruptible.
4595	*/	4595	*/
4596	unsigned long __sched	4596	unsigned long __sched
4597	wait_for_completion_timeout(struct completion *x, unsigned long timeout)	4597	wait_for_completion_timeout(struct completion *x, unsigned long timeout)
4598	{	4598	{
4599	return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);	4599	return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
4600	}	4600	}
4601	EXPORT_SYMBOL(wait_for_completion_timeout);	4601	EXPORT_SYMBOL(wait_for_completion_timeout);
4602		4602
4603	/**	4603	/**
4604	* wait_for_completion_interruptible: - waits for completion of a task (w/intr)	4604	* wait_for_completion_interruptible: - waits for completion of a task (w/intr)
4605	* @x: holds the state of this particular completion	4605	* @x: holds the state of this particular completion
4606	*	4606	*
4607	* This waits for completion of a specific task to be signaled. It is	4607	* This waits for completion of a specific task to be signaled. It is
4608	* interruptible.	4608	* interruptible.
4609	*/	4609	*/
4610	int __sched wait_for_completion_interruptible(struct completion *x)	4610	int __sched wait_for_completion_interruptible(struct completion *x)
4611	{	4611	{
4612	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);	4612	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
4613	if (t == -ERESTARTSYS)	4613	if (t == -ERESTARTSYS)
4614	return t;	4614	return t;
4615	return 0;	4615	return 0;
4616	}	4616	}
4617	EXPORT_SYMBOL(wait_for_completion_interruptible);	4617	EXPORT_SYMBOL(wait_for_completion_interruptible);
4618		4618
4619	/**	4619	/**
4620	* wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))	4620	* wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
4621	* @x: holds the state of this particular completion	4621	* @x: holds the state of this particular completion
4622	* @timeout: timeout value in jiffies	4622	* @timeout: timeout value in jiffies
4623	*	4623	*
4624	* This waits for either a completion of a specific task to be signaled or for a	4624	* This waits for either a completion of a specific task to be signaled or for a
4625	* specified timeout to expire. It is interruptible. The timeout is in jiffies.	4625	* specified timeout to expire. It is interruptible. The timeout is in jiffies.
4626	*/	4626	*/
4627	long __sched	4627	long __sched
4628	wait_for_completion_interruptible_timeout(struct completion *x,	4628	wait_for_completion_interruptible_timeout(struct completion *x,
4629	unsigned long timeout)	4629	unsigned long timeout)
4630	{	4630	{
4631	return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);	4631	return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
4632	}	4632	}
4633	EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);	4633	EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
4634		4634
4635	/**	4635	/**
4636	* wait_for_completion_killable: - waits for completion of a task (killable)	4636	* wait_for_completion_killable: - waits for completion of a task (killable)
4637	* @x: holds the state of this particular completion	4637	* @x: holds the state of this particular completion
4638	*	4638	*
4639	* This waits to be signaled for completion of a specific task. It can be	4639	* This waits to be signaled for completion of a specific task. It can be
4640	* interrupted by a kill signal.	4640	* interrupted by a kill signal.
4641	*/	4641	*/
4642	int __sched wait_for_completion_killable(struct completion *x)	4642	int __sched wait_for_completion_killable(struct completion *x)
4643	{	4643	{
4644	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);	4644	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
4645	if (t == -ERESTARTSYS)	4645	if (t == -ERESTARTSYS)
4646	return t;	4646	return t;
4647	return 0;	4647	return 0;
4648	}	4648	}
4649	EXPORT_SYMBOL(wait_for_completion_killable);	4649	EXPORT_SYMBOL(wait_for_completion_killable);
4650		4650
4651	/**	4651	/**
4652	* wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))	4652	* wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
4653	* @x: holds the state of this particular completion	4653	* @x: holds the state of this particular completion
4654	* @timeout: timeout value in jiffies	4654	* @timeout: timeout value in jiffies
4655	*	4655	*
4656	* This waits for either a completion of a specific task to be	4656	* This waits for either a completion of a specific task to be
4657	* signaled or for a specified timeout to expire. It can be	4657	* signaled or for a specified timeout to expire. It can be
4658	* interrupted by a kill signal. The timeout is in jiffies.	4658	* interrupted by a kill signal. The timeout is in jiffies.
4659	*/	4659	*/
4660	long __sched	4660	long __sched
4661	wait_for_completion_killable_timeout(struct completion *x,	4661	wait_for_completion_killable_timeout(struct completion *x,
4662	unsigned long timeout)	4662	unsigned long timeout)
4663	{	4663	{
4664	return wait_for_common(x, timeout, TASK_KILLABLE);	4664	return wait_for_common(x, timeout, TASK_KILLABLE);
4665	}	4665	}
4666	EXPORT_SYMBOL(wait_for_completion_killable_timeout);	4666	EXPORT_SYMBOL(wait_for_completion_killable_timeout);
4667		4667
4668	/**	4668	/**
4669	* try_wait_for_completion - try to decrement a completion without blocking	4669	* try_wait_for_completion - try to decrement a completion without blocking
4670	* @x: completion structure	4670	* @x: completion structure
4671	*	4671	*
4672	* Returns: 0 if a decrement cannot be done without blocking	4672	* Returns: 0 if a decrement cannot be done without blocking
4673	* 1 if a decrement succeeded.	4673	* 1 if a decrement succeeded.
4674	*	4674	*
4675	* If a completion is being used as a counting completion,	4675	* If a completion is being used as a counting completion,
4676	* attempt to decrement the counter without blocking. This	4676	* attempt to decrement the counter without blocking. This
4677	* enables us to avoid waiting if the resource the completion	4677	* enables us to avoid waiting if the resource the completion
4678	* is protecting is not available.	4678	* is protecting is not available.
4679	*/	4679	*/
4680	bool try_wait_for_completion(struct completion *x)	4680	bool try_wait_for_completion(struct completion *x)
4681	{	4681	{
4682	unsigned long flags;	4682	unsigned long flags;
4683	int ret = 1;	4683	int ret = 1;
4684		4684
4685	spin_lock_irqsave(&x->wait.lock, flags);	4685	spin_lock_irqsave(&x->wait.lock, flags);
4686	if (!x->done)	4686	if (!x->done)
4687	ret = 0;	4687	ret = 0;
4688	else	4688	else
4689	x->done--;	4689	x->done--;
4690	spin_unlock_irqrestore(&x->wait.lock, flags);	4690	spin_unlock_irqrestore(&x->wait.lock, flags);
4691	return ret;	4691	return ret;
4692	}	4692	}
4693	EXPORT_SYMBOL(try_wait_for_completion);	4693	EXPORT_SYMBOL(try_wait_for_completion);
4694		4694
4695	/**	4695	/**
4696	* completion_done - Test to see if a completion has any waiters	4696	* completion_done - Test to see if a completion has any waiters
4697	* @x: completion structure	4697	* @x: completion structure
4698	*	4698	*
4699	* Returns: 0 if there are waiters (wait_for_completion() in progress)	4699	* Returns: 0 if there are waiters (wait_for_completion() in progress)
4700	* 1 if there are no waiters.	4700	* 1 if there are no waiters.
4701	*	4701	*
4702	*/	4702	*/
4703	bool completion_done(struct completion *x)	4703	bool completion_done(struct completion *x)
4704	{	4704	{
4705	unsigned long flags;	4705	unsigned long flags;
4706	int ret = 1;	4706	int ret = 1;
4707		4707
4708	spin_lock_irqsave(&x->wait.lock, flags);	4708	spin_lock_irqsave(&x->wait.lock, flags);
4709	if (!x->done)	4709	if (!x->done)
4710	ret = 0;	4710	ret = 0;
4711	spin_unlock_irqrestore(&x->wait.lock, flags);	4711	spin_unlock_irqrestore(&x->wait.lock, flags);
4712	return ret;	4712	return ret;
4713	}	4713	}
4714	EXPORT_SYMBOL(completion_done);	4714	EXPORT_SYMBOL(completion_done);
4715		4715
4716	static long __sched	4716	static long __sched
4717	sleep_on_common(wait_queue_head_t *q, int state, long timeout)	4717	sleep_on_common(wait_queue_head_t *q, int state, long timeout)
4718	{	4718	{
4719	unsigned long flags;	4719	unsigned long flags;
4720	wait_queue_t wait;	4720	wait_queue_t wait;
4721		4721
4722	init_waitqueue_entry(&wait, current);	4722	init_waitqueue_entry(&wait, current);
4723		4723
4724	__set_current_state(state);	4724	__set_current_state(state);
4725		4725
4726	spin_lock_irqsave(&q->lock, flags);	4726	spin_lock_irqsave(&q->lock, flags);
4727	__add_wait_queue(q, &wait);	4727	__add_wait_queue(q, &wait);
4728	spin_unlock(&q->lock);	4728	spin_unlock(&q->lock);
4729	timeout = schedule_timeout(timeout);	4729	timeout = schedule_timeout(timeout);
4730	spin_lock_irq(&q->lock);	4730	spin_lock_irq(&q->lock);
4731	__remove_wait_queue(q, &wait);	4731	__remove_wait_queue(q, &wait);
4732	spin_unlock_irqrestore(&q->lock, flags);	4732	spin_unlock_irqrestore(&q->lock, flags);
4733		4733
4734	return timeout;	4734	return timeout;
4735	}	4735	}
4736		4736
4737	void __sched interruptible_sleep_on(wait_queue_head_t *q)	4737	void __sched interruptible_sleep_on(wait_queue_head_t *q)
4738	{	4738	{
4739	sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);	4739	sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
4740	}	4740	}
4741	EXPORT_SYMBOL(interruptible_sleep_on);	4741	EXPORT_SYMBOL(interruptible_sleep_on);
4742		4742
4743	long __sched	4743	long __sched
4744	interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)	4744	interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
4745	{	4745	{
4746	return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);	4746	return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
4747	}	4747	}
4748	EXPORT_SYMBOL(interruptible_sleep_on_timeout);	4748	EXPORT_SYMBOL(interruptible_sleep_on_timeout);
4749		4749
4750	void __sched sleep_on(wait_queue_head_t *q)	4750	void __sched sleep_on(wait_queue_head_t *q)
4751	{	4751	{
4752	sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);	4752	sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
4753	}	4753	}
4754	EXPORT_SYMBOL(sleep_on);	4754	EXPORT_SYMBOL(sleep_on);
4755		4755
4756	long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)	4756	long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
4757	{	4757	{
4758	return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);	4758	return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
4759	}	4759	}
4760	EXPORT_SYMBOL(sleep_on_timeout);	4760	EXPORT_SYMBOL(sleep_on_timeout);
4761		4761
4762	#ifdef CONFIG_RT_MUTEXES	4762	#ifdef CONFIG_RT_MUTEXES
4763		4763
4764	/*	4764	/*
4765	* rt_mutex_setprio - set the current priority of a task	4765	* rt_mutex_setprio - set the current priority of a task
4766	* @p: task	4766	* @p: task
4767	* @prio: prio value (kernel-internal form)	4767	* @prio: prio value (kernel-internal form)
4768	*	4768	*
4769	* This function changes the 'effective' priority of a task. It does	4769	* This function changes the 'effective' priority of a task. It does
4770	* not touch ->normal_prio like __setscheduler().	4770	* not touch ->normal_prio like __setscheduler().
4771	*	4771	*
4772	* Used by the rt_mutex code to implement priority inheritance logic.	4772	* Used by the rt_mutex code to implement priority inheritance logic.
4773	*/	4773	*/
4774	void rt_mutex_setprio(struct task_struct *p, int prio)	4774	void rt_mutex_setprio(struct task_struct *p, int prio)
4775	{	4775	{
4776	int oldprio, on_rq, running;	4776	int oldprio, on_rq, running;
4777	struct rq *rq;	4777	struct rq *rq;
4778	const struct sched_class *prev_class;	4778	const struct sched_class *prev_class;
4779		4779
4780	BUG_ON(prio < 0 \|\| prio > MAX_PRIO);	4780	BUG_ON(prio < 0 \|\| prio > MAX_PRIO);
4781		4781
4782	rq = __task_rq_lock(p);	4782	rq = __task_rq_lock(p);
4783		4783
4784	trace_sched_pi_setprio(p, prio);	4784	trace_sched_pi_setprio(p, prio);
4785	oldprio = p->prio;	4785	oldprio = p->prio;
4786	prev_class = p->sched_class;	4786	prev_class = p->sched_class;
4787	on_rq = p->on_rq;	4787	on_rq = p->on_rq;
4788	running = task_current(rq, p);	4788	running = task_current(rq, p);
4789	if (on_rq)	4789	if (on_rq)
4790	dequeue_task(rq, p, 0);	4790	dequeue_task(rq, p, 0);
4791	if (running)	4791	if (running)
4792	p->sched_class->put_prev_task(rq, p);	4792	p->sched_class->put_prev_task(rq, p);
4793		4793
4794	if (rt_prio(prio))	4794	if (rt_prio(prio))
4795	p->sched_class = &rt_sched_class;	4795	p->sched_class = &rt_sched_class;
4796	else	4796	else
4797	p->sched_class = &fair_sched_class;	4797	p->sched_class = &fair_sched_class;
4798		4798
4799	p->prio = prio;	4799	p->prio = prio;
4800		4800
4801	if (running)	4801	if (running)
4802	p->sched_class->set_curr_task(rq);	4802	p->sched_class->set_curr_task(rq);
4803	if (on_rq)	4803	if (on_rq)
4804	enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);	4804	enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
4805		4805
4806	check_class_changed(rq, p, prev_class, oldprio);	4806	check_class_changed(rq, p, prev_class, oldprio);
4807	__task_rq_unlock(rq);	4807	__task_rq_unlock(rq);
4808	}	4808	}
4809		4809
4810	#endif	4810	#endif
4811		4811
4812	void set_user_nice(struct task_struct *p, long nice)	4812	void set_user_nice(struct task_struct *p, long nice)
4813	{	4813	{
4814	int old_prio, delta, on_rq;	4814	int old_prio, delta, on_rq;
4815	unsigned long flags;	4815	unsigned long flags;
4816	struct rq *rq;	4816	struct rq *rq;
4817		4817
4818	if (TASK_NICE(p) == nice \|\| nice < -20 \|\| nice > 19)	4818	if (TASK_NICE(p) == nice \|\| nice < -20 \|\| nice > 19)
4819	return;	4819	return;
4820	/*	4820	/*
4821	* We have to be careful, if called from sys_setpriority(),	4821	* We have to be careful, if called from sys_setpriority(),
4822	* the task might be in the middle of scheduling on another CPU.	4822	* the task might be in the middle of scheduling on another CPU.
4823	*/	4823	*/
4824	rq = task_rq_lock(p, &flags);	4824	rq = task_rq_lock(p, &flags);
4825	/*	4825	/*
4826	* The RT priorities are set via sched_setscheduler(), but we still	4826	* The RT priorities are set via sched_setscheduler(), but we still
4827	* allow the 'normal' nice value to be set - but as expected	4827	* allow the 'normal' nice value to be set - but as expected
4828	* it wont have any effect on scheduling until the task is	4828	* it wont have any effect on scheduling until the task is
4829	* SCHED_FIFO/SCHED_RR:	4829	* SCHED_FIFO/SCHED_RR:
4830	*/	4830	*/
4831	if (task_has_rt_policy(p)) {	4831	if (task_has_rt_policy(p)) {
4832	p->static_prio = NICE_TO_PRIO(nice);	4832	p->static_prio = NICE_TO_PRIO(nice);
4833	goto out_unlock;	4833	goto out_unlock;
4834	}	4834	}
4835	on_rq = p->on_rq;	4835	on_rq = p->on_rq;
4836	if (on_rq)	4836	if (on_rq)
4837	dequeue_task(rq, p, 0);	4837	dequeue_task(rq, p, 0);
4838		4838
4839	p->static_prio = NICE_TO_PRIO(nice);	4839	p->static_prio = NICE_TO_PRIO(nice);
4840	set_load_weight(p);	4840	set_load_weight(p);
4841	old_prio = p->prio;	4841	old_prio = p->prio;
4842	p->prio = effective_prio(p);	4842	p->prio = effective_prio(p);
4843	delta = p->prio - old_prio;	4843	delta = p->prio - old_prio;
4844		4844
4845	if (on_rq) {	4845	if (on_rq) {
4846	enqueue_task(rq, p, 0);	4846	enqueue_task(rq, p, 0);
4847	/*	4847	/*
4848	* If the task increased its priority or is running and	4848	* If the task increased its priority or is running and
4849	* lowered its priority, then reschedule its CPU:	4849	* lowered its priority, then reschedule its CPU:
4850	*/	4850	*/
4851	if (delta < 0 \|\| (delta > 0 && task_running(rq, p)))	4851	if (delta < 0 \|\| (delta > 0 && task_running(rq, p)))
4852	resched_task(rq->curr);	4852	resched_task(rq->curr);
4853	}	4853	}
4854	out_unlock:	4854	out_unlock:
4855	task_rq_unlock(rq, p, &flags);	4855	task_rq_unlock(rq, p, &flags);
4856	}	4856	}
4857	EXPORT_SYMBOL(set_user_nice);	4857	EXPORT_SYMBOL(set_user_nice);
4858		4858
4859	/*	4859	/*
4860	* can_nice - check if a task can reduce its nice value	4860	* can_nice - check if a task can reduce its nice value
4861	* @p: task	4861	* @p: task
4862	* @nice: nice value	4862	* @nice: nice value
4863	*/	4863	*/
4864	int can_nice(const struct task_struct *p, const int nice)	4864	int can_nice(const struct task_struct *p, const int nice)
4865	{	4865	{
4866	/* convert nice value [19,-20] to rlimit style value [1,40] */	4866	/* convert nice value [19,-20] to rlimit style value [1,40] */
4867	int nice_rlim = 20 - nice;	4867	int nice_rlim = 20 - nice;
4868		4868
4869	return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) \|\|	4869	return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) \|\|
4870	capable(CAP_SYS_NICE));	4870	capable(CAP_SYS_NICE));
4871	}	4871	}
4872		4872
4873	#ifdef __ARCH_WANT_SYS_NICE	4873	#ifdef __ARCH_WANT_SYS_NICE
4874		4874
4875	/*	4875	/*
4876	* sys_nice - change the priority of the current process.	4876	* sys_nice - change the priority of the current process.
4877	* @increment: priority increment	4877	* @increment: priority increment
4878	*	4878	*
4879	* sys_setpriority is a more generic, but much slower function that	4879	* sys_setpriority is a more generic, but much slower function that
4880	* does similar things.	4880	* does similar things.
4881	*/	4881	*/
4882	SYSCALL_DEFINE1(nice, int, increment)	4882	SYSCALL_DEFINE1(nice, int, increment)
4883	{	4883	{
4884	long nice, retval;	4884	long nice, retval;
4885		4885
4886	/*	4886	/*
4887	* Setpriority might change our priority at the same moment.	4887	* Setpriority might change our priority at the same moment.
4888	* We don't have to worry. Conceptually one call occurs first	4888	* We don't have to worry. Conceptually one call occurs first
4889	* and we have a single winner.	4889	* and we have a single winner.
4890	*/	4890	*/
4891	if (increment < -40)	4891	if (increment < -40)
4892	increment = -40;	4892	increment = -40;
4893	if (increment > 40)	4893	if (increment > 40)
4894	increment = 40;	4894	increment = 40;
4895		4895
4896	nice = TASK_NICE(current) + increment;	4896	nice = TASK_NICE(current) + increment;
4897	if (nice < -20)	4897	if (nice < -20)
4898	nice = -20;	4898	nice = -20;
4899	if (nice > 19)	4899	if (nice > 19)
4900	nice = 19;	4900	nice = 19;
4901		4901
4902	if (increment < 0 && !can_nice(current, nice))	4902	if (increment < 0 && !can_nice(current, nice))
4903	return -EPERM;	4903	return -EPERM;
4904		4904
4905	retval = security_task_setnice(current, nice);	4905	retval = security_task_setnice(current, nice);
4906	if (retval)	4906	if (retval)
4907	return retval;	4907	return retval;
4908		4908
4909	set_user_nice(current, nice);	4909	set_user_nice(current, nice);
4910	return 0;	4910	return 0;
4911	}	4911	}
4912		4912
4913	#endif	4913	#endif
4914		4914
4915	/**	4915	/**
4916	* task_prio - return the priority value of a given task.	4916	* task_prio - return the priority value of a given task.
4917	* @p: the task in question.	4917	* @p: the task in question.
4918	*	4918	*
4919	* This is the priority value as seen by users in /proc.	4919	* This is the priority value as seen by users in /proc.
4920	* RT tasks are offset by -200. Normal tasks are centered	4920	* RT tasks are offset by -200. Normal tasks are centered
4921	* around 0, value goes from -16 to +15.	4921	* around 0, value goes from -16 to +15.
4922	*/	4922	*/
4923	int task_prio(const struct task_struct *p)	4923	int task_prio(const struct task_struct *p)
4924	{	4924	{
4925	return p->prio - MAX_RT_PRIO;	4925	return p->prio - MAX_RT_PRIO;
4926	}	4926	}
4927		4927
4928	/**	4928	/**
4929	* task_nice - return the nice value of a given task.	4929	* task_nice - return the nice value of a given task.
4930	* @p: the task in question.	4930	* @p: the task in question.
4931	*/	4931	*/
4932	int task_nice(const struct task_struct *p)	4932	int task_nice(const struct task_struct *p)
4933	{	4933	{
4934	return TASK_NICE(p);	4934	return TASK_NICE(p);
4935	}	4935	}
4936	EXPORT_SYMBOL(task_nice);	4936	EXPORT_SYMBOL(task_nice);
4937		4937
4938	/**	4938	/**
4939	* idle_cpu - is a given cpu idle currently?	4939	* idle_cpu - is a given cpu idle currently?
4940	* @cpu: the processor in question.	4940	* @cpu: the processor in question.
4941	*/	4941	*/
4942	int idle_cpu(int cpu)	4942	int idle_cpu(int cpu)
4943	{	4943	{
4944	return cpu_curr(cpu) == cpu_rq(cpu)->idle;	4944	return cpu_curr(cpu) == cpu_rq(cpu)->idle;
4945	}	4945	}
4946		4946
4947	/**	4947	/**
4948	* idle_task - return the idle task for a given cpu.	4948	* idle_task - return the idle task for a given cpu.
4949	* @cpu: the processor in question.	4949	* @cpu: the processor in question.
4950	*/	4950	*/
4951	struct task_struct *idle_task(int cpu)	4951	struct task_struct *idle_task(int cpu)
4952	{	4952	{
4953	return cpu_rq(cpu)->idle;	4953	return cpu_rq(cpu)->idle;
4954	}	4954	}
4955		4955
4956	/**	4956	/**
4957	* find_process_by_pid - find a process with a matching PID value.	4957	* find_process_by_pid - find a process with a matching PID value.
4958	* @pid: the pid in question.	4958	* @pid: the pid in question.
4959	*/	4959	*/
4960	static struct task_struct *find_process_by_pid(pid_t pid)	4960	static struct task_struct *find_process_by_pid(pid_t pid)
4961	{	4961	{
4962	return pid ? find_task_by_vpid(pid) : current;	4962	return pid ? find_task_by_vpid(pid) : current;
4963	}	4963	}
4964		4964
4965	/* Actually do priority change: must hold rq lock. */	4965	/* Actually do priority change: must hold rq lock. */
4966	static void	4966	static void
4967	__setscheduler(struct rq rq, struct task_struct p, int policy, int prio)	4967	__setscheduler(struct rq rq, struct task_struct p, int policy, int prio)
4968	{	4968	{
4969	p->policy = policy;	4969	p->policy = policy;
4970	p->rt_priority = prio;	4970	p->rt_priority = prio;
4971	p->normal_prio = normal_prio(p);	4971	p->normal_prio = normal_prio(p);
4972	/* we are holding p->pi_lock already */	4972	/* we are holding p->pi_lock already */
4973	p->prio = rt_mutex_getprio(p);	4973	p->prio = rt_mutex_getprio(p);
4974	if (rt_prio(p->prio))	4974	if (rt_prio(p->prio))
4975	p->sched_class = &rt_sched_class;	4975	p->sched_class = &rt_sched_class;
4976	else	4976	else
4977	p->sched_class = &fair_sched_class;	4977	p->sched_class = &fair_sched_class;
4978	set_load_weight(p);	4978	set_load_weight(p);
4979	}	4979	}
4980		4980
4981	/*	4981	/*
4982	* check the target process has a UID that matches the current process's	4982	* check the target process has a UID that matches the current process's
4983	*/	4983	*/
4984	static bool check_same_owner(struct task_struct *p)	4984	static bool check_same_owner(struct task_struct *p)
4985	{	4985	{
4986	const struct cred cred = current_cred(), pcred;	4986	const struct cred cred = current_cred(), pcred;
4987	bool match;	4987	bool match;
4988		4988
4989	rcu_read_lock();	4989	rcu_read_lock();
4990	pcred = __task_cred(p);	4990	pcred = __task_cred(p);
4991	if (cred->user->user_ns == pcred->user->user_ns)	4991	if (cred->user->user_ns == pcred->user->user_ns)
4992	match = (cred->euid == pcred->euid \|\|	4992	match = (cred->euid == pcred->euid \|\|
4993	cred->euid == pcred->uid);	4993	cred->euid == pcred->uid);
4994	else	4994	else
4995	match = false;	4995	match = false;
4996	rcu_read_unlock();	4996	rcu_read_unlock();
4997	return match;	4997	return match;
4998	}	4998	}
4999		4999
5000	static int __sched_setscheduler(struct task_struct *p, int policy,	5000	static int __sched_setscheduler(struct task_struct *p, int policy,
5001	const struct sched_param *param, bool user)	5001	const struct sched_param *param, bool user)
5002	{	5002	{
5003	int retval, oldprio, oldpolicy = -1, on_rq, running;	5003	int retval, oldprio, oldpolicy = -1, on_rq, running;
5004	unsigned long flags;	5004	unsigned long flags;
5005	const struct sched_class *prev_class;	5005	const struct sched_class *prev_class;
5006	struct rq *rq;	5006	struct rq *rq;
5007	int reset_on_fork;	5007	int reset_on_fork;
5008		5008
5009	/* may grab non-irq protected spin_locks */	5009	/* may grab non-irq protected spin_locks */
5010	BUG_ON(in_interrupt());	5010	BUG_ON(in_interrupt());
5011	recheck:	5011	recheck:
5012	/* double check policy once rq lock held */	5012	/* double check policy once rq lock held */
5013	if (policy < 0) {	5013	if (policy < 0) {
5014	reset_on_fork = p->sched_reset_on_fork;	5014	reset_on_fork = p->sched_reset_on_fork;
5015	policy = oldpolicy = p->policy;	5015	policy = oldpolicy = p->policy;
5016	} else {	5016	} else {
5017	reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);	5017	reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
5018	policy &= ~SCHED_RESET_ON_FORK;	5018	policy &= ~SCHED_RESET_ON_FORK;
5019		5019
5020	if (policy != SCHED_FIFO && policy != SCHED_RR &&	5020	if (policy != SCHED_FIFO && policy != SCHED_RR &&
5021	policy != SCHED_NORMAL && policy != SCHED_BATCH &&	5021	policy != SCHED_NORMAL && policy != SCHED_BATCH &&
5022	policy != SCHED_IDLE)	5022	policy != SCHED_IDLE)
5023	return -EINVAL;	5023	return -EINVAL;
5024	}	5024	}
5025		5025
5026	/*	5026	/*
5027	* Valid priorities for SCHED_FIFO and SCHED_RR are	5027	* Valid priorities for SCHED_FIFO and SCHED_RR are
5028	* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,	5028	* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
5029	* SCHED_BATCH and SCHED_IDLE is 0.	5029	* SCHED_BATCH and SCHED_IDLE is 0.
5030	*/	5030	*/
5031	if (param->sched_priority < 0 \|\|	5031	if (param->sched_priority < 0 \|\|
5032	(p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) \|\|	5032	(p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) \|\|
5033	(!p->mm && param->sched_priority > MAX_RT_PRIO-1))	5033	(!p->mm && param->sched_priority > MAX_RT_PRIO-1))
5034	return -EINVAL;	5034	return -EINVAL;
5035	if (rt_policy(policy) != (param->sched_priority != 0))	5035	if (rt_policy(policy) != (param->sched_priority != 0))
5036	return -EINVAL;	5036	return -EINVAL;
5037		5037
5038	/*	5038	/*
5039	* Allow unprivileged RT tasks to decrease priority:	5039	* Allow unprivileged RT tasks to decrease priority:
5040	*/	5040	*/
5041	if (user && !capable(CAP_SYS_NICE)) {	5041	if (user && !capable(CAP_SYS_NICE)) {
5042	if (rt_policy(policy)) {	5042	if (rt_policy(policy)) {
5043	unsigned long rlim_rtprio =	5043	unsigned long rlim_rtprio =
5044	task_rlimit(p, RLIMIT_RTPRIO);	5044	task_rlimit(p, RLIMIT_RTPRIO);
5045		5045
5046	/* can't set/change the rt policy */	5046	/* can't set/change the rt policy */
5047	if (policy != p->policy && !rlim_rtprio)	5047	if (policy != p->policy && !rlim_rtprio)
5048	return -EPERM;	5048	return -EPERM;
5049		5049
5050	/* can't increase priority */	5050	/* can't increase priority */
5051	if (param->sched_priority > p->rt_priority &&	5051	if (param->sched_priority > p->rt_priority &&
5052	param->sched_priority > rlim_rtprio)	5052	param->sched_priority > rlim_rtprio)
5053	return -EPERM;	5053	return -EPERM;
5054	}	5054	}
5055		5055
5056	/*	5056	/*
5057	* Treat SCHED_IDLE as nice 20. Only allow a switch to	5057	* Treat SCHED_IDLE as nice 20. Only allow a switch to
5058	* SCHED_NORMAL if the RLIMIT_NICE would normally permit it.	5058	* SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
5059	*/	5059	*/
5060	if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {	5060	if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
5061	if (!can_nice(p, TASK_NICE(p)))	5061	if (!can_nice(p, TASK_NICE(p)))
5062	return -EPERM;	5062	return -EPERM;
5063	}	5063	}
5064		5064
5065	/* can't change other user's priorities */	5065	/* can't change other user's priorities */
5066	if (!check_same_owner(p))	5066	if (!check_same_owner(p))
5067	return -EPERM;	5067	return -EPERM;
5068		5068
5069	/* Normal users shall not reset the sched_reset_on_fork flag */	5069	/* Normal users shall not reset the sched_reset_on_fork flag */
5070	if (p->sched_reset_on_fork && !reset_on_fork)	5070	if (p->sched_reset_on_fork && !reset_on_fork)
5071	return -EPERM;	5071	return -EPERM;
5072	}	5072	}
5073		5073
5074	if (user) {	5074	if (user) {
5075	retval = security_task_setscheduler(p);	5075	retval = security_task_setscheduler(p);
5076	if (retval)	5076	if (retval)
5077	return retval;	5077	return retval;
5078	}	5078	}
5079		5079
5080	/*	5080	/*
5081	* make sure no PI-waiters arrive (or leave) while we are	5081	* make sure no PI-waiters arrive (or leave) while we are
5082	* changing the priority of the task:	5082	* changing the priority of the task:
5083	*	5083	*
5084	* To be able to change p->policy safely, the appropriate	5084	* To be able to change p->policy safely, the appropriate
5085	* runqueue lock must be held.	5085	* runqueue lock must be held.
5086	*/	5086	*/
5087	rq = task_rq_lock(p, &flags);	5087	rq = task_rq_lock(p, &flags);
5088		5088
5089	/*	5089	/*
5090	* Changing the policy of the stop threads its a very bad idea	5090	* Changing the policy of the stop threads its a very bad idea
5091	*/	5091	*/
5092	if (p == rq->stop) {	5092	if (p == rq->stop) {
5093	task_rq_unlock(rq, p, &flags);	5093	task_rq_unlock(rq, p, &flags);
5094	return -EINVAL;	5094	return -EINVAL;
5095	}	5095	}
5096		5096
5097	/*	5097	/*
5098	* If not changing anything there's no need to proceed further:	5098	* If not changing anything there's no need to proceed further:
5099	*/	5099	*/
5100	if (unlikely(policy == p->policy && (!rt_policy(policy) \|\|	5100	if (unlikely(policy == p->policy && (!rt_policy(policy) \|\|
5101	param->sched_priority == p->rt_priority))) {	5101	param->sched_priority == p->rt_priority))) {
5102		5102
5103	__task_rq_unlock(rq);	5103	__task_rq_unlock(rq);
5104	raw_spin_unlock_irqrestore(&p->pi_lock, flags);	5104	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5105	return 0;	5105	return 0;
5106	}	5106	}
5107		5107
5108	#ifdef CONFIG_RT_GROUP_SCHED	5108	#ifdef CONFIG_RT_GROUP_SCHED
5109	if (user) {	5109	if (user) {
5110	/*	5110	/*
5111	* Do not allow realtime tasks into groups that have no runtime	5111	* Do not allow realtime tasks into groups that have no runtime
5112	* assigned.	5112	* assigned.
5113	*/	5113	*/
5114	if (rt_bandwidth_enabled() && rt_policy(policy) &&	5114	if (rt_bandwidth_enabled() && rt_policy(policy) &&
5115	task_group(p)->rt_bandwidth.rt_runtime == 0 &&	5115	task_group(p)->rt_bandwidth.rt_runtime == 0 &&
5116	!task_group_is_autogroup(task_group(p))) {	5116	!task_group_is_autogroup(task_group(p))) {
5117	task_rq_unlock(rq, p, &flags);	5117	task_rq_unlock(rq, p, &flags);
5118	return -EPERM;	5118	return -EPERM;
5119	}	5119	}
5120	}	5120	}
5121	#endif	5121	#endif
5122		5122
5123	/* recheck policy now with rq lock held */	5123	/* recheck policy now with rq lock held */
5124	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {	5124	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
5125	policy = oldpolicy = -1;	5125	policy = oldpolicy = -1;
5126	task_rq_unlock(rq, p, &flags);	5126	task_rq_unlock(rq, p, &flags);
5127	goto recheck;	5127	goto recheck;
5128	}	5128	}
5129	on_rq = p->on_rq;	5129	on_rq = p->on_rq;
5130	running = task_current(rq, p);	5130	running = task_current(rq, p);
5131	if (on_rq)	5131	if (on_rq)
5132	deactivate_task(rq, p, 0);	5132	deactivate_task(rq, p, 0);
5133	if (running)	5133	if (running)
5134	p->sched_class->put_prev_task(rq, p);	5134	p->sched_class->put_prev_task(rq, p);
5135		5135
5136	p->sched_reset_on_fork = reset_on_fork;	5136	p->sched_reset_on_fork = reset_on_fork;
5137		5137
5138	oldprio = p->prio;	5138	oldprio = p->prio;
5139	prev_class = p->sched_class;	5139	prev_class = p->sched_class;
5140	__setscheduler(rq, p, policy, param->sched_priority);	5140	__setscheduler(rq, p, policy, param->sched_priority);
5141		5141
5142	if (running)	5142	if (running)
5143	p->sched_class->set_curr_task(rq);	5143	p->sched_class->set_curr_task(rq);
5144	if (on_rq)	5144	if (on_rq)
5145	activate_task(rq, p, 0);	5145	activate_task(rq, p, 0);
5146		5146
5147	check_class_changed(rq, p, prev_class, oldprio);	5147	check_class_changed(rq, p, prev_class, oldprio);
5148	task_rq_unlock(rq, p, &flags);	5148	task_rq_unlock(rq, p, &flags);
5149		5149
5150	rt_mutex_adjust_pi(p);	5150	rt_mutex_adjust_pi(p);
5151		5151
5152	return 0;	5152	return 0;
5153	}	5153	}
5154		5154
5155	/**	5155	/**
5156	* sched_setscheduler - change the scheduling policy and/or RT priority of a thread.	5156	* sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
5157	* @p: the task in question.	5157	* @p: the task in question.
5158	* @policy: new policy.	5158	* @policy: new policy.
5159	* @param: structure containing the new RT priority.	5159	* @param: structure containing the new RT priority.
5160	*	5160	*
5161	* NOTE that the task may be already dead.	5161	* NOTE that the task may be already dead.
5162	*/	5162	*/
5163	int sched_setscheduler(struct task_struct *p, int policy,	5163	int sched_setscheduler(struct task_struct *p, int policy,
5164	const struct sched_param *param)	5164	const struct sched_param *param)
5165	{	5165	{
5166	return __sched_setscheduler(p, policy, param, true);	5166	return __sched_setscheduler(p, policy, param, true);
5167	}	5167	}
5168	EXPORT_SYMBOL_GPL(sched_setscheduler);	5168	EXPORT_SYMBOL_GPL(sched_setscheduler);
5169		5169
5170	/**	5170	/**
5171	* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.	5171	* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
5172	* @p: the task in question.	5172	* @p: the task in question.
5173	* @policy: new policy.	5173	* @policy: new policy.
5174	* @param: structure containing the new RT priority.	5174	* @param: structure containing the new RT priority.
5175	*	5175	*
5176	* Just like sched_setscheduler, only don't bother checking if the	5176	* Just like sched_setscheduler, only don't bother checking if the
5177	* current context has permission. For example, this is needed in	5177	* current context has permission. For example, this is needed in
5178	* stop_machine(): we create temporary high priority worker threads,	5178	* stop_machine(): we create temporary high priority worker threads,
5179	* but our caller might not have that capability.	5179	* but our caller might not have that capability.
5180	*/	5180	*/
5181	int sched_setscheduler_nocheck(struct task_struct *p, int policy,	5181	int sched_setscheduler_nocheck(struct task_struct *p, int policy,
5182	const struct sched_param *param)	5182	const struct sched_param *param)
5183	{	5183	{
5184	return __sched_setscheduler(p, policy, param, false);	5184	return __sched_setscheduler(p, policy, param, false);
5185	}	5185	}
5186		5186
5187	static int	5187	static int
5188	do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)	5188	do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
5189	{	5189	{
5190	struct sched_param lparam;	5190	struct sched_param lparam;
5191	struct task_struct *p;	5191	struct task_struct *p;
5192	int retval;	5192	int retval;
5193		5193
5194	if (!param \|\| pid < 0)	5194	if (!param \|\| pid < 0)
5195	return -EINVAL;	5195	return -EINVAL;
5196	if (copy_from_user(&lparam, param, sizeof(struct sched_param)))	5196	if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
5197	return -EFAULT;	5197	return -EFAULT;
5198		5198
5199	rcu_read_lock();	5199	rcu_read_lock();
5200	retval = -ESRCH;	5200	retval = -ESRCH;
5201	p = find_process_by_pid(pid);	5201	p = find_process_by_pid(pid);
5202	if (p != NULL)	5202	if (p != NULL)
5203	retval = sched_setscheduler(p, policy, &lparam);	5203	retval = sched_setscheduler(p, policy, &lparam);
5204	rcu_read_unlock();	5204	rcu_read_unlock();
5205		5205
5206	return retval;	5206	return retval;
5207	}	5207	}
5208		5208
5209	/**	5209	/**
5210	* sys_sched_setscheduler - set/change the scheduler policy and RT priority	5210	* sys_sched_setscheduler - set/change the scheduler policy and RT priority
5211	* @pid: the pid in question.	5211	* @pid: the pid in question.
5212	* @policy: new policy.	5212	* @policy: new policy.
5213	* @param: structure containing the new RT priority.	5213	* @param: structure containing the new RT priority.
5214	*/	5214	*/
5215	SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,	5215	SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
5216	struct sched_param __user *, param)	5216	struct sched_param __user *, param)
5217	{	5217	{
5218	/* negative values for policy are not valid */	5218	/* negative values for policy are not valid */
5219	if (policy < 0)	5219	if (policy < 0)
5220	return -EINVAL;	5220	return -EINVAL;
5221		5221
5222	return do_sched_setscheduler(pid, policy, param);	5222	return do_sched_setscheduler(pid, policy, param);
5223	}	5223	}
5224		5224
5225	/**	5225	/**
5226	* sys_sched_setparam - set/change the RT priority of a thread	5226	* sys_sched_setparam - set/change the RT priority of a thread
5227	* @pid: the pid in question.	5227	* @pid: the pid in question.
5228	* @param: structure containing the new RT priority.	5228	* @param: structure containing the new RT priority.
5229	*/	5229	*/
5230	SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)	5230	SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
5231	{	5231	{
5232	return do_sched_setscheduler(pid, -1, param);	5232	return do_sched_setscheduler(pid, -1, param);
5233	}	5233	}
5234		5234
5235	/**	5235	/**
5236	* sys_sched_getscheduler - get the policy (scheduling class) of a thread	5236	* sys_sched_getscheduler - get the policy (scheduling class) of a thread
5237	* @pid: the pid in question.	5237	* @pid: the pid in question.
5238	*/	5238	*/
5239	SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)	5239	SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
5240	{	5240	{
5241	struct task_struct *p;	5241	struct task_struct *p;
5242	int retval;	5242	int retval;
5243		5243
5244	if (pid < 0)	5244	if (pid < 0)
5245	return -EINVAL;	5245	return -EINVAL;
5246		5246
5247	retval = -ESRCH;	5247	retval = -ESRCH;
5248	rcu_read_lock();	5248	rcu_read_lock();
5249	p = find_process_by_pid(pid);	5249	p = find_process_by_pid(pid);
5250	if (p) {	5250	if (p) {
5251	retval = security_task_getscheduler(p);	5251	retval = security_task_getscheduler(p);
5252	if (!retval)	5252	if (!retval)
5253	retval = p->policy	5253	retval = p->policy
5254	\| (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);	5254	\| (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
5255	}	5255	}
5256	rcu_read_unlock();	5256	rcu_read_unlock();
5257	return retval;	5257	return retval;
5258	}	5258	}
5259		5259
5260	/**	5260	/**
5261	* sys_sched_getparam - get the RT priority of a thread	5261	* sys_sched_getparam - get the RT priority of a thread
5262	* @pid: the pid in question.	5262	* @pid: the pid in question.
5263	* @param: structure containing the RT priority.	5263	* @param: structure containing the RT priority.
5264	*/	5264	*/
5265	SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)	5265	SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
5266	{	5266	{
5267	struct sched_param lp;	5267	struct sched_param lp;
5268	struct task_struct *p;	5268	struct task_struct *p;
5269	int retval;	5269	int retval;
5270		5270
5271	if (!param \|\| pid < 0)	5271	if (!param \|\| pid < 0)
5272	return -EINVAL;	5272	return -EINVAL;
5273		5273
5274	rcu_read_lock();	5274	rcu_read_lock();
5275	p = find_process_by_pid(pid);	5275	p = find_process_by_pid(pid);
5276	retval = -ESRCH;	5276	retval = -ESRCH;
5277	if (!p)	5277	if (!p)
5278	goto out_unlock;	5278	goto out_unlock;
5279		5279
5280	retval = security_task_getscheduler(p);	5280	retval = security_task_getscheduler(p);
5281	if (retval)	5281	if (retval)
5282	goto out_unlock;	5282	goto out_unlock;
5283		5283
5284	lp.sched_priority = p->rt_priority;	5284	lp.sched_priority = p->rt_priority;
5285	rcu_read_unlock();	5285	rcu_read_unlock();
5286		5286
5287	/*	5287	/*
5288	* This one might sleep, we cannot do it with a spinlock held ...	5288	* This one might sleep, we cannot do it with a spinlock held ...
5289	*/	5289	*/
5290	retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;	5290	retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
5291		5291
5292	return retval;	5292	return retval;
5293		5293
5294	out_unlock:	5294	out_unlock:
5295	rcu_read_unlock();	5295	rcu_read_unlock();
5296	return retval;	5296	return retval;
5297	}	5297	}
5298		5298
5299	long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)	5299	long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
5300	{	5300	{
5301	cpumask_var_t cpus_allowed, new_mask;	5301	cpumask_var_t cpus_allowed, new_mask;
5302	struct task_struct *p;	5302	struct task_struct *p;
5303	int retval;	5303	int retval;
5304		5304
5305	get_online_cpus();	5305	get_online_cpus();
5306	rcu_read_lock();	5306	rcu_read_lock();
5307		5307
5308	p = find_process_by_pid(pid);	5308	p = find_process_by_pid(pid);
5309	if (!p) {	5309	if (!p) {
5310	rcu_read_unlock();	5310	rcu_read_unlock();
5311	put_online_cpus();	5311	put_online_cpus();
5312	return -ESRCH;	5312	return -ESRCH;
5313	}	5313	}
5314		5314
5315	/* Prevent p going away */	5315	/* Prevent p going away */
5316	get_task_struct(p);	5316	get_task_struct(p);
5317	rcu_read_unlock();	5317	rcu_read_unlock();
5318		5318
5319	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {	5319	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
5320	retval = -ENOMEM;	5320	retval = -ENOMEM;
5321	goto out_put_task;	5321	goto out_put_task;
5322	}	5322	}
5323	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {	5323	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
5324	retval = -ENOMEM;	5324	retval = -ENOMEM;
5325	goto out_free_cpus_allowed;	5325	goto out_free_cpus_allowed;
5326	}	5326	}
5327	retval = -EPERM;	5327	retval = -EPERM;
5328	if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE))	5328	if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE))
5329	goto out_unlock;	5329	goto out_unlock;
5330		5330
5331	retval = security_task_setscheduler(p);	5331	retval = security_task_setscheduler(p);
5332	if (retval)	5332	if (retval)
5333	goto out_unlock;	5333	goto out_unlock;
5334		5334
5335	cpuset_cpus_allowed(p, cpus_allowed);	5335	cpuset_cpus_allowed(p, cpus_allowed);
5336	cpumask_and(new_mask, in_mask, cpus_allowed);	5336	cpumask_and(new_mask, in_mask, cpus_allowed);
5337	again:	5337	again:
5338	retval = set_cpus_allowed_ptr(p, new_mask);	5338	retval = set_cpus_allowed_ptr(p, new_mask);
5339		5339
5340	if (!retval) {	5340	if (!retval) {
5341	cpuset_cpus_allowed(p, cpus_allowed);	5341	cpuset_cpus_allowed(p, cpus_allowed);
5342	if (!cpumask_subset(new_mask, cpus_allowed)) {	5342	if (!cpumask_subset(new_mask, cpus_allowed)) {
5343	/*	5343	/*
5344	* We must have raced with a concurrent cpuset	5344	* We must have raced with a concurrent cpuset
5345	* update. Just reset the cpus_allowed to the	5345	* update. Just reset the cpus_allowed to the
5346	* cpuset's cpus_allowed	5346	* cpuset's cpus_allowed
5347	*/	5347	*/
5348	cpumask_copy(new_mask, cpus_allowed);	5348	cpumask_copy(new_mask, cpus_allowed);
5349	goto again;	5349	goto again;
5350	}	5350	}
5351	}	5351	}
5352	out_unlock:	5352	out_unlock:
5353	free_cpumask_var(new_mask);	5353	free_cpumask_var(new_mask);
5354	out_free_cpus_allowed:	5354	out_free_cpus_allowed:
5355	free_cpumask_var(cpus_allowed);	5355	free_cpumask_var(cpus_allowed);
5356	out_put_task:	5356	out_put_task:
5357	put_task_struct(p);	5357	put_task_struct(p);
5358	put_online_cpus();	5358	put_online_cpus();
5359	return retval;	5359	return retval;
5360	}	5360	}
5361		5361
5362	static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,	5362	static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
5363	struct cpumask *new_mask)	5363	struct cpumask *new_mask)
5364	{	5364	{
5365	if (len < cpumask_size())	5365	if (len < cpumask_size())
5366	cpumask_clear(new_mask);	5366	cpumask_clear(new_mask);
5367	else if (len > cpumask_size())	5367	else if (len > cpumask_size())
5368	len = cpumask_size();	5368	len = cpumask_size();
5369		5369
5370	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;	5370	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
5371	}	5371	}
5372		5372
5373	/**	5373	/**
5374	* sys_sched_setaffinity - set the cpu affinity of a process	5374	* sys_sched_setaffinity - set the cpu affinity of a process
5375	* @pid: pid of the process	5375	* @pid: pid of the process
5376	* @len: length in bytes of the bitmask pointed to by user_mask_ptr	5376	* @len: length in bytes of the bitmask pointed to by user_mask_ptr
5377	* @user_mask_ptr: user-space pointer to the new cpu mask	5377	* @user_mask_ptr: user-space pointer to the new cpu mask
5378	*/	5378	*/
5379	SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,	5379	SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
5380	unsigned long __user *, user_mask_ptr)	5380	unsigned long __user *, user_mask_ptr)
5381	{	5381	{
5382	cpumask_var_t new_mask;	5382	cpumask_var_t new_mask;
5383	int retval;	5383	int retval;
5384		5384
5385	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))	5385	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
5386	return -ENOMEM;	5386	return -ENOMEM;
5387		5387
5388	retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);	5388	retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
5389	if (retval == 0)	5389	if (retval == 0)
5390	retval = sched_setaffinity(pid, new_mask);	5390	retval = sched_setaffinity(pid, new_mask);
5391	free_cpumask_var(new_mask);	5391	free_cpumask_var(new_mask);
5392	return retval;	5392	return retval;
5393	}	5393	}
5394		5394
5395	long sched_getaffinity(pid_t pid, struct cpumask *mask)	5395	long sched_getaffinity(pid_t pid, struct cpumask *mask)
5396	{	5396	{
5397	struct task_struct *p;	5397	struct task_struct *p;
5398	unsigned long flags;	5398	unsigned long flags;
5399	int retval;	5399	int retval;
5400		5400
5401	get_online_cpus();	5401	get_online_cpus();
5402	rcu_read_lock();	5402	rcu_read_lock();
5403		5403
5404	retval = -ESRCH;	5404	retval = -ESRCH;
5405	p = find_process_by_pid(pid);	5405	p = find_process_by_pid(pid);
5406	if (!p)	5406	if (!p)
5407	goto out_unlock;	5407	goto out_unlock;
5408		5408
5409	retval = security_task_getscheduler(p);	5409	retval = security_task_getscheduler(p);
5410	if (retval)	5410	if (retval)
5411	goto out_unlock;	5411	goto out_unlock;
5412		5412
5413	raw_spin_lock_irqsave(&p->pi_lock, flags);	5413	raw_spin_lock_irqsave(&p->pi_lock, flags);
5414	cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);	5414	cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
5415	raw_spin_unlock_irqrestore(&p->pi_lock, flags);	5415	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5416		5416
5417	out_unlock:	5417	out_unlock:
5418	rcu_read_unlock();	5418	rcu_read_unlock();
5419	put_online_cpus();	5419	put_online_cpus();
5420		5420
5421	return retval;	5421	return retval;
5422	}	5422	}
5423		5423
5424	/**	5424	/**
5425	* sys_sched_getaffinity - get the cpu affinity of a process	5425	* sys_sched_getaffinity - get the cpu affinity of a process
5426	* @pid: pid of the process	5426	* @pid: pid of the process
5427	* @len: length in bytes of the bitmask pointed to by user_mask_ptr	5427	* @len: length in bytes of the bitmask pointed to by user_mask_ptr
5428	* @user_mask_ptr: user-space pointer to hold the current cpu mask	5428	* @user_mask_ptr: user-space pointer to hold the current cpu mask
5429	*/	5429	*/
5430	SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,	5430	SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
5431	unsigned long __user *, user_mask_ptr)	5431	unsigned long __user *, user_mask_ptr)
5432	{	5432	{
5433	int ret;	5433	int ret;
5434	cpumask_var_t mask;	5434	cpumask_var_t mask;
5435		5435
5436	if ((len * BITS_PER_BYTE) < nr_cpu_ids)	5436	if ((len * BITS_PER_BYTE) < nr_cpu_ids)
5437	return -EINVAL;	5437	return -EINVAL;
5438	if (len & (sizeof(unsigned long)-1))	5438	if (len & (sizeof(unsigned long)-1))
5439	return -EINVAL;	5439	return -EINVAL;
5440		5440
5441	if (!alloc_cpumask_var(&mask, GFP_KERNEL))	5441	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
5442	return -ENOMEM;	5442	return -ENOMEM;
5443		5443
5444	ret = sched_getaffinity(pid, mask);	5444	ret = sched_getaffinity(pid, mask);
5445	if (ret == 0) {	5445	if (ret == 0) {
5446	size_t retlen = min_t(size_t, len, cpumask_size());	5446	size_t retlen = min_t(size_t, len, cpumask_size());
5447		5447
5448	if (copy_to_user(user_mask_ptr, mask, retlen))	5448	if (copy_to_user(user_mask_ptr, mask, retlen))
5449	ret = -EFAULT;	5449	ret = -EFAULT;
5450	else	5450	else
5451	ret = retlen;	5451	ret = retlen;
5452	}	5452	}
5453	free_cpumask_var(mask);	5453	free_cpumask_var(mask);
5454		5454
5455	return ret;	5455	return ret;
5456	}	5456	}
5457		5457
5458	/**	5458	/**
5459	* sys_sched_yield - yield the current processor to other threads.	5459	* sys_sched_yield - yield the current processor to other threads.
5460	*	5460	*
5461	* This function yields the current CPU to other tasks. If there are no	5461	* This function yields the current CPU to other tasks. If there are no
5462	* other threads running on this CPU then this function will return.	5462	* other threads running on this CPU then this function will return.
5463	*/	5463	*/
5464	SYSCALL_DEFINE0(sched_yield)	5464	SYSCALL_DEFINE0(sched_yield)
5465	{	5465	{
5466	struct rq *rq = this_rq_lock();	5466	struct rq *rq = this_rq_lock();
5467		5467
5468	schedstat_inc(rq, yld_count);	5468	schedstat_inc(rq, yld_count);
5469	current->sched_class->yield_task(rq);	5469	current->sched_class->yield_task(rq);
5470		5470
5471	/*	5471	/*
5472	* Since we are going to call schedule() anyway, there's	5472	* Since we are going to call schedule() anyway, there's
5473	* no need to preempt or enable interrupts:	5473	* no need to preempt or enable interrupts:
5474	*/	5474	*/
5475	__release(rq->lock);	5475	__release(rq->lock);
5476	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);	5476	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
5477	do_raw_spin_unlock(&rq->lock);	5477	do_raw_spin_unlock(&rq->lock);
5478	preempt_enable_no_resched();	5478	preempt_enable_no_resched();
5479		5479
5480	schedule();	5480	schedule();
5481		5481
5482	return 0;	5482	return 0;
5483	}	5483	}
5484		5484
5485	static inline int should_resched(void)	5485	static inline int should_resched(void)
5486	{	5486	{
5487	return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);	5487	return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
5488	}	5488	}
5489		5489
5490	static void __cond_resched(void)	5490	static void __cond_resched(void)
5491	{	5491	{
5492	add_preempt_count(PREEMPT_ACTIVE);	5492	add_preempt_count(PREEMPT_ACTIVE);
5493	schedule();	5493	schedule();
5494	sub_preempt_count(PREEMPT_ACTIVE);	5494	sub_preempt_count(PREEMPT_ACTIVE);
5495	}	5495	}
5496		5496
5497	int __sched _cond_resched(void)	5497	int __sched _cond_resched(void)
5498	{	5498	{
5499	if (should_resched()) {	5499	if (should_resched()) {
5500	__cond_resched();	5500	__cond_resched();
5501	return 1;	5501	return 1;
5502	}	5502	}
5503	return 0;	5503	return 0;
5504	}	5504	}
5505	EXPORT_SYMBOL(_cond_resched);	5505	EXPORT_SYMBOL(_cond_resched);
5506		5506
5507	/*	5507	/*
5508	* __cond_resched_lock() - if a reschedule is pending, drop the given lock,	5508	* __cond_resched_lock() - if a reschedule is pending, drop the given lock,
5509	* call schedule, and on return reacquire the lock.	5509	* call schedule, and on return reacquire the lock.
5510	*	5510	*
5511	* This works OK both with and without CONFIG_PREEMPT. We do strange low-level	5511	* This works OK both with and without CONFIG_PREEMPT. We do strange low-level
5512	* operations here to prevent schedule() from being called twice (once via	5512	* operations here to prevent schedule() from being called twice (once via
5513	* spin_unlock(), once by hand).	5513	* spin_unlock(), once by hand).
5514	*/	5514	*/
5515	int __cond_resched_lock(spinlock_t *lock)	5515	int __cond_resched_lock(spinlock_t *lock)
5516	{	5516	{
5517	int resched = should_resched();	5517	int resched = should_resched();
5518	int ret = 0;	5518	int ret = 0;
5519		5519
5520	lockdep_assert_held(lock);	5520	lockdep_assert_held(lock);
5521		5521
5522	if (spin_needbreak(lock) \|\| resched) {	5522	if (spin_needbreak(lock) \|\| resched) {
5523	spin_unlock(lock);	5523	spin_unlock(lock);
5524	if (resched)	5524	if (resched)
5525	__cond_resched();	5525	__cond_resched();
5526	else	5526	else
5527	cpu_relax();	5527	cpu_relax();
5528	ret = 1;	5528	ret = 1;
5529	spin_lock(lock);	5529	spin_lock(lock);
5530	}	5530	}
5531	return ret;	5531	return ret;
5532	}	5532	}
5533	EXPORT_SYMBOL(__cond_resched_lock);	5533	EXPORT_SYMBOL(__cond_resched_lock);
5534		5534
5535	int __sched __cond_resched_softirq(void)	5535	int __sched __cond_resched_softirq(void)
5536	{	5536	{
5537	BUG_ON(!in_softirq());	5537	BUG_ON(!in_softirq());
5538		5538
5539	if (should_resched()) {	5539	if (should_resched()) {
5540	local_bh_enable();	5540	local_bh_enable();
5541	__cond_resched();	5541	__cond_resched();
5542	local_bh_disable();	5542	local_bh_disable();
5543	return 1;	5543	return 1;
5544	}	5544	}
5545	return 0;	5545	return 0;
5546	}	5546	}
5547	EXPORT_SYMBOL(__cond_resched_softirq);	5547	EXPORT_SYMBOL(__cond_resched_softirq);
5548		5548
5549	/**	5549	/**
5550	* yield - yield the current processor to other threads.	5550	* yield - yield the current processor to other threads.
5551	*	5551	*
5552	* This is a shortcut for kernel-space yielding - it marks the	5552	* This is a shortcut for kernel-space yielding - it marks the
5553	* thread runnable and calls sys_sched_yield().	5553	* thread runnable and calls sys_sched_yield().
5554	*/	5554	*/
5555	void __sched yield(void)	5555	void __sched yield(void)
5556	{	5556	{
5557	set_current_state(TASK_RUNNING);	5557	set_current_state(TASK_RUNNING);
5558	sys_sched_yield();	5558	sys_sched_yield();
5559	}	5559	}
5560	EXPORT_SYMBOL(yield);	5560	EXPORT_SYMBOL(yield);
5561		5561
5562	/**	5562	/**
5563	* yield_to - yield the current processor to another thread in	5563	* yield_to - yield the current processor to another thread in
5564	* your thread group, or accelerate that thread toward the	5564	* your thread group, or accelerate that thread toward the
5565	* processor it's on.	5565	* processor it's on.
5566	* @p: target task	5566	* @p: target task
5567	* @preempt: whether task preemption is allowed or not	5567	* @preempt: whether task preemption is allowed or not
5568	*	5568	*
5569	* It's the caller's job to ensure that the target task struct	5569	* It's the caller's job to ensure that the target task struct
5570	* can't go away on us before we can do any checks.	5570	* can't go away on us before we can do any checks.
5571	*	5571	*
5572	* Returns true if we indeed boosted the target task.	5572	* Returns true if we indeed boosted the target task.
5573	*/	5573	*/
5574	bool __sched yield_to(struct task_struct *p, bool preempt)	5574	bool __sched yield_to(struct task_struct *p, bool preempt)
5575	{	5575	{
5576	struct task_struct *curr = current;	5576	struct task_struct *curr = current;
5577	struct rq rq, p_rq;	5577	struct rq rq, p_rq;
5578	unsigned long flags;	5578	unsigned long flags;
5579	bool yielded = 0;	5579	bool yielded = 0;
5580		5580
5581	local_irq_save(flags);	5581	local_irq_save(flags);
5582	rq = this_rq();	5582	rq = this_rq();
5583		5583
5584	again:	5584	again:
5585	p_rq = task_rq(p);	5585	p_rq = task_rq(p);
5586	double_rq_lock(rq, p_rq);	5586	double_rq_lock(rq, p_rq);
5587	while (task_rq(p) != p_rq) {	5587	while (task_rq(p) != p_rq) {
5588	double_rq_unlock(rq, p_rq);	5588	double_rq_unlock(rq, p_rq);
5589	goto again;	5589	goto again;
5590	}	5590	}
5591		5591
5592	if (!curr->sched_class->yield_to_task)	5592	if (!curr->sched_class->yield_to_task)
5593	goto out;	5593	goto out;
5594		5594
5595	if (curr->sched_class != p->sched_class)	5595	if (curr->sched_class != p->sched_class)
5596	goto out;	5596	goto out;
5597		5597
5598	if (task_running(p_rq, p) \|\| p->state)	5598	if (task_running(p_rq, p) \|\| p->state)
5599	goto out;	5599	goto out;
5600		5600
5601	yielded = curr->sched_class->yield_to_task(rq, p, preempt);	5601	yielded = curr->sched_class->yield_to_task(rq, p, preempt);
5602	if (yielded) {	5602	if (yielded) {
5603	schedstat_inc(rq, yld_count);	5603	schedstat_inc(rq, yld_count);
5604	/*	5604	/*
5605	* Make p's CPU reschedule; pick_next_entity takes care of	5605	* Make p's CPU reschedule; pick_next_entity takes care of
5606	* fairness.	5606	* fairness.
5607	*/	5607	*/
5608	if (preempt && rq != p_rq)	5608	if (preempt && rq != p_rq)
5609	resched_task(p_rq->curr);	5609	resched_task(p_rq->curr);
5610	}	5610	}
5611		5611
5612	out:	5612	out:
5613	double_rq_unlock(rq, p_rq);	5613	double_rq_unlock(rq, p_rq);
5614	local_irq_restore(flags);	5614	local_irq_restore(flags);
5615		5615
5616	if (yielded)	5616	if (yielded)
5617	schedule();	5617	schedule();
5618		5618
5619	return yielded;	5619	return yielded;
5620	}	5620	}
5621	EXPORT_SYMBOL_GPL(yield_to);	5621	EXPORT_SYMBOL_GPL(yield_to);
5622		5622
5623	/*	5623	/*
5624	* This task is about to go to sleep on IO. Increment rq->nr_iowait so	5624	* This task is about to go to sleep on IO. Increment rq->nr_iowait so
5625	* that process accounting knows that this is a task in IO wait state.	5625	* that process accounting knows that this is a task in IO wait state.
5626	*/	5626	*/
5627	void __sched io_schedule(void)	5627	void __sched io_schedule(void)
5628	{	5628	{
5629	struct rq *rq = raw_rq();	5629	struct rq *rq = raw_rq();
5630		5630
5631	delayacct_blkio_start();	5631	delayacct_blkio_start();
5632	atomic_inc(&rq->nr_iowait);	5632	atomic_inc(&rq->nr_iowait);
5633	blk_flush_plug(current);	5633	blk_flush_plug(current);
5634	current->in_iowait = 1;	5634	current->in_iowait = 1;
5635	schedule();	5635	schedule();
5636	current->in_iowait = 0;	5636	current->in_iowait = 0;
5637	atomic_dec(&rq->nr_iowait);	5637	atomic_dec(&rq->nr_iowait);
5638	delayacct_blkio_end();	5638	delayacct_blkio_end();
5639	}	5639	}
5640	EXPORT_SYMBOL(io_schedule);	5640	EXPORT_SYMBOL(io_schedule);
5641		5641
5642	long __sched io_schedule_timeout(long timeout)	5642	long __sched io_schedule_timeout(long timeout)
5643	{	5643	{
5644	struct rq *rq = raw_rq();	5644	struct rq *rq = raw_rq();
5645	long ret;	5645	long ret;
5646		5646
5647	delayacct_blkio_start();	5647	delayacct_blkio_start();
5648	atomic_inc(&rq->nr_iowait);	5648	atomic_inc(&rq->nr_iowait);
5649	blk_flush_plug(current);	5649	blk_flush_plug(current);
5650	current->in_iowait = 1;	5650	current->in_iowait = 1;
5651	ret = schedule_timeout(timeout);	5651	ret = schedule_timeout(timeout);
5652	current->in_iowait = 0;	5652	current->in_iowait = 0;
5653	atomic_dec(&rq->nr_iowait);	5653	atomic_dec(&rq->nr_iowait);
5654	delayacct_blkio_end();	5654	delayacct_blkio_end();
5655	return ret;	5655	return ret;
5656	}	5656	}
5657		5657
5658	/**	5658	/**
5659	* sys_sched_get_priority_max - return maximum RT priority.	5659	* sys_sched_get_priority_max - return maximum RT priority.
5660	* @policy: scheduling class.	5660	* @policy: scheduling class.
5661	*	5661	*
5662	* this syscall returns the maximum rt_priority that can be used	5662	* this syscall returns the maximum rt_priority that can be used
5663	* by a given scheduling class.	5663	* by a given scheduling class.
5664	*/	5664	*/
5665	SYSCALL_DEFINE1(sched_get_priority_max, int, policy)	5665	SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
5666	{	5666	{
5667	int ret = -EINVAL;	5667	int ret = -EINVAL;
5668		5668
5669	switch (policy) {	5669	switch (policy) {
5670	case SCHED_FIFO:	5670	case SCHED_FIFO:
5671	case SCHED_RR:	5671	case SCHED_RR:
5672	ret = MAX_USER_RT_PRIO-1;	5672	ret = MAX_USER_RT_PRIO-1;
5673	break;	5673	break;
5674	case SCHED_NORMAL:	5674	case SCHED_NORMAL:
5675	case SCHED_BATCH:	5675	case SCHED_BATCH:
5676	case SCHED_IDLE:	5676	case SCHED_IDLE:
5677	ret = 0;	5677	ret = 0;
5678	break;	5678	break;
5679	}	5679	}
5680	return ret;	5680	return ret;
5681	}	5681	}
5682		5682
5683	/**	5683	/**
5684	* sys_sched_get_priority_min - return minimum RT priority.	5684	* sys_sched_get_priority_min - return minimum RT priority.
5685	* @policy: scheduling class.	5685	* @policy: scheduling class.
5686	*	5686	*
5687	* this syscall returns the minimum rt_priority that can be used	5687	* this syscall returns the minimum rt_priority that can be used
5688	* by a given scheduling class.	5688	* by a given scheduling class.
5689	*/	5689	*/
5690	SYSCALL_DEFINE1(sched_get_priority_min, int, policy)	5690	SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
5691	{	5691	{
5692	int ret = -EINVAL;	5692	int ret = -EINVAL;
5693		5693
5694	switch (policy) {	5694	switch (policy) {
5695	case SCHED_FIFO:	5695	case SCHED_FIFO:
5696	case SCHED_RR:	5696	case SCHED_RR:
5697	ret = 1;	5697	ret = 1;
5698	break;	5698	break;
5699	case SCHED_NORMAL:	5699	case SCHED_NORMAL:
5700	case SCHED_BATCH:	5700	case SCHED_BATCH:
5701	case SCHED_IDLE:	5701	case SCHED_IDLE:
5702	ret = 0;	5702	ret = 0;
5703	}	5703	}
5704	return ret;	5704	return ret;
5705	}	5705	}
5706		5706
5707	/**	5707	/**
5708	* sys_sched_rr_get_interval - return the default timeslice of a process.	5708	* sys_sched_rr_get_interval - return the default timeslice of a process.
5709	* @pid: pid of the process.	5709	* @pid: pid of the process.
5710	* @interval: userspace pointer to the timeslice value.	5710	* @interval: userspace pointer to the timeslice value.
5711	*	5711	*
5712	* this syscall writes the default timeslice value of a given process	5712	* this syscall writes the default timeslice value of a given process
5713	* into the user-space timespec buffer. A value of '0' means infinity.	5713	* into the user-space timespec buffer. A value of '0' means infinity.
5714	*/	5714	*/
5715	SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,	5715	SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
5716	struct timespec __user *, interval)	5716	struct timespec __user *, interval)
5717	{	5717	{
5718	struct task_struct *p;	5718	struct task_struct *p;
5719	unsigned int time_slice;	5719	unsigned int time_slice;
5720	unsigned long flags;	5720	unsigned long flags;
5721	struct rq *rq;	5721	struct rq *rq;
5722	int retval;	5722	int retval;
5723	struct timespec t;	5723	struct timespec t;
5724		5724
5725	if (pid < 0)	5725	if (pid < 0)
5726	return -EINVAL;	5726	return -EINVAL;
5727		5727
5728	retval = -ESRCH;	5728	retval = -ESRCH;
5729	rcu_read_lock();	5729	rcu_read_lock();
5730	p = find_process_by_pid(pid);	5730	p = find_process_by_pid(pid);
5731	if (!p)	5731	if (!p)
5732	goto out_unlock;	5732	goto out_unlock;
5733		5733
5734	retval = security_task_getscheduler(p);	5734	retval = security_task_getscheduler(p);
5735	if (retval)	5735	if (retval)
5736	goto out_unlock;	5736	goto out_unlock;
5737		5737
5738	rq = task_rq_lock(p, &flags);	5738	rq = task_rq_lock(p, &flags);
5739	time_slice = p->sched_class->get_rr_interval(rq, p);	5739	time_slice = p->sched_class->get_rr_interval(rq, p);
5740	task_rq_unlock(rq, p, &flags);	5740	task_rq_unlock(rq, p, &flags);
5741		5741
5742	rcu_read_unlock();	5742	rcu_read_unlock();
5743	jiffies_to_timespec(time_slice, &t);	5743	jiffies_to_timespec(time_slice, &t);
5744	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;	5744	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
5745	return retval;	5745	return retval;
5746		5746
5747	out_unlock:	5747	out_unlock:
5748	rcu_read_unlock();	5748	rcu_read_unlock();
5749	return retval;	5749	return retval;
5750	}	5750	}
5751		5751
5752	static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;	5752	static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
5753		5753
5754	void sched_show_task(struct task_struct *p)	5754	void sched_show_task(struct task_struct *p)
5755	{	5755	{
5756	unsigned long free = 0;	5756	unsigned long free = 0;
5757	unsigned state;	5757	unsigned state;
5758		5758
5759	state = p->state ? __ffs(p->state) + 1 : 0;	5759	state = p->state ? __ffs(p->state) + 1 : 0;
5760	printk(KERN_INFO "%-15.15s %c", p->comm,	5760	printk(KERN_INFO "%-15.15s %c", p->comm,
5761	state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');	5761	state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
5762	#if BITS_PER_LONG == 32	5762	#if BITS_PER_LONG == 32
5763	if (state == TASK_RUNNING)	5763	if (state == TASK_RUNNING)
5764	printk(KERN_CONT " running ");	5764	printk(KERN_CONT " running ");
5765	else	5765	else
5766	printk(KERN_CONT " %08lx ", thread_saved_pc(p));	5766	printk(KERN_CONT " %08lx ", thread_saved_pc(p));
5767	#else	5767	#else
5768	if (state == TASK_RUNNING)	5768	if (state == TASK_RUNNING)
5769	printk(KERN_CONT " running task ");	5769	printk(KERN_CONT " running task ");
5770	else	5770	else
5771	printk(KERN_CONT " %016lx ", thread_saved_pc(p));	5771	printk(KERN_CONT " %016lx ", thread_saved_pc(p));
5772	#endif	5772	#endif
5773	#ifdef CONFIG_DEBUG_STACK_USAGE	5773	#ifdef CONFIG_DEBUG_STACK_USAGE
5774	free = stack_not_used(p);	5774	free = stack_not_used(p);
5775	#endif	5775	#endif
5776	printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,	5776	printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
5777	task_pid_nr(p), task_pid_nr(p->real_parent),	5777	task_pid_nr(p), task_pid_nr(p->real_parent),
5778	(unsigned long)task_thread_info(p)->flags);	5778	(unsigned long)task_thread_info(p)->flags);
5779		5779
5780	show_stack(p, NULL);	5780	show_stack(p, NULL);
5781	}	5781	}
5782		5782
5783	void show_state_filter(unsigned long state_filter)	5783	void show_state_filter(unsigned long state_filter)
5784	{	5784	{
5785	struct task_struct g, p;	5785	struct task_struct g, p;
5786		5786
5787	#if BITS_PER_LONG == 32	5787	#if BITS_PER_LONG == 32
5788	printk(KERN_INFO	5788	printk(KERN_INFO
5789	" task PC stack pid father\n");	5789	" task PC stack pid father\n");
5790	#else	5790	#else
5791	printk(KERN_INFO	5791	printk(KERN_INFO
5792	" task PC stack pid father\n");	5792	" task PC stack pid father\n");
5793	#endif	5793	#endif
5794	read_lock(&tasklist_lock);	5794	read_lock(&tasklist_lock);
5795	do_each_thread(g, p) {	5795	do_each_thread(g, p) {
5796	/*	5796	/*
5797	* reset the NMI-timeout, listing all files on a slow	5797	* reset the NMI-timeout, listing all files on a slow
5798	* console might take a lot of time:	5798	* console might take a lot of time:
5799	*/	5799	*/
5800	touch_nmi_watchdog();	5800	touch_nmi_watchdog();
5801	if (!state_filter \|\| (p->state & state_filter))	5801	if (!state_filter \|\| (p->state & state_filter))
5802	sched_show_task(p);	5802	sched_show_task(p);
5803	} while_each_thread(g, p);	5803	} while_each_thread(g, p);
5804		5804
5805	touch_all_softlockup_watchdogs();	5805	touch_all_softlockup_watchdogs();
5806		5806
5807	#ifdef CONFIG_SCHED_DEBUG	5807	#ifdef CONFIG_SCHED_DEBUG
5808	sysrq_sched_debug_show();	5808	sysrq_sched_debug_show();
5809	#endif	5809	#endif
5810	read_unlock(&tasklist_lock);	5810	read_unlock(&tasklist_lock);
5811	/*	5811	/*
5812	* Only show locks if all tasks are dumped:	5812	* Only show locks if all tasks are dumped:
5813	*/	5813	*/
5814	if (!state_filter)	5814	if (!state_filter)
5815	debug_show_all_locks();	5815	debug_show_all_locks();
5816	}	5816	}
5817		5817
5818	void __cpuinit init_idle_bootup_task(struct task_struct *idle)	5818	void __cpuinit init_idle_bootup_task(struct task_struct *idle)
5819	{	5819	{
5820	idle->sched_class = &idle_sched_class;	5820	idle->sched_class = &idle_sched_class;
5821	}	5821	}
5822		5822
5823	/**	5823	/**
5824	* init_idle - set up an idle thread for a given CPU	5824	* init_idle - set up an idle thread for a given CPU
5825	* @idle: task in question	5825	* @idle: task in question
5826	* @cpu: cpu the idle task belongs to	5826	* @cpu: cpu the idle task belongs to
5827	*	5827	*
5828	* NOTE: this function does not set the idle thread's NEED_RESCHED	5828	* NOTE: this function does not set the idle thread's NEED_RESCHED
5829	* flag, to make booting more robust.	5829	* flag, to make booting more robust.
5830	*/	5830	*/
5831	void __cpuinit init_idle(struct task_struct *idle, int cpu)	5831	void __cpuinit init_idle(struct task_struct *idle, int cpu)
5832	{	5832	{
5833	struct rq *rq = cpu_rq(cpu);	5833	struct rq *rq = cpu_rq(cpu);
5834	unsigned long flags;	5834	unsigned long flags;
5835		5835
5836	raw_spin_lock_irqsave(&rq->lock, flags);	5836	raw_spin_lock_irqsave(&rq->lock, flags);
5837		5837
5838	__sched_fork(idle);	5838	__sched_fork(idle);
5839	idle->state = TASK_RUNNING;	5839	idle->state = TASK_RUNNING;
5840	idle->se.exec_start = sched_clock();	5840	idle->se.exec_start = sched_clock();
5841		5841
5842	cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));	5842	cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
5843	/*	5843	/*
5844	* We're having a chicken and egg problem, even though we are	5844	* We're having a chicken and egg problem, even though we are
5845	* holding rq->lock, the cpu isn't yet set to this cpu so the	5845	* holding rq->lock, the cpu isn't yet set to this cpu so the
5846	* lockdep check in task_group() will fail.	5846	* lockdep check in task_group() will fail.
5847	*	5847	*
5848	* Similar case to sched_fork(). / Alternatively we could	5848	* Similar case to sched_fork(). / Alternatively we could
5849	* use task_rq_lock() here and obtain the other rq->lock.	5849	* use task_rq_lock() here and obtain the other rq->lock.
5850	*	5850	*
5851	* Silence PROVE_RCU	5851	* Silence PROVE_RCU
5852	*/	5852	*/
5853	rcu_read_lock();	5853	rcu_read_lock();
5854	__set_task_cpu(idle, cpu);	5854	__set_task_cpu(idle, cpu);
5855	rcu_read_unlock();	5855	rcu_read_unlock();
5856		5856
5857	rq->curr = rq->idle = idle;	5857	rq->curr = rq->idle = idle;
5858	#if defined(CONFIG_SMP)	5858	#if defined(CONFIG_SMP)
5859	idle->on_cpu = 1;	5859	idle->on_cpu = 1;
5860	#endif	5860	#endif
5861	raw_spin_unlock_irqrestore(&rq->lock, flags);	5861	raw_spin_unlock_irqrestore(&rq->lock, flags);
5862		5862
5863	/* Set the preempt count _outside_ the spinlocks! */	5863	/* Set the preempt count _outside_ the spinlocks! */
5864	#if defined(CONFIG_PREEMPT)	5864	#if defined(CONFIG_PREEMPT)
5865	task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);	5865	task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
5866	#else	5866	#else
5867	task_thread_info(idle)->preempt_count = 0;	5867	task_thread_info(idle)->preempt_count = 0;
5868	#endif	5868	#endif
5869	/*	5869	/*
5870	* The idle tasks have their own, simple scheduling class:	5870	* The idle tasks have their own, simple scheduling class:
5871	*/	5871	*/
5872	idle->sched_class = &idle_sched_class;	5872	idle->sched_class = &idle_sched_class;
5873	ftrace_graph_init_idle_task(idle, cpu);	5873	ftrace_graph_init_idle_task(idle, cpu);
5874	}	5874	}
5875		5875
5876	/*	5876	/*
5877	* In a system that switches off the HZ timer nohz_cpu_mask	5877	* In a system that switches off the HZ timer nohz_cpu_mask
5878	* indicates which cpus entered this state. This is used	5878	* indicates which cpus entered this state. This is used
5879	* in the rcu update to wait only for active cpus. For system	5879	* in the rcu update to wait only for active cpus. For system
5880	* which do not switch off the HZ timer nohz_cpu_mask should	5880	* which do not switch off the HZ timer nohz_cpu_mask should
5881	* always be CPU_BITS_NONE.	5881	* always be CPU_BITS_NONE.
5882	*/	5882	*/
5883	cpumask_var_t nohz_cpu_mask;	5883	cpumask_var_t nohz_cpu_mask;
5884		5884
5885	/*	5885	/*
5886	* Increase the granularity value when there are more CPUs,	5886	* Increase the granularity value when there are more CPUs,
5887	* because with more CPUs the 'effective latency' as visible	5887	* because with more CPUs the 'effective latency' as visible
5888	* to users decreases. But the relationship is not linear,	5888	* to users decreases. But the relationship is not linear,
5889	* so pick a second-best guess by going with the log2 of the	5889	* so pick a second-best guess by going with the log2 of the
5890	* number of CPUs.	5890	* number of CPUs.
5891	*	5891	*
5892	* This idea comes from the SD scheduler of Con Kolivas:	5892	* This idea comes from the SD scheduler of Con Kolivas:
5893	*/	5893	*/
5894	static int get_update_sysctl_factor(void)	5894	static int get_update_sysctl_factor(void)
5895	{	5895	{
5896	unsigned int cpus = min_t(int, num_online_cpus(), 8);	5896	unsigned int cpus = min_t(int, num_online_cpus(), 8);
5897	unsigned int factor;	5897	unsigned int factor;
5898		5898
5899	switch (sysctl_sched_tunable_scaling) {	5899	switch (sysctl_sched_tunable_scaling) {
5900	case SCHED_TUNABLESCALING_NONE:	5900	case SCHED_TUNABLESCALING_NONE:
5901	factor = 1;	5901	factor = 1;
5902	break;	5902	break;
5903	case SCHED_TUNABLESCALING_LINEAR:	5903	case SCHED_TUNABLESCALING_LINEAR:
5904	factor = cpus;	5904	factor = cpus;
5905	break;	5905	break;
5906	case SCHED_TUNABLESCALING_LOG:	5906	case SCHED_TUNABLESCALING_LOG:
5907	default:	5907	default:
5908	factor = 1 + ilog2(cpus);	5908	factor = 1 + ilog2(cpus);
5909	break;	5909	break;
5910	}	5910	}
5911		5911
5912	return factor;	5912	return factor;
5913	}	5913	}
5914		5914
5915	static void update_sysctl(void)	5915	static void update_sysctl(void)
5916	{	5916	{
5917	unsigned int factor = get_update_sysctl_factor();	5917	unsigned int factor = get_update_sysctl_factor();
5918		5918
5919	#define SET_SYSCTL(name) \	5919	#define SET_SYSCTL(name) \
5920	(sysctl_##name = (factor) * normalized_sysctl_##name)	5920	(sysctl_##name = (factor) * normalized_sysctl_##name)
5921	SET_SYSCTL(sched_min_granularity);	5921	SET_SYSCTL(sched_min_granularity);
5922	SET_SYSCTL(sched_latency);	5922	SET_SYSCTL(sched_latency);
5923	SET_SYSCTL(sched_wakeup_granularity);	5923	SET_SYSCTL(sched_wakeup_granularity);
5924	#undef SET_SYSCTL	5924	#undef SET_SYSCTL
5925	}	5925	}
5926		5926
5927	static inline void sched_init_granularity(void)	5927	static inline void sched_init_granularity(void)
5928	{	5928	{
5929	update_sysctl();	5929	update_sysctl();
5930	}	5930	}
5931		5931
5932	#ifdef CONFIG_SMP	5932	#ifdef CONFIG_SMP
5933	/*	5933	/*
5934	* This is how migration works:	5934	* This is how migration works:
5935	*	5935	*
5936	* 1) we invoke migration_cpu_stop() on the target CPU using	5936	* 1) we invoke migration_cpu_stop() on the target CPU using
5937	* stop_one_cpu().	5937	* stop_one_cpu().
5938	* 2) stopper starts to run (implicitly forcing the migrated thread	5938	* 2) stopper starts to run (implicitly forcing the migrated thread
5939	* off the CPU)	5939	* off the CPU)
5940	* 3) it checks whether the migrated task is still in the wrong runqueue.	5940	* 3) it checks whether the migrated task is still in the wrong runqueue.
5941	* 4) if it's in the wrong runqueue then the migration thread removes	5941	* 4) if it's in the wrong runqueue then the migration thread removes
5942	* it and puts it into the right queue.	5942	* it and puts it into the right queue.
5943	* 5) stopper completes and stop_one_cpu() returns and the migration	5943	* 5) stopper completes and stop_one_cpu() returns and the migration
5944	* is done.	5944	* is done.
5945	*/	5945	*/
5946		5946
5947	/*	5947	/*
5948	* Change a given task's CPU affinity. Migrate the thread to a	5948	* Change a given task's CPU affinity. Migrate the thread to a
5949	* proper CPU and schedule it away if the CPU it's executing on	5949	* proper CPU and schedule it away if the CPU it's executing on
5950	* is removed from the allowed bitmask.	5950	* is removed from the allowed bitmask.
5951	*	5951	*
5952	* NOTE: the caller must have a valid reference to the task, the	5952	* NOTE: the caller must have a valid reference to the task, the
5953	* task must not exit() & deallocate itself prematurely. The	5953	* task must not exit() & deallocate itself prematurely. The
5954	* call is not atomic; no spinlocks may be held.	5954	* call is not atomic; no spinlocks may be held.
5955	*/	5955	*/
5956	int set_cpus_allowed_ptr(struct task_struct p, const struct cpumask new_mask)	5956	int set_cpus_allowed_ptr(struct task_struct p, const struct cpumask new_mask)
5957	{	5957	{
5958	unsigned long flags;	5958	unsigned long flags;
5959	struct rq *rq;	5959	struct rq *rq;
5960	unsigned int dest_cpu;	5960	unsigned int dest_cpu;
5961	int ret = 0;	5961	int ret = 0;
5962		5962
5963	rq = task_rq_lock(p, &flags);	5963	rq = task_rq_lock(p, &flags);
5964		5964
5965	if (!cpumask_intersects(new_mask, cpu_active_mask)) {	5965	if (!cpumask_intersects(new_mask, cpu_active_mask)) {
5966	ret = -EINVAL;	5966	ret = -EINVAL;
5967	goto out;	5967	goto out;
5968	}	5968	}
5969		5969
5970	if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&	5970	if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
5971	!cpumask_equal(&p->cpus_allowed, new_mask))) {	5971	!cpumask_equal(&p->cpus_allowed, new_mask))) {
5972	ret = -EINVAL;	5972	ret = -EINVAL;
5973	goto out;	5973	goto out;
5974	}	5974	}
5975		5975
5976	if (p->sched_class->set_cpus_allowed)	5976	if (p->sched_class->set_cpus_allowed)
5977	p->sched_class->set_cpus_allowed(p, new_mask);	5977	p->sched_class->set_cpus_allowed(p, new_mask);
5978	else {	5978	else {
5979	cpumask_copy(&p->cpus_allowed, new_mask);	5979	cpumask_copy(&p->cpus_allowed, new_mask);
5980	p->rt.nr_cpus_allowed = cpumask_weight(new_mask);	5980	p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
5981	}	5981	}
5982		5982
5983	/* Can the task run on the task's current CPU? If so, we're done */	5983	/* Can the task run on the task's current CPU? If so, we're done */
5984	if (cpumask_test_cpu(task_cpu(p), new_mask))	5984	if (cpumask_test_cpu(task_cpu(p), new_mask))
5985	goto out;	5985	goto out;
5986		5986
5987	dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);	5987	dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5988	if (need_migrate_task(p)) {	5988	if (need_migrate_task(p)) {
5989	struct migration_arg arg = { p, dest_cpu };	5989	struct migration_arg arg = { p, dest_cpu };
5990	/* Need help from migration thread: drop lock and wait. */	5990	/* Need help from migration thread: drop lock and wait. */
5991	task_rq_unlock(rq, p, &flags);	5991	task_rq_unlock(rq, p, &flags);
5992	stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);	5992	stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
5993	tlb_migrate_finish(p->mm);	5993	tlb_migrate_finish(p->mm);
5994	return 0;	5994	return 0;
5995	}	5995	}
5996	out:	5996	out:
5997	task_rq_unlock(rq, p, &flags);	5997	task_rq_unlock(rq, p, &flags);
5998		5998
5999	return ret;	5999	return ret;
6000	}	6000	}
6001	EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);	6001	EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
6002		6002
6003	/*	6003	/*
6004	* Move (not current) task off this cpu, onto dest cpu. We're doing	6004	* Move (not current) task off this cpu, onto dest cpu. We're doing
6005	* this because either it can't run here any more (set_cpus_allowed()	6005	* this because either it can't run here any more (set_cpus_allowed()
6006	* away from this CPU, or CPU going down), or because we're	6006	* away from this CPU, or CPU going down), or because we're
6007	* attempting to rebalance this task on exec (sched_exec).	6007	* attempting to rebalance this task on exec (sched_exec).
6008	*	6008	*
6009	* So we race with normal scheduler movements, but that's OK, as long	6009	* So we race with normal scheduler movements, but that's OK, as long
6010	* as the task is no longer on this CPU.	6010	* as the task is no longer on this CPU.
6011	*	6011	*
6012	* Returns non-zero if task was successfully migrated.	6012	* Returns non-zero if task was successfully migrated.
6013	*/	6013	*/
6014	static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)	6014	static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
6015	{	6015	{
6016	struct rq rq_dest, rq_src;	6016	struct rq rq_dest, rq_src;
6017	int ret = 0;	6017	int ret = 0;
6018		6018
6019	if (unlikely(!cpu_active(dest_cpu)))	6019	if (unlikely(!cpu_active(dest_cpu)))
6020	return ret;	6020	return ret;
6021		6021
6022	rq_src = cpu_rq(src_cpu);	6022	rq_src = cpu_rq(src_cpu);
6023	rq_dest = cpu_rq(dest_cpu);	6023	rq_dest = cpu_rq(dest_cpu);
6024		6024
6025	raw_spin_lock(&p->pi_lock);	6025	raw_spin_lock(&p->pi_lock);
6026	double_rq_lock(rq_src, rq_dest);	6026	double_rq_lock(rq_src, rq_dest);
6027	/* Already moved. */	6027	/* Already moved. */
6028	if (task_cpu(p) != src_cpu)	6028	if (task_cpu(p) != src_cpu)
6029	goto done;	6029	goto done;
6030	/* Affinity changed (again). */	6030	/* Affinity changed (again). */
6031	if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))	6031	if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
6032	goto fail;	6032	goto fail;
6033		6033
6034	/*	6034	/*
6035	* If we're not on a rq, the next wake-up will ensure we're	6035	* If we're not on a rq, the next wake-up will ensure we're
6036	* placed properly.	6036	* placed properly.
6037	*/	6037	*/
6038	if (p->on_rq) {	6038	if (p->on_rq) {
6039	deactivate_task(rq_src, p, 0);	6039	deactivate_task(rq_src, p, 0);
6040	set_task_cpu(p, dest_cpu);	6040	set_task_cpu(p, dest_cpu);
6041	activate_task(rq_dest, p, 0);	6041	activate_task(rq_dest, p, 0);
6042	check_preempt_curr(rq_dest, p, 0);	6042	check_preempt_curr(rq_dest, p, 0);
6043	}	6043	}
6044	done:	6044	done:
6045	ret = 1;	6045	ret = 1;
6046	fail:	6046	fail:
6047	double_rq_unlock(rq_src, rq_dest);	6047	double_rq_unlock(rq_src, rq_dest);
6048	raw_spin_unlock(&p->pi_lock);	6048	raw_spin_unlock(&p->pi_lock);
6049	return ret;	6049	return ret;
6050	}	6050	}
6051		6051
6052	/*	6052	/*
6053	* migration_cpu_stop - this will be executed by a highprio stopper thread	6053	* migration_cpu_stop - this will be executed by a highprio stopper thread
6054	* and performs thread migration by bumping thread off CPU then	6054	* and performs thread migration by bumping thread off CPU then
6055	* 'pushing' onto another runqueue.	6055	* 'pushing' onto another runqueue.
6056	*/	6056	*/
6057	static int migration_cpu_stop(void *data)	6057	static int migration_cpu_stop(void *data)
6058	{	6058	{
6059	struct migration_arg *arg = data;	6059	struct migration_arg *arg = data;
6060		6060
6061	/*	6061	/*
6062	* The original target cpu might have gone down and we might	6062	* The original target cpu might have gone down and we might
6063	* be on another cpu but it doesn't matter.	6063	* be on another cpu but it doesn't matter.
6064	*/	6064	*/
6065	local_irq_disable();	6065	local_irq_disable();
6066	__migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);	6066	__migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
6067	local_irq_enable();	6067	local_irq_enable();
6068	return 0;	6068	return 0;
6069	}	6069	}
6070		6070
6071	#ifdef CONFIG_HOTPLUG_CPU	6071	#ifdef CONFIG_HOTPLUG_CPU
6072		6072
6073	/*	6073	/*
6074	* Ensures that the idle task is using init_mm right before its cpu goes	6074	* Ensures that the idle task is using init_mm right before its cpu goes
6075	* offline.	6075	* offline.
6076	*/	6076	*/
6077	void idle_task_exit(void)	6077	void idle_task_exit(void)
6078	{	6078	{
6079	struct mm_struct *mm = current->active_mm;	6079	struct mm_struct *mm = current->active_mm;
6080		6080
6081	BUG_ON(cpu_online(smp_processor_id()));	6081	BUG_ON(cpu_online(smp_processor_id()));
6082		6082
6083	if (mm != &init_mm)	6083	if (mm != &init_mm)
6084	switch_mm(mm, &init_mm, current);	6084	switch_mm(mm, &init_mm, current);
6085	mmdrop(mm);	6085	mmdrop(mm);
6086	}	6086	}
6087		6087
6088	/*	6088	/*
6089	* While a dead CPU has no uninterruptible tasks queued at this point,	6089	* While a dead CPU has no uninterruptible tasks queued at this point,
6090	* it might still have a nonzero ->nr_uninterruptible counter, because	6090	* it might still have a nonzero ->nr_uninterruptible counter, because
6091	* for performance reasons the counter is not stricly tracking tasks to	6091	* for performance reasons the counter is not stricly tracking tasks to
6092	* their home CPUs. So we just add the counter to another CPU's counter,	6092	* their home CPUs. So we just add the counter to another CPU's counter,
6093	* to keep the global sum constant after CPU-down:	6093	* to keep the global sum constant after CPU-down:
6094	*/	6094	*/
6095	static void migrate_nr_uninterruptible(struct rq *rq_src)	6095	static void migrate_nr_uninterruptible(struct rq *rq_src)
6096	{	6096	{
6097	struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));	6097	struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
6098		6098
6099	rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;	6099	rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
6100	rq_src->nr_uninterruptible = 0;	6100	rq_src->nr_uninterruptible = 0;
6101	}	6101	}
6102		6102
6103	/*	6103	/*
6104	* remove the tasks which were accounted by rq from calc_load_tasks.	6104	* remove the tasks which were accounted by rq from calc_load_tasks.
6105	*/	6105	*/
6106	static void calc_global_load_remove(struct rq *rq)	6106	static void calc_global_load_remove(struct rq *rq)
6107	{	6107	{
6108	atomic_long_sub(rq->calc_load_active, &calc_load_tasks);	6108	atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
6109	rq->calc_load_active = 0;	6109	rq->calc_load_active = 0;
6110	}	6110	}
6111		6111
6112	/*	6112	/*
6113	* Migrate all tasks from the rq, sleeping tasks will be migrated by	6113	* Migrate all tasks from the rq, sleeping tasks will be migrated by
6114	* try_to_wake_up()->select_task_rq().	6114	* try_to_wake_up()->select_task_rq().
6115	*	6115	*
6116	* Called with rq->lock held even though we'er in stop_machine() and	6116	* Called with rq->lock held even though we'er in stop_machine() and
6117	* there's no concurrency possible, we hold the required locks anyway	6117	* there's no concurrency possible, we hold the required locks anyway
6118	* because of lock validation efforts.	6118	* because of lock validation efforts.
6119	*/	6119	*/
6120	static void migrate_tasks(unsigned int dead_cpu)	6120	static void migrate_tasks(unsigned int dead_cpu)
6121	{	6121	{
6122	struct rq *rq = cpu_rq(dead_cpu);	6122	struct rq *rq = cpu_rq(dead_cpu);
6123	struct task_struct next, stop = rq->stop;	6123	struct task_struct next, stop = rq->stop;
6124	int dest_cpu;	6124	int dest_cpu;
6125		6125
6126	/*	6126	/*
6127	* Fudge the rq selection such that the below task selection loop	6127	* Fudge the rq selection such that the below task selection loop
6128	* doesn't get stuck on the currently eligible stop task.	6128	* doesn't get stuck on the currently eligible stop task.
6129	*	6129	*
6130	* We're currently inside stop_machine() and the rq is either stuck	6130	* We're currently inside stop_machine() and the rq is either stuck
6131	* in the stop_machine_cpu_stop() loop, or we're executing this code,	6131	* in the stop_machine_cpu_stop() loop, or we're executing this code,
6132	* either way we should never end up calling schedule() until we're	6132	* either way we should never end up calling schedule() until we're
6133	* done here.	6133	* done here.
6134	*/	6134	*/
6135	rq->stop = NULL;	6135	rq->stop = NULL;
6136		6136
6137	for ( ; ; ) {	6137	for ( ; ; ) {
6138	/*	6138	/*
6139	* There's this thread running, bail when that's the only	6139	* There's this thread running, bail when that's the only
6140	* remaining thread.	6140	* remaining thread.
6141	*/	6141	*/
6142	if (rq->nr_running == 1)	6142	if (rq->nr_running == 1)
6143	break;	6143	break;
6144		6144
6145	next = pick_next_task(rq);	6145	next = pick_next_task(rq);
6146	BUG_ON(!next);	6146	BUG_ON(!next);
6147	next->sched_class->put_prev_task(rq, next);	6147	next->sched_class->put_prev_task(rq, next);
6148		6148
6149	/* Find suitable destination for @next, with force if needed. */	6149	/* Find suitable destination for @next, with force if needed. */
6150	dest_cpu = select_fallback_rq(dead_cpu, next);	6150	dest_cpu = select_fallback_rq(dead_cpu, next);
6151	raw_spin_unlock(&rq->lock);	6151	raw_spin_unlock(&rq->lock);
6152		6152
6153	__migrate_task(next, dead_cpu, dest_cpu);	6153	__migrate_task(next, dead_cpu, dest_cpu);
6154		6154
6155	raw_spin_lock(&rq->lock);	6155	raw_spin_lock(&rq->lock);
6156	}	6156	}
6157		6157
6158	rq->stop = stop;	6158	rq->stop = stop;
6159	}	6159	}
6160		6160
6161	#endif /* CONFIG_HOTPLUG_CPU */	6161	#endif /* CONFIG_HOTPLUG_CPU */
6162		6162
6163	#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)	6163	#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
6164		6164
6165	static struct ctl_table sd_ctl_dir[] = {	6165	static struct ctl_table sd_ctl_dir[] = {
6166	{	6166	{
6167	.procname = "sched_domain",	6167	.procname = "sched_domain",
6168	.mode = 0555,	6168	.mode = 0555,
6169	},	6169	},
6170	{}	6170	{}
6171	};	6171	};
6172		6172
6173	static struct ctl_table sd_ctl_root[] = {	6173	static struct ctl_table sd_ctl_root[] = {
6174	{	6174	{
6175	.procname = "kernel",	6175	.procname = "kernel",
6176	.mode = 0555,	6176	.mode = 0555,
6177	.child = sd_ctl_dir,	6177	.child = sd_ctl_dir,
6178	},	6178	},
6179	{}	6179	{}
6180	};	6180	};
6181		6181
6182	static struct ctl_table *sd_alloc_ctl_entry(int n)	6182	static struct ctl_table *sd_alloc_ctl_entry(int n)
6183	{	6183	{
6184	struct ctl_table *entry =	6184	struct ctl_table *entry =
6185	kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);	6185	kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
6186		6186
6187	return entry;	6187	return entry;
6188	}	6188	}
6189		6189
6190	static void sd_free_ctl_entry(struct ctl_table **tablep)	6190	static void sd_free_ctl_entry(struct ctl_table **tablep)
6191	{	6191	{
6192	struct ctl_table *entry;	6192	struct ctl_table *entry;
6193		6193
6194	/*	6194	/*
6195	* In the intermediate directories, both the child directory and	6195	* In the intermediate directories, both the child directory and
6196	* procname are dynamically allocated and could fail but the mode	6196	* procname are dynamically allocated and could fail but the mode
6197	* will always be set. In the lowest directory the names are	6197	* will always be set. In the lowest directory the names are
6198	* static strings and all have proc handlers.	6198	* static strings and all have proc handlers.
6199	*/	6199	*/
6200	for (entry = *tablep; entry->mode; entry++) {	6200	for (entry = *tablep; entry->mode; entry++) {
6201	if (entry->child)	6201	if (entry->child)
6202	sd_free_ctl_entry(&entry->child);	6202	sd_free_ctl_entry(&entry->child);
6203	if (entry->proc_handler == NULL)	6203	if (entry->proc_handler == NULL)
6204	kfree(entry->procname);	6204	kfree(entry->procname);
6205	}	6205	}
6206		6206
6207	kfree(*tablep);	6207	kfree(*tablep);
6208	*tablep = NULL;	6208	*tablep = NULL;
6209	}	6209	}
6210		6210
6211	static void	6211	static void
6212	set_table_entry(struct ctl_table *entry,	6212	set_table_entry(struct ctl_table *entry,
6213	const char procname, void data, int maxlen,	6213	const char procname, void data, int maxlen,
6214	mode_t mode, proc_handler *proc_handler)	6214	mode_t mode, proc_handler *proc_handler)
6215	{	6215	{
6216	entry->procname = procname;	6216	entry->procname = procname;
6217	entry->data = data;	6217	entry->data = data;
6218	entry->maxlen = maxlen;	6218	entry->maxlen = maxlen;
6219	entry->mode = mode;	6219	entry->mode = mode;
6220	entry->proc_handler = proc_handler;	6220	entry->proc_handler = proc_handler;
6221	}	6221	}
6222		6222
6223	static struct ctl_table *	6223	static struct ctl_table *
6224	sd_alloc_ctl_domain_table(struct sched_domain *sd)	6224	sd_alloc_ctl_domain_table(struct sched_domain *sd)
6225	{	6225	{
6226	struct ctl_table *table = sd_alloc_ctl_entry(13);	6226	struct ctl_table *table = sd_alloc_ctl_entry(13);
6227		6227
6228	if (table == NULL)	6228	if (table == NULL)
6229	return NULL;	6229	return NULL;
6230		6230
6231	set_table_entry(&table[0], "min_interval", &sd->min_interval,	6231	set_table_entry(&table[0], "min_interval", &sd->min_interval,
6232	sizeof(long), 0644, proc_doulongvec_minmax);	6232	sizeof(long), 0644, proc_doulongvec_minmax);
6233	set_table_entry(&table[1], "max_interval", &sd->max_interval,	6233	set_table_entry(&table[1], "max_interval", &sd->max_interval,
6234	sizeof(long), 0644, proc_doulongvec_minmax);	6234	sizeof(long), 0644, proc_doulongvec_minmax);
6235	set_table_entry(&table[2], "busy_idx", &sd->busy_idx,	6235	set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
6236	sizeof(int), 0644, proc_dointvec_minmax);	6236	sizeof(int), 0644, proc_dointvec_minmax);
6237	set_table_entry(&table[3], "idle_idx", &sd->idle_idx,	6237	set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
6238	sizeof(int), 0644, proc_dointvec_minmax);	6238	sizeof(int), 0644, proc_dointvec_minmax);
6239	set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,	6239	set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
6240	sizeof(int), 0644, proc_dointvec_minmax);	6240	sizeof(int), 0644, proc_dointvec_minmax);
6241	set_table_entry(&table[5], "wake_idx", &sd->wake_idx,	6241	set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
6242	sizeof(int), 0644, proc_dointvec_minmax);	6242	sizeof(int), 0644, proc_dointvec_minmax);
6243	set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,	6243	set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
6244	sizeof(int), 0644, proc_dointvec_minmax);	6244	sizeof(int), 0644, proc_dointvec_minmax);
6245	set_table_entry(&table[7], "busy_factor", &sd->busy_factor,	6245	set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
6246	sizeof(int), 0644, proc_dointvec_minmax);	6246	sizeof(int), 0644, proc_dointvec_minmax);
6247	set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,	6247	set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
6248	sizeof(int), 0644, proc_dointvec_minmax);	6248	sizeof(int), 0644, proc_dointvec_minmax);
6249	set_table_entry(&table[9], "cache_nice_tries",	6249	set_table_entry(&table[9], "cache_nice_tries",
6250	&sd->cache_nice_tries,	6250	&sd->cache_nice_tries,
6251	sizeof(int), 0644, proc_dointvec_minmax);	6251	sizeof(int), 0644, proc_dointvec_minmax);
6252	set_table_entry(&table[10], "flags", &sd->flags,	6252	set_table_entry(&table[10], "flags", &sd->flags,
6253	sizeof(int), 0644, proc_dointvec_minmax);	6253	sizeof(int), 0644, proc_dointvec_minmax);
6254	set_table_entry(&table[11], "name", sd->name,	6254	set_table_entry(&table[11], "name", sd->name,
6255	CORENAME_MAX_SIZE, 0444, proc_dostring);	6255	CORENAME_MAX_SIZE, 0444, proc_dostring);
6256	/* &table[12] is terminator */	6256	/* &table[12] is terminator */
6257		6257
6258	return table;	6258	return table;
6259	}	6259	}
6260		6260
6261	static ctl_table *sd_alloc_ctl_cpu_table(int cpu)	6261	static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
6262	{	6262	{
6263	struct ctl_table entry, table;	6263	struct ctl_table entry, table;
6264	struct sched_domain *sd;	6264	struct sched_domain *sd;
6265	int domain_num = 0, i;	6265	int domain_num = 0, i;
6266	char buf[32];	6266	char buf[32];
6267		6267
6268	for_each_domain(cpu, sd)	6268	for_each_domain(cpu, sd)
6269	domain_num++;	6269	domain_num++;
6270	entry = table = sd_alloc_ctl_entry(domain_num + 1);	6270	entry = table = sd_alloc_ctl_entry(domain_num + 1);
6271	if (table == NULL)	6271	if (table == NULL)
6272	return NULL;	6272	return NULL;
6273		6273
6274	i = 0;	6274	i = 0;
6275	for_each_domain(cpu, sd) {	6275	for_each_domain(cpu, sd) {
6276	snprintf(buf, 32, "domain%d", i);	6276	snprintf(buf, 32, "domain%d", i);
6277	entry->procname = kstrdup(buf, GFP_KERNEL);	6277	entry->procname = kstrdup(buf, GFP_KERNEL);
6278	entry->mode = 0555;	6278	entry->mode = 0555;
6279	entry->child = sd_alloc_ctl_domain_table(sd);	6279	entry->child = sd_alloc_ctl_domain_table(sd);
6280	entry++;	6280	entry++;
6281	i++;	6281	i++;
6282	}	6282	}
6283	return table;	6283	return table;
6284	}	6284	}
6285		6285
6286	static struct ctl_table_header *sd_sysctl_header;	6286	static struct ctl_table_header *sd_sysctl_header;
6287	static void register_sched_domain_sysctl(void)	6287	static void register_sched_domain_sysctl(void)
6288	{	6288	{
6289	int i, cpu_num = num_possible_cpus();	6289	int i, cpu_num = num_possible_cpus();
6290	struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);	6290	struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
6291	char buf[32];	6291	char buf[32];
6292		6292
6293	WARN_ON(sd_ctl_dir[0].child);	6293	WARN_ON(sd_ctl_dir[0].child);
6294	sd_ctl_dir[0].child = entry;	6294	sd_ctl_dir[0].child = entry;
6295		6295
6296	if (entry == NULL)	6296	if (entry == NULL)
6297	return;	6297	return;
6298		6298
6299	for_each_possible_cpu(i) {	6299	for_each_possible_cpu(i) {
6300	snprintf(buf, 32, "cpu%d", i);	6300	snprintf(buf, 32, "cpu%d", i);
6301	entry->procname = kstrdup(buf, GFP_KERNEL);	6301	entry->procname = kstrdup(buf, GFP_KERNEL);
6302	entry->mode = 0555;	6302	entry->mode = 0555;
6303	entry->child = sd_alloc_ctl_cpu_table(i);	6303	entry->child = sd_alloc_ctl_cpu_table(i);
6304	entry++;	6304	entry++;
6305	}	6305	}
6306		6306
6307	WARN_ON(sd_sysctl_header);	6307	WARN_ON(sd_sysctl_header);
6308	sd_sysctl_header = register_sysctl_table(sd_ctl_root);	6308	sd_sysctl_header = register_sysctl_table(sd_ctl_root);
6309	}	6309	}
6310		6310
6311	/* may be called multiple times per register */	6311	/* may be called multiple times per register */
6312	static void unregister_sched_domain_sysctl(void)	6312	static void unregister_sched_domain_sysctl(void)
6313	{	6313	{
6314	if (sd_sysctl_header)	6314	if (sd_sysctl_header)
6315	unregister_sysctl_table(sd_sysctl_header);	6315	unregister_sysctl_table(sd_sysctl_header);
6316	sd_sysctl_header = NULL;	6316	sd_sysctl_header = NULL;
6317	if (sd_ctl_dir[0].child)	6317	if (sd_ctl_dir[0].child)
6318	sd_free_ctl_entry(&sd_ctl_dir[0].child);	6318	sd_free_ctl_entry(&sd_ctl_dir[0].child);
6319	}	6319	}
6320	#else	6320	#else
6321	static void register_sched_domain_sysctl(void)	6321	static void register_sched_domain_sysctl(void)
6322	{	6322	{
6323	}	6323	}
6324	static void unregister_sched_domain_sysctl(void)	6324	static void unregister_sched_domain_sysctl(void)
6325	{	6325	{
6326	}	6326	}
6327	#endif	6327	#endif
6328		6328
6329	static void set_rq_online(struct rq *rq)	6329	static void set_rq_online(struct rq *rq)
6330	{	6330	{
6331	if (!rq->online) {	6331	if (!rq->online) {
6332	const struct sched_class *class;	6332	const struct sched_class *class;
6333		6333
6334	cpumask_set_cpu(rq->cpu, rq->rd->online);	6334	cpumask_set_cpu(rq->cpu, rq->rd->online);
6335	rq->online = 1;	6335	rq->online = 1;
6336		6336
6337	for_each_class(class) {	6337	for_each_class(class) {
6338	if (class->rq_online)	6338	if (class->rq_online)
6339	class->rq_online(rq);	6339	class->rq_online(rq);
6340	}	6340	}
6341	}	6341	}
6342	}	6342	}
6343		6343
6344	static void set_rq_offline(struct rq *rq)	6344	static void set_rq_offline(struct rq *rq)
6345	{	6345	{
6346	if (rq->online) {	6346	if (rq->online) {
6347	const struct sched_class *class;	6347	const struct sched_class *class;
6348		6348
6349	for_each_class(class) {	6349	for_each_class(class) {
6350	if (class->rq_offline)	6350	if (class->rq_offline)
6351	class->rq_offline(rq);	6351	class->rq_offline(rq);
6352	}	6352	}
6353		6353
6354	cpumask_clear_cpu(rq->cpu, rq->rd->online);	6354	cpumask_clear_cpu(rq->cpu, rq->rd->online);
6355	rq->online = 0;	6355	rq->online = 0;
6356	}	6356	}
6357	}	6357	}
6358		6358
6359	/*	6359	/*
6360	* migration_call - callback that gets triggered when a CPU is added.	6360	* migration_call - callback that gets triggered when a CPU is added.
6361	* Here we can start up the necessary migration thread for the new CPU.	6361	* Here we can start up the necessary migration thread for the new CPU.
6362	*/	6362	*/
6363	static int __cpuinit	6363	static int __cpuinit
6364	migration_call(struct notifier_block nfb, unsigned long action, void hcpu)	6364	migration_call(struct notifier_block nfb, unsigned long action, void hcpu)
6365	{	6365	{
6366	int cpu = (long)hcpu;	6366	int cpu = (long)hcpu;
6367	unsigned long flags;	6367	unsigned long flags;
6368	struct rq *rq = cpu_rq(cpu);	6368	struct rq *rq = cpu_rq(cpu);
6369		6369
6370	switch (action & ~CPU_TASKS_FROZEN) {	6370	switch (action & ~CPU_TASKS_FROZEN) {
6371		6371
6372	case CPU_UP_PREPARE:	6372	case CPU_UP_PREPARE:
6373	rq->calc_load_update = calc_load_update;	6373	rq->calc_load_update = calc_load_update;
6374	break;	6374	break;
6375		6375
6376	case CPU_ONLINE:	6376	case CPU_ONLINE:
6377	/* Update our root-domain */	6377	/* Update our root-domain */
6378	raw_spin_lock_irqsave(&rq->lock, flags);	6378	raw_spin_lock_irqsave(&rq->lock, flags);
6379	if (rq->rd) {	6379	if (rq->rd) {
6380	BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));	6380	BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6381		6381
6382	set_rq_online(rq);	6382	set_rq_online(rq);
6383	}	6383	}
6384	raw_spin_unlock_irqrestore(&rq->lock, flags);	6384	raw_spin_unlock_irqrestore(&rq->lock, flags);
6385	break;	6385	break;
6386		6386
6387	#ifdef CONFIG_HOTPLUG_CPU	6387	#ifdef CONFIG_HOTPLUG_CPU
6388	case CPU_DYING:	6388	case CPU_DYING:
6389	sched_ttwu_pending();	6389	sched_ttwu_pending();
6390	/* Update our root-domain */	6390	/* Update our root-domain */
6391	raw_spin_lock_irqsave(&rq->lock, flags);	6391	raw_spin_lock_irqsave(&rq->lock, flags);
6392	if (rq->rd) {	6392	if (rq->rd) {
6393	BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));	6393	BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6394	set_rq_offline(rq);	6394	set_rq_offline(rq);
6395	}	6395	}
6396	migrate_tasks(cpu);	6396	migrate_tasks(cpu);
6397	BUG_ON(rq->nr_running != 1); /* the migration thread */	6397	BUG_ON(rq->nr_running != 1); /* the migration thread */
6398	raw_spin_unlock_irqrestore(&rq->lock, flags);	6398	raw_spin_unlock_irqrestore(&rq->lock, flags);
6399		6399
6400	migrate_nr_uninterruptible(rq);	6400	migrate_nr_uninterruptible(rq);
6401	calc_global_load_remove(rq);	6401	calc_global_load_remove(rq);
6402	break;	6402	break;
6403	#endif	6403	#endif
6404	}	6404	}
6405		6405
6406	update_max_interval();	6406	update_max_interval();
6407		6407
6408	return NOTIFY_OK;	6408	return NOTIFY_OK;
6409	}	6409	}
6410		6410
6411	/*	6411	/*
6412	* Register at high priority so that task migration (migrate_all_tasks)	6412	* Register at high priority so that task migration (migrate_all_tasks)
6413	* happens before everything else. This has to be lower priority than	6413	* happens before everything else. This has to be lower priority than
6414	* the notifier in the perf_event subsystem, though.	6414	* the notifier in the perf_event subsystem, though.
6415	*/	6415	*/
6416	static struct notifier_block __cpuinitdata migration_notifier = {	6416	static struct notifier_block __cpuinitdata migration_notifier = {
6417	.notifier_call = migration_call,	6417	.notifier_call = migration_call,
6418	.priority = CPU_PRI_MIGRATION,	6418	.priority = CPU_PRI_MIGRATION,
6419	};	6419	};
6420		6420
6421	static int __cpuinit sched_cpu_active(struct notifier_block *nfb,	6421	static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
6422	unsigned long action, void *hcpu)	6422	unsigned long action, void *hcpu)
6423	{	6423	{
6424	switch (action & ~CPU_TASKS_FROZEN) {	6424	switch (action & ~CPU_TASKS_FROZEN) {
6425	case CPU_ONLINE:	6425	case CPU_ONLINE:
6426	case CPU_DOWN_FAILED:	6426	case CPU_DOWN_FAILED:
6427	set_cpu_active((long)hcpu, true);	6427	set_cpu_active((long)hcpu, true);
6428	return NOTIFY_OK;	6428	return NOTIFY_OK;
6429	default:	6429	default:
6430	return NOTIFY_DONE;	6430	return NOTIFY_DONE;
6431	}	6431	}
6432	}	6432	}
6433		6433
6434	static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,	6434	static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
6435	unsigned long action, void *hcpu)	6435	unsigned long action, void *hcpu)
6436	{	6436	{
6437	switch (action & ~CPU_TASKS_FROZEN) {	6437	switch (action & ~CPU_TASKS_FROZEN) {
6438	case CPU_DOWN_PREPARE:	6438	case CPU_DOWN_PREPARE:
6439	set_cpu_active((long)hcpu, false);	6439	set_cpu_active((long)hcpu, false);
6440	return NOTIFY_OK;	6440	return NOTIFY_OK;
6441	default:	6441	default:
6442	return NOTIFY_DONE;	6442	return NOTIFY_DONE;
6443	}	6443	}
6444	}	6444	}
6445		6445
6446	static int __init migration_init(void)	6446	static int __init migration_init(void)
6447	{	6447	{
6448	void cpu = (void )(long)smp_processor_id();	6448	void cpu = (void )(long)smp_processor_id();
6449	int err;	6449	int err;
6450		6450
6451	/* Initialize migration for the boot CPU */	6451	/* Initialize migration for the boot CPU */
6452	err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);	6452	err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
6453	BUG_ON(err == NOTIFY_BAD);	6453	BUG_ON(err == NOTIFY_BAD);
6454	migration_call(&migration_notifier, CPU_ONLINE, cpu);	6454	migration_call(&migration_notifier, CPU_ONLINE, cpu);
6455	register_cpu_notifier(&migration_notifier);	6455	register_cpu_notifier(&migration_notifier);
6456		6456
6457	/* Register cpu active notifiers */	6457	/* Register cpu active notifiers */
6458	cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);	6458	cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
6459	cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);	6459	cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
6460		6460
6461	return 0;	6461	return 0;
6462	}	6462	}
6463	early_initcall(migration_init);	6463	early_initcall(migration_init);
6464	#endif	6464	#endif
6465		6465
6466	#ifdef CONFIG_SMP	6466	#ifdef CONFIG_SMP
6467		6467
6468	#ifdef CONFIG_SCHED_DEBUG	6468	#ifdef CONFIG_SCHED_DEBUG
6469		6469
6470	static __read_mostly int sched_domain_debug_enabled;	6470	static __read_mostly int sched_domain_debug_enabled;
6471		6471
6472	static int __init sched_domain_debug_setup(char *str)	6472	static int __init sched_domain_debug_setup(char *str)
6473	{	6473	{
6474	sched_domain_debug_enabled = 1;	6474	sched_domain_debug_enabled = 1;
6475		6475
6476	return 0;	6476	return 0;
6477	}	6477	}
6478	early_param("sched_debug", sched_domain_debug_setup);	6478	early_param("sched_debug", sched_domain_debug_setup);
6479		6479
6480	static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,	6480	static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6481	struct cpumask *groupmask)	6481	struct cpumask *groupmask)
6482	{	6482	{
6483	struct sched_group *group = sd->groups;	6483	struct sched_group *group = sd->groups;
6484	char str[256];	6484	char str[256];
6485		6485
6486	cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));	6486	cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
6487	cpumask_clear(groupmask);	6487	cpumask_clear(groupmask);
6488		6488
6489	printk(KERN_DEBUG "%*s domain %d: ", level, "", level);	6489	printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
6490		6490
6491	if (!(sd->flags & SD_LOAD_BALANCE)) {	6491	if (!(sd->flags & SD_LOAD_BALANCE)) {
6492	printk("does not load-balance\n");	6492	printk("does not load-balance\n");
6493	if (sd->parent)	6493	if (sd->parent)
6494	printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"	6494	printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
6495	" has parent");	6495	" has parent");
6496	return -1;	6496	return -1;
6497	}	6497	}
6498		6498
6499	printk(KERN_CONT "span %s level %s\n", str, sd->name);	6499	printk(KERN_CONT "span %s level %s\n", str, sd->name);
6500		6500
6501	if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {	6501	if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
6502	printk(KERN_ERR "ERROR: domain->span does not contain "	6502	printk(KERN_ERR "ERROR: domain->span does not contain "
6503	"CPU%d\n", cpu);	6503	"CPU%d\n", cpu);
6504	}	6504	}
6505	if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {	6505	if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
6506	printk(KERN_ERR "ERROR: domain->groups does not contain"	6506	printk(KERN_ERR "ERROR: domain->groups does not contain"
6507	" CPU%d\n", cpu);	6507	" CPU%d\n", cpu);
6508	}	6508	}
6509		6509
6510	printk(KERN_DEBUG "%*s groups:", level + 1, "");	6510	printk(KERN_DEBUG "%*s groups:", level + 1, "");
6511	do {	6511	do {
6512	if (!group) {	6512	if (!group) {
6513	printk("\n");	6513	printk("\n");
6514	printk(KERN_ERR "ERROR: group is NULL\n");	6514	printk(KERN_ERR "ERROR: group is NULL\n");
6515	break;	6515	break;
6516	}	6516	}
6517		6517
6518	if (!group->cpu_power) {	6518	if (!group->cpu_power) {
6519	printk(KERN_CONT "\n");	6519	printk(KERN_CONT "\n");
6520	printk(KERN_ERR "ERROR: domain->cpu_power not "	6520	printk(KERN_ERR "ERROR: domain->cpu_power not "
6521	"set\n");	6521	"set\n");
6522	break;	6522	break;
6523	}	6523	}
6524		6524
6525	if (!cpumask_weight(sched_group_cpus(group))) {	6525	if (!cpumask_weight(sched_group_cpus(group))) {
6526	printk(KERN_CONT "\n");	6526	printk(KERN_CONT "\n");
6527	printk(KERN_ERR "ERROR: empty group\n");	6527	printk(KERN_ERR "ERROR: empty group\n");
6528	break;	6528	break;
6529	}	6529	}
6530		6530
6531	if (cpumask_intersects(groupmask, sched_group_cpus(group))) {	6531	if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
6532	printk(KERN_CONT "\n");	6532	printk(KERN_CONT "\n");
6533	printk(KERN_ERR "ERROR: repeated CPUs\n");	6533	printk(KERN_ERR "ERROR: repeated CPUs\n");
6534	break;	6534	break;
6535	}	6535	}
6536		6536
6537	cpumask_or(groupmask, groupmask, sched_group_cpus(group));	6537	cpumask_or(groupmask, groupmask, sched_group_cpus(group));
6538		6538
6539	cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));	6539	cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
6540		6540
6541	printk(KERN_CONT " %s", str);	6541	printk(KERN_CONT " %s", str);
6542	if (group->cpu_power != SCHED_LOAD_SCALE) {	6542	if (group->cpu_power != SCHED_LOAD_SCALE) {
6543	printk(KERN_CONT " (cpu_power = %d)",	6543	printk(KERN_CONT " (cpu_power = %d)",
6544	group->cpu_power);	6544	group->cpu_power);
6545	}	6545	}
6546		6546
6547	group = group->next;	6547	group = group->next;
6548	} while (group != sd->groups);	6548	} while (group != sd->groups);
6549	printk(KERN_CONT "\n");	6549	printk(KERN_CONT "\n");
6550		6550
6551	if (!cpumask_equal(sched_domain_span(sd), groupmask))	6551	if (!cpumask_equal(sched_domain_span(sd), groupmask))
6552	printk(KERN_ERR "ERROR: groups don't span domain->span\n");	6552	printk(KERN_ERR "ERROR: groups don't span domain->span\n");
6553		6553
6554	if (sd->parent &&	6554	if (sd->parent &&
6555	!cpumask_subset(groupmask, sched_domain_span(sd->parent)))	6555	!cpumask_subset(groupmask, sched_domain_span(sd->parent)))
6556	printk(KERN_ERR "ERROR: parent span is not a superset "	6556	printk(KERN_ERR "ERROR: parent span is not a superset "
6557	"of domain->span\n");	6557	"of domain->span\n");
6558	return 0;	6558	return 0;
6559	}	6559	}
6560		6560
6561	static void sched_domain_debug(struct sched_domain *sd, int cpu)	6561	static void sched_domain_debug(struct sched_domain *sd, int cpu)
6562	{	6562	{
6563	cpumask_var_t groupmask;	6563	cpumask_var_t groupmask;
6564	int level = 0;	6564	int level = 0;
6565		6565
6566	if (!sched_domain_debug_enabled)	6566	if (!sched_domain_debug_enabled)
6567	return;	6567	return;
6568		6568
6569	if (!sd) {	6569	if (!sd) {
6570	printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);	6570	printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
6571	return;	6571	return;
6572	}	6572	}
6573		6573
6574	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);	6574	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
6575		6575
6576	if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {	6576	if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
6577	printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");	6577	printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
6578	return;	6578	return;
6579	}	6579	}
6580		6580
6581	for (;;) {	6581	for (;;) {
6582	if (sched_domain_debug_one(sd, cpu, level, groupmask))	6582	if (sched_domain_debug_one(sd, cpu, level, groupmask))
6583	break;	6583	break;
6584	level++;	6584	level++;
6585	sd = sd->parent;	6585	sd = sd->parent;
6586	if (!sd)	6586	if (!sd)
6587	break;	6587	break;
6588	}	6588	}
6589	free_cpumask_var(groupmask);	6589	free_cpumask_var(groupmask);
6590	}	6590	}
6591	#else /* !CONFIG_SCHED_DEBUG */	6591	#else /* !CONFIG_SCHED_DEBUG */
6592	# define sched_domain_debug(sd, cpu) do { } while (0)	6592	# define sched_domain_debug(sd, cpu) do { } while (0)
6593	#endif /* CONFIG_SCHED_DEBUG */	6593	#endif /* CONFIG_SCHED_DEBUG */
6594		6594
6595	static int sd_degenerate(struct sched_domain *sd)	6595	static int sd_degenerate(struct sched_domain *sd)
6596	{	6596	{
6597	if (cpumask_weight(sched_domain_span(sd)) == 1)	6597	if (cpumask_weight(sched_domain_span(sd)) == 1)
6598	return 1;	6598	return 1;
6599		6599
6600	/* Following flags need at least 2 groups */	6600	/* Following flags need at least 2 groups */
6601	if (sd->flags & (SD_LOAD_BALANCE \|	6601	if (sd->flags & (SD_LOAD_BALANCE \|
6602	SD_BALANCE_NEWIDLE \|	6602	SD_BALANCE_NEWIDLE \|
6603	SD_BALANCE_FORK \|	6603	SD_BALANCE_FORK \|
6604	SD_BALANCE_EXEC \|	6604	SD_BALANCE_EXEC \|
6605	SD_SHARE_CPUPOWER \|	6605	SD_SHARE_CPUPOWER \|
6606	SD_SHARE_PKG_RESOURCES)) {	6606	SD_SHARE_PKG_RESOURCES)) {
6607	if (sd->groups != sd->groups->next)	6607	if (sd->groups != sd->groups->next)
6608	return 0;	6608	return 0;
6609	}	6609	}
6610		6610
6611	/* Following flags don't use groups */	6611	/* Following flags don't use groups */
6612	if (sd->flags & (SD_WAKE_AFFINE))	6612	if (sd->flags & (SD_WAKE_AFFINE))
6613	return 0;	6613	return 0;
6614		6614
6615	return 1;	6615	return 1;
6616	}	6616	}
6617		6617
6618	static int	6618	static int
6619	sd_parent_degenerate(struct sched_domain sd, struct sched_domain parent)	6619	sd_parent_degenerate(struct sched_domain sd, struct sched_domain parent)
6620	{	6620	{
6621	unsigned long cflags = sd->flags, pflags = parent->flags;	6621	unsigned long cflags = sd->flags, pflags = parent->flags;
6622		6622
6623	if (sd_degenerate(parent))	6623	if (sd_degenerate(parent))
6624	return 1;	6624	return 1;
6625		6625
6626	if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))	6626	if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
6627	return 0;	6627	return 0;
6628		6628
6629	/* Flags needing groups don't count if only 1 group in parent */	6629	/* Flags needing groups don't count if only 1 group in parent */
6630	if (parent->groups == parent->groups->next) {	6630	if (parent->groups == parent->groups->next) {
6631	pflags &= ~(SD_LOAD_BALANCE \|	6631	pflags &= ~(SD_LOAD_BALANCE \|
6632	SD_BALANCE_NEWIDLE \|	6632	SD_BALANCE_NEWIDLE \|
6633	SD_BALANCE_FORK \|	6633	SD_BALANCE_FORK \|
6634	SD_BALANCE_EXEC \|	6634	SD_BALANCE_EXEC \|
6635	SD_SHARE_CPUPOWER \|	6635	SD_SHARE_CPUPOWER \|
6636	SD_SHARE_PKG_RESOURCES);	6636	SD_SHARE_PKG_RESOURCES);
6637	if (nr_node_ids == 1)	6637	if (nr_node_ids == 1)
6638	pflags &= ~SD_SERIALIZE;	6638	pflags &= ~SD_SERIALIZE;
6639	}	6639	}
6640	if (~cflags & pflags)	6640	if (~cflags & pflags)
6641	return 0;	6641	return 0;
6642		6642
6643	return 1;	6643	return 1;
6644	}	6644	}
6645		6645
6646	static void free_rootdomain(struct root_domain *rd)	6646	static void free_rootdomain(struct root_domain *rd)
6647	{	6647	{
6648	synchronize_sched();	6648	synchronize_sched();
6649		6649
6650	cpupri_cleanup(&rd->cpupri);	6650	cpupri_cleanup(&rd->cpupri);
6651		6651
6652	free_cpumask_var(rd->rto_mask);	6652	free_cpumask_var(rd->rto_mask);
6653	free_cpumask_var(rd->online);	6653	free_cpumask_var(rd->online);
6654	free_cpumask_var(rd->span);	6654	free_cpumask_var(rd->span);
6655	kfree(rd);	6655	kfree(rd);
6656	}	6656	}
6657		6657
6658	static void rq_attach_root(struct rq rq, struct root_domain rd)	6658	static void rq_attach_root(struct rq rq, struct root_domain rd)
6659	{	6659	{
6660	struct root_domain *old_rd = NULL;	6660	struct root_domain *old_rd = NULL;
6661	unsigned long flags;	6661	unsigned long flags;
6662		6662
6663	raw_spin_lock_irqsave(&rq->lock, flags);	6663	raw_spin_lock_irqsave(&rq->lock, flags);
6664		6664
6665	if (rq->rd) {	6665	if (rq->rd) {
6666	old_rd = rq->rd;	6666	old_rd = rq->rd;
6667		6667
6668	if (cpumask_test_cpu(rq->cpu, old_rd->online))	6668	if (cpumask_test_cpu(rq->cpu, old_rd->online))
6669	set_rq_offline(rq);	6669	set_rq_offline(rq);
6670		6670
6671	cpumask_clear_cpu(rq->cpu, old_rd->span);	6671	cpumask_clear_cpu(rq->cpu, old_rd->span);
6672		6672
6673	/*	6673	/*
6674	* If we dont want to free the old_rt yet then	6674	* If we dont want to free the old_rt yet then
6675	* set old_rd to NULL to skip the freeing later	6675	* set old_rd to NULL to skip the freeing later
6676	* in this function:	6676	* in this function:
6677	*/	6677	*/
6678	if (!atomic_dec_and_test(&old_rd->refcount))	6678	if (!atomic_dec_and_test(&old_rd->refcount))
6679	old_rd = NULL;	6679	old_rd = NULL;
6680	}	6680	}
6681		6681
6682	atomic_inc(&rd->refcount);	6682	atomic_inc(&rd->refcount);
6683	rq->rd = rd;	6683	rq->rd = rd;
6684		6684
6685	cpumask_set_cpu(rq->cpu, rd->span);	6685	cpumask_set_cpu(rq->cpu, rd->span);
6686	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))	6686	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
6687	set_rq_online(rq);	6687	set_rq_online(rq);
6688		6688
6689	raw_spin_unlock_irqrestore(&rq->lock, flags);	6689	raw_spin_unlock_irqrestore(&rq->lock, flags);
6690		6690
6691	if (old_rd)	6691	if (old_rd)
6692	free_rootdomain(old_rd);	6692	free_rootdomain(old_rd);
6693	}	6693	}
6694		6694
6695	static int init_rootdomain(struct root_domain *rd)	6695	static int init_rootdomain(struct root_domain *rd)
6696	{	6696	{
6697	memset(rd, 0, sizeof(*rd));	6697	memset(rd, 0, sizeof(*rd));
6698		6698
6699	if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))	6699	if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
6700	goto out;	6700	goto out;
6701	if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))	6701	if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
6702	goto free_span;	6702	goto free_span;
6703	if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))	6703	if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
6704	goto free_online;	6704	goto free_online;
6705		6705
6706	if (cpupri_init(&rd->cpupri) != 0)	6706	if (cpupri_init(&rd->cpupri) != 0)
6707	goto free_rto_mask;	6707	goto free_rto_mask;
6708	return 0;	6708	return 0;
6709		6709
6710	free_rto_mask:	6710	free_rto_mask:
6711	free_cpumask_var(rd->rto_mask);	6711	free_cpumask_var(rd->rto_mask);
6712	free_online:	6712	free_online:
6713	free_cpumask_var(rd->online);	6713	free_cpumask_var(rd->online);
6714	free_span:	6714	free_span:
6715	free_cpumask_var(rd->span);	6715	free_cpumask_var(rd->span);
6716	out:	6716	out:
6717	return -ENOMEM;	6717	return -ENOMEM;
6718	}	6718	}
6719		6719
6720	static void init_defrootdomain(void)	6720	static void init_defrootdomain(void)
6721	{	6721	{
6722	init_rootdomain(&def_root_domain);	6722	init_rootdomain(&def_root_domain);
6723		6723
6724	atomic_set(&def_root_domain.refcount, 1);	6724	atomic_set(&def_root_domain.refcount, 1);
6725	}	6725	}
6726		6726
6727	static struct root_domain *alloc_rootdomain(void)	6727	static struct root_domain *alloc_rootdomain(void)
6728	{	6728	{
6729	struct root_domain *rd;	6729	struct root_domain *rd;
6730		6730
6731	rd = kmalloc(sizeof(*rd), GFP_KERNEL);	6731	rd = kmalloc(sizeof(*rd), GFP_KERNEL);
6732	if (!rd)	6732	if (!rd)
6733	return NULL;	6733	return NULL;
6734		6734
6735	if (init_rootdomain(rd) != 0) {	6735	if (init_rootdomain(rd) != 0) {
6736	kfree(rd);	6736	kfree(rd);
6737	return NULL;	6737	return NULL;
6738	}	6738	}
6739		6739
6740	return rd;	6740	return rd;
6741	}	6741	}
6742		6742
6743	/*	6743	/*
6744	* Attach the domain 'sd' to 'cpu' as its base domain. Callers must	6744	* Attach the domain 'sd' to 'cpu' as its base domain. Callers must
6745	* hold the hotplug lock.	6745	* hold the hotplug lock.
6746	*/	6746	*/
6747	static void	6747	static void
6748	cpu_attach_domain(struct sched_domain sd, struct root_domain rd, int cpu)	6748	cpu_attach_domain(struct sched_domain sd, struct root_domain rd, int cpu)
6749	{	6749	{
6750	struct rq *rq = cpu_rq(cpu);	6750	struct rq *rq = cpu_rq(cpu);
6751	struct sched_domain *tmp;	6751	struct sched_domain *tmp;
6752		6752
6753	for (tmp = sd; tmp; tmp = tmp->parent)	6753	for (tmp = sd; tmp; tmp = tmp->parent)
6754	tmp->span_weight = cpumask_weight(sched_domain_span(tmp));	6754	tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
6755		6755
6756	/* Remove the sched domains which do not contribute to scheduling. */	6756	/* Remove the sched domains which do not contribute to scheduling. */
6757	for (tmp = sd; tmp; ) {	6757	for (tmp = sd; tmp; ) {
6758	struct sched_domain *parent = tmp->parent;	6758	struct sched_domain *parent = tmp->parent;
6759	if (!parent)	6759	if (!parent)
6760	break;	6760	break;
6761		6761
6762	if (sd_parent_degenerate(tmp, parent)) {	6762	if (sd_parent_degenerate(tmp, parent)) {
6763	tmp->parent = parent->parent;	6763	tmp->parent = parent->parent;
6764	if (parent->parent)	6764	if (parent->parent)
6765	parent->parent->child = tmp;	6765	parent->parent->child = tmp;
6766	} else	6766	} else
6767	tmp = tmp->parent;	6767	tmp = tmp->parent;
6768	}	6768	}
6769		6769
6770	if (sd && sd_degenerate(sd)) {	6770	if (sd && sd_degenerate(sd)) {
6771	sd = sd->parent;	6771	sd = sd->parent;
6772	if (sd)	6772	if (sd)
6773	sd->child = NULL;	6773	sd->child = NULL;
6774	}	6774	}
6775		6775
6776	sched_domain_debug(sd, cpu);	6776	sched_domain_debug(sd, cpu);
6777		6777
6778	rq_attach_root(rq, rd);	6778	rq_attach_root(rq, rd);
6779	rcu_assign_pointer(rq->sd, sd);	6779	rcu_assign_pointer(rq->sd, sd);
6780	}	6780	}
6781		6781
6782	/* cpus with isolated domains */	6782	/* cpus with isolated domains */
6783	static cpumask_var_t cpu_isolated_map;	6783	static cpumask_var_t cpu_isolated_map;
6784		6784
6785	/* Setup the mask of cpus configured for isolated domains */	6785	/* Setup the mask of cpus configured for isolated domains */
6786	static int __init isolated_cpu_setup(char *str)	6786	static int __init isolated_cpu_setup(char *str)
6787	{	6787	{
6788	alloc_bootmem_cpumask_var(&cpu_isolated_map);	6788	alloc_bootmem_cpumask_var(&cpu_isolated_map);
6789	cpulist_parse(str, cpu_isolated_map);	6789	cpulist_parse(str, cpu_isolated_map);
6790	return 1;	6790	return 1;
6791	}	6791	}
6792		6792
6793	__setup("isolcpus=", isolated_cpu_setup);	6793	__setup("isolcpus=", isolated_cpu_setup);
6794		6794
6795	/*	6795	/*
6796	* init_sched_build_groups takes the cpumask we wish to span, and a pointer	6796	* init_sched_build_groups takes the cpumask we wish to span, and a pointer
6797	* to a function which identifies what group(along with sched group) a CPU	6797	* to a function which identifies what group(along with sched group) a CPU
6798	* belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids	6798	* belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
6799	* (due to the fact that we keep track of groups covered with a struct cpumask).	6799	* (due to the fact that we keep track of groups covered with a struct cpumask).
6800	*	6800	*
6801	* init_sched_build_groups will build a circular linked list of the groups	6801	* init_sched_build_groups will build a circular linked list of the groups
6802	* covered by the given span, and will set each group's ->cpumask correctly,	6802	* covered by the given span, and will set each group's ->cpumask correctly,
6803	* and ->cpu_power to 0.	6803	* and ->cpu_power to 0.
6804	*/	6804	*/
6805	static void	6805	static void
6806	init_sched_build_groups(const struct cpumask *span,	6806	init_sched_build_groups(const struct cpumask *span,
6807	const struct cpumask *cpu_map,	6807	const struct cpumask *cpu_map,
6808	int (group_fn)(int cpu, const struct cpumask cpu_map,	6808	int (group_fn)(int cpu, const struct cpumask cpu_map,
6809	struct sched_group **sg,	6809	struct sched_group **sg,
6810	struct cpumask *tmpmask),	6810	struct cpumask *tmpmask),
6811	struct cpumask covered, struct cpumask tmpmask)	6811	struct cpumask covered, struct cpumask tmpmask)
6812	{	6812	{
6813	struct sched_group first = NULL, last = NULL;	6813	struct sched_group first = NULL, last = NULL;
6814	int i;	6814	int i;
6815		6815
6816	cpumask_clear(covered);	6816	cpumask_clear(covered);
6817		6817
6818	for_each_cpu(i, span) {	6818	for_each_cpu(i, span) {
6819	struct sched_group *sg;	6819	struct sched_group *sg;
6820	int group = group_fn(i, cpu_map, &sg, tmpmask);	6820	int group = group_fn(i, cpu_map, &sg, tmpmask);
6821	int j;	6821	int j;
6822		6822
6823	if (cpumask_test_cpu(i, covered))	6823	if (cpumask_test_cpu(i, covered))
6824	continue;	6824	continue;
6825		6825
6826	cpumask_clear(sched_group_cpus(sg));	6826	cpumask_clear(sched_group_cpus(sg));
6827	sg->cpu_power = 0;	6827	sg->cpu_power = 0;
6828		6828
6829	for_each_cpu(j, span) {	6829	for_each_cpu(j, span) {
6830	if (group_fn(j, cpu_map, NULL, tmpmask) != group)	6830	if (group_fn(j, cpu_map, NULL, tmpmask) != group)
6831	continue;	6831	continue;
6832		6832
6833	cpumask_set_cpu(j, covered);	6833	cpumask_set_cpu(j, covered);
6834	cpumask_set_cpu(j, sched_group_cpus(sg));	6834	cpumask_set_cpu(j, sched_group_cpus(sg));
6835	}	6835	}
6836	if (!first)	6836	if (!first)
6837	first = sg;	6837	first = sg;
6838	if (last)	6838	if (last)
6839	last->next = sg;	6839	last->next = sg;
6840	last = sg;	6840	last = sg;
6841	}	6841	}
6842	last->next = first;	6842	last->next = first;
6843	}	6843	}
6844		6844
6845	#define SD_NODES_PER_DOMAIN 16	6845	#define SD_NODES_PER_DOMAIN 16
6846		6846
6847	#ifdef CONFIG_NUMA	6847	#ifdef CONFIG_NUMA
6848		6848
6849	/**	6849	/**
6850	* find_next_best_node - find the next node to include in a sched_domain	6850	* find_next_best_node - find the next node to include in a sched_domain
6851	* @node: node whose sched_domain we're building	6851	* @node: node whose sched_domain we're building
6852	* @used_nodes: nodes already in the sched_domain	6852	* @used_nodes: nodes already in the sched_domain
6853	*	6853	*
6854	* Find the next node to include in a given scheduling domain. Simply	6854	* Find the next node to include in a given scheduling domain. Simply
6855	* finds the closest node not already in the @used_nodes map.	6855	* finds the closest node not already in the @used_nodes map.
6856	*	6856	*
6857	* Should use nodemask_t.	6857	* Should use nodemask_t.
6858	*/	6858	*/
6859	static int find_next_best_node(int node, nodemask_t *used_nodes)	6859	static int find_next_best_node(int node, nodemask_t *used_nodes)
6860	{	6860	{
6861	int i, n, val, min_val, best_node = 0;	6861	int i, n, val, min_val, best_node = 0;
6862		6862
6863	min_val = INT_MAX;	6863	min_val = INT_MAX;
6864		6864
6865	for (i = 0; i < nr_node_ids; i++) {	6865	for (i = 0; i < nr_node_ids; i++) {
6866	/* Start at @node */	6866	/* Start at @node */
6867	n = (node + i) % nr_node_ids;	6867	n = (node + i) % nr_node_ids;
6868		6868
6869	if (!nr_cpus_node(n))	6869	if (!nr_cpus_node(n))
6870	continue;	6870	continue;
6871		6871
6872	/* Skip already used nodes */	6872	/* Skip already used nodes */
6873	if (node_isset(n, *used_nodes))	6873	if (node_isset(n, *used_nodes))
6874	continue;	6874	continue;
6875		6875
6876	/* Simple min distance search */	6876	/* Simple min distance search */
6877	val = node_distance(node, n);	6877	val = node_distance(node, n);
6878		6878
6879	if (val < min_val) {	6879	if (val < min_val) {
6880	min_val = val;	6880	min_val = val;
6881	best_node = n;	6881	best_node = n;
6882	}	6882	}
6883	}	6883	}
6884		6884
6885	node_set(best_node, *used_nodes);	6885	node_set(best_node, *used_nodes);
6886	return best_node;	6886	return best_node;
6887	}	6887	}
6888		6888
6889	/**	6889	/**
6890	* sched_domain_node_span - get a cpumask for a node's sched_domain	6890	* sched_domain_node_span - get a cpumask for a node's sched_domain
6891	* @node: node whose cpumask we're constructing	6891	* @node: node whose cpumask we're constructing
6892	* @span: resulting cpumask	6892	* @span: resulting cpumask
6893	*	6893	*
6894	* Given a node, construct a good cpumask for its sched_domain to span. It	6894	* Given a node, construct a good cpumask for its sched_domain to span. It
6895	* should be one that prevents unnecessary balancing, but also spreads tasks	6895	* should be one that prevents unnecessary balancing, but also spreads tasks
6896	* out optimally.	6896	* out optimally.
6897	*/	6897	*/
6898	static void sched_domain_node_span(int node, struct cpumask *span)	6898	static void sched_domain_node_span(int node, struct cpumask *span)
6899	{	6899	{
6900	nodemask_t used_nodes;	6900	nodemask_t used_nodes;
6901	int i;	6901	int i;
6902		6902
6903	cpumask_clear(span);	6903	cpumask_clear(span);
6904	nodes_clear(used_nodes);	6904	nodes_clear(used_nodes);
6905		6905
6906	cpumask_or(span, span, cpumask_of_node(node));	6906	cpumask_or(span, span, cpumask_of_node(node));
6907	node_set(node, used_nodes);	6907	node_set(node, used_nodes);
6908		6908
6909	for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {	6909	for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
6910	int next_node = find_next_best_node(node, &used_nodes);	6910	int next_node = find_next_best_node(node, &used_nodes);
6911		6911
6912	cpumask_or(span, span, cpumask_of_node(next_node));	6912	cpumask_or(span, span, cpumask_of_node(next_node));
6913	}	6913	}
6914	}	6914	}
6915	#endif /* CONFIG_NUMA */	6915	#endif /* CONFIG_NUMA */
6916		6916
6917	int sched_smt_power_savings = 0, sched_mc_power_savings = 0;	6917	int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6918		6918
6919	/*	6919	/*
6920	* The cpus mask in sched_group and sched_domain hangs off the end.	6920	* The cpus mask in sched_group and sched_domain hangs off the end.
6921	*	6921	*
6922	* ( See the the comments in include/linux/sched.h:struct sched_group	6922	* ( See the the comments in include/linux/sched.h:struct sched_group
6923	* and struct sched_domain. )	6923	* and struct sched_domain. )
6924	*/	6924	*/
6925	struct static_sched_group {	6925	struct static_sched_group {
6926	struct sched_group sg;	6926	struct sched_group sg;
6927	DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);	6927	DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
6928	};	6928	};
6929		6929
6930	struct static_sched_domain {	6930	struct static_sched_domain {
6931	struct sched_domain sd;	6931	struct sched_domain sd;
6932	DECLARE_BITMAP(span, CONFIG_NR_CPUS);	6932	DECLARE_BITMAP(span, CONFIG_NR_CPUS);
6933	};	6933	};
6934		6934
6935	struct s_data {	6935	struct s_data {
6936	#ifdef CONFIG_NUMA	6936	#ifdef CONFIG_NUMA
6937	int sd_allnodes;	6937	int sd_allnodes;
6938	cpumask_var_t domainspan;	6938	cpumask_var_t domainspan;
6939	cpumask_var_t covered;	6939	cpumask_var_t covered;
6940	cpumask_var_t notcovered;	6940	cpumask_var_t notcovered;
6941	#endif	6941	#endif
6942	cpumask_var_t nodemask;	6942	cpumask_var_t nodemask;
6943	cpumask_var_t this_sibling_map;	6943	cpumask_var_t this_sibling_map;
6944	cpumask_var_t this_core_map;	6944	cpumask_var_t this_core_map;
6945	cpumask_var_t this_book_map;	6945	cpumask_var_t this_book_map;
6946	cpumask_var_t send_covered;	6946	cpumask_var_t send_covered;
6947	cpumask_var_t tmpmask;	6947	cpumask_var_t tmpmask;
6948	struct sched_group **sched_group_nodes;	6948	struct sched_group **sched_group_nodes;
6949	struct root_domain *rd;	6949	struct root_domain *rd;
6950	};	6950	};
6951		6951
6952	enum s_alloc {	6952	enum s_alloc {
6953	sa_sched_groups = 0,	6953	sa_sched_groups = 0,
6954	sa_rootdomain,	6954	sa_rootdomain,
6955	sa_tmpmask,	6955	sa_tmpmask,
6956	sa_send_covered,	6956	sa_send_covered,
6957	sa_this_book_map,	6957	sa_this_book_map,
6958	sa_this_core_map,	6958	sa_this_core_map,
6959	sa_this_sibling_map,	6959	sa_this_sibling_map,
6960	sa_nodemask,	6960	sa_nodemask,
6961	sa_sched_group_nodes,	6961	sa_sched_group_nodes,
6962	#ifdef CONFIG_NUMA	6962	#ifdef CONFIG_NUMA
6963	sa_notcovered,	6963	sa_notcovered,
6964	sa_covered,	6964	sa_covered,
6965	sa_domainspan,	6965	sa_domainspan,
6966	#endif	6966	#endif
6967	sa_none,	6967	sa_none,
6968	};	6968	};
6969		6969
6970	/*	6970	/*
6971	* SMT sched-domains:	6971	* SMT sched-domains:
6972	*/	6972	*/
6973	#ifdef CONFIG_SCHED_SMT	6973	#ifdef CONFIG_SCHED_SMT
6974	static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);	6974	static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
6975	static DEFINE_PER_CPU(struct static_sched_group, sched_groups);	6975	static DEFINE_PER_CPU(struct static_sched_group, sched_groups);
6976		6976
6977	static int	6977	static int
6978	cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,	6978	cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
6979	struct sched_group *sg, struct cpumask unused)	6979	struct sched_group *sg, struct cpumask unused)
6980	{	6980	{
6981	if (sg)	6981	if (sg)
6982	*sg = &per_cpu(sched_groups, cpu).sg;	6982	*sg = &per_cpu(sched_groups, cpu).sg;
6983	return cpu;	6983	return cpu;
6984	}	6984	}
6985	#endif /* CONFIG_SCHED_SMT */	6985	#endif /* CONFIG_SCHED_SMT */
6986		6986
6987	/*	6987	/*
6988	* multi-core sched-domains:	6988	* multi-core sched-domains:
6989	*/	6989	*/
6990	#ifdef CONFIG_SCHED_MC	6990	#ifdef CONFIG_SCHED_MC
6991	static DEFINE_PER_CPU(struct static_sched_domain, core_domains);	6991	static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
6992	static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);	6992	static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
6993		6993
6994	static int	6994	static int
6995	cpu_to_core_group(int cpu, const struct cpumask *cpu_map,	6995	cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
6996	struct sched_group *sg, struct cpumask mask)	6996	struct sched_group *sg, struct cpumask mask)
6997	{	6997	{
6998	int group;	6998	int group;
6999	#ifdef CONFIG_SCHED_SMT	6999	#ifdef CONFIG_SCHED_SMT
7000	cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);	7000	cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
7001	group = cpumask_first(mask);	7001	group = cpumask_first(mask);
7002	#else	7002	#else
7003	group = cpu;	7003	group = cpu;
7004	#endif	7004	#endif
7005	if (sg)	7005	if (sg)
7006	*sg = &per_cpu(sched_group_core, group).sg;	7006	*sg = &per_cpu(sched_group_core, group).sg;
7007	return group;	7007	return group;
7008	}	7008	}
7009	#endif /* CONFIG_SCHED_MC */	7009	#endif /* CONFIG_SCHED_MC */
7010		7010
7011	/*	7011	/*
7012	* book sched-domains:	7012	* book sched-domains:
7013	*/	7013	*/
7014	#ifdef CONFIG_SCHED_BOOK	7014	#ifdef CONFIG_SCHED_BOOK
7015	static DEFINE_PER_CPU(struct static_sched_domain, book_domains);	7015	static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
7016	static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);	7016	static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
7017		7017
7018	static int	7018	static int
7019	cpu_to_book_group(int cpu, const struct cpumask *cpu_map,	7019	cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
7020	struct sched_group *sg, struct cpumask mask)	7020	struct sched_group *sg, struct cpumask mask)
7021	{	7021	{
7022	int group = cpu;	7022	int group = cpu;
7023	#ifdef CONFIG_SCHED_MC	7023	#ifdef CONFIG_SCHED_MC
7024	cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);	7024	cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
7025	group = cpumask_first(mask);	7025	group = cpumask_first(mask);
7026	#elif defined(CONFIG_SCHED_SMT)	7026	#elif defined(CONFIG_SCHED_SMT)
7027	cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);	7027	cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
7028	group = cpumask_first(mask);	7028	group = cpumask_first(mask);
7029	#endif	7029	#endif
7030	if (sg)	7030	if (sg)
7031	*sg = &per_cpu(sched_group_book, group).sg;	7031	*sg = &per_cpu(sched_group_book, group).sg;
7032	return group;	7032	return group;
7033	}	7033	}
7034	#endif /* CONFIG_SCHED_BOOK */	7034	#endif /* CONFIG_SCHED_BOOK */
7035		7035
7036	static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);	7036	static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
7037	static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);	7037	static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
7038		7038
7039	static int	7039	static int
7040	cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,	7040	cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
7041	struct sched_group *sg, struct cpumask mask)	7041	struct sched_group *sg, struct cpumask mask)
7042	{	7042	{
7043	int group;	7043	int group;
7044	#ifdef CONFIG_SCHED_BOOK	7044	#ifdef CONFIG_SCHED_BOOK
7045	cpumask_and(mask, cpu_book_mask(cpu), cpu_map);	7045	cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
7046	group = cpumask_first(mask);	7046	group = cpumask_first(mask);
7047	#elif defined(CONFIG_SCHED_MC)	7047	#elif defined(CONFIG_SCHED_MC)
7048	cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);	7048	cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
7049	group = cpumask_first(mask);	7049	group = cpumask_first(mask);
7050	#elif defined(CONFIG_SCHED_SMT)	7050	#elif defined(CONFIG_SCHED_SMT)
7051	cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);	7051	cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
7052	group = cpumask_first(mask);	7052	group = cpumask_first(mask);
7053	#else	7053	#else
7054	group = cpu;	7054	group = cpu;
7055	#endif	7055	#endif
7056	if (sg)	7056	if (sg)
7057	*sg = &per_cpu(sched_group_phys, group).sg;	7057	*sg = &per_cpu(sched_group_phys, group).sg;
7058	return group;	7058	return group;
7059	}	7059	}
7060		7060
7061	#ifdef CONFIG_NUMA	7061	#ifdef CONFIG_NUMA
7062	/*	7062	/*
7063	* The init_sched_build_groups can't handle what we want to do with node	7063	* The init_sched_build_groups can't handle what we want to do with node
7064	* groups, so roll our own. Now each node has its own list of groups which	7064	* groups, so roll our own. Now each node has its own list of groups which
7065	* gets dynamically allocated.	7065	* gets dynamically allocated.
7066	*/	7066	*/
7067	static DEFINE_PER_CPU(struct static_sched_domain, node_domains);	7067	static DEFINE_PER_CPU(struct static_sched_domain, node_domains);
7068	static struct sched_group ***sched_group_nodes_bycpu;	7068	static struct sched_group ***sched_group_nodes_bycpu;
7069		7069
7070	static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);	7070	static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
7071	static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);	7071	static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
7072		7072
7073	static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,	7073	static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
7074	struct sched_group **sg,	7074	struct sched_group **sg,
7075	struct cpumask *nodemask)	7075	struct cpumask *nodemask)
7076	{	7076	{
7077	int group;	7077	int group;
7078		7078
7079	cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);	7079	cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
7080	group = cpumask_first(nodemask);	7080	group = cpumask_first(nodemask);
7081		7081
7082	if (sg)	7082	if (sg)
7083	*sg = &per_cpu(sched_group_allnodes, group).sg;	7083	*sg = &per_cpu(sched_group_allnodes, group).sg;
7084	return group;	7084	return group;
7085	}	7085	}
7086		7086
7087	static void init_numa_sched_groups_power(struct sched_group *group_head)	7087	static void init_numa_sched_groups_power(struct sched_group *group_head)
7088	{	7088	{
7089	struct sched_group *sg = group_head;	7089	struct sched_group *sg = group_head;
7090	int j;	7090	int j;
7091		7091
7092	if (!sg)	7092	if (!sg)
7093	return;	7093	return;
7094	do {	7094	do {
7095	for_each_cpu(j, sched_group_cpus(sg)) {	7095	for_each_cpu(j, sched_group_cpus(sg)) {
7096	struct sched_domain *sd;	7096	struct sched_domain *sd;
7097		7097
7098	sd = &per_cpu(phys_domains, j).sd;	7098	sd = &per_cpu(phys_domains, j).sd;
7099	if (j != group_first_cpu(sd->groups)) {	7099	if (j != group_first_cpu(sd->groups)) {
7100	/*	7100	/*
7101	* Only add "power" once for each	7101	* Only add "power" once for each
7102	* physical package.	7102	* physical package.
7103	*/	7103	*/
7104	continue;	7104	continue;
7105	}	7105	}
7106		7106
7107	sg->cpu_power += sd->groups->cpu_power;	7107	sg->cpu_power += sd->groups->cpu_power;
7108	}	7108	}
7109	sg = sg->next;	7109	sg = sg->next;
7110	} while (sg != group_head);	7110	} while (sg != group_head);
7111	}	7111	}
7112		7112
7113	static int build_numa_sched_groups(struct s_data *d,	7113	static int build_numa_sched_groups(struct s_data *d,
7114	const struct cpumask *cpu_map, int num)	7114	const struct cpumask *cpu_map, int num)
7115	{	7115	{
7116	struct sched_domain *sd;	7116	struct sched_domain *sd;
7117	struct sched_group sg, prev;	7117	struct sched_group sg, prev;
7118	int n, j;	7118	int n, j;
7119		7119
7120	cpumask_clear(d->covered);	7120	cpumask_clear(d->covered);
7121	cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);	7121	cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
7122	if (cpumask_empty(d->nodemask)) {	7122	if (cpumask_empty(d->nodemask)) {
7123	d->sched_group_nodes[num] = NULL;	7123	d->sched_group_nodes[num] = NULL;
7124	goto out;	7124	goto out;
7125	}	7125	}
7126		7126
7127	sched_domain_node_span(num, d->domainspan);	7127	sched_domain_node_span(num, d->domainspan);
7128	cpumask_and(d->domainspan, d->domainspan, cpu_map);	7128	cpumask_and(d->domainspan, d->domainspan, cpu_map);
7129		7129
7130	sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),	7130	sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
7131	GFP_KERNEL, num);	7131	GFP_KERNEL, num);
7132	if (!sg) {	7132	if (!sg) {
7133	printk(KERN_WARNING "Can not alloc domain group for node %d\n",	7133	printk(KERN_WARNING "Can not alloc domain group for node %d\n",
7134	num);	7134	num);
7135	return -ENOMEM;	7135	return -ENOMEM;
7136	}	7136	}
7137	d->sched_group_nodes[num] = sg;	7137	d->sched_group_nodes[num] = sg;
7138		7138
7139	for_each_cpu(j, d->nodemask) {	7139	for_each_cpu(j, d->nodemask) {
7140	sd = &per_cpu(node_domains, j).sd;	7140	sd = &per_cpu(node_domains, j).sd;
7141	sd->groups = sg;	7141	sd->groups = sg;
7142	}	7142	}
7143		7143
7144	sg->cpu_power = 0;	7144	sg->cpu_power = 0;
7145	cpumask_copy(sched_group_cpus(sg), d->nodemask);	7145	cpumask_copy(sched_group_cpus(sg), d->nodemask);
7146	sg->next = sg;	7146	sg->next = sg;
7147	cpumask_or(d->covered, d->covered, d->nodemask);	7147	cpumask_or(d->covered, d->covered, d->nodemask);
7148		7148
7149	prev = sg;	7149	prev = sg;
7150	for (j = 0; j < nr_node_ids; j++) {	7150	for (j = 0; j < nr_node_ids; j++) {
7151	n = (num + j) % nr_node_ids;	7151	n = (num + j) % nr_node_ids;
7152	cpumask_complement(d->notcovered, d->covered);	7152	cpumask_complement(d->notcovered, d->covered);
7153	cpumask_and(d->tmpmask, d->notcovered, cpu_map);	7153	cpumask_and(d->tmpmask, d->notcovered, cpu_map);
7154	cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);	7154	cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
7155	if (cpumask_empty(d->tmpmask))	7155	if (cpumask_empty(d->tmpmask))
7156	break;	7156	break;
7157	cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));	7157	cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
7158	if (cpumask_empty(d->tmpmask))	7158	if (cpumask_empty(d->tmpmask))
7159	continue;	7159	continue;
7160	sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),	7160	sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
7161	GFP_KERNEL, num);	7161	GFP_KERNEL, num);
7162	if (!sg) {	7162	if (!sg) {
7163	printk(KERN_WARNING	7163	printk(KERN_WARNING
7164	"Can not alloc domain group for node %d\n", j);	7164	"Can not alloc domain group for node %d\n", j);
7165	return -ENOMEM;	7165	return -ENOMEM;
7166	}	7166	}
7167	sg->cpu_power = 0;	7167	sg->cpu_power = 0;
7168	cpumask_copy(sched_group_cpus(sg), d->tmpmask);	7168	cpumask_copy(sched_group_cpus(sg), d->tmpmask);
7169	sg->next = prev->next;	7169	sg->next = prev->next;
7170	cpumask_or(d->covered, d->covered, d->tmpmask);	7170	cpumask_or(d->covered, d->covered, d->tmpmask);
7171	prev->next = sg;	7171	prev->next = sg;
7172	prev = sg;	7172	prev = sg;
7173	}	7173	}
7174	out:	7174	out:
7175	return 0;	7175	return 0;
7176	}	7176	}
7177	#endif /* CONFIG_NUMA */	7177	#endif /* CONFIG_NUMA */
7178		7178
7179	#ifdef CONFIG_NUMA	7179	#ifdef CONFIG_NUMA
7180	/* Free memory allocated for various sched_group structures */	7180	/* Free memory allocated for various sched_group structures */
7181	static void free_sched_groups(const struct cpumask *cpu_map,	7181	static void free_sched_groups(const struct cpumask *cpu_map,
7182	struct cpumask *nodemask)	7182	struct cpumask *nodemask)
7183	{	7183	{
7184	int cpu, i;	7184	int cpu, i;
7185		7185
7186	for_each_cpu(cpu, cpu_map) {	7186	for_each_cpu(cpu, cpu_map) {
7187	struct sched_group **sched_group_nodes	7187	struct sched_group **sched_group_nodes
7188	= sched_group_nodes_bycpu[cpu];	7188	= sched_group_nodes_bycpu[cpu];
7189		7189
7190	if (!sched_group_nodes)	7190	if (!sched_group_nodes)
7191	continue;	7191	continue;
7192		7192
7193	for (i = 0; i < nr_node_ids; i++) {	7193	for (i = 0; i < nr_node_ids; i++) {
7194	struct sched_group oldsg, sg = sched_group_nodes[i];	7194	struct sched_group oldsg, sg = sched_group_nodes[i];
7195		7195
7196	cpumask_and(nodemask, cpumask_of_node(i), cpu_map);	7196	cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
7197	if (cpumask_empty(nodemask))	7197	if (cpumask_empty(nodemask))
7198	continue;	7198	continue;
7199		7199
7200	if (sg == NULL)	7200	if (sg == NULL)
7201	continue;	7201	continue;
7202	sg = sg->next;	7202	sg = sg->next;
7203	next_sg:	7203	next_sg:
7204	oldsg = sg;	7204	oldsg = sg;
7205	sg = sg->next;	7205	sg = sg->next;
7206	kfree(oldsg);	7206	kfree(oldsg);
7207	if (oldsg != sched_group_nodes[i])	7207	if (oldsg != sched_group_nodes[i])
7208	goto next_sg;	7208	goto next_sg;
7209	}	7209	}
7210	kfree(sched_group_nodes);	7210	kfree(sched_group_nodes);
7211	sched_group_nodes_bycpu[cpu] = NULL;	7211	sched_group_nodes_bycpu[cpu] = NULL;
7212	}	7212	}
7213	}	7213	}
7214	#else /* !CONFIG_NUMA */	7214	#else /* !CONFIG_NUMA */
7215	static void free_sched_groups(const struct cpumask *cpu_map,	7215	static void free_sched_groups(const struct cpumask *cpu_map,
7216	struct cpumask *nodemask)	7216	struct cpumask *nodemask)
7217	{	7217	{
7218	}	7218	}
7219	#endif /* CONFIG_NUMA */	7219	#endif /* CONFIG_NUMA */
7220		7220
7221	/*	7221	/*
7222	* Initialize sched groups cpu_power.	7222	* Initialize sched groups cpu_power.
7223	*	7223	*
7224	* cpu_power indicates the capacity of sched group, which is used while	7224	* cpu_power indicates the capacity of sched group, which is used while
7225	* distributing the load between different sched groups in a sched domain.	7225	* distributing the load between different sched groups in a sched domain.
7226	* Typically cpu_power for all the groups in a sched domain will be same unless	7226	* Typically cpu_power for all the groups in a sched domain will be same unless
7227	* there are asymmetries in the topology. If there are asymmetries, group	7227	* there are asymmetries in the topology. If there are asymmetries, group
7228	* having more cpu_power will pickup more load compared to the group having	7228	* having more cpu_power will pickup more load compared to the group having
7229	* less cpu_power.	7229	* less cpu_power.
7230	*/	7230	*/
7231	static void init_sched_groups_power(int cpu, struct sched_domain *sd)	7231	static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7232	{	7232	{
7233	struct sched_domain *child;	7233	struct sched_domain *child;
7234	struct sched_group *group;	7234	struct sched_group *group;
7235	long power;	7235	long power;
7236	int weight;	7236	int weight;
7237		7237
7238	WARN_ON(!sd \|\| !sd->groups);	7238	WARN_ON(!sd \|\| !sd->groups);
7239		7239
7240	if (cpu != group_first_cpu(sd->groups))	7240	if (cpu != group_first_cpu(sd->groups))
7241	return;	7241	return;
7242		7242
7243	sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));	7243	sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
7244		7244
7245	child = sd->child;	7245	child = sd->child;
7246		7246
7247	sd->groups->cpu_power = 0;	7247	sd->groups->cpu_power = 0;
7248		7248
7249	if (!child) {	7249	if (!child) {
7250	power = SCHED_LOAD_SCALE;	7250	power = SCHED_LOAD_SCALE;
7251	weight = cpumask_weight(sched_domain_span(sd));	7251	weight = cpumask_weight(sched_domain_span(sd));
7252	/*	7252	/*
7253	* SMT siblings share the power of a single core.	7253	* SMT siblings share the power of a single core.
7254	* Usually multiple threads get a better yield out of	7254	* Usually multiple threads get a better yield out of
7255	* that one core than a single thread would have,	7255	* that one core than a single thread would have,
7256	* reflect that in sd->smt_gain.	7256	* reflect that in sd->smt_gain.
7257	*/	7257	*/
7258	if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {	7258	if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
7259	power *= sd->smt_gain;	7259	power *= sd->smt_gain;
7260	power /= weight;	7260	power /= weight;
7261	power >>= SCHED_LOAD_SHIFT;	7261	power >>= SCHED_LOAD_SHIFT;
7262	}	7262	}
7263	sd->groups->cpu_power += power;	7263	sd->groups->cpu_power += power;
7264	return;	7264	return;
7265	}	7265	}
7266		7266
7267	/*	7267	/*
7268	* Add cpu_power of each child group to this groups cpu_power.	7268	* Add cpu_power of each child group to this groups cpu_power.
7269	*/	7269	*/
7270	group = child->groups;	7270	group = child->groups;
7271	do {	7271	do {
7272	sd->groups->cpu_power += group->cpu_power;	7272	sd->groups->cpu_power += group->cpu_power;
7273	group = group->next;	7273	group = group->next;
7274	} while (group != child->groups);	7274	} while (group != child->groups);
7275	}	7275	}
7276		7276
7277	/*	7277	/*
7278	* Initializers for schedule domains	7278	* Initializers for schedule domains
7279	* Non-inlined to reduce accumulated stack pressure in build_sched_domains()	7279	* Non-inlined to reduce accumulated stack pressure in build_sched_domains()
7280	*/	7280	*/
7281		7281
7282	#ifdef CONFIG_SCHED_DEBUG	7282	#ifdef CONFIG_SCHED_DEBUG
7283	# define SD_INIT_NAME(sd, type) sd->name = #type	7283	# define SD_INIT_NAME(sd, type) sd->name = #type
7284	#else	7284	#else
7285	# define SD_INIT_NAME(sd, type) do { } while (0)	7285	# define SD_INIT_NAME(sd, type) do { } while (0)
7286	#endif	7286	#endif
7287		7287
7288	#define SD_INIT(sd, type) sd_init_##type(sd)	7288	#define SD_INIT(sd, type) sd_init_##type(sd)
7289		7289
7290	#define SD_INIT_FUNC(type) \	7290	#define SD_INIT_FUNC(type) \
7291	static noinline void sd_init_##type(struct sched_domain *sd) \	7291	static noinline void sd_init_##type(struct sched_domain *sd) \
7292	{ \	7292	{ \
7293	memset(sd, 0, sizeof(*sd)); \	7293	memset(sd, 0, sizeof(*sd)); \
7294	*sd = SD_##type##_INIT; \	7294	*sd = SD_##type##_INIT; \
7295	sd->level = SD_LV_##type; \	7295	sd->level = SD_LV_##type; \
7296	SD_INIT_NAME(sd, type); \	7296	SD_INIT_NAME(sd, type); \
7297	}	7297	}
7298		7298
7299	SD_INIT_FUNC(CPU)	7299	SD_INIT_FUNC(CPU)
7300	#ifdef CONFIG_NUMA	7300	#ifdef CONFIG_NUMA
7301	SD_INIT_FUNC(ALLNODES)	7301	SD_INIT_FUNC(ALLNODES)
7302	SD_INIT_FUNC(NODE)	7302	SD_INIT_FUNC(NODE)
7303	#endif	7303	#endif
7304	#ifdef CONFIG_SCHED_SMT	7304	#ifdef CONFIG_SCHED_SMT
7305	SD_INIT_FUNC(SIBLING)	7305	SD_INIT_FUNC(SIBLING)
7306	#endif	7306	#endif
7307	#ifdef CONFIG_SCHED_MC	7307	#ifdef CONFIG_SCHED_MC
7308	SD_INIT_FUNC(MC)	7308	SD_INIT_FUNC(MC)
7309	#endif	7309	#endif
7310	#ifdef CONFIG_SCHED_BOOK	7310	#ifdef CONFIG_SCHED_BOOK
7311	SD_INIT_FUNC(BOOK)	7311	SD_INIT_FUNC(BOOK)
7312	#endif	7312	#endif
7313		7313
7314	static int default_relax_domain_level = -1;	7314	static int default_relax_domain_level = -1;
7315		7315
7316	static int __init setup_relax_domain_level(char *str)	7316	static int __init setup_relax_domain_level(char *str)
7317	{	7317	{
7318	unsigned long val;	7318	unsigned long val;
7319		7319
7320	val = simple_strtoul(str, NULL, 0);	7320	val = simple_strtoul(str, NULL, 0);
7321	if (val < SD_LV_MAX)	7321	if (val < SD_LV_MAX)
7322	default_relax_domain_level = val;	7322	default_relax_domain_level = val;
7323		7323
7324	return 1;	7324	return 1;
7325	}	7325	}
7326	__setup("relax_domain_level=", setup_relax_domain_level);	7326	__setup("relax_domain_level=", setup_relax_domain_level);
7327		7327
7328	static void set_domain_attribute(struct sched_domain *sd,	7328	static void set_domain_attribute(struct sched_domain *sd,
7329	struct sched_domain_attr *attr)	7329	struct sched_domain_attr *attr)
7330	{	7330	{
7331	int request;	7331	int request;
7332		7332
7333	if (!attr \|\| attr->relax_domain_level < 0) {	7333	if (!attr \|\| attr->relax_domain_level < 0) {
7334	if (default_relax_domain_level < 0)	7334	if (default_relax_domain_level < 0)
7335	return;	7335	return;
7336	else	7336	else
7337	request = default_relax_domain_level;	7337	request = default_relax_domain_level;
7338	} else	7338	} else
7339	request = attr->relax_domain_level;	7339	request = attr->relax_domain_level;
7340	if (request < sd->level) {	7340	if (request < sd->level) {
7341	/* turn off idle balance on this domain */	7341	/* turn off idle balance on this domain */
7342	sd->flags &= ~(SD_BALANCE_WAKE\|SD_BALANCE_NEWIDLE);	7342	sd->flags &= ~(SD_BALANCE_WAKE\|SD_BALANCE_NEWIDLE);
7343	} else {	7343	} else {
7344	/* turn on idle balance on this domain */	7344	/* turn on idle balance on this domain */
7345	sd->flags \|= (SD_BALANCE_WAKE\|SD_BALANCE_NEWIDLE);	7345	sd->flags \|= (SD_BALANCE_WAKE\|SD_BALANCE_NEWIDLE);
7346	}	7346	}
7347	}	7347	}
7348		7348
7349	static void __free_domain_allocs(struct s_data *d, enum s_alloc what,	7349	static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
7350	const struct cpumask *cpu_map)	7350	const struct cpumask *cpu_map)
7351	{	7351	{
7352	switch (what) {	7352	switch (what) {
7353	case sa_sched_groups:	7353	case sa_sched_groups:
7354	free_sched_groups(cpu_map, d->tmpmask); /* fall through */	7354	free_sched_groups(cpu_map, d->tmpmask); /* fall through */
7355	d->sched_group_nodes = NULL;	7355	d->sched_group_nodes = NULL;
7356	case sa_rootdomain:	7356	case sa_rootdomain:
7357	free_rootdomain(d->rd); /* fall through */	7357	free_rootdomain(d->rd); /* fall through */
7358	case sa_tmpmask:	7358	case sa_tmpmask:
7359	free_cpumask_var(d->tmpmask); /* fall through */	7359	free_cpumask_var(d->tmpmask); /* fall through */
7360	case sa_send_covered:	7360	case sa_send_covered:
7361	free_cpumask_var(d->send_covered); /* fall through */	7361	free_cpumask_var(d->send_covered); /* fall through */
7362	case sa_this_book_map:	7362	case sa_this_book_map:
7363	free_cpumask_var(d->this_book_map); /* fall through */	7363	free_cpumask_var(d->this_book_map); /* fall through */
7364	case sa_this_core_map:	7364	case sa_this_core_map:
7365	free_cpumask_var(d->this_core_map); /* fall through */	7365	free_cpumask_var(d->this_core_map); /* fall through */
7366	case sa_this_sibling_map:	7366	case sa_this_sibling_map:
7367	free_cpumask_var(d->this_sibling_map); /* fall through */	7367	free_cpumask_var(d->this_sibling_map); /* fall through */
7368	case sa_nodemask:	7368	case sa_nodemask:
7369	free_cpumask_var(d->nodemask); /* fall through */	7369	free_cpumask_var(d->nodemask); /* fall through */
7370	case sa_sched_group_nodes:	7370	case sa_sched_group_nodes:
7371	#ifdef CONFIG_NUMA	7371	#ifdef CONFIG_NUMA
7372	kfree(d->sched_group_nodes); /* fall through */	7372	kfree(d->sched_group_nodes); /* fall through */
7373	case sa_notcovered:	7373	case sa_notcovered:
7374	free_cpumask_var(d->notcovered); /* fall through */	7374	free_cpumask_var(d->notcovered); /* fall through */
7375	case sa_covered:	7375	case sa_covered:
7376	free_cpumask_var(d->covered); /* fall through */	7376	free_cpumask_var(d->covered); /* fall through */
7377	case sa_domainspan:	7377	case sa_domainspan:
7378	free_cpumask_var(d->domainspan); /* fall through */	7378	free_cpumask_var(d->domainspan); /* fall through */
7379	#endif	7379	#endif
7380	case sa_none:	7380	case sa_none:
7381	break;	7381	break;
7382	}	7382	}
7383	}	7383	}
7384		7384
7385	static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,	7385	static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
7386	const struct cpumask *cpu_map)	7386	const struct cpumask *cpu_map)
7387	{	7387	{
7388	#ifdef CONFIG_NUMA	7388	#ifdef CONFIG_NUMA
7389	if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))	7389	if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))
7390	return sa_none;	7390	return sa_none;
7391	if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))	7391	if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))
7392	return sa_domainspan;	7392	return sa_domainspan;
7393	if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))	7393	if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))
7394	return sa_covered;	7394	return sa_covered;
7395	/* Allocate the per-node list of sched groups */	7395	/* Allocate the per-node list of sched groups */
7396	d->sched_group_nodes = kcalloc(nr_node_ids,	7396	d->sched_group_nodes = kcalloc(nr_node_ids,
7397	sizeof(struct sched_group *), GFP_KERNEL);	7397	sizeof(struct sched_group *), GFP_KERNEL);
7398	if (!d->sched_group_nodes) {	7398	if (!d->sched_group_nodes) {
7399	printk(KERN_WARNING "Can not alloc sched group node list\n");	7399	printk(KERN_WARNING "Can not alloc sched group node list\n");
7400	return sa_notcovered;	7400	return sa_notcovered;
7401	}	7401	}
7402	sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;	7402	sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
7403	#endif	7403	#endif
7404	if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))	7404	if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
7405	return sa_sched_group_nodes;	7405	return sa_sched_group_nodes;
7406	if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))	7406	if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
7407	return sa_nodemask;	7407	return sa_nodemask;
7408	if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))	7408	if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
7409	return sa_this_sibling_map;	7409	return sa_this_sibling_map;
7410	if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))	7410	if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
7411	return sa_this_core_map;	7411	return sa_this_core_map;
7412	if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))	7412	if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
7413	return sa_this_book_map;	7413	return sa_this_book_map;
7414	if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))	7414	if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
7415	return sa_send_covered;	7415	return sa_send_covered;
7416	d->rd = alloc_rootdomain();	7416	d->rd = alloc_rootdomain();
7417	if (!d->rd) {	7417	if (!d->rd) {
7418	printk(KERN_WARNING "Cannot alloc root domain\n");	7418	printk(KERN_WARNING "Cannot alloc root domain\n");
7419	return sa_tmpmask;	7419	return sa_tmpmask;
7420	}	7420	}
7421	return sa_rootdomain;	7421	return sa_rootdomain;
7422	}	7422	}
7423		7423
7424	static struct sched_domain __build_numa_sched_domains(struct s_data d,	7424	static struct sched_domain __build_numa_sched_domains(struct s_data d,
7425	const struct cpumask cpu_map, struct sched_domain_attr attr, int i)	7425	const struct cpumask cpu_map, struct sched_domain_attr attr, int i)
7426	{	7426	{
7427	struct sched_domain *sd = NULL;	7427	struct sched_domain *sd = NULL;
7428	#ifdef CONFIG_NUMA	7428	#ifdef CONFIG_NUMA
7429	struct sched_domain *parent;	7429	struct sched_domain *parent;
7430		7430
7431	d->sd_allnodes = 0;	7431	d->sd_allnodes = 0;
7432	if (cpumask_weight(cpu_map) >	7432	if (cpumask_weight(cpu_map) >
7433	SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {	7433	SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
7434	sd = &per_cpu(allnodes_domains, i).sd;	7434	sd = &per_cpu(allnodes_domains, i).sd;
7435	SD_INIT(sd, ALLNODES);	7435	SD_INIT(sd, ALLNODES);
7436	set_domain_attribute(sd, attr);	7436	set_domain_attribute(sd, attr);
7437	cpumask_copy(sched_domain_span(sd), cpu_map);	7437	cpumask_copy(sched_domain_span(sd), cpu_map);
7438	cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);	7438	cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
7439	d->sd_allnodes = 1;	7439	d->sd_allnodes = 1;
7440	}	7440	}
7441	parent = sd;	7441	parent = sd;
7442		7442
7443	sd = &per_cpu(node_domains, i).sd;	7443	sd = &per_cpu(node_domains, i).sd;
7444	SD_INIT(sd, NODE);	7444	SD_INIT(sd, NODE);
7445	set_domain_attribute(sd, attr);	7445	set_domain_attribute(sd, attr);
7446	sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));	7446	sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
7447	sd->parent = parent;	7447	sd->parent = parent;
7448	if (parent)	7448	if (parent)
7449	parent->child = sd;	7449	parent->child = sd;
7450	cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);	7450	cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
7451	#endif	7451	#endif
7452	return sd;	7452	return sd;
7453	}	7453	}
7454		7454
7455	static struct sched_domain __build_cpu_sched_domain(struct s_data d,	7455	static struct sched_domain __build_cpu_sched_domain(struct s_data d,
7456	const struct cpumask cpu_map, struct sched_domain_attr attr,	7456	const struct cpumask cpu_map, struct sched_domain_attr attr,
7457	struct sched_domain *parent, int i)	7457	struct sched_domain *parent, int i)
7458	{	7458	{
7459	struct sched_domain *sd;	7459	struct sched_domain *sd;
7460	sd = &per_cpu(phys_domains, i).sd;	7460	sd = &per_cpu(phys_domains, i).sd;
7461	SD_INIT(sd, CPU);	7461	SD_INIT(sd, CPU);
7462	set_domain_attribute(sd, attr);	7462	set_domain_attribute(sd, attr);
7463	cpumask_copy(sched_domain_span(sd), d->nodemask);	7463	cpumask_copy(sched_domain_span(sd), d->nodemask);
7464	sd->parent = parent;	7464	sd->parent = parent;
7465	if (parent)	7465	if (parent)
7466	parent->child = sd;	7466	parent->child = sd;
7467	cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);	7467	cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
7468	return sd;	7468	return sd;
7469	}	7469	}
7470		7470
7471	static struct sched_domain __build_book_sched_domain(struct s_data d,	7471	static struct sched_domain __build_book_sched_domain(struct s_data d,
7472	const struct cpumask cpu_map, struct sched_domain_attr attr,	7472	const struct cpumask cpu_map, struct sched_domain_attr attr,
7473	struct sched_domain *parent, int i)	7473	struct sched_domain *parent, int i)
7474	{	7474	{
7475	struct sched_domain *sd = parent;	7475	struct sched_domain *sd = parent;
7476	#ifdef CONFIG_SCHED_BOOK	7476	#ifdef CONFIG_SCHED_BOOK
7477	sd = &per_cpu(book_domains, i).sd;	7477	sd = &per_cpu(book_domains, i).sd;
7478	SD_INIT(sd, BOOK);	7478	SD_INIT(sd, BOOK);
7479	set_domain_attribute(sd, attr);	7479	set_domain_attribute(sd, attr);
7480	cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));	7480	cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
7481	sd->parent = parent;	7481	sd->parent = parent;
7482	parent->child = sd;	7482	parent->child = sd;
7483	cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);	7483	cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
7484	#endif	7484	#endif
7485	return sd;	7485	return sd;
7486	}	7486	}
7487		7487
7488	static struct sched_domain __build_mc_sched_domain(struct s_data d,	7488	static struct sched_domain __build_mc_sched_domain(struct s_data d,
7489	const struct cpumask cpu_map, struct sched_domain_attr attr,	7489	const struct cpumask cpu_map, struct sched_domain_attr attr,
7490	struct sched_domain *parent, int i)	7490	struct sched_domain *parent, int i)
7491	{	7491	{
7492	struct sched_domain *sd = parent;	7492	struct sched_domain *sd = parent;
7493	#ifdef CONFIG_SCHED_MC	7493	#ifdef CONFIG_SCHED_MC
7494	sd = &per_cpu(core_domains, i).sd;	7494	sd = &per_cpu(core_domains, i).sd;
7495	SD_INIT(sd, MC);	7495	SD_INIT(sd, MC);
7496	set_domain_attribute(sd, attr);	7496	set_domain_attribute(sd, attr);
7497	cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));	7497	cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
7498	sd->parent = parent;	7498	sd->parent = parent;
7499	parent->child = sd;	7499	parent->child = sd;
7500	cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);	7500	cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
7501	#endif	7501	#endif
7502	return sd;	7502	return sd;
7503	}	7503	}
7504		7504
7505	static struct sched_domain __build_smt_sched_domain(struct s_data d,	7505	static struct sched_domain __build_smt_sched_domain(struct s_data d,
7506	const struct cpumask cpu_map, struct sched_domain_attr attr,	7506	const struct cpumask cpu_map, struct sched_domain_attr attr,
7507	struct sched_domain *parent, int i)	7507	struct sched_domain *parent, int i)
7508	{	7508	{
7509	struct sched_domain *sd = parent;	7509	struct sched_domain *sd = parent;
7510	#ifdef CONFIG_SCHED_SMT	7510	#ifdef CONFIG_SCHED_SMT
7511	sd = &per_cpu(cpu_domains, i).sd;	7511	sd = &per_cpu(cpu_domains, i).sd;
7512	SD_INIT(sd, SIBLING);	7512	SD_INIT(sd, SIBLING);
7513	set_domain_attribute(sd, attr);	7513	set_domain_attribute(sd, attr);
7514	cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));	7514	cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
7515	sd->parent = parent;	7515	sd->parent = parent;
7516	parent->child = sd;	7516	parent->child = sd;
7517	cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);	7517	cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
7518	#endif	7518	#endif
7519	return sd;	7519	return sd;
7520	}	7520	}
7521		7521
7522	static void build_sched_groups(struct s_data *d, enum sched_domain_level l,	7522	static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
7523	const struct cpumask *cpu_map, int cpu)	7523	const struct cpumask *cpu_map, int cpu)
7524	{	7524	{
7525	switch (l) {	7525	switch (l) {
7526	#ifdef CONFIG_SCHED_SMT	7526	#ifdef CONFIG_SCHED_SMT
7527	case SD_LV_SIBLING: /* set up CPU (sibling) groups */	7527	case SD_LV_SIBLING: /* set up CPU (sibling) groups */
7528	cpumask_and(d->this_sibling_map, cpu_map,	7528	cpumask_and(d->this_sibling_map, cpu_map,
7529	topology_thread_cpumask(cpu));	7529	topology_thread_cpumask(cpu));
7530	if (cpu == cpumask_first(d->this_sibling_map))	7530	if (cpu == cpumask_first(d->this_sibling_map))
7531	init_sched_build_groups(d->this_sibling_map, cpu_map,	7531	init_sched_build_groups(d->this_sibling_map, cpu_map,
7532	&cpu_to_cpu_group,	7532	&cpu_to_cpu_group,
7533	d->send_covered, d->tmpmask);	7533	d->send_covered, d->tmpmask);
7534	break;	7534	break;
7535	#endif	7535	#endif
7536	#ifdef CONFIG_SCHED_MC	7536	#ifdef CONFIG_SCHED_MC
7537	case SD_LV_MC: /* set up multi-core groups */	7537	case SD_LV_MC: /* set up multi-core groups */
7538	cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));	7538	cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
7539	if (cpu == cpumask_first(d->this_core_map))	7539	if (cpu == cpumask_first(d->this_core_map))
7540	init_sched_build_groups(d->this_core_map, cpu_map,	7540	init_sched_build_groups(d->this_core_map, cpu_map,
7541	&cpu_to_core_group,	7541	&cpu_to_core_group,
7542	d->send_covered, d->tmpmask);	7542	d->send_covered, d->tmpmask);
7543	break;	7543	break;
7544	#endif	7544	#endif
7545	#ifdef CONFIG_SCHED_BOOK	7545	#ifdef CONFIG_SCHED_BOOK
7546	case SD_LV_BOOK: /* set up book groups */	7546	case SD_LV_BOOK: /* set up book groups */
7547	cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));	7547	cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
7548	if (cpu == cpumask_first(d->this_book_map))	7548	if (cpu == cpumask_first(d->this_book_map))
7549	init_sched_build_groups(d->this_book_map, cpu_map,	7549	init_sched_build_groups(d->this_book_map, cpu_map,
7550	&cpu_to_book_group,	7550	&cpu_to_book_group,
7551	d->send_covered, d->tmpmask);	7551	d->send_covered, d->tmpmask);
7552	break;	7552	break;
7553	#endif	7553	#endif
7554	case SD_LV_CPU: /* set up physical groups */	7554	case SD_LV_CPU: /* set up physical groups */
7555	cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);	7555	cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
7556	if (!cpumask_empty(d->nodemask))	7556	if (!cpumask_empty(d->nodemask))
7557	init_sched_build_groups(d->nodemask, cpu_map,	7557	init_sched_build_groups(d->nodemask, cpu_map,
7558	&cpu_to_phys_group,	7558	&cpu_to_phys_group,
7559	d->send_covered, d->tmpmask);	7559	d->send_covered, d->tmpmask);
7560	break;	7560	break;
7561	#ifdef CONFIG_NUMA	7561	#ifdef CONFIG_NUMA
7562	case SD_LV_ALLNODES:	7562	case SD_LV_ALLNODES:
7563	init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,	7563	init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,
7564	d->send_covered, d->tmpmask);	7564	d->send_covered, d->tmpmask);
7565	break;	7565	break;
7566	#endif	7566	#endif
7567	default:	7567	default:
7568	break;	7568	break;
7569	}	7569	}
7570	}	7570	}
7571		7571
7572	/*	7572	/*
7573	* Build sched domains for a given set of cpus and attach the sched domains	7573	* Build sched domains for a given set of cpus and attach the sched domains
7574	* to the individual cpus	7574	* to the individual cpus
7575	*/	7575	*/
7576	static int __build_sched_domains(const struct cpumask *cpu_map,	7576	static int __build_sched_domains(const struct cpumask *cpu_map,
7577	struct sched_domain_attr *attr)	7577	struct sched_domain_attr *attr)
7578	{	7578	{
7579	enum s_alloc alloc_state = sa_none;	7579	enum s_alloc alloc_state = sa_none;
7580	struct s_data d;	7580	struct s_data d;
7581	struct sched_domain *sd;	7581	struct sched_domain *sd;
7582	int i;	7582	int i;
7583	#ifdef CONFIG_NUMA	7583	#ifdef CONFIG_NUMA
7584	d.sd_allnodes = 0;	7584	d.sd_allnodes = 0;
7585	#endif	7585	#endif
7586		7586
7587	alloc_state = __visit_domain_allocation_hell(&d, cpu_map);	7587	alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
7588	if (alloc_state != sa_rootdomain)	7588	if (alloc_state != sa_rootdomain)
7589	goto error;	7589	goto error;
7590	alloc_state = sa_sched_groups;	7590	alloc_state = sa_sched_groups;
7591		7591
7592	/*	7592	/*
7593	* Set up domains for cpus specified by the cpu_map.	7593	* Set up domains for cpus specified by the cpu_map.
7594	*/	7594	*/
7595	for_each_cpu(i, cpu_map) {	7595	for_each_cpu(i, cpu_map) {
7596	cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),	7596	cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
7597	cpu_map);	7597	cpu_map);
7598		7598
7599	sd = __build_numa_sched_domains(&d, cpu_map, attr, i);	7599	sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
7600	sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);	7600	sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
7601	sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);	7601	sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);
7602	sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);	7602	sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
7603	sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);	7603	sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
7604	}	7604	}
7605		7605
7606	for_each_cpu(i, cpu_map) {	7606	for_each_cpu(i, cpu_map) {
7607	build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);	7607	build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
7608	build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);	7608	build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
7609	build_sched_groups(&d, SD_LV_MC, cpu_map, i);	7609	build_sched_groups(&d, SD_LV_MC, cpu_map, i);
7610	}	7610	}
7611		7611
7612	/* Set up physical groups */	7612	/* Set up physical groups */
7613	for (i = 0; i < nr_node_ids; i++)	7613	for (i = 0; i < nr_node_ids; i++)
7614	build_sched_groups(&d, SD_LV_CPU, cpu_map, i);	7614	build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
7615		7615
7616	#ifdef CONFIG_NUMA	7616	#ifdef CONFIG_NUMA
7617	/* Set up node groups */	7617	/* Set up node groups */
7618	if (d.sd_allnodes)	7618	if (d.sd_allnodes)
7619	build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);	7619	build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
7620		7620
7621	for (i = 0; i < nr_node_ids; i++)	7621	for (i = 0; i < nr_node_ids; i++)
7622	if (build_numa_sched_groups(&d, cpu_map, i))	7622	if (build_numa_sched_groups(&d, cpu_map, i))
7623	goto error;	7623	goto error;
7624	#endif	7624	#endif
7625		7625
7626	/* Calculate CPU power for physical packages and nodes */	7626	/* Calculate CPU power for physical packages and nodes */
7627	#ifdef CONFIG_SCHED_SMT	7627	#ifdef CONFIG_SCHED_SMT
7628	for_each_cpu(i, cpu_map) {	7628	for_each_cpu(i, cpu_map) {
7629	sd = &per_cpu(cpu_domains, i).sd;	7629	sd = &per_cpu(cpu_domains, i).sd;
7630	init_sched_groups_power(i, sd);	7630	init_sched_groups_power(i, sd);
7631	}	7631	}
7632	#endif	7632	#endif
7633	#ifdef CONFIG_SCHED_MC	7633	#ifdef CONFIG_SCHED_MC
7634	for_each_cpu(i, cpu_map) {	7634	for_each_cpu(i, cpu_map) {
7635	sd = &per_cpu(core_domains, i).sd;	7635	sd = &per_cpu(core_domains, i).sd;
7636	init_sched_groups_power(i, sd);	7636	init_sched_groups_power(i, sd);
7637	}	7637	}
7638	#endif	7638	#endif
7639	#ifdef CONFIG_SCHED_BOOK	7639	#ifdef CONFIG_SCHED_BOOK
7640	for_each_cpu(i, cpu_map) {	7640	for_each_cpu(i, cpu_map) {
7641	sd = &per_cpu(book_domains, i).sd;	7641	sd = &per_cpu(book_domains, i).sd;
7642	init_sched_groups_power(i, sd);	7642	init_sched_groups_power(i, sd);
7643	}	7643	}
7644	#endif	7644	#endif
7645		7645
7646	for_each_cpu(i, cpu_map) {	7646	for_each_cpu(i, cpu_map) {
7647	sd = &per_cpu(phys_domains, i).sd;	7647	sd = &per_cpu(phys_domains, i).sd;
7648	init_sched_groups_power(i, sd);	7648	init_sched_groups_power(i, sd);
7649	}	7649	}
7650		7650
7651	#ifdef CONFIG_NUMA	7651	#ifdef CONFIG_NUMA
7652	for (i = 0; i < nr_node_ids; i++)	7652	for (i = 0; i < nr_node_ids; i++)
7653	init_numa_sched_groups_power(d.sched_group_nodes[i]);	7653	init_numa_sched_groups_power(d.sched_group_nodes[i]);
7654		7654
7655	if (d.sd_allnodes) {	7655	if (d.sd_allnodes) {
7656	struct sched_group *sg;	7656	struct sched_group *sg;
7657		7657
7658	cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,	7658	cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
7659	d.tmpmask);	7659	d.tmpmask);
7660	init_numa_sched_groups_power(sg);	7660	init_numa_sched_groups_power(sg);
7661	}	7661	}
7662	#endif	7662	#endif
7663		7663
7664	/* Attach the domains */	7664	/* Attach the domains */
7665	for_each_cpu(i, cpu_map) {	7665	for_each_cpu(i, cpu_map) {
7666	#ifdef CONFIG_SCHED_SMT	7666	#ifdef CONFIG_SCHED_SMT
7667	sd = &per_cpu(cpu_domains, i).sd;	7667	sd = &per_cpu(cpu_domains, i).sd;
7668	#elif defined(CONFIG_SCHED_MC)	7668	#elif defined(CONFIG_SCHED_MC)
7669	sd = &per_cpu(core_domains, i).sd;	7669	sd = &per_cpu(core_domains, i).sd;
7670	#elif defined(CONFIG_SCHED_BOOK)	7670	#elif defined(CONFIG_SCHED_BOOK)
7671	sd = &per_cpu(book_domains, i).sd;	7671	sd = &per_cpu(book_domains, i).sd;
7672	#else	7672	#else
7673	sd = &per_cpu(phys_domains, i).sd;	7673	sd = &per_cpu(phys_domains, i).sd;
7674	#endif	7674	#endif
7675	cpu_attach_domain(sd, d.rd, i);	7675	cpu_attach_domain(sd, d.rd, i);
7676	}	7676	}
7677		7677
7678	d.sched_group_nodes = NULL; /* don't free this we still need it */	7678	d.sched_group_nodes = NULL; /* don't free this we still need it */
7679	__free_domain_allocs(&d, sa_tmpmask, cpu_map);	7679	__free_domain_allocs(&d, sa_tmpmask, cpu_map);
7680	return 0;	7680	return 0;
7681		7681
7682	error:	7682	error:
7683	__free_domain_allocs(&d, alloc_state, cpu_map);	7683	__free_domain_allocs(&d, alloc_state, cpu_map);
7684	return -ENOMEM;	7684	return -ENOMEM;
7685	}	7685	}
7686		7686
7687	static int build_sched_domains(const struct cpumask *cpu_map)	7687	static int build_sched_domains(const struct cpumask *cpu_map)
7688	{	7688	{
7689	return __build_sched_domains(cpu_map, NULL);	7689	return __build_sched_domains(cpu_map, NULL);
7690	}	7690	}
7691		7691
7692	static cpumask_var_t doms_cur; / current sched domains */	7692	static cpumask_var_t doms_cur; / current sched domains */
7693	static int ndoms_cur; /* number of sched domains in 'doms_cur' */	7693	static int ndoms_cur; /* number of sched domains in 'doms_cur' */
7694	static struct sched_domain_attr *dattr_cur;	7694	static struct sched_domain_attr *dattr_cur;
7695	/* attribues of custom domains in 'doms_cur' */	7695	/* attribues of custom domains in 'doms_cur' */
7696		7696
7697	/*	7697	/*
7698	* Special case: If a kmalloc of a doms_cur partition (array of	7698	* Special case: If a kmalloc of a doms_cur partition (array of
7699	* cpumask) fails, then fallback to a single sched domain,	7699	* cpumask) fails, then fallback to a single sched domain,
7700	* as determined by the single cpumask fallback_doms.	7700	* as determined by the single cpumask fallback_doms.
7701	*/	7701	*/
7702	static cpumask_var_t fallback_doms;	7702	static cpumask_var_t fallback_doms;
7703		7703
7704	/*	7704	/*
7705	* arch_update_cpu_topology lets virtualized architectures update the	7705	* arch_update_cpu_topology lets virtualized architectures update the
7706	* cpu core maps. It is supposed to return 1 if the topology changed	7706	* cpu core maps. It is supposed to return 1 if the topology changed
7707	* or 0 if it stayed the same.	7707	* or 0 if it stayed the same.
7708	*/	7708	*/
7709	int __attribute__((weak)) arch_update_cpu_topology(void)	7709	int __attribute__((weak)) arch_update_cpu_topology(void)
7710	{	7710	{
7711	return 0;	7711	return 0;
7712	}	7712	}
7713		7713
7714	cpumask_var_t *alloc_sched_domains(unsigned int ndoms)	7714	cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
7715	{	7715	{
7716	int i;	7716	int i;
7717	cpumask_var_t *doms;	7717	cpumask_var_t *doms;
7718		7718
7719	doms = kmalloc(sizeof(doms) ndoms, GFP_KERNEL);	7719	doms = kmalloc(sizeof(doms) ndoms, GFP_KERNEL);
7720	if (!doms)	7720	if (!doms)
7721	return NULL;	7721	return NULL;
7722	for (i = 0; i < ndoms; i++) {	7722	for (i = 0; i < ndoms; i++) {
7723	if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {	7723	if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
7724	free_sched_domains(doms, i);	7724	free_sched_domains(doms, i);
7725	return NULL;	7725	return NULL;
7726	}	7726	}
7727	}	7727	}
7728	return doms;	7728	return doms;
7729	}	7729	}
7730		7730
7731	void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)	7731	void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
7732	{	7732	{
7733	unsigned int i;	7733	unsigned int i;
7734	for (i = 0; i < ndoms; i++)	7734	for (i = 0; i < ndoms; i++)
7735	free_cpumask_var(doms[i]);	7735	free_cpumask_var(doms[i]);
7736	kfree(doms);	7736	kfree(doms);
7737	}	7737	}
7738		7738
7739	/*	7739	/*
7740	* Set up scheduler domains and groups. Callers must hold the hotplug lock.	7740	* Set up scheduler domains and groups. Callers must hold the hotplug lock.
7741	* For now this just excludes isolated cpus, but could be used to	7741	* For now this just excludes isolated cpus, but could be used to
7742	* exclude other special cases in the future.	7742	* exclude other special cases in the future.
7743	*/	7743	*/
7744	static int arch_init_sched_domains(const struct cpumask *cpu_map)	7744	static int arch_init_sched_domains(const struct cpumask *cpu_map)
7745	{	7745	{
7746	int err;	7746	int err;
7747		7747
7748	arch_update_cpu_topology();	7748	arch_update_cpu_topology();
7749	ndoms_cur = 1;	7749	ndoms_cur = 1;
7750	doms_cur = alloc_sched_domains(ndoms_cur);	7750	doms_cur = alloc_sched_domains(ndoms_cur);
7751	if (!doms_cur)	7751	if (!doms_cur)
7752	doms_cur = &fallback_doms;	7752	doms_cur = &fallback_doms;
7753	cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);	7753	cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
7754	dattr_cur = NULL;	7754	dattr_cur = NULL;
7755	err = build_sched_domains(doms_cur[0]);	7755	err = build_sched_domains(doms_cur[0]);
7756	register_sched_domain_sysctl();	7756	register_sched_domain_sysctl();
7757		7757
7758	return err;	7758	return err;
7759	}	7759	}
7760		7760
7761	static void arch_destroy_sched_domains(const struct cpumask *cpu_map,	7761	static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
7762	struct cpumask *tmpmask)	7762	struct cpumask *tmpmask)
7763	{	7763	{
7764	free_sched_groups(cpu_map, tmpmask);	7764	free_sched_groups(cpu_map, tmpmask);
7765	}	7765	}
7766		7766
7767	/*	7767	/*
7768	* Detach sched domains from a group of cpus specified in cpu_map	7768	* Detach sched domains from a group of cpus specified in cpu_map
7769	* These cpus will now be attached to the NULL domain	7769	* These cpus will now be attached to the NULL domain
7770	*/	7770	*/
7771	static void detach_destroy_domains(const struct cpumask *cpu_map)	7771	static void detach_destroy_domains(const struct cpumask *cpu_map)
7772	{	7772	{
7773	/* Save because hotplug lock held. */	7773	/* Save because hotplug lock held. */
7774	static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);	7774	static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
7775	int i;	7775	int i;
7776		7776
7777	for_each_cpu(i, cpu_map)	7777	for_each_cpu(i, cpu_map)
7778	cpu_attach_domain(NULL, &def_root_domain, i);	7778	cpu_attach_domain(NULL, &def_root_domain, i);
7779	synchronize_sched();	7779	synchronize_sched();
7780	arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));	7780	arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
7781	}	7781	}
7782		7782
7783	/* handle null as "default" */	7783	/* handle null as "default" */
7784	static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,	7784	static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7785	struct sched_domain_attr *new, int idx_new)	7785	struct sched_domain_attr *new, int idx_new)
7786	{	7786	{
7787	struct sched_domain_attr tmp;	7787	struct sched_domain_attr tmp;
7788		7788
7789	/* fast path */	7789	/* fast path */
7790	if (!new && !cur)	7790	if (!new && !cur)
7791	return 1;	7791	return 1;
7792		7792
7793	tmp = SD_ATTR_INIT;	7793	tmp = SD_ATTR_INIT;
7794	return !memcmp(cur ? (cur + idx_cur) : &tmp,	7794	return !memcmp(cur ? (cur + idx_cur) : &tmp,
7795	new ? (new + idx_new) : &tmp,	7795	new ? (new + idx_new) : &tmp,
7796	sizeof(struct sched_domain_attr));	7796	sizeof(struct sched_domain_attr));
7797	}	7797	}
7798		7798
7799	/*	7799	/*
7800	* Partition sched domains as specified by the 'ndoms_new'	7800	* Partition sched domains as specified by the 'ndoms_new'
7801	* cpumasks in the array doms_new[] of cpumasks. This compares	7801	* cpumasks in the array doms_new[] of cpumasks. This compares
7802	* doms_new[] to the current sched domain partitioning, doms_cur[].	7802	* doms_new[] to the current sched domain partitioning, doms_cur[].
7803	* It destroys each deleted domain and builds each new domain.	7803	* It destroys each deleted domain and builds each new domain.
7804	*	7804	*
7805	* 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.	7805	* 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
7806	* The masks don't intersect (don't overlap.) We should setup one	7806	* The masks don't intersect (don't overlap.) We should setup one
7807	* sched domain for each mask. CPUs not in any of the cpumasks will	7807	* sched domain for each mask. CPUs not in any of the cpumasks will
7808	* not be load balanced. If the same cpumask appears both in the	7808	* not be load balanced. If the same cpumask appears both in the
7809	* current 'doms_cur' domains and in the new 'doms_new', we can leave	7809	* current 'doms_cur' domains and in the new 'doms_new', we can leave
7810	* it as it is.	7810	* it as it is.
7811	*	7811	*
7812	* The passed in 'doms_new' should be allocated using	7812	* The passed in 'doms_new' should be allocated using
7813	* alloc_sched_domains. This routine takes ownership of it and will	7813	* alloc_sched_domains. This routine takes ownership of it and will
7814	* free_sched_domains it when done with it. If the caller failed the	7814	* free_sched_domains it when done with it. If the caller failed the
7815	* alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,	7815	* alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
7816	* and partition_sched_domains() will fallback to the single partition	7816	* and partition_sched_domains() will fallback to the single partition
7817	* 'fallback_doms', it also forces the domains to be rebuilt.	7817	* 'fallback_doms', it also forces the domains to be rebuilt.
7818	*	7818	*
7819	* If doms_new == NULL it will be replaced with cpu_online_mask.	7819	* If doms_new == NULL it will be replaced with cpu_online_mask.
7820	* ndoms_new == 0 is a special case for destroying existing domains,	7820	* ndoms_new == 0 is a special case for destroying existing domains,
7821	* and it will not create the default domain.	7821	* and it will not create the default domain.
7822	*	7822	*
7823	* Call with hotplug lock held	7823	* Call with hotplug lock held
7824	*/	7824	*/
7825	void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],	7825	void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
7826	struct sched_domain_attr *dattr_new)	7826	struct sched_domain_attr *dattr_new)
7827	{	7827	{
7828	int i, j, n;	7828	int i, j, n;
7829	int new_topology;	7829	int new_topology;
7830		7830
7831	mutex_lock(&sched_domains_mutex);	7831	mutex_lock(&sched_domains_mutex);
7832		7832
7833	/* always unregister in case we don't destroy any domains */	7833	/* always unregister in case we don't destroy any domains */
7834	unregister_sched_domain_sysctl();	7834	unregister_sched_domain_sysctl();
7835		7835
7836	/* Let architecture update cpu core mappings. */	7836	/* Let architecture update cpu core mappings. */
7837	new_topology = arch_update_cpu_topology();	7837	new_topology = arch_update_cpu_topology();
7838		7838
7839	n = doms_new ? ndoms_new : 0;	7839	n = doms_new ? ndoms_new : 0;
7840		7840
7841	/* Destroy deleted domains */	7841	/* Destroy deleted domains */
7842	for (i = 0; i < ndoms_cur; i++) {	7842	for (i = 0; i < ndoms_cur; i++) {
7843	for (j = 0; j < n && !new_topology; j++) {	7843	for (j = 0; j < n && !new_topology; j++) {
7844	if (cpumask_equal(doms_cur[i], doms_new[j])	7844	if (cpumask_equal(doms_cur[i], doms_new[j])
7845	&& dattrs_equal(dattr_cur, i, dattr_new, j))	7845	&& dattrs_equal(dattr_cur, i, dattr_new, j))
7846	goto match1;	7846	goto match1;
7847	}	7847	}
7848	/* no match - a current sched domain not in new doms_new[] */	7848	/* no match - a current sched domain not in new doms_new[] */
7849	detach_destroy_domains(doms_cur[i]);	7849	detach_destroy_domains(doms_cur[i]);
7850	match1:	7850	match1:
7851	;	7851	;
7852	}	7852	}
7853		7853
7854	if (doms_new == NULL) {	7854	if (doms_new == NULL) {
7855	ndoms_cur = 0;	7855	ndoms_cur = 0;
7856	doms_new = &fallback_doms;	7856	doms_new = &fallback_doms;
7857	cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);	7857	cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
7858	WARN_ON_ONCE(dattr_new);	7858	WARN_ON_ONCE(dattr_new);
7859	}	7859	}
7860		7860
7861	/* Build new domains */	7861	/* Build new domains */
7862	for (i = 0; i < ndoms_new; i++) {	7862	for (i = 0; i < ndoms_new; i++) {
7863	for (j = 0; j < ndoms_cur && !new_topology; j++) {	7863	for (j = 0; j < ndoms_cur && !new_topology; j++) {
7864	if (cpumask_equal(doms_new[i], doms_cur[j])	7864	if (cpumask_equal(doms_new[i], doms_cur[j])
7865	&& dattrs_equal(dattr_new, i, dattr_cur, j))	7865	&& dattrs_equal(dattr_new, i, dattr_cur, j))
7866	goto match2;	7866	goto match2;
7867	}	7867	}
7868	/* no match - add a new doms_new */	7868	/* no match - add a new doms_new */
7869	__build_sched_domains(doms_new[i],	7869	__build_sched_domains(doms_new[i],
7870	dattr_new ? dattr_new + i : NULL);	7870	dattr_new ? dattr_new + i : NULL);
7871	match2:	7871	match2:
7872	;	7872	;
7873	}	7873	}
7874		7874
7875	/* Remember the new sched domains */	7875	/* Remember the new sched domains */
7876	if (doms_cur != &fallback_doms)	7876	if (doms_cur != &fallback_doms)
7877	free_sched_domains(doms_cur, ndoms_cur);	7877	free_sched_domains(doms_cur, ndoms_cur);
7878	kfree(dattr_cur); /* kfree(NULL) is safe */	7878	kfree(dattr_cur); /* kfree(NULL) is safe */
7879	doms_cur = doms_new;	7879	doms_cur = doms_new;
7880	dattr_cur = dattr_new;	7880	dattr_cur = dattr_new;
7881	ndoms_cur = ndoms_new;	7881	ndoms_cur = ndoms_new;
7882		7882
7883	register_sched_domain_sysctl();	7883	register_sched_domain_sysctl();
7884		7884
7885	mutex_unlock(&sched_domains_mutex);	7885	mutex_unlock(&sched_domains_mutex);
7886	}	7886	}
7887		7887
7888	#if defined(CONFIG_SCHED_MC) \|\| defined(CONFIG_SCHED_SMT)	7888	#if defined(CONFIG_SCHED_MC) \|\| defined(CONFIG_SCHED_SMT)
7889	static void arch_reinit_sched_domains(void)	7889	static void arch_reinit_sched_domains(void)
7890	{	7890	{
7891	get_online_cpus();	7891	get_online_cpus();
7892		7892
7893	/* Destroy domains first to force the rebuild */	7893	/* Destroy domains first to force the rebuild */
7894	partition_sched_domains(0, NULL, NULL);	7894	partition_sched_domains(0, NULL, NULL);
7895		7895
7896	rebuild_sched_domains();	7896	rebuild_sched_domains();
7897	put_online_cpus();	7897	put_online_cpus();
7898	}	7898	}
7899		7899
7900	static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)	7900	static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
7901	{	7901	{
7902	unsigned int level = 0;	7902	unsigned int level = 0;
7903		7903
7904	if (sscanf(buf, "%u", &level) != 1)	7904	if (sscanf(buf, "%u", &level) != 1)
7905	return -EINVAL;	7905	return -EINVAL;
7906		7906
7907	/*	7907	/*
7908	* level is always be positive so don't check for	7908	* level is always be positive so don't check for
7909	* level < POWERSAVINGS_BALANCE_NONE which is 0	7909	* level < POWERSAVINGS_BALANCE_NONE which is 0
7910	* What happens on 0 or 1 byte write,	7910	* What happens on 0 or 1 byte write,
7911	* need to check for count as well?	7911	* need to check for count as well?
7912	*/	7912	*/
7913		7913
7914	if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)	7914	if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
7915	return -EINVAL;	7915	return -EINVAL;
7916		7916
7917	if (smt)	7917	if (smt)
7918	sched_smt_power_savings = level;	7918	sched_smt_power_savings = level;
7919	else	7919	else
7920	sched_mc_power_savings = level;	7920	sched_mc_power_savings = level;
7921		7921
7922	arch_reinit_sched_domains();	7922	arch_reinit_sched_domains();
7923		7923
7924	return count;	7924	return count;
7925	}	7925	}
7926		7926
7927	#ifdef CONFIG_SCHED_MC	7927	#ifdef CONFIG_SCHED_MC
7928	static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,	7928	static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
7929	struct sysdev_class_attribute *attr,	7929	struct sysdev_class_attribute *attr,
7930	char *page)	7930	char *page)
7931	{	7931	{
7932	return sprintf(page, "%u\n", sched_mc_power_savings);	7932	return sprintf(page, "%u\n", sched_mc_power_savings);
7933	}	7933	}
7934	static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,	7934	static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
7935	struct sysdev_class_attribute *attr,	7935	struct sysdev_class_attribute *attr,
7936	const char *buf, size_t count)	7936	const char *buf, size_t count)
7937	{	7937	{
7938	return sched_power_savings_store(buf, count, 0);	7938	return sched_power_savings_store(buf, count, 0);
7939	}	7939	}
7940	static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,	7940	static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
7941	sched_mc_power_savings_show,	7941	sched_mc_power_savings_show,
7942	sched_mc_power_savings_store);	7942	sched_mc_power_savings_store);
7943	#endif	7943	#endif
7944		7944
7945	#ifdef CONFIG_SCHED_SMT	7945	#ifdef CONFIG_SCHED_SMT
7946	static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,	7946	static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
7947	struct sysdev_class_attribute *attr,	7947	struct sysdev_class_attribute *attr,
7948	char *page)	7948	char *page)
7949	{	7949	{
7950	return sprintf(page, "%u\n", sched_smt_power_savings);	7950	return sprintf(page, "%u\n", sched_smt_power_savings);
7951	}	7951	}
7952	static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,	7952	static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
7953	struct sysdev_class_attribute *attr,	7953	struct sysdev_class_attribute *attr,
7954	const char *buf, size_t count)	7954	const char *buf, size_t count)
7955	{	7955	{
7956	return sched_power_savings_store(buf, count, 1);	7956	return sched_power_savings_store(buf, count, 1);
7957	}	7957	}
7958	static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,	7958	static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
7959	sched_smt_power_savings_show,	7959	sched_smt_power_savings_show,
7960	sched_smt_power_savings_store);	7960	sched_smt_power_savings_store);
7961	#endif	7961	#endif
7962		7962
7963	int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)	7963	int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7964	{	7964	{
7965	int err = 0;	7965	int err = 0;
7966		7966
7967	#ifdef CONFIG_SCHED_SMT	7967	#ifdef CONFIG_SCHED_SMT
7968	if (smt_capable())	7968	if (smt_capable())
7969	err = sysfs_create_file(&cls->kset.kobj,	7969	err = sysfs_create_file(&cls->kset.kobj,
7970	&attr_sched_smt_power_savings.attr);	7970	&attr_sched_smt_power_savings.attr);
7971	#endif	7971	#endif
7972	#ifdef CONFIG_SCHED_MC	7972	#ifdef CONFIG_SCHED_MC
7973	if (!err && mc_capable())	7973	if (!err && mc_capable())
7974	err = sysfs_create_file(&cls->kset.kobj,	7974	err = sysfs_create_file(&cls->kset.kobj,
7975	&attr_sched_mc_power_savings.attr);	7975	&attr_sched_mc_power_savings.attr);
7976	#endif	7976	#endif
7977	return err;	7977	return err;
7978	}	7978	}
7979	#endif /* CONFIG_SCHED_MC \|\| CONFIG_SCHED_SMT */	7979	#endif /* CONFIG_SCHED_MC \|\| CONFIG_SCHED_SMT */
7980		7980
7981	/*	7981	/*
7982	* Update cpusets according to cpu_active mask. If cpusets are	7982	* Update cpusets according to cpu_active mask. If cpusets are
7983	* disabled, cpuset_update_active_cpus() becomes a simple wrapper	7983	* disabled, cpuset_update_active_cpus() becomes a simple wrapper
7984	* around partition_sched_domains().	7984	* around partition_sched_domains().
7985	*/	7985	*/
7986	static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,	7986	static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
7987	void *hcpu)	7987	void *hcpu)
7988	{	7988	{
7989	switch (action & ~CPU_TASKS_FROZEN) {	7989	switch (action & ~CPU_TASKS_FROZEN) {
7990	case CPU_ONLINE:	7990	case CPU_ONLINE:
7991	case CPU_DOWN_FAILED:	7991	case CPU_DOWN_FAILED:
7992	cpuset_update_active_cpus();	7992	cpuset_update_active_cpus();
7993	return NOTIFY_OK;	7993	return NOTIFY_OK;
7994	default:	7994	default:
7995	return NOTIFY_DONE;	7995	return NOTIFY_DONE;
7996	}	7996	}
7997	}	7997	}
7998		7998
7999	static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,	7999	static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
8000	void *hcpu)	8000	void *hcpu)
8001	{	8001	{
8002	switch (action & ~CPU_TASKS_FROZEN) {	8002	switch (action & ~CPU_TASKS_FROZEN) {
8003	case CPU_DOWN_PREPARE:	8003	case CPU_DOWN_PREPARE:
8004	cpuset_update_active_cpus();	8004	cpuset_update_active_cpus();
8005	return NOTIFY_OK;	8005	return NOTIFY_OK;
8006	default:	8006	default:
8007	return NOTIFY_DONE;	8007	return NOTIFY_DONE;
8008	}	8008	}
8009	}	8009	}
8010		8010
8011	static int update_runtime(struct notifier_block *nfb,	8011	static int update_runtime(struct notifier_block *nfb,
8012	unsigned long action, void *hcpu)	8012	unsigned long action, void *hcpu)
8013	{	8013	{
8014	int cpu = (int)(long)hcpu;	8014	int cpu = (int)(long)hcpu;
8015		8015
8016	switch (action) {	8016	switch (action) {
8017	case CPU_DOWN_PREPARE:	8017	case CPU_DOWN_PREPARE:
8018	case CPU_DOWN_PREPARE_FROZEN:	8018	case CPU_DOWN_PREPARE_FROZEN:
8019	disable_runtime(cpu_rq(cpu));	8019	disable_runtime(cpu_rq(cpu));
8020	return NOTIFY_OK;	8020	return NOTIFY_OK;
8021		8021
8022	case CPU_DOWN_FAILED:	8022	case CPU_DOWN_FAILED:
8023	case CPU_DOWN_FAILED_FROZEN:	8023	case CPU_DOWN_FAILED_FROZEN:
8024	case CPU_ONLINE:	8024	case CPU_ONLINE:
8025	case CPU_ONLINE_FROZEN:	8025	case CPU_ONLINE_FROZEN:
8026	enable_runtime(cpu_rq(cpu));	8026	enable_runtime(cpu_rq(cpu));
8027	return NOTIFY_OK;	8027	return NOTIFY_OK;
8028		8028
8029	default:	8029	default:
8030	return NOTIFY_DONE;	8030	return NOTIFY_DONE;
8031	}	8031	}
8032	}	8032	}
8033		8033
8034	void __init sched_init_smp(void)	8034	void __init sched_init_smp(void)
8035	{	8035	{
8036	cpumask_var_t non_isolated_cpus;	8036	cpumask_var_t non_isolated_cpus;
8037		8037
8038	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);	8038	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
8039	alloc_cpumask_var(&fallback_doms, GFP_KERNEL);	8039	alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
8040		8040
8041	#if defined(CONFIG_NUMA)	8041	#if defined(CONFIG_NUMA)
8042	sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),	8042	sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
8043	GFP_KERNEL);	8043	GFP_KERNEL);
8044	BUG_ON(sched_group_nodes_bycpu == NULL);	8044	BUG_ON(sched_group_nodes_bycpu == NULL);
8045	#endif	8045	#endif
8046	get_online_cpus();	8046	get_online_cpus();
8047	mutex_lock(&sched_domains_mutex);	8047	mutex_lock(&sched_domains_mutex);
8048	arch_init_sched_domains(cpu_active_mask);	8048	arch_init_sched_domains(cpu_active_mask);
8049	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);	8049	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
8050	if (cpumask_empty(non_isolated_cpus))	8050	if (cpumask_empty(non_isolated_cpus))
8051	cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);	8051	cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
8052	mutex_unlock(&sched_domains_mutex);	8052	mutex_unlock(&sched_domains_mutex);
8053	put_online_cpus();	8053	put_online_cpus();
8054		8054
8055	hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);	8055	hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
8056	hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);	8056	hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
8057		8057
8058	/* RT runtime code needs to handle some hotplug events */	8058	/* RT runtime code needs to handle some hotplug events */
8059	hotcpu_notifier(update_runtime, 0);	8059	hotcpu_notifier(update_runtime, 0);
8060		8060
8061	init_hrtick();	8061	init_hrtick();
8062		8062
8063	/* Move init over to a non-isolated CPU */	8063	/* Move init over to a non-isolated CPU */
8064	if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)	8064	if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
8065	BUG();	8065	BUG();
8066	sched_init_granularity();	8066	sched_init_granularity();
8067	free_cpumask_var(non_isolated_cpus);	8067	free_cpumask_var(non_isolated_cpus);
8068		8068
8069	init_sched_rt_class();	8069	init_sched_rt_class();
8070	}	8070	}
8071	#else	8071	#else
8072	void __init sched_init_smp(void)	8072	void __init sched_init_smp(void)
8073	{	8073	{
8074	sched_init_granularity();	8074	sched_init_granularity();
8075	}	8075	}
8076	#endif /* CONFIG_SMP */	8076	#endif /* CONFIG_SMP */
8077		8077
8078	const_debug unsigned int sysctl_timer_migration = 1;	8078	const_debug unsigned int sysctl_timer_migration = 1;
8079		8079
8080	int in_sched_functions(unsigned long addr)	8080	int in_sched_functions(unsigned long addr)
8081	{	8081	{
8082	return in_lock_functions(addr) \|\|	8082	return in_lock_functions(addr) \|\|
8083	(addr >= (unsigned long)__sched_text_start	8083	(addr >= (unsigned long)__sched_text_start
8084	&& addr < (unsigned long)__sched_text_end);	8084	&& addr < (unsigned long)__sched_text_end);
8085	}	8085	}
8086		8086
8087	static void init_cfs_rq(struct cfs_rq cfs_rq, struct rq rq)	8087	static void init_cfs_rq(struct cfs_rq cfs_rq, struct rq rq)
8088	{	8088	{
8089	cfs_rq->tasks_timeline = RB_ROOT;	8089	cfs_rq->tasks_timeline = RB_ROOT;
8090	INIT_LIST_HEAD(&cfs_rq->tasks);	8090	INIT_LIST_HEAD(&cfs_rq->tasks);
8091	#ifdef CONFIG_FAIR_GROUP_SCHED	8091	#ifdef CONFIG_FAIR_GROUP_SCHED
8092	cfs_rq->rq = rq;	8092	cfs_rq->rq = rq;
8093	/* allow initial update_cfs_load() to truncate */	8093	/* allow initial update_cfs_load() to truncate */
8094	#ifdef CONFIG_SMP	8094	#ifdef CONFIG_SMP
8095	cfs_rq->load_stamp = 1;	8095	cfs_rq->load_stamp = 1;
8096	#endif	8096	#endif
8097	#endif	8097	#endif
8098	cfs_rq->min_vruntime = (u64)(-(1LL << 20));	8098	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
8099	}	8099	}
8100		8100
8101	static void init_rt_rq(struct rt_rq rt_rq, struct rq rq)	8101	static void init_rt_rq(struct rt_rq rt_rq, struct rq rq)
8102	{	8102	{
8103	struct rt_prio_array *array;	8103	struct rt_prio_array *array;
8104	int i;	8104	int i;
8105		8105
8106	array = &rt_rq->active;	8106	array = &rt_rq->active;
8107	for (i = 0; i < MAX_RT_PRIO; i++) {	8107	for (i = 0; i < MAX_RT_PRIO; i++) {
8108	INIT_LIST_HEAD(array->queue + i);	8108	INIT_LIST_HEAD(array->queue + i);
8109	__clear_bit(i, array->bitmap);	8109	__clear_bit(i, array->bitmap);
8110	}	8110	}
8111	/* delimiter for bitsearch: */	8111	/* delimiter for bitsearch: */
8112	__set_bit(MAX_RT_PRIO, array->bitmap);	8112	__set_bit(MAX_RT_PRIO, array->bitmap);
8113		8113
8114	#if defined CONFIG_SMP \|\| defined CONFIG_RT_GROUP_SCHED	8114	#if defined CONFIG_SMP \|\| defined CONFIG_RT_GROUP_SCHED
8115	rt_rq->highest_prio.curr = MAX_RT_PRIO;	8115	rt_rq->highest_prio.curr = MAX_RT_PRIO;
8116	#ifdef CONFIG_SMP	8116	#ifdef CONFIG_SMP
8117	rt_rq->highest_prio.next = MAX_RT_PRIO;	8117	rt_rq->highest_prio.next = MAX_RT_PRIO;
8118	#endif	8118	#endif
8119	#endif	8119	#endif
8120	#ifdef CONFIG_SMP	8120	#ifdef CONFIG_SMP
8121	rt_rq->rt_nr_migratory = 0;	8121	rt_rq->rt_nr_migratory = 0;
8122	rt_rq->overloaded = 0;	8122	rt_rq->overloaded = 0;
8123	plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock);	8123	plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock);
8124	#endif	8124	#endif
8125		8125
8126	rt_rq->rt_time = 0;	8126	rt_rq->rt_time = 0;
8127	rt_rq->rt_throttled = 0;	8127	rt_rq->rt_throttled = 0;
8128	rt_rq->rt_runtime = 0;	8128	rt_rq->rt_runtime = 0;
8129	raw_spin_lock_init(&rt_rq->rt_runtime_lock);	8129	raw_spin_lock_init(&rt_rq->rt_runtime_lock);
8130		8130
8131	#ifdef CONFIG_RT_GROUP_SCHED	8131	#ifdef CONFIG_RT_GROUP_SCHED
8132	rt_rq->rt_nr_boosted = 0;	8132	rt_rq->rt_nr_boosted = 0;
8133	rt_rq->rq = rq;	8133	rt_rq->rq = rq;
8134	#endif	8134	#endif
8135	}	8135	}
8136		8136
8137	#ifdef CONFIG_FAIR_GROUP_SCHED	8137	#ifdef CONFIG_FAIR_GROUP_SCHED
8138	static void init_tg_cfs_entry(struct task_group tg, struct cfs_rq cfs_rq,	8138	static void init_tg_cfs_entry(struct task_group tg, struct cfs_rq cfs_rq,
8139	struct sched_entity *se, int cpu,	8139	struct sched_entity *se, int cpu,
8140	struct sched_entity *parent)	8140	struct sched_entity *parent)
8141	{	8141	{
8142	struct rq *rq = cpu_rq(cpu);	8142	struct rq *rq = cpu_rq(cpu);
8143	tg->cfs_rq[cpu] = cfs_rq;	8143	tg->cfs_rq[cpu] = cfs_rq;
8144	init_cfs_rq(cfs_rq, rq);	8144	init_cfs_rq(cfs_rq, rq);
8145	cfs_rq->tg = tg;	8145	cfs_rq->tg = tg;
8146		8146
8147	tg->se[cpu] = se;	8147	tg->se[cpu] = se;
8148	/* se could be NULL for root_task_group */	8148	/* se could be NULL for root_task_group */
8149	if (!se)	8149	if (!se)
8150	return;	8150	return;
8151		8151
8152	if (!parent)	8152	if (!parent)
8153	se->cfs_rq = &rq->cfs;	8153	se->cfs_rq = &rq->cfs;
8154	else	8154	else
8155	se->cfs_rq = parent->my_q;	8155	se->cfs_rq = parent->my_q;
8156		8156
8157	se->my_q = cfs_rq;	8157	se->my_q = cfs_rq;
8158	update_load_set(&se->load, 0);	8158	update_load_set(&se->load, 0);
8159	se->parent = parent;	8159	se->parent = parent;
8160	}	8160	}
8161	#endif	8161	#endif
8162		8162
8163	#ifdef CONFIG_RT_GROUP_SCHED	8163	#ifdef CONFIG_RT_GROUP_SCHED
8164	static void init_tg_rt_entry(struct task_group tg, struct rt_rq rt_rq,	8164	static void init_tg_rt_entry(struct task_group tg, struct rt_rq rt_rq,
8165	struct sched_rt_entity *rt_se, int cpu,	8165	struct sched_rt_entity *rt_se, int cpu,
8166	struct sched_rt_entity *parent)	8166	struct sched_rt_entity *parent)
8167	{	8167	{
8168	struct rq *rq = cpu_rq(cpu);	8168	struct rq *rq = cpu_rq(cpu);
8169		8169
8170	tg->rt_rq[cpu] = rt_rq;	8170	tg->rt_rq[cpu] = rt_rq;
8171	init_rt_rq(rt_rq, rq);	8171	init_rt_rq(rt_rq, rq);
8172	rt_rq->tg = tg;	8172	rt_rq->tg = tg;
8173	rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;	8173	rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
8174		8174
8175	tg->rt_se[cpu] = rt_se;	8175	tg->rt_se[cpu] = rt_se;
8176	if (!rt_se)	8176	if (!rt_se)
8177	return;	8177	return;
8178		8178
8179	if (!parent)	8179	if (!parent)
8180	rt_se->rt_rq = &rq->rt;	8180	rt_se->rt_rq = &rq->rt;
8181	else	8181	else
8182	rt_se->rt_rq = parent->my_q;	8182	rt_se->rt_rq = parent->my_q;
8183		8183
8184	rt_se->my_q = rt_rq;	8184	rt_se->my_q = rt_rq;
8185	rt_se->parent = parent;	8185	rt_se->parent = parent;
8186	INIT_LIST_HEAD(&rt_se->run_list);	8186	INIT_LIST_HEAD(&rt_se->run_list);
8187	}	8187	}
8188	#endif	8188	#endif
8189		8189
8190	void __init sched_init(void)	8190	void __init sched_init(void)
8191	{	8191	{
8192	int i, j;	8192	int i, j;
8193	unsigned long alloc_size = 0, ptr;	8193	unsigned long alloc_size = 0, ptr;
8194		8194
8195	#ifdef CONFIG_FAIR_GROUP_SCHED	8195	#ifdef CONFIG_FAIR_GROUP_SCHED
8196	alloc_size += 2 * nr_cpu_ids * sizeof(void **);	8196	alloc_size += 2 * nr_cpu_ids * sizeof(void **);
8197	#endif	8197	#endif
8198	#ifdef CONFIG_RT_GROUP_SCHED	8198	#ifdef CONFIG_RT_GROUP_SCHED
8199	alloc_size += 2 * nr_cpu_ids * sizeof(void **);	8199	alloc_size += 2 * nr_cpu_ids * sizeof(void **);
8200	#endif	8200	#endif
8201	#ifdef CONFIG_CPUMASK_OFFSTACK	8201	#ifdef CONFIG_CPUMASK_OFFSTACK
8202	alloc_size += num_possible_cpus() * cpumask_size();	8202	alloc_size += num_possible_cpus() * cpumask_size();
8203	#endif	8203	#endif
8204	if (alloc_size) {	8204	if (alloc_size) {
8205	ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);	8205	ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
8206		8206
8207	#ifdef CONFIG_FAIR_GROUP_SCHED	8207	#ifdef CONFIG_FAIR_GROUP_SCHED
8208	root_task_group.se = (struct sched_entity **)ptr;	8208	root_task_group.se = (struct sched_entity **)ptr;
8209	ptr += nr_cpu_ids * sizeof(void **);	8209	ptr += nr_cpu_ids * sizeof(void **);
8210		8210
8211	root_task_group.cfs_rq = (struct cfs_rq **)ptr;	8211	root_task_group.cfs_rq = (struct cfs_rq **)ptr;
8212	ptr += nr_cpu_ids * sizeof(void **);	8212	ptr += nr_cpu_ids * sizeof(void **);
8213		8213
8214	#endif /* CONFIG_FAIR_GROUP_SCHED */	8214	#endif /* CONFIG_FAIR_GROUP_SCHED */
8215	#ifdef CONFIG_RT_GROUP_SCHED	8215	#ifdef CONFIG_RT_GROUP_SCHED
8216	root_task_group.rt_se = (struct sched_rt_entity **)ptr;	8216	root_task_group.rt_se = (struct sched_rt_entity **)ptr;
8217	ptr += nr_cpu_ids * sizeof(void **);	8217	ptr += nr_cpu_ids * sizeof(void **);
8218		8218
8219	root_task_group.rt_rq = (struct rt_rq **)ptr;	8219	root_task_group.rt_rq = (struct rt_rq **)ptr;
8220	ptr += nr_cpu_ids * sizeof(void **);	8220	ptr += nr_cpu_ids * sizeof(void **);
8221		8221
8222	#endif /* CONFIG_RT_GROUP_SCHED */	8222	#endif /* CONFIG_RT_GROUP_SCHED */
8223	#ifdef CONFIG_CPUMASK_OFFSTACK	8223	#ifdef CONFIG_CPUMASK_OFFSTACK
8224	for_each_possible_cpu(i) {	8224	for_each_possible_cpu(i) {
8225	per_cpu(load_balance_tmpmask, i) = (void *)ptr;	8225	per_cpu(load_balance_tmpmask, i) = (void *)ptr;
8226	ptr += cpumask_size();	8226	ptr += cpumask_size();
8227	}	8227	}
8228	#endif /* CONFIG_CPUMASK_OFFSTACK */	8228	#endif /* CONFIG_CPUMASK_OFFSTACK */
8229	}	8229	}
8230		8230
8231	#ifdef CONFIG_SMP	8231	#ifdef CONFIG_SMP
8232	init_defrootdomain();	8232	init_defrootdomain();
8233	#endif	8233	#endif
8234		8234
8235	init_rt_bandwidth(&def_rt_bandwidth,	8235	init_rt_bandwidth(&def_rt_bandwidth,
8236	global_rt_period(), global_rt_runtime());	8236	global_rt_period(), global_rt_runtime());
8237		8237
8238	#ifdef CONFIG_RT_GROUP_SCHED	8238	#ifdef CONFIG_RT_GROUP_SCHED
8239	init_rt_bandwidth(&root_task_group.rt_bandwidth,	8239	init_rt_bandwidth(&root_task_group.rt_bandwidth,
8240	global_rt_period(), global_rt_runtime());	8240	global_rt_period(), global_rt_runtime());
8241	#endif /* CONFIG_RT_GROUP_SCHED */	8241	#endif /* CONFIG_RT_GROUP_SCHED */
8242		8242
8243	#ifdef CONFIG_CGROUP_SCHED	8243	#ifdef CONFIG_CGROUP_SCHED
8244	list_add(&root_task_group.list, &task_groups);	8244	list_add(&root_task_group.list, &task_groups);
8245	INIT_LIST_HEAD(&root_task_group.children);	8245	INIT_LIST_HEAD(&root_task_group.children);
8246	autogroup_init(&init_task);	8246	autogroup_init(&init_task);
8247	#endif /* CONFIG_CGROUP_SCHED */	8247	#endif /* CONFIG_CGROUP_SCHED */
8248		8248
8249	for_each_possible_cpu(i) {	8249	for_each_possible_cpu(i) {
8250	struct rq *rq;	8250	struct rq *rq;
8251		8251
8252	rq = cpu_rq(i);	8252	rq = cpu_rq(i);
8253	raw_spin_lock_init(&rq->lock);	8253	raw_spin_lock_init(&rq->lock);
8254	rq->nr_running = 0;	8254	rq->nr_running = 0;
8255	rq->calc_load_active = 0;	8255	rq->calc_load_active = 0;
8256	rq->calc_load_update = jiffies + LOAD_FREQ;	8256	rq->calc_load_update = jiffies + LOAD_FREQ;
8257	init_cfs_rq(&rq->cfs, rq);	8257	init_cfs_rq(&rq->cfs, rq);
8258	init_rt_rq(&rq->rt, rq);	8258	init_rt_rq(&rq->rt, rq);
8259	#ifdef CONFIG_FAIR_GROUP_SCHED	8259	#ifdef CONFIG_FAIR_GROUP_SCHED
8260	root_task_group.shares = root_task_group_load;	8260	root_task_group.shares = root_task_group_load;
8261	INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);	8261	INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
8262	/*	8262	/*
8263	* How much cpu bandwidth does root_task_group get?	8263	* How much cpu bandwidth does root_task_group get?
8264	*	8264	*
8265	* In case of task-groups formed thr' the cgroup filesystem, it	8265	* In case of task-groups formed thr' the cgroup filesystem, it
8266	* gets 100% of the cpu resources in the system. This overall	8266	* gets 100% of the cpu resources in the system. This overall
8267	* system cpu resource is divided among the tasks of	8267	* system cpu resource is divided among the tasks of
8268	* root_task_group and its child task-groups in a fair manner,	8268	* root_task_group and its child task-groups in a fair manner,
8269	* based on each entity's (task or task-group's) weight	8269	* based on each entity's (task or task-group's) weight
8270	* (se->load.weight).	8270	* (se->load.weight).
8271	*	8271	*
8272	* In other words, if root_task_group has 10 tasks of weight	8272	* In other words, if root_task_group has 10 tasks of weight
8273	* 1024) and two child groups A0 and A1 (of weight 1024 each),	8273	* 1024) and two child groups A0 and A1 (of weight 1024 each),
8274	* then A0's share of the cpu resource is:	8274	* then A0's share of the cpu resource is:
8275	*	8275	*
8276	* A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%	8276	* A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
8277	*	8277	*
8278	* We achieve this by letting root_task_group's tasks sit	8278	* We achieve this by letting root_task_group's tasks sit
8279	* directly in rq->cfs (i.e root_task_group->se[] = NULL).	8279	* directly in rq->cfs (i.e root_task_group->se[] = NULL).
8280	*/	8280	*/
8281	init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);	8281	init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
8282	#endif /* CONFIG_FAIR_GROUP_SCHED */	8282	#endif /* CONFIG_FAIR_GROUP_SCHED */
8283		8283
8284	rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;	8284	rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
8285	#ifdef CONFIG_RT_GROUP_SCHED	8285	#ifdef CONFIG_RT_GROUP_SCHED
8286	INIT_LIST_HEAD(&rq->leaf_rt_rq_list);	8286	INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
8287	init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);	8287	init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
8288	#endif	8288	#endif
8289		8289
8290	for (j = 0; j < CPU_LOAD_IDX_MAX; j++)	8290	for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
8291	rq->cpu_load[j] = 0;	8291	rq->cpu_load[j] = 0;
8292		8292
8293	rq->last_load_update_tick = jiffies;	8293	rq->last_load_update_tick = jiffies;
8294		8294
8295	#ifdef CONFIG_SMP	8295	#ifdef CONFIG_SMP
8296	rq->sd = NULL;	8296	rq->sd = NULL;
8297	rq->rd = NULL;	8297	rq->rd = NULL;
8298	rq->cpu_power = SCHED_LOAD_SCALE;	8298	rq->cpu_power = SCHED_LOAD_SCALE;
8299	rq->post_schedule = 0;	8299	rq->post_schedule = 0;
8300	rq->active_balance = 0;	8300	rq->active_balance = 0;
8301	rq->next_balance = jiffies;	8301	rq->next_balance = jiffies;
8302	rq->push_cpu = 0;	8302	rq->push_cpu = 0;
8303	rq->cpu = i;	8303	rq->cpu = i;
8304	rq->online = 0;	8304	rq->online = 0;
8305	rq->idle_stamp = 0;	8305	rq->idle_stamp = 0;
8306	rq->avg_idle = 2*sysctl_sched_migration_cost;	8306	rq->avg_idle = 2*sysctl_sched_migration_cost;
8307	rq_attach_root(rq, &def_root_domain);	8307	rq_attach_root(rq, &def_root_domain);
8308	#ifdef CONFIG_NO_HZ	8308	#ifdef CONFIG_NO_HZ
8309	rq->nohz_balance_kick = 0;	8309	rq->nohz_balance_kick = 0;
8310	init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));	8310	init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
8311	#endif	8311	#endif
8312	#endif	8312	#endif
8313	init_rq_hrtick(rq);	8313	init_rq_hrtick(rq);
8314	atomic_set(&rq->nr_iowait, 0);	8314	atomic_set(&rq->nr_iowait, 0);
8315	}	8315	}
8316		8316
8317	set_load_weight(&init_task);	8317	set_load_weight(&init_task);
8318		8318
8319	#ifdef CONFIG_PREEMPT_NOTIFIERS	8319	#ifdef CONFIG_PREEMPT_NOTIFIERS
8320	INIT_HLIST_HEAD(&init_task.preempt_notifiers);	8320	INIT_HLIST_HEAD(&init_task.preempt_notifiers);
8321	#endif	8321	#endif
8322		8322
8323	#ifdef CONFIG_SMP	8323	#ifdef CONFIG_SMP
8324	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);	8324	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
8325	#endif	8325	#endif
8326		8326
8327	#ifdef CONFIG_RT_MUTEXES	8327	#ifdef CONFIG_RT_MUTEXES
8328	plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock);	8328	plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock);
8329	#endif	8329	#endif
8330		8330
8331	/*	8331	/*
8332	* The boot idle thread does lazy MMU switching as well:	8332	* The boot idle thread does lazy MMU switching as well:
8333	*/	8333	*/
8334	atomic_inc(&init_mm.mm_count);	8334	atomic_inc(&init_mm.mm_count);
8335	enter_lazy_tlb(&init_mm, current);	8335	enter_lazy_tlb(&init_mm, current);
8336		8336
8337	/*	8337	/*
8338	* Make us the idle thread. Technically, schedule() should not be	8338	* Make us the idle thread. Technically, schedule() should not be
8339	* called from this thread, however somewhere below it might be,	8339	* called from this thread, however somewhere below it might be,
8340	* but because we are the idle thread, we just pick up running again	8340	* but because we are the idle thread, we just pick up running again
8341	* when this runqueue becomes "idle".	8341	* when this runqueue becomes "idle".
8342	*/	8342	*/
8343	init_idle(current, smp_processor_id());	8343	init_idle(current, smp_processor_id());
8344		8344
8345	calc_load_update = jiffies + LOAD_FREQ;	8345	calc_load_update = jiffies + LOAD_FREQ;
8346		8346
8347	/*	8347	/*
8348	* During early bootup we pretend to be a normal task:	8348	* During early bootup we pretend to be a normal task:
8349	*/	8349	*/
8350	current->sched_class = &fair_sched_class;	8350	current->sched_class = &fair_sched_class;
8351		8351
8352	/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */	8352	/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
8353	zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);	8353	zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
8354	#ifdef CONFIG_SMP	8354	#ifdef CONFIG_SMP
8355	#ifdef CONFIG_NO_HZ	8355	#ifdef CONFIG_NO_HZ
8356	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);	8356	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
8357	alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);	8357	alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
8358	atomic_set(&nohz.load_balancer, nr_cpu_ids);	8358	atomic_set(&nohz.load_balancer, nr_cpu_ids);
8359	atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);	8359	atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
8360	atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);	8360	atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
8361	#endif	8361	#endif
8362	/* May be allocated at isolcpus cmdline parse time */	8362	/* May be allocated at isolcpus cmdline parse time */
8363	if (cpu_isolated_map == NULL)	8363	if (cpu_isolated_map == NULL)
8364	zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);	8364	zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
8365	#endif /* SMP */	8365	#endif /* SMP */
8366		8366
8367	scheduler_running = 1;	8367	scheduler_running = 1;
8368	}	8368	}
8369		8369
8370	#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP	8370	#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
8371	static inline int preempt_count_equals(int preempt_offset)	8371	static inline int preempt_count_equals(int preempt_offset)
8372	{	8372	{
8373	int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();	8373	int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
8374		8374
8375	return (nested == preempt_offset);	8375	return (nested == preempt_offset);
8376	}	8376	}
8377		8377
8378	void __might_sleep(const char *file, int line, int preempt_offset)	8378	void __might_sleep(const char *file, int line, int preempt_offset)
8379	{	8379	{
8380	#ifdef in_atomic	8380	#ifdef in_atomic
8381	static unsigned long prev_jiffy; /* ratelimiting */	8381	static unsigned long prev_jiffy; /* ratelimiting */
8382		8382
8383	if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) \|\|	8383	if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) \|\|
8384	system_state != SYSTEM_RUNNING \|\| oops_in_progress)	8384	system_state != SYSTEM_RUNNING \|\| oops_in_progress)
8385	return;	8385	return;
8386	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)	8386	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
8387	return;	8387	return;
8388	prev_jiffy = jiffies;	8388	prev_jiffy = jiffies;
8389		8389
8390	printk(KERN_ERR	8390	printk(KERN_ERR
8391	"BUG: sleeping function called from invalid context at %s:%d\n",	8391	"BUG: sleeping function called from invalid context at %s:%d\n",
8392	file, line);	8392	file, line);
8393	printk(KERN_ERR	8393	printk(KERN_ERR
8394	"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",	8394	"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
8395	in_atomic(), irqs_disabled(),	8395	in_atomic(), irqs_disabled(),
8396	current->pid, current->comm);	8396	current->pid, current->comm);
8397		8397
8398	debug_show_held_locks(current);	8398	debug_show_held_locks(current);
8399	if (irqs_disabled())	8399	if (irqs_disabled())
8400	print_irqtrace_events(current);	8400	print_irqtrace_events(current);
8401	dump_stack();	8401	dump_stack();
8402	#endif	8402	#endif
8403	}	8403	}
8404	EXPORT_SYMBOL(__might_sleep);	8404	EXPORT_SYMBOL(__might_sleep);
8405	#endif	8405	#endif
8406		8406
8407	#ifdef CONFIG_MAGIC_SYSRQ	8407	#ifdef CONFIG_MAGIC_SYSRQ
8408	static void normalize_task(struct rq rq, struct task_struct p)	8408	static void normalize_task(struct rq rq, struct task_struct p)
8409	{	8409	{
8410	const struct sched_class *prev_class = p->sched_class;	8410	const struct sched_class *prev_class = p->sched_class;
8411	int old_prio = p->prio;	8411	int old_prio = p->prio;
8412	int on_rq;	8412	int on_rq;
8413		8413
8414	on_rq = p->on_rq;	8414	on_rq = p->on_rq;
8415	if (on_rq)	8415	if (on_rq)
8416	deactivate_task(rq, p, 0);	8416	deactivate_task(rq, p, 0);
8417	__setscheduler(rq, p, SCHED_NORMAL, 0);	8417	__setscheduler(rq, p, SCHED_NORMAL, 0);
8418	if (on_rq) {	8418	if (on_rq) {
8419	activate_task(rq, p, 0);	8419	activate_task(rq, p, 0);
8420	resched_task(rq->curr);	8420	resched_task(rq->curr);
8421	}	8421	}
8422		8422
8423	check_class_changed(rq, p, prev_class, old_prio);	8423	check_class_changed(rq, p, prev_class, old_prio);
8424	}	8424	}
8425		8425
8426	void normalize_rt_tasks(void)	8426	void normalize_rt_tasks(void)
8427	{	8427	{
8428	struct task_struct g, p;	8428	struct task_struct g, p;
8429	unsigned long flags;	8429	unsigned long flags;
8430	struct rq *rq;	8430	struct rq *rq;
8431		8431
8432	read_lock_irqsave(&tasklist_lock, flags);	8432	read_lock_irqsave(&tasklist_lock, flags);
8433	do_each_thread(g, p) {	8433	do_each_thread(g, p) {
8434	/*	8434	/*
8435	* Only normalize user tasks:	8435	* Only normalize user tasks:
8436	*/	8436	*/
8437	if (!p->mm)	8437	if (!p->mm)
8438	continue;	8438	continue;
8439		8439
8440	p->se.exec_start = 0;	8440	p->se.exec_start = 0;
8441	#ifdef CONFIG_SCHEDSTATS	8441	#ifdef CONFIG_SCHEDSTATS
8442	p->se.statistics.wait_start = 0;	8442	p->se.statistics.wait_start = 0;
8443	p->se.statistics.sleep_start = 0;	8443	p->se.statistics.sleep_start = 0;
8444	p->se.statistics.block_start = 0;	8444	p->se.statistics.block_start = 0;
8445	#endif	8445	#endif
8446		8446
8447	if (!rt_task(p)) {	8447	if (!rt_task(p)) {
8448	/*	8448	/*
8449	* Renice negative nice level userspace	8449	* Renice negative nice level userspace
8450	* tasks back to 0:	8450	* tasks back to 0:
8451	*/	8451	*/
8452	if (TASK_NICE(p) < 0 && p->mm)	8452	if (TASK_NICE(p) < 0 && p->mm)
8453	set_user_nice(p, 0);	8453	set_user_nice(p, 0);
8454	continue;	8454	continue;
8455	}	8455	}
8456		8456
8457	raw_spin_lock(&p->pi_lock);	8457	raw_spin_lock(&p->pi_lock);
8458	rq = __task_rq_lock(p);	8458	rq = __task_rq_lock(p);
8459		8459
8460	normalize_task(rq, p);	8460	normalize_task(rq, p);
8461		8461
8462	__task_rq_unlock(rq);	8462	__task_rq_unlock(rq);
8463	raw_spin_unlock(&p->pi_lock);	8463	raw_spin_unlock(&p->pi_lock);
8464	} while_each_thread(g, p);	8464	} while_each_thread(g, p);
8465		8465
8466	read_unlock_irqrestore(&tasklist_lock, flags);	8466	read_unlock_irqrestore(&tasklist_lock, flags);
8467	}	8467	}
8468		8468
8469	#endif /* CONFIG_MAGIC_SYSRQ */	8469	#endif /* CONFIG_MAGIC_SYSRQ */
8470		8470
8471	#if defined(CONFIG_IA64) \|\| defined(CONFIG_KGDB_KDB)	8471	#if defined(CONFIG_IA64) \|\| defined(CONFIG_KGDB_KDB)
8472	/*	8472	/*
8473	* These functions are only useful for the IA64 MCA handling, or kdb.	8473	* These functions are only useful for the IA64 MCA handling, or kdb.
8474	*	8474	*
8475	* They can only be called when the whole system has been	8475	* They can only be called when the whole system has been
8476	* stopped - every CPU needs to be quiescent, and no scheduling	8476	* stopped - every CPU needs to be quiescent, and no scheduling
8477	* activity can take place. Using them for anything else would	8477	* activity can take place. Using them for anything else would
8478	* be a serious bug, and as a result, they aren't even visible	8478	* be a serious bug, and as a result, they aren't even visible
8479	* under any other configuration.	8479	* under any other configuration.
8480	*/	8480	*/
8481		8481
8482	/**	8482	/**
8483	* curr_task - return the current task for a given cpu.	8483	* curr_task - return the current task for a given cpu.
8484	* @cpu: the processor in question.	8484	* @cpu: the processor in question.
8485	*	8485	*
8486	* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!	8486	* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
8487	*/	8487	*/
8488	struct task_struct *curr_task(int cpu)	8488	struct task_struct *curr_task(int cpu)
8489	{	8489	{
8490	return cpu_curr(cpu);	8490	return cpu_curr(cpu);
8491	}	8491	}
8492		8492
8493	#endif /* defined(CONFIG_IA64) \|\| defined(CONFIG_KGDB_KDB) */	8493	#endif /* defined(CONFIG_IA64) \|\| defined(CONFIG_KGDB_KDB) */
8494		8494
8495	#ifdef CONFIG_IA64	8495	#ifdef CONFIG_IA64
8496	/**	8496	/**
8497	* set_curr_task - set the current task for a given cpu.	8497	* set_curr_task - set the current task for a given cpu.
8498	* @cpu: the processor in question.	8498	* @cpu: the processor in question.
8499	* @p: the task pointer to set.	8499	* @p: the task pointer to set.
8500	*	8500	*
8501	* Description: This function must only be used when non-maskable interrupts	8501	* Description: This function must only be used when non-maskable interrupts
8502	* are serviced on a separate stack. It allows the architecture to switch the	8502	* are serviced on a separate stack. It allows the architecture to switch the
8503	* notion of the current task on a cpu in a non-blocking manner. This function	8503	* notion of the current task on a cpu in a non-blocking manner. This function
8504	* must be called with all CPU's synchronized, and interrupts disabled, the	8504	* must be called with all CPU's synchronized, and interrupts disabled, the
8505	* and caller must save the original value of the current task (see	8505	* and caller must save the original value of the current task (see
8506	* curr_task() above) and restore that value before reenabling interrupts and	8506	* curr_task() above) and restore that value before reenabling interrupts and
8507	* re-starting the system.	8507	* re-starting the system.
8508	*	8508	*
8509	* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!	8509	* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
8510	*/	8510	*/
8511	void set_curr_task(int cpu, struct task_struct *p)	8511	void set_curr_task(int cpu, struct task_struct *p)
8512	{	8512	{
8513	cpu_curr(cpu) = p;	8513	cpu_curr(cpu) = p;
8514	}	8514	}
8515		8515
8516	#endif	8516	#endif
8517		8517
8518	#ifdef CONFIG_FAIR_GROUP_SCHED	8518	#ifdef CONFIG_FAIR_GROUP_SCHED
8519	static void free_fair_sched_group(struct task_group *tg)	8519	static void free_fair_sched_group(struct task_group *tg)
8520	{	8520	{
8521	int i;	8521	int i;
8522		8522
8523	for_each_possible_cpu(i) {	8523	for_each_possible_cpu(i) {
8524	if (tg->cfs_rq)	8524	if (tg->cfs_rq)
8525	kfree(tg->cfs_rq[i]);	8525	kfree(tg->cfs_rq[i]);
8526	if (tg->se)	8526	if (tg->se)
8527	kfree(tg->se[i]);	8527	kfree(tg->se[i]);
8528	}	8528	}
8529		8529
8530	kfree(tg->cfs_rq);	8530	kfree(tg->cfs_rq);
8531	kfree(tg->se);	8531	kfree(tg->se);
8532	}	8532	}
8533		8533
8534	static	8534	static
8535	int alloc_fair_sched_group(struct task_group tg, struct task_group parent)	8535	int alloc_fair_sched_group(struct task_group tg, struct task_group parent)
8536	{	8536	{
8537	struct cfs_rq *cfs_rq;	8537	struct cfs_rq *cfs_rq;
8538	struct sched_entity *se;	8538	struct sched_entity *se;
8539	int i;	8539	int i;
8540		8540
8541	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);	8541	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
8542	if (!tg->cfs_rq)	8542	if (!tg->cfs_rq)
8543	goto err;	8543	goto err;
8544	tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);	8544	tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
8545	if (!tg->se)	8545	if (!tg->se)
8546	goto err;	8546	goto err;
8547		8547
8548	tg->shares = NICE_0_LOAD;	8548	tg->shares = NICE_0_LOAD;
8549		8549
8550	for_each_possible_cpu(i) {	8550	for_each_possible_cpu(i) {
8551	cfs_rq = kzalloc_node(sizeof(struct cfs_rq),	8551	cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8552	GFP_KERNEL, cpu_to_node(i));	8552	GFP_KERNEL, cpu_to_node(i));
8553	if (!cfs_rq)	8553	if (!cfs_rq)
8554	goto err;	8554	goto err;
8555		8555
8556	se = kzalloc_node(sizeof(struct sched_entity),	8556	se = kzalloc_node(sizeof(struct sched_entity),
8557	GFP_KERNEL, cpu_to_node(i));	8557	GFP_KERNEL, cpu_to_node(i));
8558	if (!se)	8558	if (!se)
8559	goto err_free_rq;	8559	goto err_free_rq;
8560		8560
8561	init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);	8561	init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8562	}	8562	}
8563		8563
8564	return 1;	8564	return 1;
8565		8565
8566	err_free_rq:	8566	err_free_rq:
8567	kfree(cfs_rq);	8567	kfree(cfs_rq);
8568	err:	8568	err:
8569	return 0;	8569	return 0;
8570	}	8570	}
8571		8571
8572	static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)	8572	static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8573	{	8573	{
8574	struct rq *rq = cpu_rq(cpu);	8574	struct rq *rq = cpu_rq(cpu);
8575	unsigned long flags;	8575	unsigned long flags;
8576		8576
8577	/*	8577	/*
8578	* Only empty task groups can be destroyed; so we can speculatively	8578	* Only empty task groups can be destroyed; so we can speculatively
8579	* check on_list without danger of it being re-added.	8579	* check on_list without danger of it being re-added.
8580	*/	8580	*/
8581	if (!tg->cfs_rq[cpu]->on_list)	8581	if (!tg->cfs_rq[cpu]->on_list)
8582	return;	8582	return;
8583		8583
8584	raw_spin_lock_irqsave(&rq->lock, flags);	8584	raw_spin_lock_irqsave(&rq->lock, flags);
8585	list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);	8585	list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8586	raw_spin_unlock_irqrestore(&rq->lock, flags);	8586	raw_spin_unlock_irqrestore(&rq->lock, flags);
8587	}	8587	}
8588	#else /* !CONFG_FAIR_GROUP_SCHED */	8588	#else /* !CONFG_FAIR_GROUP_SCHED */
8589	static inline void free_fair_sched_group(struct task_group *tg)	8589	static inline void free_fair_sched_group(struct task_group *tg)
8590	{	8590	{
8591	}	8591	}
8592		8592
8593	static inline	8593	static inline
8594	int alloc_fair_sched_group(struct task_group tg, struct task_group parent)	8594	int alloc_fair_sched_group(struct task_group tg, struct task_group parent)
8595	{	8595	{
8596	return 1;	8596	return 1;
8597	}	8597	}
8598		8598
8599	static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)	8599	static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8600	{	8600	{
8601	}	8601	}
8602	#endif /* CONFIG_FAIR_GROUP_SCHED */	8602	#endif /* CONFIG_FAIR_GROUP_SCHED */
8603		8603
8604	#ifdef CONFIG_RT_GROUP_SCHED	8604	#ifdef CONFIG_RT_GROUP_SCHED
8605	static void free_rt_sched_group(struct task_group *tg)	8605	static void free_rt_sched_group(struct task_group *tg)
8606	{	8606	{
8607	int i;	8607	int i;
8608		8608
8609	destroy_rt_bandwidth(&tg->rt_bandwidth);	8609	destroy_rt_bandwidth(&tg->rt_bandwidth);
8610		8610
8611	for_each_possible_cpu(i) {	8611	for_each_possible_cpu(i) {
8612	if (tg->rt_rq)	8612	if (tg->rt_rq)
8613	kfree(tg->rt_rq[i]);	8613	kfree(tg->rt_rq[i]);
8614	if (tg->rt_se)	8614	if (tg->rt_se)
8615	kfree(tg->rt_se[i]);	8615	kfree(tg->rt_se[i]);
8616	}	8616	}
8617		8617
8618	kfree(tg->rt_rq);	8618	kfree(tg->rt_rq);
8619	kfree(tg->rt_se);	8619	kfree(tg->rt_se);
8620	}	8620	}
8621		8621
8622	static	8622	static
8623	int alloc_rt_sched_group(struct task_group tg, struct task_group parent)	8623	int alloc_rt_sched_group(struct task_group tg, struct task_group parent)
8624	{	8624	{
8625	struct rt_rq *rt_rq;	8625	struct rt_rq *rt_rq;
8626	struct sched_rt_entity *rt_se;	8626	struct sched_rt_entity *rt_se;
8627	struct rq *rq;	8627	struct rq *rq;
8628	int i;	8628	int i;
8629		8629
8630	tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);	8630	tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
8631	if (!tg->rt_rq)	8631	if (!tg->rt_rq)
8632	goto err;	8632	goto err;
8633	tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);	8633	tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
8634	if (!tg->rt_se)	8634	if (!tg->rt_se)
8635	goto err;	8635	goto err;
8636		8636
8637	init_rt_bandwidth(&tg->rt_bandwidth,	8637	init_rt_bandwidth(&tg->rt_bandwidth,
8638	ktime_to_ns(def_rt_bandwidth.rt_period), 0);	8638	ktime_to_ns(def_rt_bandwidth.rt_period), 0);
8639		8639
8640	for_each_possible_cpu(i) {	8640	for_each_possible_cpu(i) {
8641	rq = cpu_rq(i);	8641	rq = cpu_rq(i);
8642		8642
8643	rt_rq = kzalloc_node(sizeof(struct rt_rq),	8643	rt_rq = kzalloc_node(sizeof(struct rt_rq),
8644	GFP_KERNEL, cpu_to_node(i));	8644	GFP_KERNEL, cpu_to_node(i));
8645	if (!rt_rq)	8645	if (!rt_rq)
8646	goto err;	8646	goto err;
8647		8647
8648	rt_se = kzalloc_node(sizeof(struct sched_rt_entity),	8648	rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
8649	GFP_KERNEL, cpu_to_node(i));	8649	GFP_KERNEL, cpu_to_node(i));
8650	if (!rt_se)	8650	if (!rt_se)
8651	goto err_free_rq;	8651	goto err_free_rq;
8652		8652
8653	init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);	8653	init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
8654	}	8654	}
8655		8655
8656	return 1;	8656	return 1;
8657		8657
8658	err_free_rq:	8658	err_free_rq:
8659	kfree(rt_rq);	8659	kfree(rt_rq);
8660	err:	8660	err:
8661	return 0;	8661	return 0;
8662	}	8662	}
8663	#else /* !CONFIG_RT_GROUP_SCHED */	8663	#else /* !CONFIG_RT_GROUP_SCHED */
8664	static inline void free_rt_sched_group(struct task_group *tg)	8664	static inline void free_rt_sched_group(struct task_group *tg)
8665	{	8665	{
8666	}	8666	}
8667		8667
8668	static inline	8668	static inline
8669	int alloc_rt_sched_group(struct task_group tg, struct task_group parent)	8669	int alloc_rt_sched_group(struct task_group tg, struct task_group parent)
8670	{	8670	{
8671	return 1;	8671	return 1;
8672	}	8672	}
8673	#endif /* CONFIG_RT_GROUP_SCHED */	8673	#endif /* CONFIG_RT_GROUP_SCHED */
8674		8674
8675	#ifdef CONFIG_CGROUP_SCHED	8675	#ifdef CONFIG_CGROUP_SCHED
8676	static void free_sched_group(struct task_group *tg)	8676	static void free_sched_group(struct task_group *tg)
8677	{	8677	{
8678	free_fair_sched_group(tg);	8678	free_fair_sched_group(tg);
8679	free_rt_sched_group(tg);	8679	free_rt_sched_group(tg);
8680	autogroup_free(tg);	8680	autogroup_free(tg);
8681	kfree(tg);	8681	kfree(tg);
8682	}	8682	}
8683		8683
8684	/* allocate runqueue etc for a new task group */	8684	/* allocate runqueue etc for a new task group */
8685	struct task_group sched_create_group(struct task_group parent)	8685	struct task_group sched_create_group(struct task_group parent)
8686	{	8686	{
8687	struct task_group *tg;	8687	struct task_group *tg;
8688	unsigned long flags;	8688	unsigned long flags;
8689		8689
8690	tg = kzalloc(sizeof(*tg), GFP_KERNEL);	8690	tg = kzalloc(sizeof(*tg), GFP_KERNEL);
8691	if (!tg)	8691	if (!tg)
8692	return ERR_PTR(-ENOMEM);	8692	return ERR_PTR(-ENOMEM);
8693		8693
8694	if (!alloc_fair_sched_group(tg, parent))	8694	if (!alloc_fair_sched_group(tg, parent))
8695	goto err;	8695	goto err;
8696		8696
8697	if (!alloc_rt_sched_group(tg, parent))	8697	if (!alloc_rt_sched_group(tg, parent))
8698	goto err;	8698	goto err;
8699		8699
8700	spin_lock_irqsave(&task_group_lock, flags);	8700	spin_lock_irqsave(&task_group_lock, flags);
8701	list_add_rcu(&tg->list, &task_groups);	8701	list_add_rcu(&tg->list, &task_groups);
8702		8702
8703	WARN_ON(!parent); /* root should already exist */	8703	WARN_ON(!parent); /* root should already exist */
8704		8704
8705	tg->parent = parent;	8705	tg->parent = parent;
8706	INIT_LIST_HEAD(&tg->children);	8706	INIT_LIST_HEAD(&tg->children);
8707	list_add_rcu(&tg->siblings, &parent->children);	8707	list_add_rcu(&tg->siblings, &parent->children);
8708	spin_unlock_irqrestore(&task_group_lock, flags);	8708	spin_unlock_irqrestore(&task_group_lock, flags);
8709		8709
8710	return tg;	8710	return tg;
8711		8711
8712	err:	8712	err:
8713	free_sched_group(tg);	8713	free_sched_group(tg);
8714	return ERR_PTR(-ENOMEM);	8714	return ERR_PTR(-ENOMEM);
8715	}	8715	}
8716		8716
8717	/* rcu callback to free various structures associated with a task group */	8717	/* rcu callback to free various structures associated with a task group */
8718	static void free_sched_group_rcu(struct rcu_head *rhp)	8718	static void free_sched_group_rcu(struct rcu_head *rhp)
8719	{	8719	{
8720	/* now it should be safe to free those cfs_rqs */	8720	/* now it should be safe to free those cfs_rqs */
8721	free_sched_group(container_of(rhp, struct task_group, rcu));	8721	free_sched_group(container_of(rhp, struct task_group, rcu));
8722	}	8722	}
8723		8723
8724	/* Destroy runqueue etc associated with a task group */	8724	/* Destroy runqueue etc associated with a task group */
8725	void sched_destroy_group(struct task_group *tg)	8725	void sched_destroy_group(struct task_group *tg)
8726	{	8726	{
8727	unsigned long flags;	8727	unsigned long flags;
8728	int i;	8728	int i;
8729		8729
8730	/* end participation in shares distribution */	8730	/* end participation in shares distribution */
8731	for_each_possible_cpu(i)	8731	for_each_possible_cpu(i)
8732	unregister_fair_sched_group(tg, i);	8732	unregister_fair_sched_group(tg, i);
8733		8733
8734	spin_lock_irqsave(&task_group_lock, flags);	8734	spin_lock_irqsave(&task_group_lock, flags);
8735	list_del_rcu(&tg->list);	8735	list_del_rcu(&tg->list);
8736	list_del_rcu(&tg->siblings);	8736	list_del_rcu(&tg->siblings);
8737	spin_unlock_irqrestore(&task_group_lock, flags);	8737	spin_unlock_irqrestore(&task_group_lock, flags);
8738		8738
8739	/* wait for possible concurrent references to cfs_rqs complete */	8739	/* wait for possible concurrent references to cfs_rqs complete */
8740	call_rcu(&tg->rcu, free_sched_group_rcu);	8740	call_rcu(&tg->rcu, free_sched_group_rcu);
8741	}	8741	}
8742		8742
8743	/* change task's runqueue when it moves between groups.	8743	/* change task's runqueue when it moves between groups.
8744	* The caller of this function should have put the task in its new group	8744	* The caller of this function should have put the task in its new group
8745	* by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to	8745	* by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
8746	* reflect its new group.	8746	* reflect its new group.
8747	*/	8747	*/
8748	void sched_move_task(struct task_struct *tsk)	8748	void sched_move_task(struct task_struct *tsk)
8749	{	8749	{
8750	int on_rq, running;	8750	int on_rq, running;
8751	unsigned long flags;	8751	unsigned long flags;
8752	struct rq *rq;	8752	struct rq *rq;
8753		8753
8754	rq = task_rq_lock(tsk, &flags);	8754	rq = task_rq_lock(tsk, &flags);
8755		8755
8756	running = task_current(rq, tsk);	8756	running = task_current(rq, tsk);
8757	on_rq = tsk->on_rq;	8757	on_rq = tsk->on_rq;
8758		8758
8759	if (on_rq)	8759	if (on_rq)
8760	dequeue_task(rq, tsk, 0);	8760	dequeue_task(rq, tsk, 0);
8761	if (unlikely(running))	8761	if (unlikely(running))
8762	tsk->sched_class->put_prev_task(rq, tsk);	8762	tsk->sched_class->put_prev_task(rq, tsk);
8763		8763
8764	#ifdef CONFIG_FAIR_GROUP_SCHED	8764	#ifdef CONFIG_FAIR_GROUP_SCHED
8765	if (tsk->sched_class->task_move_group)	8765	if (tsk->sched_class->task_move_group)
8766	tsk->sched_class->task_move_group(tsk, on_rq);	8766	tsk->sched_class->task_move_group(tsk, on_rq);
8767	else	8767	else
8768	#endif	8768	#endif
8769	set_task_rq(tsk, task_cpu(tsk));	8769	set_task_rq(tsk, task_cpu(tsk));
8770		8770
8771	if (unlikely(running))	8771	if (unlikely(running))
8772	tsk->sched_class->set_curr_task(rq);	8772	tsk->sched_class->set_curr_task(rq);
8773	if (on_rq)	8773	if (on_rq)
8774	enqueue_task(rq, tsk, 0);	8774	enqueue_task(rq, tsk, 0);
8775		8775
8776	task_rq_unlock(rq, tsk, &flags);	8776	task_rq_unlock(rq, tsk, &flags);
8777	}	8777	}
8778	#endif /* CONFIG_CGROUP_SCHED */	8778	#endif /* CONFIG_CGROUP_SCHED */
8779		8779
8780	#ifdef CONFIG_FAIR_GROUP_SCHED	8780	#ifdef CONFIG_FAIR_GROUP_SCHED
8781	static DEFINE_MUTEX(shares_mutex);	8781	static DEFINE_MUTEX(shares_mutex);
8782		8782
8783	int sched_group_set_shares(struct task_group *tg, unsigned long shares)	8783	int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8784	{	8784	{
8785	int i;	8785	int i;
8786	unsigned long flags;	8786	unsigned long flags;
8787		8787
8788	/*	8788	/*
8789	* We can't change the weight of the root cgroup.	8789	* We can't change the weight of the root cgroup.
8790	*/	8790	*/
8791	if (!tg->se[0])	8791	if (!tg->se[0])
8792	return -EINVAL;	8792	return -EINVAL;
8793		8793
8794	if (shares < MIN_SHARES)	8794	if (shares < MIN_SHARES)
8795	shares = MIN_SHARES;	8795	shares = MIN_SHARES;
8796	else if (shares > MAX_SHARES)	8796	else if (shares > MAX_SHARES)
8797	shares = MAX_SHARES;	8797	shares = MAX_SHARES;
8798		8798
8799	mutex_lock(&shares_mutex);	8799	mutex_lock(&shares_mutex);
8800	if (tg->shares == shares)	8800	if (tg->shares == shares)
8801	goto done;	8801	goto done;
8802		8802
8803	tg->shares = shares;	8803	tg->shares = shares;
8804	for_each_possible_cpu(i) {	8804	for_each_possible_cpu(i) {
8805	struct rq *rq = cpu_rq(i);	8805	struct rq *rq = cpu_rq(i);
8806	struct sched_entity *se;	8806	struct sched_entity *se;
8807		8807
8808	se = tg->se[i];	8808	se = tg->se[i];
8809	/* Propagate contribution to hierarchy */	8809	/* Propagate contribution to hierarchy */
8810	raw_spin_lock_irqsave(&rq->lock, flags);	8810	raw_spin_lock_irqsave(&rq->lock, flags);
8811	for_each_sched_entity(se)	8811	for_each_sched_entity(se)
8812	update_cfs_shares(group_cfs_rq(se));	8812	update_cfs_shares(group_cfs_rq(se));
8813	raw_spin_unlock_irqrestore(&rq->lock, flags);	8813	raw_spin_unlock_irqrestore(&rq->lock, flags);
8814	}	8814	}
8815		8815
8816	done:	8816	done:
8817	mutex_unlock(&shares_mutex);	8817	mutex_unlock(&shares_mutex);
8818	return 0;	8818	return 0;
8819	}	8819	}
8820		8820
8821	unsigned long sched_group_shares(struct task_group *tg)	8821	unsigned long sched_group_shares(struct task_group *tg)
8822	{	8822	{
8823	return tg->shares;	8823	return tg->shares;
8824	}	8824	}
8825	#endif	8825	#endif
8826		8826
8827	#ifdef CONFIG_RT_GROUP_SCHED	8827	#ifdef CONFIG_RT_GROUP_SCHED
8828	/*	8828	/*
8829	* Ensure that the real time constraints are schedulable.	8829	* Ensure that the real time constraints are schedulable.
8830	*/	8830	*/
8831	static DEFINE_MUTEX(rt_constraints_mutex);	8831	static DEFINE_MUTEX(rt_constraints_mutex);
8832		8832
8833	static unsigned long to_ratio(u64 period, u64 runtime)	8833	static unsigned long to_ratio(u64 period, u64 runtime)
8834	{	8834	{
8835	if (runtime == RUNTIME_INF)	8835	if (runtime == RUNTIME_INF)
8836	return 1ULL << 20;	8836	return 1ULL << 20;
8837		8837
8838	return div64_u64(runtime << 20, period);	8838	return div64_u64(runtime << 20, period);
8839	}	8839	}
8840		8840
8841	/* Must be called with tasklist_lock held */	8841	/* Must be called with tasklist_lock held */
8842	static inline int tg_has_rt_tasks(struct task_group *tg)	8842	static inline int tg_has_rt_tasks(struct task_group *tg)
8843	{	8843	{
8844	struct task_struct g, p;	8844	struct task_struct g, p;
8845		8845
8846	do_each_thread(g, p) {	8846	do_each_thread(g, p) {
8847	if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)	8847	if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
8848	return 1;	8848	return 1;
8849	} while_each_thread(g, p);	8849	} while_each_thread(g, p);
8850		8850
8851	return 0;	8851	return 0;
8852	}	8852	}
8853		8853
8854	struct rt_schedulable_data {	8854	struct rt_schedulable_data {
8855	struct task_group *tg;	8855	struct task_group *tg;
8856	u64 rt_period;	8856	u64 rt_period;
8857	u64 rt_runtime;	8857	u64 rt_runtime;
8858	};	8858	};
8859		8859
8860	static int tg_schedulable(struct task_group tg, void data)	8860	static int tg_schedulable(struct task_group tg, void data)
8861	{	8861	{
8862	struct rt_schedulable_data *d = data;	8862	struct rt_schedulable_data *d = data;
8863	struct task_group *child;	8863	struct task_group *child;
8864	unsigned long total, sum = 0;	8864	unsigned long total, sum = 0;
8865	u64 period, runtime;	8865	u64 period, runtime;
8866		8866
8867	period = ktime_to_ns(tg->rt_bandwidth.rt_period);	8867	period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8868	runtime = tg->rt_bandwidth.rt_runtime;	8868	runtime = tg->rt_bandwidth.rt_runtime;
8869		8869
8870	if (tg == d->tg) {	8870	if (tg == d->tg) {
8871	period = d->rt_period;	8871	period = d->rt_period;
8872	runtime = d->rt_runtime;	8872	runtime = d->rt_runtime;
8873	}	8873	}
8874		8874
8875	/*	8875	/*
8876	* Cannot have more runtime than the period.	8876	* Cannot have more runtime than the period.
8877	*/	8877	*/
8878	if (runtime > period && runtime != RUNTIME_INF)	8878	if (runtime > period && runtime != RUNTIME_INF)
8879	return -EINVAL;	8879	return -EINVAL;
8880		8880
8881	/*	8881	/*
8882	* Ensure we don't starve existing RT tasks.	8882	* Ensure we don't starve existing RT tasks.
8883	*/	8883	*/
8884	if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))	8884	if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
8885	return -EBUSY;	8885	return -EBUSY;
8886		8886
8887	total = to_ratio(period, runtime);	8887	total = to_ratio(period, runtime);
8888		8888
8889	/*	8889	/*
8890	* Nobody can have more than the global setting allows.	8890	* Nobody can have more than the global setting allows.
8891	*/	8891	*/
8892	if (total > to_ratio(global_rt_period(), global_rt_runtime()))	8892	if (total > to_ratio(global_rt_period(), global_rt_runtime()))
8893	return -EINVAL;	8893	return -EINVAL;
8894		8894
8895	/*	8895	/*
8896	* The sum of our children's runtime should not exceed our own.	8896	* The sum of our children's runtime should not exceed our own.
8897	*/	8897	*/
8898	list_for_each_entry_rcu(child, &tg->children, siblings) {	8898	list_for_each_entry_rcu(child, &tg->children, siblings) {
8899	period = ktime_to_ns(child->rt_bandwidth.rt_period);	8899	period = ktime_to_ns(child->rt_bandwidth.rt_period);
8900	runtime = child->rt_bandwidth.rt_runtime;	8900	runtime = child->rt_bandwidth.rt_runtime;
8901		8901
8902	if (child == d->tg) {	8902	if (child == d->tg) {
8903	period = d->rt_period;	8903	period = d->rt_period;
8904	runtime = d->rt_runtime;	8904	runtime = d->rt_runtime;
8905	}	8905	}
8906		8906
8907	sum += to_ratio(period, runtime);	8907	sum += to_ratio(period, runtime);
8908	}	8908	}
8909		8909
8910	if (sum > total)	8910	if (sum > total)
8911	return -EINVAL;	8911	return -EINVAL;
8912		8912
8913	return 0;	8913	return 0;
8914	}	8914	}
8915		8915
8916	static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)	8916	static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8917	{	8917	{
8918	struct rt_schedulable_data data = {	8918	struct rt_schedulable_data data = {
8919	.tg = tg,	8919	.tg = tg,
8920	.rt_period = period,	8920	.rt_period = period,
8921	.rt_runtime = runtime,	8921	.rt_runtime = runtime,
8922	};	8922	};
8923		8923
8924	return walk_tg_tree(tg_schedulable, tg_nop, &data);	8924	return walk_tg_tree(tg_schedulable, tg_nop, &data);
8925	}	8925	}
8926		8926
8927	static int tg_set_bandwidth(struct task_group *tg,	8927	static int tg_set_bandwidth(struct task_group *tg,
8928	u64 rt_period, u64 rt_runtime)	8928	u64 rt_period, u64 rt_runtime)
8929	{	8929	{
8930	int i, err = 0;	8930	int i, err = 0;
8931		8931
8932	mutex_lock(&rt_constraints_mutex);	8932	mutex_lock(&rt_constraints_mutex);
8933	read_lock(&tasklist_lock);	8933	read_lock(&tasklist_lock);
8934	err = __rt_schedulable(tg, rt_period, rt_runtime);	8934	err = __rt_schedulable(tg, rt_period, rt_runtime);
8935	if (err)	8935	if (err)
8936	goto unlock;	8936	goto unlock;
8937		8937
8938	raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);	8938	raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8939	tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);	8939	tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
8940	tg->rt_bandwidth.rt_runtime = rt_runtime;	8940	tg->rt_bandwidth.rt_runtime = rt_runtime;
8941		8941
8942	for_each_possible_cpu(i) {	8942	for_each_possible_cpu(i) {
8943	struct rt_rq *rt_rq = tg->rt_rq[i];	8943	struct rt_rq *rt_rq = tg->rt_rq[i];
8944		8944
8945	raw_spin_lock(&rt_rq->rt_runtime_lock);	8945	raw_spin_lock(&rt_rq->rt_runtime_lock);
8946	rt_rq->rt_runtime = rt_runtime;	8946	rt_rq->rt_runtime = rt_runtime;
8947	raw_spin_unlock(&rt_rq->rt_runtime_lock);	8947	raw_spin_unlock(&rt_rq->rt_runtime_lock);
8948	}	8948	}
8949	raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);	8949	raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8950	unlock:	8950	unlock:
8951	read_unlock(&tasklist_lock);	8951	read_unlock(&tasklist_lock);
8952	mutex_unlock(&rt_constraints_mutex);	8952	mutex_unlock(&rt_constraints_mutex);
8953		8953
8954	return err;	8954	return err;
8955	}	8955	}
8956		8956
8957	int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)	8957	int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
8958	{	8958	{
8959	u64 rt_runtime, rt_period;	8959	u64 rt_runtime, rt_period;
8960		8960
8961	rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);	8961	rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8962	rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;	8962	rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
8963	if (rt_runtime_us < 0)	8963	if (rt_runtime_us < 0)
8964	rt_runtime = RUNTIME_INF;	8964	rt_runtime = RUNTIME_INF;
8965		8965
8966	return tg_set_bandwidth(tg, rt_period, rt_runtime);	8966	return tg_set_bandwidth(tg, rt_period, rt_runtime);
8967	}	8967	}
8968		8968
8969	long sched_group_rt_runtime(struct task_group *tg)	8969	long sched_group_rt_runtime(struct task_group *tg)
8970	{	8970	{
8971	u64 rt_runtime_us;	8971	u64 rt_runtime_us;
8972		8972
8973	if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)	8973	if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
8974	return -1;	8974	return -1;
8975		8975
8976	rt_runtime_us = tg->rt_bandwidth.rt_runtime;	8976	rt_runtime_us = tg->rt_bandwidth.rt_runtime;
8977	do_div(rt_runtime_us, NSEC_PER_USEC);	8977	do_div(rt_runtime_us, NSEC_PER_USEC);
8978	return rt_runtime_us;	8978	return rt_runtime_us;
8979	}	8979	}
8980		8980
8981	int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)	8981	int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
8982	{	8982	{
8983	u64 rt_runtime, rt_period;	8983	u64 rt_runtime, rt_period;
8984		8984
8985	rt_period = (u64)rt_period_us * NSEC_PER_USEC;	8985	rt_period = (u64)rt_period_us * NSEC_PER_USEC;
8986	rt_runtime = tg->rt_bandwidth.rt_runtime;	8986	rt_runtime = tg->rt_bandwidth.rt_runtime;
8987		8987
8988	if (rt_period == 0)	8988	if (rt_period == 0)
8989	return -EINVAL;	8989	return -EINVAL;
8990		8990
8991	return tg_set_bandwidth(tg, rt_period, rt_runtime);	8991	return tg_set_bandwidth(tg, rt_period, rt_runtime);
8992	}	8992	}
8993		8993
8994	long sched_group_rt_period(struct task_group *tg)	8994	long sched_group_rt_period(struct task_group *tg)
8995	{	8995	{
8996	u64 rt_period_us;	8996	u64 rt_period_us;
8997		8997
8998	rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);	8998	rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
8999	do_div(rt_period_us, NSEC_PER_USEC);	8999	do_div(rt_period_us, NSEC_PER_USEC);
9000	return rt_period_us;	9000	return rt_period_us;
9001	}	9001	}
9002		9002
9003	static int sched_rt_global_constraints(void)	9003	static int sched_rt_global_constraints(void)
9004	{	9004	{
9005	u64 runtime, period;	9005	u64 runtime, period;
9006	int ret = 0;	9006	int ret = 0;
9007		9007
9008	if (sysctl_sched_rt_period <= 0)	9008	if (sysctl_sched_rt_period <= 0)
9009	return -EINVAL;	9009	return -EINVAL;
9010		9010
9011	runtime = global_rt_runtime();	9011	runtime = global_rt_runtime();
9012	period = global_rt_period();	9012	period = global_rt_period();
9013		9013
9014	/*	9014	/*
9015	* Sanity check on the sysctl variables.	9015	* Sanity check on the sysctl variables.
9016	*/	9016	*/
9017	if (runtime > period && runtime != RUNTIME_INF)	9017	if (runtime > period && runtime != RUNTIME_INF)
9018	return -EINVAL;	9018	return -EINVAL;
9019		9019
9020	mutex_lock(&rt_constraints_mutex);	9020	mutex_lock(&rt_constraints_mutex);
9021	read_lock(&tasklist_lock);	9021	read_lock(&tasklist_lock);
9022	ret = __rt_schedulable(NULL, 0, 0);	9022	ret = __rt_schedulable(NULL, 0, 0);
9023	read_unlock(&tasklist_lock);	9023	read_unlock(&tasklist_lock);
9024	mutex_unlock(&rt_constraints_mutex);	9024	mutex_unlock(&rt_constraints_mutex);
9025		9025
9026	return ret;	9026	return ret;
9027	}	9027	}
9028		9028
9029	int sched_rt_can_attach(struct task_group tg, struct task_struct tsk)	9029	int sched_rt_can_attach(struct task_group tg, struct task_struct tsk)
9030	{	9030	{
9031	/* Don't accept realtime tasks when there is no way for them to run */	9031	/* Don't accept realtime tasks when there is no way for them to run */
9032	if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)	9032	if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
9033	return 0;	9033	return 0;
9034		9034
9035	return 1;	9035	return 1;
9036	}	9036	}
9037		9037
9038	#else /* !CONFIG_RT_GROUP_SCHED */	9038	#else /* !CONFIG_RT_GROUP_SCHED */
9039	static int sched_rt_global_constraints(void)	9039	static int sched_rt_global_constraints(void)
9040	{	9040	{
9041	unsigned long flags;	9041	unsigned long flags;
9042	int i;	9042	int i;
9043		9043
9044	if (sysctl_sched_rt_period <= 0)	9044	if (sysctl_sched_rt_period <= 0)
9045	return -EINVAL;	9045	return -EINVAL;
9046		9046
9047	/*	9047	/*
9048	* There's always some RT tasks in the root group	9048	* There's always some RT tasks in the root group
9049	* -- migration, kstopmachine etc..	9049	* -- migration, kstopmachine etc..
9050	*/	9050	*/
9051	if (sysctl_sched_rt_runtime == 0)	9051	if (sysctl_sched_rt_runtime == 0)
9052	return -EBUSY;	9052	return -EBUSY;
9053		9053
9054	raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);	9054	raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
9055	for_each_possible_cpu(i) {	9055	for_each_possible_cpu(i) {
9056	struct rt_rq *rt_rq = &cpu_rq(i)->rt;	9056	struct rt_rq *rt_rq = &cpu_rq(i)->rt;
9057		9057
9058	raw_spin_lock(&rt_rq->rt_runtime_lock);	9058	raw_spin_lock(&rt_rq->rt_runtime_lock);
9059	rt_rq->rt_runtime = global_rt_runtime();	9059	rt_rq->rt_runtime = global_rt_runtime();
9060	raw_spin_unlock(&rt_rq->rt_runtime_lock);	9060	raw_spin_unlock(&rt_rq->rt_runtime_lock);
9061	}	9061	}
9062	raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);	9062	raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
9063		9063
9064	return 0;	9064	return 0;
9065	}	9065	}
9066	#endif /* CONFIG_RT_GROUP_SCHED */	9066	#endif /* CONFIG_RT_GROUP_SCHED */
9067		9067
9068	int sched_rt_handler(struct ctl_table *table, int write,	9068	int sched_rt_handler(struct ctl_table *table, int write,
9069	void __user buffer, size_t lenp,	9069	void __user buffer, size_t lenp,
9070	loff_t *ppos)	9070	loff_t *ppos)
9071	{	9071	{
9072	int ret;	9072	int ret;
9073	int old_period, old_runtime;	9073	int old_period, old_runtime;
9074	static DEFINE_MUTEX(mutex);	9074	static DEFINE_MUTEX(mutex);
9075		9075
9076	mutex_lock(&mutex);	9076	mutex_lock(&mutex);
9077	old_period = sysctl_sched_rt_period;	9077	old_period = sysctl_sched_rt_period;
9078	old_runtime = sysctl_sched_rt_runtime;	9078	old_runtime = sysctl_sched_rt_runtime;
9079		9079
9080	ret = proc_dointvec(table, write, buffer, lenp, ppos);	9080	ret = proc_dointvec(table, write, buffer, lenp, ppos);
9081		9081
9082	if (!ret && write) {	9082	if (!ret && write) {
9083	ret = sched_rt_global_constraints();	9083	ret = sched_rt_global_constraints();
9084	if (ret) {	9084	if (ret) {
9085	sysctl_sched_rt_period = old_period;	9085	sysctl_sched_rt_period = old_period;
9086	sysctl_sched_rt_runtime = old_runtime;	9086	sysctl_sched_rt_runtime = old_runtime;
9087	} else {	9087	} else {
9088	def_rt_bandwidth.rt_runtime = global_rt_runtime();	9088	def_rt_bandwidth.rt_runtime = global_rt_runtime();
9089	def_rt_bandwidth.rt_period =	9089	def_rt_bandwidth.rt_period =
9090	ns_to_ktime(global_rt_period());	9090	ns_to_ktime(global_rt_period());
9091	}	9091	}
9092	}	9092	}
9093	mutex_unlock(&mutex);	9093	mutex_unlock(&mutex);
9094		9094
9095	return ret;	9095	return ret;
9096	}	9096	}
9097		9097
9098	#ifdef CONFIG_CGROUP_SCHED	9098	#ifdef CONFIG_CGROUP_SCHED
9099		9099
9100	/* return corresponding task_group object of a cgroup */	9100	/* return corresponding task_group object of a cgroup */
9101	static inline struct task_group cgroup_tg(struct cgroup cgrp)	9101	static inline struct task_group cgroup_tg(struct cgroup cgrp)
9102	{	9102	{
9103	return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),	9103	return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
9104	struct task_group, css);	9104	struct task_group, css);
9105	}	9105	}
9106		9106
9107	static struct cgroup_subsys_state *	9107	static struct cgroup_subsys_state *
9108	cpu_cgroup_create(struct cgroup_subsys ss, struct cgroup cgrp)	9108	cpu_cgroup_create(struct cgroup_subsys ss, struct cgroup cgrp)
9109	{	9109	{
9110	struct task_group tg, parent;	9110	struct task_group tg, parent;
9111		9111
9112	if (!cgrp->parent) {	9112	if (!cgrp->parent) {
9113	/* This is early initialization for the top cgroup */	9113	/* This is early initialization for the top cgroup */
9114	return &root_task_group.css;	9114	return &root_task_group.css;
9115	}	9115	}
9116		9116
9117	parent = cgroup_tg(cgrp->parent);	9117	parent = cgroup_tg(cgrp->parent);
9118	tg = sched_create_group(parent);	9118	tg = sched_create_group(parent);
9119	if (IS_ERR(tg))	9119	if (IS_ERR(tg))
9120	return ERR_PTR(-ENOMEM);	9120	return ERR_PTR(-ENOMEM);
9121		9121
9122	return &tg->css;	9122	return &tg->css;
9123	}	9123	}
9124		9124
9125	static void	9125	static void
9126	cpu_cgroup_destroy(struct cgroup_subsys ss, struct cgroup cgrp)	9126	cpu_cgroup_destroy(struct cgroup_subsys ss, struct cgroup cgrp)
9127	{	9127	{
9128	struct task_group *tg = cgroup_tg(cgrp);	9128	struct task_group *tg = cgroup_tg(cgrp);
9129		9129
9130	sched_destroy_group(tg);	9130	sched_destroy_group(tg);
9131	}	9131	}
9132		9132
9133	static int	9133	static int
9134	cpu_cgroup_can_attach_task(struct cgroup cgrp, struct task_struct tsk)	9134	cpu_cgroup_can_attach_task(struct cgroup cgrp, struct task_struct tsk)
9135	{	9135	{
9136	#ifdef CONFIG_RT_GROUP_SCHED	9136	#ifdef CONFIG_RT_GROUP_SCHED
9137	if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))	9137	if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
9138	return -EINVAL;	9138	return -EINVAL;
9139	#else	9139	#else
9140	/* We don't support RT-tasks being in separate groups */	9140	/* We don't support RT-tasks being in separate groups */
9141	if (tsk->sched_class != &fair_sched_class)	9141	if (tsk->sched_class != &fair_sched_class)
9142	return -EINVAL;	9142	return -EINVAL;
9143	#endif	9143	#endif
9144	return 0;	9144	return 0;
9145	}	9145	}
9146		9146
9147	static int	9147	static int
9148	cpu_cgroup_can_attach(struct cgroup_subsys ss, struct cgroup cgrp,	9148	cpu_cgroup_can_attach(struct cgroup_subsys ss, struct cgroup cgrp,
9149	struct task_struct *tsk, bool threadgroup)	9149	struct task_struct *tsk, bool threadgroup)
9150	{	9150	{
9151	int retval = cpu_cgroup_can_attach_task(cgrp, tsk);	9151	int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
9152	if (retval)	9152	if (retval)
9153	return retval;	9153	return retval;
9154	if (threadgroup) {	9154	if (threadgroup) {
9155	struct task_struct *c;	9155	struct task_struct *c;
9156	rcu_read_lock();	9156	rcu_read_lock();
9157	list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {	9157	list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
9158	retval = cpu_cgroup_can_attach_task(cgrp, c);	9158	retval = cpu_cgroup_can_attach_task(cgrp, c);
9159	if (retval) {	9159	if (retval) {
9160	rcu_read_unlock();	9160	rcu_read_unlock();
9161	return retval;	9161	return retval;
9162	}	9162	}
9163	}	9163	}
9164	rcu_read_unlock();	9164	rcu_read_unlock();
9165	}	9165	}
9166	return 0;	9166	return 0;
9167	}	9167	}
9168		9168
9169	static void	9169	static void
9170	cpu_cgroup_attach(struct cgroup_subsys ss, struct cgroup cgrp,	9170	cpu_cgroup_attach(struct cgroup_subsys ss, struct cgroup cgrp,
9171	struct cgroup old_cont, struct task_struct tsk,	9171	struct cgroup old_cont, struct task_struct tsk,
9172	bool threadgroup)	9172	bool threadgroup)
9173	{	9173	{
9174	sched_move_task(tsk);	9174	sched_move_task(tsk);
9175	if (threadgroup) {	9175	if (threadgroup) {
9176	struct task_struct *c;	9176	struct task_struct *c;
9177	rcu_read_lock();	9177	rcu_read_lock();
9178	list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {	9178	list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
9179	sched_move_task(c);	9179	sched_move_task(c);
9180	}	9180	}
9181	rcu_read_unlock();	9181	rcu_read_unlock();
9182	}	9182	}
9183	}	9183	}
9184		9184
9185	static void	9185	static void
9186	cpu_cgroup_exit(struct cgroup_subsys ss, struct cgroup cgrp,	9186	cpu_cgroup_exit(struct cgroup_subsys ss, struct cgroup cgrp,
9187	struct cgroup old_cgrp, struct task_struct task)	9187	struct cgroup old_cgrp, struct task_struct task)
9188	{	9188	{
9189	/*	9189	/*
9190	* cgroup_exit() is called in the copy_process() failure path.	9190	* cgroup_exit() is called in the copy_process() failure path.
9191	* Ignore this case since the task hasn't ran yet, this avoids	9191	* Ignore this case since the task hasn't ran yet, this avoids
9192	* trying to poke a half freed task state from generic code.	9192	* trying to poke a half freed task state from generic code.
9193	*/	9193	*/
9194	if (!(task->flags & PF_EXITING))	9194	if (!(task->flags & PF_EXITING))
9195	return;	9195	return;
9196		9196
9197	sched_move_task(task);	9197	sched_move_task(task);
9198	}	9198	}
9199		9199
9200	#ifdef CONFIG_FAIR_GROUP_SCHED	9200	#ifdef CONFIG_FAIR_GROUP_SCHED
9201	static int cpu_shares_write_u64(struct cgroup cgrp, struct cftype cftype,	9201	static int cpu_shares_write_u64(struct cgroup cgrp, struct cftype cftype,
9202	u64 shareval)	9202	u64 shareval)
9203	{	9203	{
9204	return sched_group_set_shares(cgroup_tg(cgrp), shareval);	9204	return sched_group_set_shares(cgroup_tg(cgrp), shareval);
9205	}	9205	}
9206		9206
9207	static u64 cpu_shares_read_u64(struct cgroup cgrp, struct cftype cft)	9207	static u64 cpu_shares_read_u64(struct cgroup cgrp, struct cftype cft)
9208	{	9208	{
9209	struct task_group *tg = cgroup_tg(cgrp);	9209	struct task_group *tg = cgroup_tg(cgrp);
9210		9210
9211	return (u64) tg->shares;	9211	return (u64) tg->shares;
9212	}	9212	}
9213	#endif /* CONFIG_FAIR_GROUP_SCHED */	9213	#endif /* CONFIG_FAIR_GROUP_SCHED */
9214		9214
9215	#ifdef CONFIG_RT_GROUP_SCHED	9215	#ifdef CONFIG_RT_GROUP_SCHED
9216	static int cpu_rt_runtime_write(struct cgroup cgrp, struct cftype cft,	9216	static int cpu_rt_runtime_write(struct cgroup cgrp, struct cftype cft,
9217	s64 val)	9217	s64 val)
9218	{	9218	{
9219	return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);	9219	return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
9220	}	9220	}
9221		9221
9222	static s64 cpu_rt_runtime_read(struct cgroup cgrp, struct cftype cft)	9222	static s64 cpu_rt_runtime_read(struct cgroup cgrp, struct cftype cft)
9223	{	9223	{
9224	return sched_group_rt_runtime(cgroup_tg(cgrp));	9224	return sched_group_rt_runtime(cgroup_tg(cgrp));
9225	}	9225	}
9226		9226
9227	static int cpu_rt_period_write_uint(struct cgroup cgrp, struct cftype cftype,	9227	static int cpu_rt_period_write_uint(struct cgroup cgrp, struct cftype cftype,
9228	u64 rt_period_us)	9228	u64 rt_period_us)
9229	{	9229	{
9230	return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);	9230	return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
9231	}	9231	}
9232		9232
9233	static u64 cpu_rt_period_read_uint(struct cgroup cgrp, struct cftype cft)	9233	static u64 cpu_rt_period_read_uint(struct cgroup cgrp, struct cftype cft)
9234	{	9234	{
9235	return sched_group_rt_period(cgroup_tg(cgrp));	9235	return sched_group_rt_period(cgroup_tg(cgrp));
9236	}	9236	}
9237	#endif /* CONFIG_RT_GROUP_SCHED */	9237	#endif /* CONFIG_RT_GROUP_SCHED */
9238		9238
9239	static struct cftype cpu_files[] = {	9239	static struct cftype cpu_files[] = {
9240	#ifdef CONFIG_FAIR_GROUP_SCHED	9240	#ifdef CONFIG_FAIR_GROUP_SCHED
9241	{	9241	{
9242	.name = "shares",	9242	.name = "shares",
9243	.read_u64 = cpu_shares_read_u64,	9243	.read_u64 = cpu_shares_read_u64,
9244	.write_u64 = cpu_shares_write_u64,	9244	.write_u64 = cpu_shares_write_u64,
9245	},	9245	},
9246	#endif	9246	#endif
9247	#ifdef CONFIG_RT_GROUP_SCHED	9247	#ifdef CONFIG_RT_GROUP_SCHED
9248	{	9248	{
9249	.name = "rt_runtime_us",	9249	.name = "rt_runtime_us",
9250	.read_s64 = cpu_rt_runtime_read,	9250	.read_s64 = cpu_rt_runtime_read,
9251	.write_s64 = cpu_rt_runtime_write,	9251	.write_s64 = cpu_rt_runtime_write,
9252	},	9252	},
9253	{	9253	{
9254	.name = "rt_period_us",	9254	.name = "rt_period_us",
9255	.read_u64 = cpu_rt_period_read_uint,	9255	.read_u64 = cpu_rt_period_read_uint,
9256	.write_u64 = cpu_rt_period_write_uint,	9256	.write_u64 = cpu_rt_period_write_uint,
9257	},	9257	},
9258	#endif	9258	#endif
9259	};	9259	};
9260		9260
9261	static int cpu_cgroup_populate(struct cgroup_subsys ss, struct cgroup cont)	9261	static int cpu_cgroup_populate(struct cgroup_subsys ss, struct cgroup cont)
9262	{	9262	{
9263	return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));	9263	return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
9264	}	9264	}
9265		9265
9266	struct cgroup_subsys cpu_cgroup_subsys = {	9266	struct cgroup_subsys cpu_cgroup_subsys = {
9267	.name = "cpu",	9267	.name = "cpu",
9268	.create = cpu_cgroup_create,	9268	.create = cpu_cgroup_create,
9269	.destroy = cpu_cgroup_destroy,	9269	.destroy = cpu_cgroup_destroy,
9270	.can_attach = cpu_cgroup_can_attach,	9270	.can_attach = cpu_cgroup_can_attach,
9271	.attach = cpu_cgroup_attach,	9271	.attach = cpu_cgroup_attach,
9272	.exit = cpu_cgroup_exit,	9272	.exit = cpu_cgroup_exit,
9273	.populate = cpu_cgroup_populate,	9273	.populate = cpu_cgroup_populate,
9274	.subsys_id = cpu_cgroup_subsys_id,	9274	.subsys_id = cpu_cgroup_subsys_id,
9275	.early_init = 1,	9275	.early_init = 1,
9276	};	9276	};
9277		9277
9278	#endif /* CONFIG_CGROUP_SCHED */	9278	#endif /* CONFIG_CGROUP_SCHED */
9279		9279
9280	#ifdef CONFIG_CGROUP_CPUACCT	9280	#ifdef CONFIG_CGROUP_CPUACCT
9281		9281
9282	/*	9282	/*
9283	* CPU accounting code for task groups.	9283	* CPU accounting code for task groups.
9284	*	9284	*
9285	* Based on the work by Paul Menage (menage@google.com) and Balbir Singh	9285	* Based on the work by Paul Menage (menage@google.com) and Balbir Singh
9286	* (balbir@in.ibm.com).	9286	* (balbir@in.ibm.com).
9287	*/	9287	*/
9288		9288
9289	/* track cpu usage of a group of tasks and its child groups */	9289	/* track cpu usage of a group of tasks and its child groups */
9290	struct cpuacct {	9290	struct cpuacct {
9291	struct cgroup_subsys_state css;	9291	struct cgroup_subsys_state css;
9292	/* cpuusage holds pointer to a u64-type object on every cpu */	9292	/* cpuusage holds pointer to a u64-type object on every cpu */
9293	u64 __percpu *cpuusage;	9293	u64 __percpu *cpuusage;
9294	struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];	9294	struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
9295	struct cpuacct *parent;	9295	struct cpuacct *parent;
9296	};	9296	};
9297		9297
9298	struct cgroup_subsys cpuacct_subsys;	9298	struct cgroup_subsys cpuacct_subsys;
9299		9299
9300	/* return cpu accounting group corresponding to this container */	9300	/* return cpu accounting group corresponding to this container */
9301	static inline struct cpuacct cgroup_ca(struct cgroup cgrp)	9301	static inline struct cpuacct cgroup_ca(struct cgroup cgrp)
9302	{	9302	{
9303	return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),	9303	return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
9304	struct cpuacct, css);	9304	struct cpuacct, css);
9305	}	9305	}
9306		9306
9307	/* return cpu accounting group to which this task belongs */	9307	/* return cpu accounting group to which this task belongs */
9308	static inline struct cpuacct task_ca(struct task_struct tsk)	9308	static inline struct cpuacct task_ca(struct task_struct tsk)
9309	{	9309	{
9310	return container_of(task_subsys_state(tsk, cpuacct_subsys_id),	9310	return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
9311	struct cpuacct, css);	9311	struct cpuacct, css);
9312	}	9312	}
9313		9313
9314	/* create a new cpu accounting group */	9314	/* create a new cpu accounting group */
9315	static struct cgroup_subsys_state *cpuacct_create(	9315	static struct cgroup_subsys_state *cpuacct_create(
9316	struct cgroup_subsys ss, struct cgroup cgrp)	9316	struct cgroup_subsys ss, struct cgroup cgrp)
9317	{	9317	{
9318	struct cpuacct ca = kzalloc(sizeof(ca), GFP_KERNEL);	9318	struct cpuacct ca = kzalloc(sizeof(ca), GFP_KERNEL);
9319	int i;	9319	int i;
9320		9320
9321	if (!ca)	9321	if (!ca)
9322	goto out;	9322	goto out;
9323		9323
9324	ca->cpuusage = alloc_percpu(u64);	9324	ca->cpuusage = alloc_percpu(u64);
9325	if (!ca->cpuusage)	9325	if (!ca->cpuusage)
9326	goto out_free_ca;	9326	goto out_free_ca;
9327		9327
9328	for (i = 0; i < CPUACCT_STAT_NSTATS; i++)	9328	for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
9329	if (percpu_counter_init(&ca->cpustat[i], 0))	9329	if (percpu_counter_init(&ca->cpustat[i], 0))
9330	goto out_free_counters;	9330	goto out_free_counters;
9331		9331
9332	if (cgrp->parent)	9332	if (cgrp->parent)
9333	ca->parent = cgroup_ca(cgrp->parent);	9333	ca->parent = cgroup_ca(cgrp->parent);
9334		9334
9335	return &ca->css;	9335	return &ca->css;
9336		9336
9337	out_free_counters:	9337	out_free_counters:
9338	while (--i >= 0)	9338	while (--i >= 0)
9339	percpu_counter_destroy(&ca->cpustat[i]);	9339	percpu_counter_destroy(&ca->cpustat[i]);
9340	free_percpu(ca->cpuusage);	9340	free_percpu(ca->cpuusage);
9341	out_free_ca:	9341	out_free_ca:
9342	kfree(ca);	9342	kfree(ca);
9343	out:	9343	out:
9344	return ERR_PTR(-ENOMEM);	9344	return ERR_PTR(-ENOMEM);
9345	}	9345	}
9346		9346
9347	/* destroy an existing cpu accounting group */	9347	/* destroy an existing cpu accounting group */
9348	static void	9348	static void
9349	cpuacct_destroy(struct cgroup_subsys ss, struct cgroup cgrp)	9349	cpuacct_destroy(struct cgroup_subsys ss, struct cgroup cgrp)
9350	{	9350	{
9351	struct cpuacct *ca = cgroup_ca(cgrp);	9351	struct cpuacct *ca = cgroup_ca(cgrp);
9352	int i;	9352	int i;
9353		9353
9354	for (i = 0; i < CPUACCT_STAT_NSTATS; i++)	9354	for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
9355	percpu_counter_destroy(&ca->cpustat[i]);	9355	percpu_counter_destroy(&ca->cpustat[i]);
9356	free_percpu(ca->cpuusage);	9356	free_percpu(ca->cpuusage);
9357	kfree(ca);	9357	kfree(ca);
9358	}	9358	}
9359		9359
9360	static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)	9360	static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
9361	{	9361	{
9362	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);	9362	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
9363	u64 data;	9363	u64 data;
9364		9364
9365	#ifndef CONFIG_64BIT	9365	#ifndef CONFIG_64BIT
9366	/*	9366	/*
9367	* Take rq->lock to make 64-bit read safe on 32-bit platforms.	9367	* Take rq->lock to make 64-bit read safe on 32-bit platforms.
9368	*/	9368	*/
9369	raw_spin_lock_irq(&cpu_rq(cpu)->lock);	9369	raw_spin_lock_irq(&cpu_rq(cpu)->lock);
9370	data = *cpuusage;	9370	data = *cpuusage;
9371	raw_spin_unlock_irq(&cpu_rq(cpu)->lock);	9371	raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
9372	#else	9372	#else
9373	data = *cpuusage;	9373	data = *cpuusage;
9374	#endif	9374	#endif
9375		9375
9376	return data;	9376	return data;
9377	}	9377	}
9378		9378
9379	static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)	9379	static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
9380	{	9380	{
9381	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);	9381	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
9382		9382
9383	#ifndef CONFIG_64BIT	9383	#ifndef CONFIG_64BIT
9384	/*	9384	/*
9385	* Take rq->lock to make 64-bit write safe on 32-bit platforms.	9385	* Take rq->lock to make 64-bit write safe on 32-bit platforms.
9386	*/	9386	*/
9387	raw_spin_lock_irq(&cpu_rq(cpu)->lock);	9387	raw_spin_lock_irq(&cpu_rq(cpu)->lock);
9388	*cpuusage = val;	9388	*cpuusage = val;
9389	raw_spin_unlock_irq(&cpu_rq(cpu)->lock);	9389	raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
9390	#else	9390	#else
9391	*cpuusage = val;	9391	*cpuusage = val;
9392	#endif	9392	#endif
9393	}	9393	}
9394		9394
9395	/* return total cpu usage (in nanoseconds) of a group */	9395	/* return total cpu usage (in nanoseconds) of a group */
9396	static u64 cpuusage_read(struct cgroup cgrp, struct cftype cft)	9396	static u64 cpuusage_read(struct cgroup cgrp, struct cftype cft)
9397	{	9397	{
9398	struct cpuacct *ca = cgroup_ca(cgrp);	9398	struct cpuacct *ca = cgroup_ca(cgrp);
9399	u64 totalcpuusage = 0;	9399	u64 totalcpuusage = 0;
9400	int i;	9400	int i;
9401		9401
9402	for_each_present_cpu(i)	9402	for_each_present_cpu(i)
9403	totalcpuusage += cpuacct_cpuusage_read(ca, i);	9403	totalcpuusage += cpuacct_cpuusage_read(ca, i);
9404		9404
9405	return totalcpuusage;	9405	return totalcpuusage;
9406	}	9406	}
9407		9407
9408	static int cpuusage_write(struct cgroup cgrp, struct cftype cftype,	9408	static int cpuusage_write(struct cgroup cgrp, struct cftype cftype,
9409	u64 reset)	9409	u64 reset)
9410	{	9410	{
9411	struct cpuacct *ca = cgroup_ca(cgrp);	9411	struct cpuacct *ca = cgroup_ca(cgrp);
9412	int err = 0;	9412	int err = 0;
9413	int i;	9413	int i;
9414		9414
9415	if (reset) {	9415	if (reset) {
9416	err = -EINVAL;	9416	err = -EINVAL;
9417	goto out;	9417	goto out;
9418	}	9418	}
9419		9419
9420	for_each_present_cpu(i)	9420	for_each_present_cpu(i)
9421	cpuacct_cpuusage_write(ca, i, 0);	9421	cpuacct_cpuusage_write(ca, i, 0);
9422		9422
9423	out:	9423	out:
9424	return err;	9424	return err;
9425	}	9425	}
9426		9426
9427	static int cpuacct_percpu_seq_read(struct cgroup cgroup, struct cftype cft,	9427	static int cpuacct_percpu_seq_read(struct cgroup cgroup, struct cftype cft,
9428	struct seq_file *m)	9428	struct seq_file *m)
9429	{	9429	{
9430	struct cpuacct *ca = cgroup_ca(cgroup);	9430	struct cpuacct *ca = cgroup_ca(cgroup);
9431	u64 percpu;	9431	u64 percpu;
9432	int i;	9432	int i;
9433		9433
9434	for_each_present_cpu(i) {	9434	for_each_present_cpu(i) {
9435	percpu = cpuacct_cpuusage_read(ca, i);	9435	percpu = cpuacct_cpuusage_read(ca, i);
9436	seq_printf(m, "%llu ", (unsigned long long) percpu);	9436	seq_printf(m, "%llu ", (unsigned long long) percpu);
9437	}	9437	}
9438	seq_printf(m, "\n");	9438	seq_printf(m, "\n");
9439	return 0;	9439	return 0;
9440	}	9440	}
9441		9441
9442	static const char *cpuacct_stat_desc[] = {	9442	static const char *cpuacct_stat_desc[] = {
9443	[CPUACCT_STAT_USER] = "user",	9443	[CPUACCT_STAT_USER] = "user",
9444	[CPUACCT_STAT_SYSTEM] = "system",	9444	[CPUACCT_STAT_SYSTEM] = "system",
9445	};	9445	};
9446		9446
9447	static int cpuacct_stats_show(struct cgroup cgrp, struct cftype cft,	9447	static int cpuacct_stats_show(struct cgroup cgrp, struct cftype cft,
9448	struct cgroup_map_cb *cb)	9448	struct cgroup_map_cb *cb)
9449	{	9449	{
9450	struct cpuacct *ca = cgroup_ca(cgrp);	9450	struct cpuacct *ca = cgroup_ca(cgrp);
9451	int i;	9451	int i;
9452		9452
9453	for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {	9453	for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
9454	s64 val = percpu_counter_read(&ca->cpustat[i]);	9454	s64 val = percpu_counter_read(&ca->cpustat[i]);
9455	val = cputime64_to_clock_t(val);	9455	val = cputime64_to_clock_t(val);
9456	cb->fill(cb, cpuacct_stat_desc[i], val);	9456	cb->fill(cb, cpuacct_stat_desc[i], val);
9457	}	9457	}
9458	return 0;	9458	return 0;
9459	}	9459	}
9460		9460
9461	static struct cftype files[] = {	9461	static struct cftype files[] = {
9462	{	9462	{
9463	.name = "usage",	9463	.name = "usage",
9464	.read_u64 = cpuusage_read,	9464	.read_u64 = cpuusage_read,
9465	.write_u64 = cpuusage_write,	9465	.write_u64 = cpuusage_write,
9466	},	9466	},
9467	{	9467	{
9468	.name = "usage_percpu",	9468	.name = "usage_percpu",
9469	.read_seq_string = cpuacct_percpu_seq_read,	9469	.read_seq_string = cpuacct_percpu_seq_read,
9470	},	9470	},
9471	{	9471	{
9472	.name = "stat",	9472	.name = "stat",
9473	.read_map = cpuacct_stats_show,	9473	.read_map = cpuacct_stats_show,
9474	},	9474	},
9475	};	9475	};
9476		9476
9477	static int cpuacct_populate(struct cgroup_subsys ss, struct cgroup cgrp)	9477	static int cpuacct_populate(struct cgroup_subsys ss, struct cgroup cgrp)
9478	{	9478	{
9479	return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));	9479	return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
9480	}	9480	}
9481		9481
9482	/*	9482	/*
9483	* charge this task's execution time to its accounting group.	9483	* charge this task's execution time to its accounting group.
9484	*	9484	*
9485	* called with rq->lock held.	9485	* called with rq->lock held.
9486	*/	9486	*/
9487	static void cpuacct_charge(struct task_struct *tsk, u64 cputime)	9487	static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9488	{	9488	{
9489	struct cpuacct *ca;	9489	struct cpuacct *ca;
9490	int cpu;	9490	int cpu;
9491		9491
9492	if (unlikely(!cpuacct_subsys.active))	9492	if (unlikely(!cpuacct_subsys.active))
9493	return;	9493	return;
9494		9494
9495	cpu = task_cpu(tsk);	9495	cpu = task_cpu(tsk);
9496		9496
9497	rcu_read_lock();	9497	rcu_read_lock();
9498		9498
9499	ca = task_ca(tsk);	9499	ca = task_ca(tsk);
9500		9500
9501	for (; ca; ca = ca->parent) {	9501	for (; ca; ca = ca->parent) {
9502	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);	9502	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
9503	*cpuusage += cputime;	9503	*cpuusage += cputime;
9504	}	9504	}
9505		9505
9506	rcu_read_unlock();	9506	rcu_read_unlock();
9507	}	9507	}
9508		9508
9509	/*	9509	/*
9510	* When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large	9510	* When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
9511	* in cputime_t units. As a result, cpuacct_update_stats calls	9511	* in cputime_t units. As a result, cpuacct_update_stats calls
9512	* percpu_counter_add with values large enough to always overflow the	9512	* percpu_counter_add with values large enough to always overflow the
9513	* per cpu batch limit causing bad SMP scalability.	9513	* per cpu batch limit causing bad SMP scalability.
9514	*	9514	*
9515	* To fix this we scale percpu_counter_batch by cputime_one_jiffy so we	9515	* To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
9516	* batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled	9516	* batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
9517	* and enabled. We cap it at INT_MAX which is the largest allowed batch value.	9517	* and enabled. We cap it at INT_MAX which is the largest allowed batch value.
9518	*/	9518	*/
9519	#ifdef CONFIG_SMP	9519	#ifdef CONFIG_SMP
9520	#define CPUACCT_BATCH \	9520	#define CPUACCT_BATCH \
9521	min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)	9521	min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
9522	#else	9522	#else
9523	#define CPUACCT_BATCH 0	9523	#define CPUACCT_BATCH 0
9524	#endif	9524	#endif
9525		9525
9526	/*	9526	/*
9527	* Charge the system/user time to the task's accounting group.	9527	* Charge the system/user time to the task's accounting group.
9528	*/	9528	*/
9529	static void cpuacct_update_stats(struct task_struct *tsk,	9529	static void cpuacct_update_stats(struct task_struct *tsk,
9530	enum cpuacct_stat_index idx, cputime_t val)	9530	enum cpuacct_stat_index idx, cputime_t val)
9531	{	9531	{
9532	struct cpuacct *ca;	9532	struct cpuacct *ca;
9533	int batch = CPUACCT_BATCH;	9533	int batch = CPUACCT_BATCH;
9534		9534
9535	if (unlikely(!cpuacct_subsys.active))	9535	if (unlikely(!cpuacct_subsys.active))
9536	return;	9536	return;
9537		9537
9538	rcu_read_lock();	9538	rcu_read_lock();
9539	ca = task_ca(tsk);	9539	ca = task_ca(tsk);
9540		9540
9541	do {	9541	do {
9542	__percpu_counter_add(&ca->cpustat[idx], val, batch);	9542	__percpu_counter_add(&ca->cpustat[idx], val, batch);
9543	ca = ca->parent;	9543	ca = ca->parent;
9544	} while (ca);	9544	} while (ca);
9545	rcu_read_unlock();	9545	rcu_read_unlock();
9546	}	9546	}
9547		9547
9548	struct cgroup_subsys cpuacct_subsys = {	9548	struct cgroup_subsys cpuacct_subsys = {
9549	.name = "cpuacct",	9549	.name = "cpuacct",
9550	.create = cpuacct_create,	9550	.create = cpuacct_create,
9551	.destroy = cpuacct_destroy,	9551	.destroy = cpuacct_destroy,
9552	.populate = cpuacct_populate,	9552	.populate = cpuacct_populate,
9553	.subsys_id = cpuacct_subsys_id,	9553	.subsys_id = cpuacct_subsys_id,
9554	};	9554	};
9555	#endif /* CONFIG_CGROUP_CPUACCT */	9555	#endif /* CONFIG_CGROUP_CPUACCT */
9556		9556
9557		9557