Eric Lee / linux-smarc-t335x-v3.2

Commit 95c354fe9f7d6decc08a92aa26eb233ecc2155bf

Authored by Nick Piggin 2008-01-30 20:31:20 +0800

Committed by Ingo Molnar 2008-01-30 20:31:20 +0800

Exists in master and in 4 other branches

spinlock: lockbreak cleanup

The break_lock data structure and code for spinlocks is quite nasty.
Not only does it double the size of a spinlock but it changes locking to
a potentially less optimal trylock.

Put all of that under CONFIG_GENERIC_LOCKBREAK, and introduce a
__raw_spin_is_contended that uses the lock data itself to determine whether
there are waiters on the lock, to be used if CONFIG_GENERIC_LOCKBREAK is
not set.

Rename need_lockbreak to spin_needbreak, make it use spin_is_contended to
decouple it from the spinlock implementation, and make it typesafe (rwlocks
do not have any need_lockbreak sites -- why do they even get bloated up
with that break_lock then?).

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

Showing 19 changed files with 72 additions and 37 deletions Inline Diff

arch/arm/Kconfig
arch/ia64/Kconfig
arch/m32r/Kconfig
arch/mips/Kconfig
arch/parisc/Kconfig
arch/powerpc/Kconfig
arch/sparc64/Kconfig
arch/x86/Kconfig
fs/jbd/checkpoint.c
fs/jbd/commit.c
fs/jbd2/checkpoint.c
fs/jbd2/commit.c
include/linux/sched.h
include/linux/spinlock.h
include/linux/spinlock_types.h
include/linux/spinlock_up.h
kernel/sched.c
kernel/spinlock.c
mm/memory.c

arch/arm/Kconfig

Diff comments View file @ 95c354f

 #
 # For a description of the syntax of this configuration file,
 # see Documentation/kbuild/kconfig-language.txt.
 #
 mainmenu "Linux Kernel Configuration"
 config ARM
 	bool
 	default y
 	select RTC_LIB
 	select SYS_SUPPORTS_APM_EMULATION
 	help
 	  The ARM series is a line of low-power-consumption RISC chip designs
 	  licensed by ARM Ltd and targeted at embedded applications and
 	  handhelds such as the Compaq IPAQ.  ARM-based PCs are no longer
 	  manufactured, but legacy ARM-based PC hardware remains popular in
 	  Europe.  There is an ARM Linux project with a web page at
 	  <http://www.arm.linux.org.uk/>.
 config SYS_SUPPORTS_APM_EMULATION
 	bool
 config GENERIC_GPIO
 	bool
 	default n
 config GENERIC_TIME
 	bool
 	default n
 config GENERIC_CLOCKEVENTS
 	bool
 	default n
 config MMU
 	bool
 	default y
 config NO_IOPORT
 	bool
 	default n
 config EISA
 	bool
 	---help---
 	  The Extended Industry Standard Architecture (EISA) bus was
 	  developed as an open alternative to the IBM MicroChannel bus.
 	  The EISA bus provided some of the features of the IBM MicroChannel
 	  bus while maintaining backward compatibility with cards made for
 	  the older ISA bus.  The EISA bus saw limited use between 1988 and
 	  1995 when it was made obsolete by the PCI bus.
 	  Say Y here if you are building a kernel for an EISA-based machine.
 	  Otherwise, say N.
 config SBUS
 	bool
 config MCA
 	bool
 	help
 	  MicroChannel Architecture is found in some IBM PS/2 machines and
 	  laptops.  It is a bus system similar to PCI or ISA. See
 	  <file:Documentation/mca.txt> (and especially the web page given
 	  there) before attempting to build an MCA bus kernel.
 config GENERIC_HARDIRQS
 	bool
 	default y
 config STACKTRACE_SUPPORT
 	bool
 	default y
 config LOCKDEP_SUPPORT
 	bool
 	default y
 config TRACE_IRQFLAGS_SUPPORT
 	bool
 	default y
 config HARDIRQS_SW_RESEND
 	bool
 	default y
 config GENERIC_IRQ_PROBE
 	bool
 	default y
+config GENERIC_LOCKBREAK
+	bool
+	default y
+	depends on SMP && PREEMPT
 config RWSEM_GENERIC_SPINLOCK
 	bool
 	default y
 config RWSEM_XCHGADD_ALGORITHM
 	bool
 config ARCH_HAS_ILOG2_U32
 	bool
 	default n
 config ARCH_HAS_ILOG2_U64
 	bool
 	default n
 config GENERIC_HWEIGHT
 	bool
 	default y
 config GENERIC_CALIBRATE_DELAY
 	bool
 	default y
 config ARCH_MAY_HAVE_PC_FDC
 	bool
 config ZONE_DMA
 	bool
 	default y
 config GENERIC_ISA_DMA
 	bool
 config FIQ
 	bool
 config ARCH_MTD_XIP
 	bool
 config VECTORS_BASE
 	hex
 	default 0xffff0000 if MMU || CPU_HIGH_VECTOR
 	default DRAM_BASE if REMAP_VECTORS_TO_RAM
 	default 0x00000000
 	help
 	  The base address of exception vectors.
 source "init/Kconfig"
 menu "System Type"
 choice
 	prompt "ARM system type"
 	default ARCH_VERSATILE
 config ARCH_AAEC2000
 	bool "Agilent AAEC-2000 based"
 	select ARM_AMBA
 	help
 	  This enables support for systems based on the Agilent AAEC-2000
 config ARCH_INTEGRATOR
 	bool "ARM Ltd. Integrator family"
 	select ARM_AMBA
 	select ICST525
 	help
 	  Support for ARM's Integrator platform.
 config ARCH_REALVIEW
 	bool "ARM Ltd. RealView family"
 	select ARM_AMBA
 	select ICST307
 	help
 	  This enables support for ARM Ltd RealView boards.
 config ARCH_VERSATILE
 	bool "ARM Ltd. Versatile family"
 	select ARM_AMBA
 	select ARM_VIC
 	select ICST307
 	select GENERIC_TIME
 	select GENERIC_CLOCKEVENTS
 	help
 	  This enables support for ARM Ltd Versatile board.
 config ARCH_AT91
 	bool "Atmel AT91"
 	select GENERIC_GPIO
 	help
 	  This enables support for systems based on the Atmel AT91RM9200,
 	  AT91SAM9 and AT91CAP9 processors.
 config ARCH_CLPS7500
 	bool "Cirrus CL-PS7500FE"
 	select TIMER_ACORN
 	select ISA
 	select NO_IOPORT
 	help
 	  Support for the Cirrus Logic PS7500FE system-on-a-chip.
 config ARCH_CLPS711X
 	bool "Cirrus Logic CLPS711x/EP721x-based"
 	help
 	  Support for Cirrus Logic 711x/721x based boards.
 config ARCH_CO285
 	bool "Co-EBSA285"
 	select FOOTBRIDGE
 	select FOOTBRIDGE_ADDIN
 	help
 	  Support for Intel's EBSA285 companion chip.
 config ARCH_EBSA110
 	bool "EBSA-110"
 	select ISA
 	select NO_IOPORT
 	help
 	  This is an evaluation board for the StrongARM processor available
 	  from Digital. It has limited hardware on-board, including an
 	  Ethernet interface, two PCMCIA sockets, two serial ports and a
 	  parallel port.
 config ARCH_EP93XX
 	bool "EP93xx-based"
 	select ARM_AMBA
 	select ARM_VIC
 	select GENERIC_GPIO
 	help
 	  This enables support for the Cirrus EP93xx series of CPUs.
 config ARCH_FOOTBRIDGE
 	bool "FootBridge"
 	select FOOTBRIDGE
 	help
 	  Support for systems based on the DC21285 companion chip
 	  ("FootBridge"), such as the Simtec CATS and the Rebel NetWinder.
 config ARCH_NETX
 	bool "Hilscher NetX based"
 	select ARM_VIC
 	help
 	  This enables support for systems based on the Hilscher NetX Soc
 config ARCH_H720X
 	bool "Hynix HMS720x-based"
 	select ISA_DMA_API
 	help
 	  This enables support for systems based on the Hynix HMS720x
 config ARCH_IMX
 	bool "IMX"
 	select GENERIC_GPIO
 	select GENERIC_TIME
 	select GENERIC_CLOCKEVENTS
 	help
 	  Support for Motorola's i.MX family of processors (MX1, MXL).
 config ARCH_IOP13XX
 	bool "IOP13xx-based"
 	depends on MMU
 	select PLAT_IOP
 	select PCI
 	select ARCH_SUPPORTS_MSI
 	help
 	  Support for Intel's IOP13XX (XScale) family of processors.
 config ARCH_IOP32X
 	bool "IOP32x-based"
 	depends on MMU
 	select PLAT_IOP
 	select PCI
 	help
 	  Support for Intel's 80219 and IOP32X (XScale) family of
 	  processors.
 config ARCH_IOP33X
 	bool "IOP33x-based"
 	depends on MMU
 	select PLAT_IOP
 	select PCI
 	help
 	  Support for Intel's IOP33X (XScale) family of processors.
 config ARCH_IXP23XX
  	bool "IXP23XX-based"
 	depends on MMU
  	select PCI
 	help
 	  Support for Intel's IXP23xx (XScale) family of processors.
 config ARCH_IXP2000
 	bool "IXP2400/2800-based"
 	depends on MMU
 	select PCI
 	help
 	  Support for Intel's IXP2400/2800 (XScale) family of processors.
 config ARCH_IXP4XX
 	bool "IXP4xx-based"
 	depends on MMU
 	select GENERIC_GPIO
 	select GENERIC_TIME
 	select GENERIC_CLOCKEVENTS
 	help
 	  Support for Intel's IXP4XX (XScale) family of processors.
 config ARCH_L7200
 	bool "LinkUp-L7200"
 	select FIQ
 	help
 	  Say Y here if you intend to run this kernel on a LinkUp Systems
 	  L7200 Software Development Board which uses an ARM720T processor.
 	  Information on this board can be obtained at:
 	  <http://www.linkupsys.com/>
 	  If you have any questions or comments about the Linux kernel port
 	  to this board, send e-mail to <sjhill@cotw.com>.
 config ARCH_KS8695
 	bool "Micrel/Kendin KS8695"
 	select GENERIC_GPIO
 	help
 	  Support for Micrel/Kendin KS8695 "Centaur" (ARM922T) based
 	  System-on-Chip devices.
 config ARCH_NS9XXX
 	bool "NetSilicon NS9xxx"
 	select GENERIC_GPIO
 	select GENERIC_TIME
 	select GENERIC_CLOCKEVENTS
 	help
 	  Say Y here if you intend to run this kernel on a NetSilicon NS9xxx
 	  System.
 	  <http://www.digi.com/products/microprocessors/index.jsp>
 config ARCH_MXC
 	bool "Freescale MXC/iMX-based"
 	select ARCH_MTD_XIP
 	help
 	  Support for Freescale MXC/iMX-based family of processors
 config ARCH_ORION
 	bool "Marvell Orion"
 	depends on MMU
 	select PCI
 	select GENERIC_GPIO
 	select GENERIC_TIME
 	select GENERIC_CLOCKEVENTS
 	help
 	  Support for Marvell Orion System on Chip family.
 config ARCH_PNX4008
 	bool "Philips Nexperia PNX4008 Mobile"
 	help
 	  This enables support for Philips PNX4008 mobile platform.
 config ARCH_PXA
 	bool "PXA2xx/PXA3xx-based"
 	depends on MMU
 	select ARCH_MTD_XIP
 	select GENERIC_GPIO
 	select GENERIC_TIME
 	select GENERIC_CLOCKEVENTS
 	select TICK_ONESHOT
 	help
 	  Support for Intel/Marvell's PXA2xx/PXA3xx processor line.
 config ARCH_RPC
 	bool "RiscPC"
 	select ARCH_ACORN
 	select FIQ
 	select TIMER_ACORN
 	select ARCH_MAY_HAVE_PC_FDC
 	select ISA_DMA_API
 	select NO_IOPORT
 	help
 	  On the Acorn Risc-PC, Linux can support the internal IDE disk and
 	  CD-ROM interface, serial and parallel port, and the floppy drive.
 config ARCH_SA1100
 	bool "SA1100-based"
 	select ISA
 	select ARCH_DISCONTIGMEM_ENABLE
 	select ARCH_MTD_XIP
 	select GENERIC_GPIO
 	select GENERIC_TIME
 	help
 	  Support for StrongARM 11x0 based boards.
 config ARCH_S3C2410
 	bool "Samsung S3C2410, S3C2412, S3C2413, S3C2440, S3C2442, S3C2443"
 	select GENERIC_GPIO
 	help
 	  Samsung S3C2410X CPU based systems, such as the Simtec Electronics
 	  BAST (<http://www.simtec.co.uk/products/EB110ITX/>), the IPAQ 1940 or
 	  the Samsung SMDK2410 development board (and derivatives).
 config ARCH_SHARK
 	bool "Shark"
 	select ISA
 	select ISA_DMA
 	select PCI
 	help
 	  Support for the StrongARM based Digital DNARD machine, also known
 	  as "Shark" (<http://www.shark-linux.de/shark.html>).
 config ARCH_LH7A40X
 	bool "Sharp LH7A40X"
 	help
 	  Say Y here for systems based on one of the Sharp LH7A40X
 	  System on a Chip processors.  These CPUs include an ARM922T
 	  core with a wide array of integrated devices for
 	  hand-held and low-power applications.
 config ARCH_DAVINCI
 	bool "TI DaVinci"
 	select GENERIC_TIME
 	select GENERIC_CLOCKEVENTS
 	select GENERIC_GPIO
 	help
 	  Support for TI's DaVinci platform.
 config ARCH_OMAP
 	bool "TI OMAP"
 	select GENERIC_GPIO
 	select GENERIC_TIME
 	help
 	  Support for TI's OMAP platform (OMAP1 and OMAP2).
 config ARCH_MSM7X00A
 	bool "Qualcomm MSM7X00A"
 	select GENERIC_TIME
 	select GENERIC_CLOCKEVENTS
 	help
 	  Support for Qualcomm MSM7X00A based systems.  This runs on the ARM11
 	  apps processor of the MSM7X00A and depends on a shared memory
 	  interface to the ARM9 modem processor which runs the baseband stack
 	  and controls some vital subsystems (clock and power control, etc).
 	  <http://www.cdmatech.com/products/msm7200_chipset_solution.jsp>
 endchoice
 source "arch/arm/mach-clps711x/Kconfig"
 source "arch/arm/mach-ep93xx/Kconfig"
 source "arch/arm/mach-footbridge/Kconfig"
 source "arch/arm/mach-integrator/Kconfig"
 source "arch/arm/mach-iop32x/Kconfig"
 source "arch/arm/mach-iop33x/Kconfig"
 source "arch/arm/mach-iop13xx/Kconfig"
 source "arch/arm/mach-ixp4xx/Kconfig"
 source "arch/arm/mach-ixp2000/Kconfig"
 source "arch/arm/mach-ixp23xx/Kconfig"
 source "arch/arm/mach-pxa/Kconfig"
 source "arch/arm/mach-sa1100/Kconfig"
 source "arch/arm/plat-omap/Kconfig"
 source "arch/arm/mach-omap1/Kconfig"
 source "arch/arm/mach-omap2/Kconfig"
 source "arch/arm/mach-orion/Kconfig"
 source "arch/arm/plat-s3c24xx/Kconfig"
 source "arch/arm/plat-s3c/Kconfig"
 if ARCH_S3C2410
 source "arch/arm/mach-s3c2400/Kconfig"
 source "arch/arm/mach-s3c2410/Kconfig"
 source "arch/arm/mach-s3c2412/Kconfig"
 source "arch/arm/mach-s3c2440/Kconfig"
 source "arch/arm/mach-s3c2442/Kconfig"
 source "arch/arm/mach-s3c2443/Kconfig"
 endif
 source "arch/arm/mach-lh7a40x/Kconfig"
 source "arch/arm/mach-imx/Kconfig"
 source "arch/arm/mach-h720x/Kconfig"
 source "arch/arm/mach-versatile/Kconfig"
 source "arch/arm/mach-aaec2000/Kconfig"
 source "arch/arm/mach-realview/Kconfig"
 source "arch/arm/mach-at91/Kconfig"
 source "arch/arm/plat-mxc/Kconfig"
 source "arch/arm/mach-netx/Kconfig"
 source "arch/arm/mach-ns9xxx/Kconfig"
 source "arch/arm/mach-davinci/Kconfig"
 source "arch/arm/mach-ks8695/Kconfig"
 source "arch/arm/mach-msm/Kconfig"
 # Definitions to make life easier
 config ARCH_ACORN
 	bool
 config PLAT_IOP
 	bool
 source arch/arm/mm/Kconfig
 config IWMMXT
 	bool "Enable iWMMXt support"
 	depends on CPU_XSCALE || CPU_XSC3
 	default y if PXA27x || PXA3xx
 	help
 	  Enable support for iWMMXt context switching at run time if
 	  running on a CPU that supports it.
 #  bool 'Use XScale PMU as timer source' CONFIG_XSCALE_PMU_TIMER
 config XSCALE_PMU
 	bool
 	depends on CPU_XSCALE && !XSCALE_PMU_TIMER
 	default y
 if !MMU
 source "arch/arm/Kconfig-nommu"
 endif
 endmenu
 source "arch/arm/common/Kconfig"
 config FORCE_MAX_ZONEORDER
 	int
 	depends on SA1111
 	default "9"
 menu "Bus support"
 config ARM_AMBA
 	bool
 config ISA
 	bool
 	help
 	  Find out whether you have ISA slots on your motherboard.  ISA is the
 	  name of a bus system, i.e. the way the CPU talks to the other stuff
 	  inside your box.  Other bus systems are PCI, EISA, MicroChannel
 	  (MCA) or VESA.  ISA is an older system, now being displaced by PCI;
 	  newer boards don't support it.  If you have ISA, say Y, otherwise N.
 # Select ISA DMA controller support
 config ISA_DMA
 	bool
 	select ISA_DMA_API
 # Select ISA DMA interface
 config ISA_DMA_API
 	bool
 config PCI
 	bool "PCI support" if ARCH_INTEGRATOR_AP || ARCH_VERSATILE_PB || ARCH_IXP4XX || ARCH_KS8695 || MACH_ARMCORE
 	help
 	  Find out whether you have a PCI motherboard. PCI is the name of a
 	  bus system, i.e. the way the CPU talks to the other stuff inside
 	  your box. Other bus systems are ISA, EISA, MicroChannel (MCA) or
 	  VESA. If you have PCI, say Y, otherwise N.
 	  The PCI-HOWTO, available from
 	  <http://www.tldp.org/docs.html#howto>, contains valuable
 	  information about which PCI hardware does work under Linux and which
 	  doesn't.
 config PCI_SYSCALL
 	def_bool PCI
 # Select the host bridge type
 config PCI_HOST_VIA82C505
 	bool
 	depends on PCI && ARCH_SHARK
 	default y
 config PCI_HOST_ITE8152
 	bool
 	depends on PCI && MACH_ARMCORE
 	default y
 	select DMABOUNCE
 source "drivers/pci/Kconfig"
 source "drivers/pcmcia/Kconfig"
 endmenu
 menu "Kernel Features"
 source "kernel/time/Kconfig"
 config SMP
 	bool "Symmetric Multi-Processing (EXPERIMENTAL)"
 	depends on EXPERIMENTAL && REALVIEW_MPCORE
 	help
 	  This enables support for systems with more than one CPU. If you have
 	  a system with only one CPU, like most personal computers, say N. If
 	  you have a system with more than one CPU, say Y.
 	  If you say N here, the kernel will run on single and multiprocessor
 	  machines, but will use only one CPU of a multiprocessor machine. If
 	  you say Y here, the kernel will run on many, but not all, single
 	  processor machines. On a single processor machine, the kernel will
 	  run faster if you say N here.
 	  See also the <file:Documentation/smp.txt>,
 	  <file:Documentation/i386/IO-APIC.txt>,
 	  <file:Documentation/nmi_watchdog.txt> and the SMP-HOWTO available at
 	  <http://www.linuxdoc.org/docs.html#howto>.
 	  If you don't know what to do here, say N.
 config NR_CPUS
 	int "Maximum number of CPUs (2-32)"
 	range 2 32
 	depends on SMP
 	default "4"
 config HOTPLUG_CPU
 	bool "Support for hot-pluggable CPUs (EXPERIMENTAL)"
 	depends on SMP && HOTPLUG && EXPERIMENTAL
 	help
 	  Say Y here to experiment with turning CPUs off and on.  CPUs
 	  can be controlled through /sys/devices/system/cpu.
 config LOCAL_TIMERS
 	bool "Use local timer interrupts"
 	depends on SMP && REALVIEW_MPCORE
 	default y
 	help
 	  Enable support for local timers on SMP platforms, rather then the
 	  legacy IPI broadcast method.  Local timers allows the system
 	  accounting to be spread across the timer interval, preventing a
 	  "thundering herd" at every timer tick.
 config PREEMPT
 	bool "Preemptible Kernel (EXPERIMENTAL)"
 	depends on EXPERIMENTAL
 	help
 	  This option reduces the latency of the kernel when reacting to
 	  real-time or interactive events by allowing a low priority process to
 	  be preempted even if it is in kernel mode executing a system call.
 	  This allows applications to run more reliably even when the system is
 	  under load.
 	  Say Y here if you are building a kernel for a desktop, embedded
 	  or real-time system.  Say N if you are unsure.
 config NO_IDLE_HZ
 	bool "Dynamic tick timer"
 	depends on !GENERIC_CLOCKEVENTS
 	help
 	  Select this option if you want to disable continuous timer ticks
 	  and have them programmed to occur as required. This option saves
 	  power as the system can remain in idle state for longer.
 	  By default dynamic tick is disabled during the boot, and can be
 	  manually enabled with:
 	    echo 1 > /sys/devices/system/timer/timer0/dyn_tick
 	  Alternatively, if you want dynamic tick automatically enabled
 	  during boot, pass "dyntick=enable" via the kernel command string.
 	  Please note that dynamic tick may affect the accuracy of
 	  timekeeping on some platforms depending on the implementation.
 	  Currently at least OMAP, PXA2xx and SA11x0 platforms are known
 	  to have accurate timekeeping with dynamic tick.
 config HZ
 	int
 	default 128 if ARCH_L7200
 	default 200 if ARCH_EBSA110 || ARCH_S3C2410
 	default OMAP_32K_TIMER_HZ if ARCH_OMAP && OMAP_32K_TIMER
 	default AT91_TIMER_HZ if ARCH_AT91
 	default 100
 config AEABI
 	bool "Use the ARM EABI to compile the kernel"
 	help
 	  This option allows for the kernel to be compiled using the latest
 	  ARM ABI (aka EABI).  This is only useful if you are using a user
 	  space environment that is also compiled with EABI.
 	  Since there are major incompatibilities between the legacy ABI and
 	  EABI, especially with regard to structure member alignment, this
 	  option also changes the kernel syscall calling convention to
 	  disambiguate both ABIs and allow for backward compatibility support
 	  (selected with CONFIG_OABI_COMPAT).
 	  To use this you need GCC version 4.0.0 or later.
 config OABI_COMPAT
 	bool "Allow old ABI binaries to run with this kernel (EXPERIMENTAL)"
 	depends on AEABI && EXPERIMENTAL
 	default y
 	help
 	  This option preserves the old syscall interface along with the
 	  new (ARM EABI) one. It also provides a compatibility layer to
 	  intercept syscalls that have structure arguments which layout
 	  in memory differs between the legacy ABI and the new ARM EABI
 	  (only for non "thumb" binaries). This option adds a tiny
 	  overhead to all syscalls and produces a slightly larger kernel.
 	  If you know you'll be using only pure EABI user space then you
 	  can say N here. If this option is not selected and you attempt
 	  to execute a legacy ABI binary then the result will be
 	  UNPREDICTABLE (in fact it can be predicted that it won't work
 	  at all). If in doubt say Y.
 config ARCH_DISCONTIGMEM_ENABLE
 	bool
 	default (ARCH_LH7A40X && !LH7A40X_CONTIGMEM)
 	help
 	  Say Y to support efficient handling of discontiguous physical memory,
 	  for architectures which are either NUMA (Non-Uniform Memory Access)
 	  or have huge holes in the physical address space for other reasons.
 	  See <file:Documentation/vm/numa> for more.
 config NODES_SHIFT
 	int
 	default "4" if ARCH_LH7A40X
 	default "2"
 	depends on NEED_MULTIPLE_NODES
 source "mm/Kconfig"
 config LEDS
 	bool "Timer and CPU usage LEDs"
 	depends on ARCH_CDB89712 || ARCH_CO285 || ARCH_EBSA110 || \
 		   ARCH_EBSA285 || ARCH_IMX || ARCH_INTEGRATOR || \
 		   ARCH_LUBBOCK || MACH_MAINSTONE || ARCH_NETWINDER || \
 		   ARCH_OMAP || ARCH_P720T || ARCH_PXA_IDP || \
 		   ARCH_SA1100 || ARCH_SHARK || ARCH_VERSATILE || \
 		   ARCH_AT91 || MACH_TRIZEPS4 || ARCH_DAVINCI || \
 		   ARCH_KS8695 || MACH_RD88F5182
 	help
 	  If you say Y here, the LEDs on your machine will be used
 	  to provide useful information about your current system status.
 	  If you are compiling a kernel for a NetWinder or EBSA-285, you will
 	  be able to select which LEDs are active using the options below. If
 	  you are compiling a kernel for the EBSA-110 or the LART however, the
 	  red LED will simply flash regularly to indicate that the system is
 	  still functional. It is safe to say Y here if you have a CATS
 	  system, but the driver will do nothing.
 config LEDS_TIMER
 	bool "Timer LED" if (!ARCH_CDB89712 && !ARCH_OMAP) || \
 			    OMAP_OSK_MISTRAL || MACH_OMAP_H2 \
 			    || MACH_OMAP_PERSEUS2
 	depends on LEDS
 	depends on !GENERIC_CLOCKEVENTS
 	default y if ARCH_EBSA110
 	help
 	  If you say Y here, one of the system LEDs (the green one on the
 	  NetWinder, the amber one on the EBSA285, or the red one on the LART)
 	  will flash regularly to indicate that the system is still
 	  operational. This is mainly useful to kernel hackers who are
 	  debugging unstable kernels.
 	  The LART uses the same LED for both Timer LED and CPU usage LED
 	  functions. You may choose to use both, but the Timer LED function
 	  will overrule the CPU usage LED.
 config LEDS_CPU
 	bool "CPU usage LED" if (!ARCH_CDB89712 && !ARCH_EBSA110 && \
 			!ARCH_OMAP) \
 			|| OMAP_OSK_MISTRAL || MACH_OMAP_H2 \
 			|| MACH_OMAP_PERSEUS2
 	depends on LEDS
 	help
 	  If you say Y here, the red LED will be used to give a good real
 	  time indication of CPU usage, by lighting whenever the idle task
 	  is not currently executing.
 	  The LART uses the same LED for both Timer LED and CPU usage LED
 	  functions. You may choose to use both, but the Timer LED function
 	  will overrule the CPU usage LED.
 config ALIGNMENT_TRAP
 	bool
 	depends on CPU_CP15_MMU
 	default y if !ARCH_EBSA110
 	help
 	  ARM processors cannot fetch/store information which is not
 	  naturally aligned on the bus, i.e., a 4 byte fetch must start at an
 	  address divisible by 4. On 32-bit ARM processors, these non-aligned
 	  fetch/store instructions will be emulated in software if you say
 	  here, which has a severe performance impact. This is necessary for
 	  correct operation of some network protocols. With an IP-only
 	  configuration it is safe to say N, otherwise say Y.
 endmenu
 menu "Boot options"
 # Compressed boot loader in ROM.  Yes, we really want to ask about
 # TEXT and BSS so we preserve their values in the config files.
 config ZBOOT_ROM_TEXT
 	hex "Compressed ROM boot loader base address"
 	default "0"
 	help
 	  The physical address at which the ROM-able zImage is to be
 	  placed in the target.  Platforms which normally make use of
 	  ROM-able zImage formats normally set this to a suitable
 	  value in their defconfig file.
 	  If ZBOOT_ROM is not enabled, this has no effect.
 config ZBOOT_ROM_BSS
 	hex "Compressed ROM boot loader BSS address"
 	default "0"
 	help
 	  The base address of an area of read/write memory in the target
 	  for the ROM-able zImage which must be available while the
 	  decompressor is running. It must be large enough to hold the
 	  entire decompressed kernel plus an additional 128 KiB.
 	  Platforms which normally make use of ROM-able zImage formats
 	  normally set this to a suitable value in their defconfig file.
 	  If ZBOOT_ROM is not enabled, this has no effect.
 config ZBOOT_ROM
 	bool "Compressed boot loader in ROM/flash"
 	depends on ZBOOT_ROM_TEXT != ZBOOT_ROM_BSS
 	help
 	  Say Y here if you intend to execute your compressed kernel image
 	  (zImage) directly from ROM or flash.  If unsure, say N.
 config CMDLINE
 	string "Default kernel command string"
 	default ""
 	help
 	  On some architectures (EBSA110 and CATS), there is currently no way
 	  for the boot loader to pass arguments to the kernel. For these
 	  architectures, you should supply some command-line options at build
 	  time by entering them here. As a minimum, you should specify the
 	  memory size and the root device (e.g., mem=64M root=/dev/nfs).
 config XIP_KERNEL
 	bool "Kernel Execute-In-Place from ROM"
 	depends on !ZBOOT_ROM
 	help
 	  Execute-In-Place allows the kernel to run from non-volatile storage
 	  directly addressable by the CPU, such as NOR flash. This saves RAM
 	  space since the text section of the kernel is not loaded from flash
 	  to RAM.  Read-write sections, such as the data section and stack,
 	  are still copied to RAM.  The XIP kernel is not compressed since
 	  it has to run directly from flash, so it will take more space to
 	  store it.  The flash address used to link the kernel object files,
 	  and for storing it, is configuration dependent. Therefore, if you
 	  say Y here, you must know the proper physical address where to
 	  store the kernel image depending on your own flash memory usage.
 	  Also note that the make target becomes "make xipImage" rather than
 	  "make zImage" or "make Image".  The final kernel binary to put in
 	  ROM memory will be arch/arm/boot/xipImage.
 	  If unsure, say N.
 config XIP_PHYS_ADDR
 	hex "XIP Kernel Physical Location"
 	depends on XIP_KERNEL
 	default "0x00080000"
 	help
 	  This is the physical address in your flash memory the kernel will
 	  be linked for and stored to.  This address is dependent on your
 	  own flash usage.
 config KEXEC
 	bool "Kexec system call (EXPERIMENTAL)"
 	depends on EXPERIMENTAL
 	help
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
 	  but it is independent of the system firmware.   And like a reboot
 	  you can start any kernel with it, not just Linux.
 	  It is an ongoing process to be certain the hardware in a machine
 	  is properly shutdown, so do not be surprised if this code does not
 	  initially work for you.  It may help to enable device hotplugging
 	  support.
 endmenu
 if (ARCH_SA1100 || ARCH_INTEGRATOR || ARCH_OMAP || ARCH_IMX || ARCH_PXA)
 menu "CPU Frequency scaling"
 source "drivers/cpufreq/Kconfig"
 config CPU_FREQ_SA1100
 	bool
 	depends on CPU_FREQ && (SA1100_H3100 || SA1100_H3600 || SA1100_H3800 || SA1100_LART || SA1100_PLEB || SA1100_BADGE4 || SA1100_HACKKIT)
 	default y
 config CPU_FREQ_SA1110
 	bool
 	depends on CPU_FREQ && (SA1100_ASSABET || SA1100_CERF || SA1100_PT_SYSTEM3)
 	default y
 config CPU_FREQ_INTEGRATOR
 	tristate "CPUfreq driver for ARM Integrator CPUs"
 	depends on ARCH_INTEGRATOR && CPU_FREQ
 	default y
 	help
 	  This enables the CPUfreq driver for ARM Integrator CPUs.
 	  For details, take a look at <file:Documentation/cpu-freq>.
 	  If in doubt, say Y.
 config CPU_FREQ_IMX
 	tristate "CPUfreq driver for i.MX CPUs"
 	depends on ARCH_IMX && CPU_FREQ
 	default n
 	help
 	  This enables the CPUfreq driver for i.MX CPUs.
 	  If in doubt, say N.
 config CPU_FREQ_PXA
 	bool
 	depends on CPU_FREQ && ARCH_PXA && PXA25x
 	default y
 	select CPU_FREQ_DEFAULT_GOV_USERSPACE
 endmenu
 endif
 menu "Floating point emulation"
 comment "At least one emulation must be selected"
 config FPE_NWFPE
 	bool "NWFPE math emulation"
 	depends on !AEABI || OABI_COMPAT
 	---help---
 	  Say Y to include the NWFPE floating point emulator in the kernel.
 	  This is necessary to run most binaries. Linux does not currently
 	  support floating point hardware so you need to say Y here even if
 	  your machine has an FPA or floating point co-processor podule.
 	  You may say N here if you are going to load the Acorn FPEmulator
 	  early in the bootup.
 config FPE_NWFPE_XP
 	bool "Support extended precision"
 	depends on FPE_NWFPE
 	help
 	  Say Y to include 80-bit support in the kernel floating-point
 	  emulator.  Otherwise, only 32 and 64-bit support is compiled in.
 	  Note that gcc does not generate 80-bit operations by default,
 	  so in most cases this option only enlarges the size of the
 	  floating point emulator without any good reason.
 	  You almost surely want to say N here.
 config FPE_FASTFPE
 	bool "FastFPE math emulation (EXPERIMENTAL)"
 	depends on (!AEABI || OABI_COMPAT) && !CPU_32v3 && EXPERIMENTAL
 	---help---
 	  Say Y here to include the FAST floating point emulator in the kernel.
 	  This is an experimental much faster emulator which now also has full
 	  precision for the mantissa.  It does not support any exceptions.
 	  It is very simple, and approximately 3-6 times faster than NWFPE.
 	  It should be sufficient for most programs.  It may be not suitable
 	  for scientific calculations, but you have to check this for yourself.
 	  If you do not feel you need a faster FP emulation you should better
 	  choose NWFPE.
 config VFP
 	bool "VFP-format floating point maths"
 	depends on CPU_V6 || CPU_ARM926T || CPU_V7 || CPU_FEROCEON
 	help
 	  Say Y to include VFP support code in the kernel. This is needed
 	  if your hardware includes a VFP unit.
 	  Please see <file:Documentation/arm/VFP/release-notes.txt> for
 	  release notes and additional status information.
 	  Say N if your target does not have VFP hardware.
 config VFPv3
 	bool
 	depends on VFP
 	default y if CPU_V7
 config NEON
 	bool "Advanced SIMD (NEON) Extension support"
 	depends on VFPv3 && CPU_V7
 	help
 	  Say Y to include support code for NEON, the ARMv7 Advanced SIMD
 	  Extension.
 endmenu
 menu "Userspace binary formats"
 source "fs/Kconfig.binfmt"
 config ARTHUR
 	tristate "RISC OS personality"
 	depends on !AEABI
 	help
 	  Say Y here to include the kernel code necessary if you want to run
 	  Acorn RISC OS/Arthur binaries under Linux. This code is still very
 	  experimental; if this sounds frightening, say N and sleep in peace.
 	  You can also say M here to compile this support as a module (which
 	  will be called arthur).
 endmenu
 menu "Power management options"
 source "kernel/power/Kconfig"
 endmenu
 source "net/Kconfig"
 menu "Device Drivers"
 source "drivers/base/Kconfig"
 source "drivers/connector/Kconfig"
 if ALIGNMENT_TRAP || !CPU_CP15_MMU
 source "drivers/mtd/Kconfig"
 endif
 source "drivers/parport/Kconfig"
 source "drivers/pnp/Kconfig"
 source "drivers/block/Kconfig"
 # misc before ide - BLK_DEV_SGIIOC4 depends on SGI_IOC4
 source "drivers/misc/Kconfig"
 if PCMCIA || ARCH_CLPS7500 || ARCH_IOP32X || ARCH_IOP33X || ARCH_IXP4XX \
 	|| ARCH_L7200 || ARCH_LH7A40X || ARCH_PXA || ARCH_RPC \
 	|| ARCH_S3C2410 || ARCH_SA1100 || ARCH_SHARK || FOOTBRIDGE \
 	|| ARCH_IXP23XX
 source "drivers/ide/Kconfig"
 endif
 source "drivers/scsi/Kconfig"
 source "drivers/ata/Kconfig"
 source "drivers/md/Kconfig"
 source "drivers/message/fusion/Kconfig"
 source "drivers/ieee1394/Kconfig"
 source "drivers/message/i2o/Kconfig"
 source "drivers/net/Kconfig"
 source "drivers/isdn/Kconfig"
 # input before char - char/joystick depends on it. As does USB.
 source "drivers/input/Kconfig"
 source "drivers/char/Kconfig"
 source "drivers/i2c/Kconfig"
 source "drivers/spi/Kconfig"
 source "drivers/w1/Kconfig"
 source "drivers/power/Kconfig"
 source "drivers/hwmon/Kconfig"
 source "drivers/watchdog/Kconfig"
 source "drivers/ssb/Kconfig"
 #source "drivers/l3/Kconfig"
 source "drivers/mfd/Kconfig"
 source "drivers/media/Kconfig"
 source "drivers/video/Kconfig"
 source "sound/Kconfig"
 source "drivers/hid/Kconfig"
 source "drivers/usb/Kconfig"
 source "drivers/mmc/Kconfig"
 source "drivers/leds/Kconfig"
 source "drivers/rtc/Kconfig"
 source "drivers/dma/Kconfig"
 source "drivers/dca/Kconfig"
 endmenu
 source "fs/Kconfig"
 source "arch/arm/Kconfig.instrumentation"
 source "arch/arm/Kconfig.debug"
 source "security/Kconfig"
 source "crypto/Kconfig"
 source "lib/Kconfig"

arch/ia64/Kconfig

Diff comments View file @ 95c354f

 #
 # For a description of the syntax of this configuration file,
 # see Documentation/kbuild/kconfig-language.txt.
 #
 mainmenu "IA-64 Linux Kernel Configuration"
 source "init/Kconfig"
 menu "Processor type and features"
 config IA64
 	bool
 	select PCI if (!IA64_HP_SIM)
 	select ACPI if (!IA64_HP_SIM)
 	select PM if (!IA64_HP_SIM)
 	select ARCH_SUPPORTS_MSI
 	default y
 	help
 	  The Itanium Processor Family is Intel's 64-bit successor to
 	  the 32-bit X86 line.  The IA-64 Linux project has a home
 	  page at <http://www.linuxia64.org/> and a mailing list at
 	  <linux-ia64@vger.kernel.org>.
 config 64BIT
 	bool
 	select ATA_NONSTANDARD if ATA
 	default y
 config ZONE_DMA
 	def_bool y
 	depends on !IA64_SGI_SN2
 config QUICKLIST
 	bool
 	default y
 config MMU
 	bool
 	default y
 config SWIOTLB
        bool
+config GENERIC_LOCKBREAK
+	bool
+	default y
+	depends on SMP && PREEMPT
 config RWSEM_XCHGADD_ALGORITHM
 	bool
 	default y
 config ARCH_HAS_ILOG2_U32
 	bool
 	default n
 config ARCH_HAS_ILOG2_U64
 	bool
 	default n
 config HUGETLB_PAGE_SIZE_VARIABLE
 	bool
 	depends on HUGETLB_PAGE
 	default y
 config GENERIC_FIND_NEXT_BIT
 	bool
 	default y
 config GENERIC_CALIBRATE_DELAY
 	bool
 	default y
 config GENERIC_TIME
 	bool
 	default y
 config GENERIC_TIME_VSYSCALL
 	bool
 	default y
 config DMI
 	bool
 	default y
 config EFI
 	bool
 	default y
 config GENERIC_IOMAP
 	bool
 	default y
 config SCHED_NO_NO_OMIT_FRAME_POINTER
 	bool
 	default y
 config IA64_UNCACHED_ALLOCATOR
 	bool
 	select GENERIC_ALLOCATOR
 config AUDIT_ARCH
 	bool
 	default y
 choice
 	prompt "System type"
 	default IA64_GENERIC
 config IA64_GENERIC
 	bool "generic"
 	select NUMA
 	select ACPI_NUMA
 	select SWIOTLB
 	help
 	  This selects the system type of your hardware.  A "generic" kernel
 	  will run on any supported IA-64 system.  However, if you configure
 	  a kernel for your specific system, it will be faster and smaller.
 	  generic		For any supported IA-64 system
 	  DIG-compliant		For DIG ("Developer's Interface Guide") compliant systems
 	  HP-zx1/sx1000		For HP systems
 	  HP-zx1/sx1000+swiotlb	For HP systems with (broken) DMA-constrained devices.
 	  SGI-SN2		For SGI Altix systems
 	  Ski-simulator		For the HP simulator <http://www.hpl.hp.com/research/linux/ski/>
 	  If you don't know what to do, choose "generic".
 config IA64_DIG
 	bool "DIG-compliant"
 	select SWIOTLB
 config IA64_HP_ZX1
 	bool "HP-zx1/sx1000"
 	help
 	  Build a kernel that runs on HP zx1 and sx1000 systems.  This adds
 	  support for the HP I/O MMU.
 config IA64_HP_ZX1_SWIOTLB
 	bool "HP-zx1/sx1000 with software I/O TLB"
 	select SWIOTLB
 	help
 	  Build a kernel that runs on HP zx1 and sx1000 systems even when they
 	  have broken PCI devices which cannot DMA to full 32 bits.  Apart
 	  from support for the HP I/O MMU, this includes support for the software
 	  I/O TLB, which allows supporting the broken devices at the expense of
 	  wasting some kernel memory (about 2MB by default).
 config IA64_SGI_SN2
 	bool "SGI-SN2"
 	help
 	  Selecting this option will optimize the kernel for use on sn2 based
 	  systems, but the resulting kernel binary will not run on other
 	  types of ia64 systems.  If you have an SGI Altix system, it's safe
 	  to select this option.  If in doubt, select ia64 generic support
 	  instead.
 config IA64_HP_SIM
 	bool "Ski-simulator"
 	select SWIOTLB
 endchoice
 choice
 	prompt "Processor type"
 	default ITANIUM
 config ITANIUM
 	bool "Itanium"
 	help
 	  Select your IA-64 processor type.  The default is Itanium.
 	  This choice is safe for all IA-64 systems, but may not perform
 	  optimally on systems with, say, Itanium 2 or newer processors.
 config MCKINLEY
 	bool "Itanium 2"
 	help
 	  Select this to configure for an Itanium 2 (McKinley) processor.
 endchoice
 choice
 	prompt "Kernel page size"
 	default IA64_PAGE_SIZE_16KB
 config IA64_PAGE_SIZE_4KB
 	bool "4KB"
 	help
 	  This lets you select the page size of the kernel.  For best IA-64
 	  performance, a page size of 8KB or 16KB is recommended.  For best
 	  IA-32 compatibility, a page size of 4KB should be selected (the vast
 	  majority of IA-32 binaries work perfectly fine with a larger page
 	  size).  For Itanium 2 or newer systems, a page size of 64KB can also
 	  be selected.
 	  4KB                For best IA-32 compatibility
 	  8KB                For best IA-64 performance
 	  16KB               For best IA-64 performance
 	  64KB               Requires Itanium 2 or newer processor.
 	  If you don't know what to do, choose 16KB.
 config IA64_PAGE_SIZE_8KB
 	bool "8KB"
 config IA64_PAGE_SIZE_16KB
 	bool "16KB"
 config IA64_PAGE_SIZE_64KB
 	depends on !ITANIUM
 	bool "64KB"
 endchoice
 choice
 	prompt "Page Table Levels"
 	default PGTABLE_3
 config PGTABLE_3
 	bool "3 Levels"
 config PGTABLE_4
 	depends on !IA64_PAGE_SIZE_64KB
 	bool "4 Levels"
 endchoice
 source kernel/Kconfig.hz
 config IA64_BRL_EMU
 	bool
 	depends on ITANIUM
 	default y
 # align cache-sensitive data to 128 bytes
 config IA64_L1_CACHE_SHIFT
 	int
 	default "7" if MCKINLEY
 	default "6" if ITANIUM
 config IA64_CYCLONE
 	bool "Cyclone (EXA) Time Source support"
 	help
 	  Say Y here to enable support for IBM EXA Cyclone time source.
 	  If you're unsure, answer N.
 config IOSAPIC
 	bool
 	depends on !IA64_HP_SIM
 	default y
 config IA64_SGI_SN_XP
 	tristate "Support communication between SGI SSIs"
 	depends on IA64_GENERIC || IA64_SGI_SN2
 	select IA64_UNCACHED_ALLOCATOR
 	help
 	  An SGI machine can be divided into multiple Single System
 	  Images which act independently of each other and have
 	  hardware based memory protection from the others.  Enabling
 	  this feature will allow for direct communication between SSIs
 	  based on a network adapter and DMA messaging.
 config FORCE_MAX_ZONEORDER
 	int "MAX_ORDER (11 - 17)"  if !HUGETLB_PAGE
 	range 11 17  if !HUGETLB_PAGE
 	default "17" if HUGETLB_PAGE
 	default "11"
 config SMP
 	bool "Symmetric multi-processing support"
 	help
 	  This enables support for systems with more than one CPU. If you have
 	  a system with only one CPU, say N.  If you have a system with more
 	  than one CPU, say Y.
 	  If you say N here, the kernel will run on single and multiprocessor
 	  systems, but will use only one CPU of a multiprocessor system.  If
 	  you say Y here, the kernel will run on many, but not all,
 	  single processor systems.  On a single processor system, the kernel
 	  will run faster if you say N here.
 	  See also the <file:Documentation/smp.txt> and the SMP-HOWTO
 	  available at <http://www.tldp.org/docs.html#howto>.
 	  If you don't know what to do here, say N.
 config NR_CPUS
 	int "Maximum number of CPUs (2-1024)"
 	range 2 1024
 	depends on SMP
 	default "1024"
 	help
 	  You should set this to the number of CPUs in your system, but
 	  keep in mind that a kernel compiled for, e.g., 2 CPUs will boot but
 	  only use 2 CPUs on a >2 CPU system.  Setting this to a value larger
 	  than 64 will cause the use of a CPU mask array, causing a small
 	  performance hit.
 config HOTPLUG_CPU
 	bool "Support for hot-pluggable CPUs (EXPERIMENTAL)"
 	depends on SMP && EXPERIMENTAL
 	select HOTPLUG
 	default n
 	---help---
 	  Say Y here to experiment with turning CPUs off and on.  CPUs
 	  can be controlled through /sys/devices/system/cpu/cpu#.
 	  Say N if you want to disable CPU hotplug.
 config ARCH_ENABLE_MEMORY_HOTPLUG
 	def_bool y
 config ARCH_ENABLE_MEMORY_HOTREMOVE
 	def_bool y
 config SCHED_SMT
 	bool "SMT scheduler support"
 	depends on SMP
 	help
 	  Improves the CPU scheduler's decision making when dealing with
 	  Intel IA64 chips with MultiThreading at a cost of slightly increased
 	  overhead in some places. If unsure say N here.
 config PERMIT_BSP_REMOVE
 	bool "Support removal of Bootstrap Processor"
 	depends on HOTPLUG_CPU
 	default n
 	---help---
 	Say Y here if your platform SAL will support removal of BSP with HOTPLUG_CPU
 	support.
 config FORCE_CPEI_RETARGET
 	bool "Force assumption that CPEI can be re-targetted"
 	depends on PERMIT_BSP_REMOVE
 	default n
 	---help---
 	Say Y if you need to force the assumption that CPEI can be re-targetted to
 	any cpu in the system. This hint is available via ACPI 3.0 specifications.
 	Tiger4 systems are capable of re-directing CPEI to any CPU other than BSP.
 	This option it useful to enable this feature on older BIOS's as well.
 	You can also enable this by using boot command line option force_cpei=1.
 source "kernel/Kconfig.preempt"
 source "mm/Kconfig"
 config ARCH_SELECT_MEMORY_MODEL
 	def_bool y
 config ARCH_DISCONTIGMEM_ENABLE
 	def_bool y
 	help
 	  Say Y to support efficient handling of discontiguous physical memory,
 	  for architectures which are either NUMA (Non-Uniform Memory Access)
 	  or have huge holes in the physical address space for other reasons.
  	  See <file:Documentation/vm/numa> for more.
 config ARCH_FLATMEM_ENABLE
 	def_bool y
 config ARCH_SPARSEMEM_ENABLE
 	def_bool y
 	depends on ARCH_DISCONTIGMEM_ENABLE
 	select SPARSEMEM_VMEMMAP_ENABLE
 config ARCH_DISCONTIGMEM_DEFAULT
 	def_bool y if (IA64_SGI_SN2 || IA64_GENERIC || IA64_HP_ZX1 || IA64_HP_ZX1_SWIOTLB)
 	depends on ARCH_DISCONTIGMEM_ENABLE
 config NUMA
 	bool "NUMA support"
 	depends on !IA64_HP_SIM && !FLATMEM
 	default y if IA64_SGI_SN2
 	select ACPI_NUMA if ACPI
 	help
 	  Say Y to compile the kernel to support NUMA (Non-Uniform Memory
 	  Access).  This option is for configuring high-end multiprocessor
 	  server systems.  If in doubt, say N.
 config NODES_SHIFT
 	int "Max num nodes shift(3-10)"
 	range 3 10
 	default "10"
 	depends on NEED_MULTIPLE_NODES
 	help
 	  This option specifies the maximum number of nodes in your SSI system.
 	  MAX_NUMNODES will be 2^(This value).
 	  If in doubt, use the default.
 config ARCH_POPULATES_NODE_MAP
 	def_bool y
 # VIRTUAL_MEM_MAP and FLAT_NODE_MEM_MAP are functionally equivalent.
 # VIRTUAL_MEM_MAP has been retained for historical reasons.
 config VIRTUAL_MEM_MAP
 	bool "Virtual mem map"
 	depends on !SPARSEMEM
 	default y if !IA64_HP_SIM
 	help
 	  Say Y to compile the kernel with support for a virtual mem map.
 	  This code also only takes effect if a memory hole of greater than
 	  1 Gb is found during boot.  You must turn this option on if you
 	  require the DISCONTIGMEM option for your machine. If you are
 	  unsure, say Y.
 config HOLES_IN_ZONE
 	bool
 	default y if VIRTUAL_MEM_MAP
 config HAVE_ARCH_EARLY_PFN_TO_NID
 	def_bool y
 	depends on NEED_MULTIPLE_NODES
 config HAVE_ARCH_NODEDATA_EXTENSION
 	def_bool y
 	depends on NUMA
 config IA32_SUPPORT
 	bool "Support for Linux/x86 binaries"
 	help
 	  IA-64 processors can execute IA-32 (X86) instructions.  By
 	  saying Y here, the kernel will include IA-32 system call
 	  emulation support which makes it possible to transparently
 	  run IA-32 Linux binaries on an IA-64 Linux system.
 	  If in doubt, say Y.
 config COMPAT
 	bool
 	depends on IA32_SUPPORT
 	default y
 config COMPAT_FOR_U64_ALIGNMENT
 	def_bool COMPAT
 config IA64_MCA_RECOVERY
 	tristate "MCA recovery from errors other than TLB."
 config PERFMON
 	bool "Performance monitor support"
 	help
 	  Selects whether support for the IA-64 performance monitor hardware
 	  is included in the kernel.  This makes some kernel data-structures a
 	  little bigger and slows down execution a bit, but it is generally
 	  a good idea to turn this on.  If you're unsure, say Y.
 config IA64_PALINFO
 	tristate "/proc/pal support"
 	help
 	  If you say Y here, you are able to get PAL (Processor Abstraction
 	  Layer) information in /proc/pal.  This contains useful information
 	  about the processors in your systems, such as cache and TLB sizes
 	  and the PAL firmware version in use.
 	  To use this option, you have to ensure that the "/proc file system
 	  support" (CONFIG_PROC_FS) is enabled, too.
 config IA64_MC_ERR_INJECT
 	tristate "MC error injection support"
 	help
 	  Adds support for MC error injection. If enabled, the kernel
 	  will provide a sysfs interface for user applications to
 	  call MC error injection PAL procedures to inject various errors.
 	  This is a useful tool for MCA testing.
 	  If you're unsure, do not select this option.
 config SGI_SN
 	def_bool y if (IA64_SGI_SN2 || IA64_GENERIC)
 config IA64_ESI
 	bool "ESI (Extensible SAL Interface) support"
 	help
 	  If you say Y here, support is built into the kernel to
 	  make ESI calls.  ESI calls are used to support vendor-specific
 	  firmware extensions, such as the ability to inject memory-errors
 	  for test-purposes.  If you're unsure, say N.
 config IA64_HP_AML_NFW
 	bool "Support ACPI AML calls to native firmware"
 	help
 	  This driver installs a global ACPI Operation Region handler for
 	  region 0xA1.  AML methods can use this OpRegion to call arbitrary
 	  native firmware functions.  The driver installs the OpRegion
 	  handler if there is an HPQ5001 device or if the user supplies
 	  the "force" module parameter, e.g., with the "aml_nfw.force"
 	  kernel command line option.
 source "drivers/sn/Kconfig"
 config KEXEC
 	bool "kexec system call (EXPERIMENTAL)"
 	depends on EXPERIMENTAL && !IA64_HP_SIM && (!SMP || HOTPLUG_CPU)
 	help
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
 	  but it is independent of the system firmware.   And like a reboot
 	  you can start any kernel with it, not just Linux.
 	  The name comes from the similarity to the exec system call.
 	  It is an ongoing process to be certain the hardware in a machine
 	  is properly shutdown, so do not be surprised if this code does not
 	  initially work for you.  It may help to enable device hotplugging
 	  support.  As of this writing the exact hardware interface is
 	  strongly in flux, so no good recommendation can be made.
 config CRASH_DUMP
 	  bool "kernel crash dumps (EXPERIMENTAL)"
 	  depends on EXPERIMENTAL && IA64_MCA_RECOVERY && !IA64_HP_SIM && (!SMP || HOTPLUG_CPU)
 	  help
 	    Generate crash dump after being started by kexec.
 source "drivers/firmware/Kconfig"
 source "fs/Kconfig.binfmt"
 endmenu
 menu "Power management and ACPI"
 source "kernel/power/Kconfig"
 source "drivers/acpi/Kconfig"
 if PM
 source "arch/ia64/kernel/cpufreq/Kconfig"
 endif
 endmenu
 if !IA64_HP_SIM
 menu "Bus options (PCI, PCMCIA)"
 config PCI
 	bool "PCI support"
 	help
 	  Real IA-64 machines all have PCI/PCI-X/PCI Express busses.  Say Y
 	  here unless you are using a simulator without PCI support.
 config PCI_DOMAINS
 	def_bool PCI
 config PCI_SYSCALL
 	def_bool PCI
 source "drivers/pci/pcie/Kconfig"
 source "drivers/pci/Kconfig"
 source "drivers/pci/hotplug/Kconfig"
 source "drivers/pcmcia/Kconfig"
 endmenu
 endif
 source "net/Kconfig"
 source "drivers/Kconfig"
 config MSPEC
 	tristate "Memory special operations driver"
 	depends on IA64
 	select IA64_UNCACHED_ALLOCATOR
 	help
 	  If you have an ia64 and you want to enable memory special
 	  operations support (formerly known as fetchop), say Y here,
 	  otherwise say N.
 source "fs/Kconfig"
 source "lib/Kconfig"
 #
 # Use the generic interrupt handling code in kernel/irq/:
 #
 config GENERIC_HARDIRQS
 	bool
 	default y
 config GENERIC_IRQ_PROBE
 	bool
 	default y
 config GENERIC_PENDING_IRQ
 	bool
 	depends on GENERIC_HARDIRQS && SMP
 	default y
 config IRQ_PER_CPU
 	bool
 	default y
 source "arch/ia64/hp/sim/Kconfig"
 source "kernel/Kconfig.instrumentation"
 source "arch/ia64/Kconfig.debug"
 source "security/Kconfig"
 source "crypto/Kconfig"

arch/m32r/Kconfig

Diff comments View file @ 95c354f

 #
 # For a description of the syntax of this configuration file,
 # see Documentation/kbuild/kconfig-language.txt.
 #
 mainmenu "Linux/M32R Kernel Configuration"
 config M32R
 	bool
 	default y
 config SBUS
 	bool
 config GENERIC_ISA_DMA
 	bool
 	default y
 config ZONE_DMA
 	bool
 	default y
 config GENERIC_HARDIRQS
 	bool
 	default y
 config GENERIC_IRQ_PROBE
 	bool
 	default y
 config NO_IOPORT
 	def_bool y
 config NO_DMA
 	def_bool y
 source "init/Kconfig"
 menu "Processor type and features"
 choice
 	prompt "Platform Type"
 	default PLAT_MAPPI
 config PLAT_MAPPI
 	bool "Mappi-I"
 	help
 	  The Mappi-I is an FPGA board for SOC (System-On-a-Chip) prototyping.
 	  You can operate a Linux system on this board by using an M32R
 	  softmacro core, which is a fully-synthesizable functional model
 	  described in Verilog-HDL.
 	  The Mappi-I board was the first platform, which had been used
 	  to port and develop a Linux system for the M32R processor.
 	  Currently, the Mappi-II, an heir to the Mappi-I, is available.
 config PLAT_USRV
 	bool "uServer"
 	select PLAT_HAS_INT1ICU
 config PLAT_M32700UT
 	bool "M32700UT"
 	select PLAT_HAS_INT0ICU
 	select PLAT_HAS_INT1ICU
 	select PLAT_HAS_INT2ICU
 	help
 	  The M3T-M32700UT is an evaluation board based on uT-Engine
 	  specification.  This board has an M32700 (Chaos) evaluation chip.
 	  You can say Y for SMP, because the M32700 is a single chip
 	  multiprocessor.
 config PLAT_OPSPUT
 	bool "OPSPUT"
 	select PLAT_HAS_INT0ICU
 	select PLAT_HAS_INT1ICU
 	select PLAT_HAS_INT2ICU
 	help
 	  The OPSPUT is an evaluation board based on uT-Engine
 	  specification.  This board has a OPSP-REP chip.
 config PLAT_OAKS32R
 	bool "OAKS32R"
 	help
 	  The OAKS32R is a tiny, inexpensive evaluation board.
 	  Please note that if you say Y here and choose chip "M32102",
 	  say N for MMU and select a no-MMU version kernel, otherwise
 	  a kernel with MMU support will not work, because the M32102
 	  is a microcontroller for embedded systems and it has no MMU.
 config PLAT_MAPPI2
        bool "Mappi-II(M3A-ZA36/M3A-ZA52)"
 config PLAT_MAPPI3
        bool "Mappi-III(M3A-2170)"
 config PLAT_M32104UT
 	bool "M32104UT"
 	select PLAT_HAS_INT1ICU
 	help
 	  The M3T-M32104UT is an reference board based on uT-Engine
 	  specification.  This board has a M32104 chip.
 endchoice
 choice
 	prompt "Processor family"
 	default CHIP_M32700
 config CHIP_M32700
 	bool "M32700 (Chaos)"
 config CHIP_M32102
 	bool "M32102"
 config CHIP_M32104
 	bool "M32104"
 	depends on PLAT_M32104UT
 config CHIP_VDEC2
        bool "VDEC2"
 config CHIP_OPSP
        bool "OPSP"
 endchoice
 config MMU
 	bool "Support for memory management hardware"
 	depends on CHIP_M32700 || CHIP_VDEC2 || CHIP_OPSP
 	default y
 config TLB_ENTRIES
        int "TLB Entries"
        depends on CHIP_M32700 || CHIP_VDEC2 || CHIP_OPSP
        default 32 if CHIP_M32700 || CHIP_OPSP
        default 16 if CHIP_VDEC2
 config ISA_M32R
         bool
 	depends on CHIP_M32102 || CHIP_M32104
 	default y
 config ISA_M32R2
 	bool
 	depends on CHIP_M32700 || CHIP_VDEC2 || CHIP_OPSP
 	default y
 config ISA_DSP_LEVEL2
 	bool
 	depends on CHIP_M32700 || CHIP_OPSP
 	default y
 config ISA_DUAL_ISSUE
 	bool
 	depends on CHIP_M32700 || CHIP_OPSP
 	default y
 config PLAT_HAS_INT0ICU
 	bool
 	default n
 config PLAT_HAS_INT1ICU
 	bool
 	default n
 config PLAT_HAS_INT2ICU
 	bool
 	default n
 config BUS_CLOCK
 	int "Bus Clock [Hz] (integer)"
 	default "70000000" if PLAT_MAPPI
 	default "25000000" if PLAT_USRV
 	default "50000000" if PLAT_MAPPI3
 	default "50000000" if PLAT_M32700UT
 	default "50000000" if PLAT_OPSPUT
 	default "54000000" if PLAT_M32104UT
 	default "33333333" if PLAT_OAKS32R
 	default "20000000" if PLAT_MAPPI2
 config TIMER_DIVIDE
 	int "Timer divider (integer)"
 	default "128"
 config CPU_LITTLE_ENDIAN
         bool "Generate little endian code"
 	default n
 config MEMORY_START
 	hex "Physical memory start address (hex)"
 	default "08000000" if PLAT_MAPPI || PLAT_MAPPI2 || PLAT_MAPPI3
 	default "08000000" if PLAT_USRV
 	default "08000000" if PLAT_M32700UT
 	default "08000000" if PLAT_OPSPUT
 	default "04000000" if PLAT_M32104UT
 	default "01000000" if PLAT_OAKS32R
 config MEMORY_SIZE
 	hex "Physical memory size (hex)"
 	default "08000000" if PLAT_MAPPI3
 	default "04000000" if PLAT_MAPPI || PLAT_MAPPI2
 	default "02000000" if PLAT_USRV
 	default "01000000" if PLAT_M32700UT
 	default "01000000" if PLAT_OPSPUT
 	default "01000000" if PLAT_M32104UT
 	default "00800000" if PLAT_OAKS32R
 config NOHIGHMEM
 	bool
 	default y
 config ARCH_DISCONTIGMEM_ENABLE
 	bool "Internal RAM Support"
 	depends on CHIP_M32700 || CHIP_M32102 || CHIP_VDEC2 || CHIP_OPSP || CHIP_M32104
 	default y
 source "mm/Kconfig"
 config IRAM_START
 	hex "Internal memory start address (hex)"
 	default "00f00000" if !CHIP_M32104
 	default "00700000" if CHIP_M32104
 	depends on (CHIP_M32700 || CHIP_M32102 || CHIP_VDEC2 || CHIP_OPSP || CHIP_M32104) && DISCONTIGMEM
 config IRAM_SIZE
 	hex "Internal memory size (hex)"
 	depends on (CHIP_M32700 || CHIP_M32102 || CHIP_VDEC2 || CHIP_OPSP || CHIP_M32104) && DISCONTIGMEM
 	default "00080000" if CHIP_M32700
 	default "00010000" if CHIP_M32102 || CHIP_OPSP || CHIP_M32104
 	default "00008000" if CHIP_VDEC2
 #
 # Define implied options from the CPU selection here
 #
+config GENERIC_LOCKBREAK
+	bool
+	default y
+	depends on SMP && PREEMPT
 config RWSEM_GENERIC_SPINLOCK
 	bool
 	depends on M32R
 	default y
 config RWSEM_XCHGADD_ALGORITHM
 	bool
 	default n
 config ARCH_HAS_ILOG2_U32
 	bool
 	default n
 config ARCH_HAS_ILOG2_U64
 	bool
 	default n
 config GENERIC_FIND_NEXT_BIT
 	bool
 	default y
 config GENERIC_HWEIGHT
 	bool
 	default y
 config GENERIC_CALIBRATE_DELAY
 	bool
 	default y
 config SCHED_NO_NO_OMIT_FRAME_POINTER
         bool
         default y
 config PREEMPT
 	bool "Preemptible Kernel"
 	help
 	  This option reduces the latency of the kernel when reacting to
 	  real-time or interactive events by allowing a low priority process to
 	  be preempted even if it is in kernel mode executing a system call.
 	  This allows applications to run more reliably even when the system is
 	  under load.
 	  Say Y here if you are building a kernel for a desktop, embedded
 	  or real-time system.  Say N if you are unsure.
 config SMP
 	bool "Symmetric multi-processing support"
 	---help---
 	  This enables support for systems with more than one CPU. If you have
 	  a system with only one CPU, like most personal computers, say N. If
 	  you have a system with more than one CPU, say Y.
 	  If you say N here, the kernel will run on single and multiprocessor
 	  machines, but will use only one CPU of a multiprocessor machine. If
 	  you say Y here, the kernel will run on many, but not all,
 	  singleprocessor machines. On a singleprocessor machine, the kernel
 	  will run faster if you say N here.
 	  People using multiprocessor machines who say Y here should also say
 	  Y to "Enhanced Real Time Clock Support", below. The "Advanced Power
 	  Management" code will be disabled if you say Y here.
 	  See also the <file:Documentation/smp.txt>,
 	  and the SMP-HOWTO available at
 	  <http://www.linuxdoc.org/docs.html#howto>.
 	  If you don't know what to do here, say N.
 config CHIP_M32700_TS1
 	bool "Workaround code for the M32700 TS1 chip's bug"
 	depends on (CHIP_M32700 && SMP)
 	default n
 config NR_CPUS
 	int "Maximum number of CPUs (2-32)"
 	range 2 32
 	depends on SMP
 	default "2"
 	help
 	  This allows you to specify the maximum number of CPUs which this
 	  kernel will support.  The maximum supported value is 32 and the
 	  minimum value which makes sense is 2.
 	  This is purely to save memory - each supported CPU adds
 	  approximately eight kilobytes to the kernel image.
 # Common NUMA Features
 config NUMA
 	bool "Numa Memory Allocation Support"
 	depends on SMP && BROKEN
 	default n
 config NODES_SHIFT
 	int
 	default "1"
 	depends on NEED_MULTIPLE_NODES
 # turning this on wastes a bunch of space.
 # Summit needs it only when NUMA is on
 config BOOT_IOREMAP
 	bool
 	depends on NUMA
 	default n
 endmenu
 menu "Bus options (PCI, PCMCIA, EISA, MCA, ISA)"
 config PCI
 	bool "PCI support"
 	depends on BROKEN
 	default n
 	help
 	  Find out whether you have a PCI motherboard. PCI is the name of a
 	  bus system, i.e. the way the CPU talks to the other stuff inside
 	  your box. Other bus systems are ISA, EISA, MicroChannel (MCA) or
 	  VESA. If you have PCI, say Y, otherwise N.
 	  The PCI-HOWTO, available from
 	  <http://www.linuxdoc.org/docs.html#howto>, contains valuable
 	  information about which PCI hardware does work under Linux and which
 	  doesn't.
 choice
 	prompt "PCI access mode"
 	depends on PCI
 	default PCI_GOANY
 config PCI_GOBIOS
 	bool "BIOS"
 	---help---
 	  On PCI systems, the BIOS can be used to detect the PCI devices and
 	  determine their configuration. However, some old PCI motherboards
 	  have BIOS bugs and may crash if this is done. Also, some embedded
 	  PCI-based systems don't have any BIOS at all. Linux can also try to
 	  detect the PCI hardware directly without using the BIOS.
 	  With this option, you can specify how Linux should detect the PCI
 	  devices. If you choose "BIOS", the BIOS will be used, if you choose
 	  "Direct", the BIOS won't be used, and if you choose "Any", the
 	  kernel will try the direct access method and falls back to the BIOS
 	  if that doesn't work. If unsure, go with the default, which is
 	  "Any".
 config PCI_GODIRECT
 	bool "Direct"
 config PCI_GOANY
 	bool "Any"
 endchoice
 config PCI_BIOS
 	bool
 	depends on PCI && (PCI_GOBIOS || PCI_GOANY)
 	default y
 config PCI_DIRECT
 	bool
  	depends on PCI && (PCI_GODIRECT || PCI_GOANY)
 	default y
 source "drivers/pci/Kconfig"
 config ISA
 	bool "ISA support"
 	help
 	  Find out whether you have ISA slots on your motherboard.  ISA is the
 	  name of a bus system, i.e. the way the CPU talks to the other stuff
 	  inside your box.  If you have ISA, say Y, otherwise N.
 source "drivers/pcmcia/Kconfig"
 source "drivers/pci/hotplug/Kconfig"
 endmenu
 menu "Executable file formats"
 source "fs/Kconfig.binfmt"
 endmenu
 source "net/Kconfig"
 source "drivers/Kconfig"
 source "fs/Kconfig"
 source "kernel/Kconfig.instrumentation"
 source "arch/m32r/Kconfig.debug"
 source "security/Kconfig"
 source "crypto/Kconfig"
 source "lib/Kconfig"

arch/mips/Kconfig

Diff comments View file @ 95c354f

 config MIPS
 	bool
 	default y
 	# Horrible source of confusion.  Die, die, die ...
 	select EMBEDDED
 	select RTC_LIB
 mainmenu "Linux/MIPS Kernel Configuration"
 menu "Machine selection"
 config ZONE_DMA
 	bool
 choice
 	prompt "System type"
 	default SGI_IP22
 config MACH_ALCHEMY
 	bool "Alchemy processor based machines"
 config BASLER_EXCITE
 	bool "Basler eXcite smart camera"
 	select CEVT_R4K
 	select CSRC_R4K
 	select DMA_COHERENT
 	select HW_HAS_PCI
 	select IRQ_CPU
 	select IRQ_CPU_RM7K
 	select IRQ_CPU_RM9K
 	select MIPS_RM9122
 	select SYS_HAS_CPU_RM9000
 	select SYS_SUPPORTS_32BIT_KERNEL
 	select SYS_SUPPORTS_BIG_ENDIAN
 	select SYS_SUPPORTS_KGDB
 	help
 	  The eXcite is a smart camera platform manufactured by
 	  Basler Vision Technologies AG.
 config BCM47XX
 	bool "BCM47XX based boards"
 	select CEVT_R4K
 	select CSRC_R4K
 	select DMA_NONCOHERENT
 	select HW_HAS_PCI
 	select IRQ_CPU
 	select SYS_HAS_CPU_MIPS32_R1
 	select SYS_SUPPORTS_32BIT_KERNEL
 	select SYS_SUPPORTS_LITTLE_ENDIAN
 	select SSB
 	select SSB_DRIVER_MIPS
 	select SSB_DRIVER_EXTIF
 	select SSB_PCICORE_HOSTMODE if PCI
 	select GENERIC_GPIO
 	select SYS_HAS_EARLY_PRINTK
 	select CFE
 	help
 	 Support for BCM47XX based boards
 config MIPS_COBALT
 	bool "Cobalt Server"
 	select CEVT_R4K
 	select CSRC_R4K
 	select CEVT_GT641XX
 	select DMA_NONCOHERENT
 	select HW_HAS_PCI
 	select I8253
 	select I8259
 	select IRQ_CPU
 	select IRQ_GT641XX
 	select PCI_GT64XXX_PCI0
 	select SYS_HAS_CPU_NEVADA
 	select SYS_HAS_EARLY_PRINTK
 	select SYS_SUPPORTS_32BIT_KERNEL
 	select SYS_SUPPORTS_64BIT_KERNEL
 	select SYS_SUPPORTS_LITTLE_ENDIAN
 	select GENERIC_HARDIRQS_NO__DO_IRQ
 config MACH_DECSTATION
 	bool "DECstations"
 	select BOOT_ELF32
 	select CEVT_R4K
 	select CSRC_R4K
 	select CPU_DADDI_WORKAROUNDS if 64BIT
 	select CPU_R4000_WORKAROUNDS if 64BIT
 	select CPU_R4400_WORKAROUNDS if 64BIT
 	select DMA_NONCOHERENT
 	select NO_IOPORT
 	select IRQ_CPU
 	select SYS_HAS_CPU_R3000
 	select SYS_HAS_CPU_R4X00
 	select SYS_SUPPORTS_32BIT_KERNEL
 	select SYS_SUPPORTS_64BIT_KERNEL if EXPERIMENTAL
 	select SYS_SUPPORTS_LITTLE_ENDIAN
 	select SYS_SUPPORTS_128HZ
 	select SYS_SUPPORTS_256HZ
 	select SYS_SUPPORTS_1024HZ
 	help
 	  This enables support for DEC's MIPS based workstations.  For details
 	  see the Linux/MIPS FAQ on <http://www.linux-mips.org/> and the
 	  DECstation porting pages on <http://decstation.unix-ag.org/>.
 	  If you have one of the following DECstation Models you definitely
 	  want to choose R4xx0 for the CPU Type:
 		DECstation 5000/50
 		DECstation 5000/150
 		DECstation 5000/260
 		DECsystem 5900/260
 	  otherwise choose R3000.
 config MACH_JAZZ
 	bool "Jazz family of machines"
 	select ARC
 	select ARC32
 	select ARCH_MAY_HAVE_PC_FDC
 	select CEVT_R4K
 	select CSRC_R4K
 	select DEFAULT_SGI_PARTITION if CPU_BIG_ENDIAN
 	select GENERIC_ISA_DMA
 	select IRQ_CPU
 	select I8253
 	select I8259
 	select ISA
 	select SYS_HAS_CPU_R4X00
 	select SYS_SUPPORTS_32BIT_KERNEL
 	select SYS_SUPPORTS_64BIT_KERNEL if EXPERIMENTAL
 	select SYS_SUPPORTS_100HZ
 	select GENERIC_HARDIRQS_NO__DO_IRQ
 	help
 	 This a family of machines based on the MIPS R4030 chipset which was
 	 used by several vendors to build RISC/os and Windows NT workstations.
 	 Members include the Acer PICA, MIPS Magnum 4000, MIPS Millenium and
 	 Olivetti M700-10 workstations.
 config LASAT
 	bool "LASAT Networks platforms"
 	select CEVT_R4K
 	select CSRC_R4K
 	select DMA_NONCOHERENT
 	select SYS_HAS_EARLY_PRINTK
 	select HW_HAS_PCI
 	select IRQ_CPU
 	select PCI_GT64XXX_PCI0
 	select MIPS_NILE4
 	select R5000_CPU_SCACHE
 	select SYS_HAS_CPU_R5000
 	select SYS_SUPPORTS_32BIT_KERNEL
 	select SYS_SUPPORTS_64BIT_KERNEL if BROKEN
 	select SYS_SUPPORTS_LITTLE_ENDIAN
 	select GENERIC_HARDIRQS_NO__DO_IRQ
 config LEMOTE_FULONG
 	bool "Lemote Fulong mini-PC"
 	select ARCH_SPARSEMEM_ENABLE
 	select CEVT_R4K
 	select CSRC_R4K
 	select SYS_HAS_CPU_LOONGSON2
 	select DMA_NONCOHERENT
 	select BOOT_ELF32
 	select BOARD_SCACHE
 	select HAVE_STD_PC_SERIAL_PORT
 	select HW_HAS_PCI
 	select I8259
 	select ISA
 	select IRQ_CPU
 	select SYS_SUPPORTS_32BIT_KERNEL
 	select SYS_SUPPORTS_64BIT_KERNEL
 	select SYS_SUPPORTS_LITTLE_ENDIAN
 	select SYS_SUPPORTS_HIGHMEM
 	select SYS_HAS_EARLY_PRINTK
 	select GENERIC_HARDIRQS_NO__DO_IRQ
 	select GENERIC_ISA_DMA_SUPPORT_BROKEN
 	select CPU_HAS_WB
 	help
 	  Lemote Fulong mini-PC board based on the Chinese Loongson-2E CPU and
 	  an FPGA northbridge
 config MIPS_ATLAS
 	bool "MIPS Atlas board"
 	select BOOT_ELF32
 	select BOOT_RAW
 	select CEVT_R4K
 	select CSRC_R4K
 	select DMA_NONCOHERENT
 	select SYS_HAS_EARLY_PRINTK
 	select IRQ_CPU
 	select HW_HAS_PCI
 	select MIPS_BOARDS_GEN
 	select MIPS_BONITO64
 	select PCI_GT64XXX_PCI0
 	select MIPS_MSC
 	select RM7000_CPU_SCACHE
 	select SWAP_IO_SPACE
 	select SYS_HAS_CPU_MIPS32_R1
 	select SYS_HAS_CPU_MIPS32_R2
 	select SYS_HAS_CPU_MIPS64_R1
 	select SYS_HAS_CPU_NEVADA
 	select SYS_HAS_CPU_RM7000
 	select SYS_SUPPORTS_32BIT_KERNEL
 	select SYS_SUPPORTS_64BIT_KERNEL
 	select SYS_SUPPORTS_BIG_ENDIAN
 	select SYS_SUPPORTS_LITTLE_ENDIAN
 	select SYS_SUPPORTS_MULTITHREADING if EXPERIMENTAL
 	select SYS_SUPPORTS_SMARTMIPS
 	select GENERIC_HARDIRQS_NO__DO_IRQ
 	help
 	  This enables support for the MIPS Technologies Atlas evaluation
 	  board.
 config MIPS_MALTA
 	bool "MIPS Malta board"
 	select ARCH_MAY_HAVE_PC_FDC
 	select BOOT_ELF32
 	select BOOT_RAW
 	select CEVT_R4K
 	select CSRC_R4K
 	select DMA_NONCOHERENT
 	select GENERIC_ISA_DMA
 	select IRQ_CPU
 	select HW_HAS_PCI
 	select I8253
 	select I8259
 	select MIPS_BOARDS_GEN
 	select MIPS_BONITO64
 	select MIPS_CPU_SCACHE
 	select PCI_GT64XXX_PCI0
 	select MIPS_MSC
 	select SWAP_IO_SPACE
 	select SYS_HAS_CPU_MIPS32_R1
 	select SYS_HAS_CPU_MIPS32_R2
 	select SYS_HAS_CPU_MIPS64_R1
 	select SYS_HAS_CPU_NEVADA
 	select SYS_HAS_CPU_RM7000
 	select SYS_HAS_EARLY_PRINTK
 	select SYS_SUPPORTS_32BIT_KERNEL
 	select SYS_SUPPORTS_64BIT_KERNEL
 	select SYS_SUPPORTS_BIG_ENDIAN
 	select SYS_SUPPORTS_LITTLE_ENDIAN
 	select SYS_SUPPORTS_MULTITHREADING
 	select SYS_SUPPORTS_SMARTMIPS
 	help
 	  This enables support for the MIPS Technologies Malta evaluation
 	  board.
 config MIPS_SEAD
 	bool "MIPS SEAD board"
 	select CEVT_R4K
 	select CSRC_R4K
 	select IRQ_CPU
 	select DMA_NONCOHERENT
 	select SYS_HAS_EARLY_PRINTK
 	select MIPS_BOARDS_GEN
 	select SYS_HAS_CPU_MIPS32_R1
 	select SYS_HAS_CPU_MIPS32_R2
 	select SYS_HAS_CPU_MIPS64_R1
 	select SYS_SUPPORTS_32BIT_KERNEL
 	select SYS_SUPPORTS_64BIT_KERNEL if EXPERIMENTAL
 	select SYS_SUPPORTS_BIG_ENDIAN
 	select SYS_SUPPORTS_LITTLE_ENDIAN
 	select SYS_SUPPORTS_SMARTMIPS
 	help
 	  This enables support for the MIPS Technologies SEAD evaluation
 	  board.
 config MIPS_SIM
 	bool 'MIPS simulator (MIPSsim)'
 	select CEVT_R4K
 	select CSRC_R4K
 	select DMA_NONCOHERENT
 	select SYS_HAS_EARLY_PRINTK
 	select IRQ_CPU
 	select BOOT_RAW
 	select SYS_HAS_CPU_MIPS32_R1
 	select SYS_HAS_CPU_MIPS32_R2
 	select SYS_HAS_EARLY_PRINTK
 	select SYS_SUPPORTS_32BIT_KERNEL
 	select SYS_SUPPORTS_BIG_ENDIAN
 	select SYS_SUPPORTS_MULTITHREADING
 	select SYS_SUPPORTS_LITTLE_ENDIAN
 	help
 	  This option enables support for MIPS Technologies MIPSsim software
 	  emulator.
 config MARKEINS
 	bool "NEC EMMA2RH Mark-eins"
 	select CEVT_R4K
 	select CSRC_R4K
 	select DMA_NONCOHERENT
 	select HW_HAS_PCI
 	select IRQ_CPU
 	select SWAP_IO_SPACE
 	select SYS_SUPPORTS_32BIT_KERNEL
 	select SYS_SUPPORTS_BIG_ENDIAN
 	select SYS_SUPPORTS_LITTLE_ENDIAN
 	select SYS_HAS_CPU_R5000
 	help
 	  This enables support for the R5432-based NEC Mark-eins
 	  boards with R5500 CPU.
 config MACH_VR41XX
 	bool "NEC VR4100 series based machines"
 	select CEVT_R4K
 	select CSRC_R4K
 	select SYS_HAS_CPU_VR41XX
 	select GENERIC_HARDIRQS_NO__DO_IRQ
 config PNX8550_JBS
 	bool "Philips PNX8550 based JBS board"
 	select PNX8550
 	select SYS_SUPPORTS_LITTLE_ENDIAN
 config PNX8550_STB810
 	bool "Philips PNX8550 based STB810 board"
 	select PNX8550
 	select SYS_SUPPORTS_LITTLE_ENDIAN
 config PMC_MSP
 	bool "PMC-Sierra MSP chipsets"
 	depends on EXPERIMENTAL
 	select DMA_NONCOHERENT
 	select SWAP_IO_SPACE
 	select NO_EXCEPT_FILL
 	select BOOT_RAW
 	select SYS_HAS_CPU_MIPS32_R1
 	select SYS_HAS_CPU_MIPS32_R2
 	select SYS_SUPPORTS_32BIT_KERNEL
 	select SYS_SUPPORTS_BIG_ENDIAN
 	select SYS_SUPPORTS_KGDB
 	select IRQ_CPU
 	select SERIAL_8250
 	select SERIAL_8250_CONSOLE
 	help
 	  This adds support for the PMC-Sierra family of Multi-Service
 	  Processor System-On-A-Chips.  These parts include a number
 	  of integrated peripherals, interfaces and DSPs in addition to
 	  a variety of MIPS cores.
 config PMC_YOSEMITE
 	bool "PMC-Sierra Yosemite eval board"
 	select CEVT_R4K
 	select CSRC_R4K
 	select DMA_COHERENT
 	select HW_HAS_PCI
 	select IRQ_CPU
 	select IRQ_CPU_RM7K
 	select IRQ_CPU_RM9K
 	select SWAP_IO_SPACE
 	select SYS_HAS_CPU_RM9000
 	select SYS_HAS_EARLY_PRINTK
 	select SYS_SUPPORTS_32BIT_KERNEL
 	select SYS_SUPPORTS_64BIT_KERNEL
 	select SYS_SUPPORTS_BIG_ENDIAN
 	select SYS_SUPPORTS_HIGHMEM
 	select SYS_SUPPORTS_KGDB
 	select SYS_SUPPORTS_SMP
 	help
 	  Yosemite is an evaluation board for the RM9000x2 processor
 	  manufactured by PMC-Sierra.
 config SGI_IP22
 	bool "SGI IP22 (Indy/Indigo2)"
 	select ARC
 	select ARC32
 	select BOOT_ELF32
 	select CEVT_R4K
 	select CSRC_R4K
 	select DEFAULT_SGI_PARTITION
 	select DMA_NONCOHERENT
 	select HW_HAS_EISA
 	select I8253
 	select I8259
 	select IP22_CPU_SCACHE
 	select IRQ_CPU
 	select GENERIC_ISA_DMA_SUPPORT_BROKEN
 	select SGI_HAS_DS1286
 	select SGI_HAS_I8042
 	select SGI_HAS_INDYDOG
 	select SGI_HAS_SEEQ
 	select SGI_HAS_WD93
 	select SGI_HAS_ZILOG
 	select SWAP_IO_SPACE
 	select SYS_HAS_CPU_R4X00
 	select SYS_HAS_CPU_R5000
 	select SYS_HAS_EARLY_PRINTK
 	select SYS_SUPPORTS_32BIT_KERNEL
 	select SYS_SUPPORTS_64BIT_KERNEL
 	select SYS_SUPPORTS_BIG_ENDIAN
 	help
 	  This are the SGI Indy, Challenge S and Indigo2, as well as certain
 	  OEM variants like the Tandem CMN B006S. To compile a Linux kernel
 	  that runs on these, say Y here.
 config SGI_IP27
 	bool "SGI IP27 (Origin200/2000)"
 	select ARC
 	select ARC64
 	select BOOT_ELF64
 	select DEFAULT_SGI_PARTITION
 	select DMA_IP27
 	select SYS_HAS_EARLY_PRINTK
 	select HW_HAS_PCI
 	select NR_CPUS_DEFAULT_64
 	select SYS_HAS_CPU_R10000
 	select SYS_SUPPORTS_64BIT_KERNEL
 	select SYS_SUPPORTS_BIG_ENDIAN
 	select SYS_SUPPORTS_KGDB
 	select SYS_SUPPORTS_NUMA
 	select SYS_SUPPORTS_SMP
 	select GENERIC_HARDIRQS_NO__DO_IRQ
 	help
 	  This are the SGI Origin 200, Origin 2000 and Onyx 2 Graphics
 	  workstations.  To compile a Linux kernel that runs on these, say Y
 	  here.
 config SGI_IP28
 	bool "SGI IP28 (Indigo2 R10k) (EXPERIMENTAL)"
 	depends on EXPERIMENTAL
 	select ARC
 	select ARC64
 	select BOOT_ELF64
 	select CEVT_R4K
 	select CSRC_R4K
 	select DEFAULT_SGI_PARTITION
 	select DMA_NONCOHERENT
 	select GENERIC_ISA_DMA_SUPPORT_BROKEN
 	select IRQ_CPU
 	select HW_HAS_EISA
 	select I8253
 	select I8259
 	select SGI_HAS_DS1286
 	select SGI_HAS_I8042
 	select SGI_HAS_INDYDOG
 	select SGI_HAS_SEEQ
 	select SGI_HAS_WD93
 	select SGI_HAS_ZILOG
 	select SWAP_IO_SPACE
 	select SYS_HAS_CPU_R10000
 	select SYS_HAS_EARLY_PRINTK
 	select SYS_SUPPORTS_64BIT_KERNEL
 	select SYS_SUPPORTS_BIG_ENDIAN
       help
         This is the SGI Indigo2 with R10000 processor.  To compile a Linux
         kernel that runs on these, say Y here.
 config SGI_IP32
 	bool "SGI IP32 (O2)"
 	select ARC
 	select ARC32
 	select BOOT_ELF32
 	select CEVT_R4K
 	select CSRC_R4K
 	select DMA_NONCOHERENT
 	select HW_HAS_PCI
 	select IRQ_CPU
 	select R5000_CPU_SCACHE
 	select RM7000_CPU_SCACHE
 	select SYS_HAS_CPU_R5000
 	select SYS_HAS_CPU_R10000 if BROKEN
 	select SYS_HAS_CPU_RM7000
 	select SYS_HAS_CPU_NEVADA
 	select SYS_SUPPORTS_64BIT_KERNEL
 	select SYS_SUPPORTS_BIG_ENDIAN
 	help
 	  If you want this kernel to run on SGI O2 workstation, say Y here.
 config SIBYTE_CRHINE
 	bool "Sibyte BCM91120C-CRhine"
 	depends on EXPERIMENTAL
 	select BOOT_ELF32
 	select DMA_COHERENT
 	select SIBYTE_BCM1120
 	select SWAP_IO_SPACE
 	select SYS_HAS_CPU_SB1
 	select SYS_SUPPORTS_BIG_ENDIAN
 	select SYS_SUPPORTS_LITTLE_ENDIAN
 config SIBYTE_CARMEL
 	bool "Sibyte BCM91120x-Carmel"
 	depends on EXPERIMENTAL
 	select BOOT_ELF32
 	select DMA_COHERENT
 	select SIBYTE_BCM1120
 	select SWAP_IO_SPACE
 	select SYS_HAS_CPU_SB1
 	select SYS_SUPPORTS_BIG_ENDIAN
 	select SYS_SUPPORTS_LITTLE_ENDIAN
 config SIBYTE_CRHONE
 	bool "Sibyte BCM91125C-CRhone"
 	depends on EXPERIMENTAL
 	select BOOT_ELF32
 	select DMA_COHERENT
 	select SIBYTE_BCM1125
 	select SWAP_IO_SPACE
 	select SYS_HAS_CPU_SB1
 	select SYS_SUPPORTS_BIG_ENDIAN
 	select SYS_SUPPORTS_HIGHMEM
 	select SYS_SUPPORTS_LITTLE_ENDIAN
 config SIBYTE_RHONE
 	bool "Sibyte BCM91125E-Rhone"
 	depends on EXPERIMENTAL
 	select BOOT_ELF32
 	select DMA_COHERENT
 	select SIBYTE_BCM1125H
 	select SWAP_IO_SPACE
 	select SYS_HAS_CPU_SB1
 	select SYS_SUPPORTS_BIG_ENDIAN
 	select SYS_SUPPORTS_LITTLE_ENDIAN
 config SIBYTE_SWARM
 	bool "Sibyte BCM91250A-SWARM"
 	select BOOT_ELF32
 	select DMA_COHERENT
 	select NR_CPUS_DEFAULT_2
 	select SIBYTE_SB1250
 	select SWAP_IO_SPACE
 	select SYS_HAS_CPU_SB1
 	select SYS_SUPPORTS_BIG_ENDIAN
 	select SYS_SUPPORTS_HIGHMEM
 	select SYS_SUPPORTS_KGDB
 	select SYS_SUPPORTS_LITTLE_ENDIAN
 	select ZONE_DMA32 if 64BIT
 config SIBYTE_LITTLESUR
 	bool "Sibyte BCM91250C2-LittleSur"
 	depends on EXPERIMENTAL
 	select BOOT_ELF32
 	select DMA_COHERENT
 	select NR_CPUS_DEFAULT_2
 	select SIBYTE_SB1250
 	select SWAP_IO_SPACE
 	select SYS_HAS_CPU_SB1
 	select SYS_SUPPORTS_BIG_ENDIAN
 	select SYS_SUPPORTS_HIGHMEM
 	select SYS_SUPPORTS_LITTLE_ENDIAN
 config SIBYTE_SENTOSA
 	bool "Sibyte BCM91250E-Sentosa"
 	depends on EXPERIMENTAL
 	select BOOT_ELF32
 	select DMA_COHERENT
 	select NR_CPUS_DEFAULT_2
 	select SIBYTE_SB1250
 	select SWAP_IO_SPACE
 	select SYS_HAS_CPU_SB1
 	select SYS_SUPPORTS_BIG_ENDIAN
 	select SYS_SUPPORTS_LITTLE_ENDIAN
 config SIBYTE_BIGSUR
 	bool "Sibyte BCM91480B-BigSur"
 	select BOOT_ELF32
 	select DMA_COHERENT
 	select NR_CPUS_DEFAULT_4
 	select SIBYTE_BCM1x80
 	select SWAP_IO_SPACE
 	select SYS_HAS_CPU_SB1
 	select SYS_SUPPORTS_BIG_ENDIAN
 	select SYS_SUPPORTS_HIGHMEM
 	select SYS_SUPPORTS_LITTLE_ENDIAN
 	select ZONE_DMA32 if 64BIT
 config SNI_RM
 	bool "SNI RM200/300/400"
 	select ARC if CPU_LITTLE_ENDIAN
 	select ARC32 if CPU_LITTLE_ENDIAN
 	select SNIPROM if CPU_BIG_ENDIAN
 	select ARCH_MAY_HAVE_PC_FDC
 	select BOOT_ELF32
 	select CEVT_R4K
 	select CSRC_R4K
 	select DEFAULT_SGI_PARTITION if CPU_BIG_ENDIAN
 	select DMA_NONCOHERENT
 	select GENERIC_ISA_DMA
 	select HW_HAS_EISA
 	select HW_HAS_PCI
 	select IRQ_CPU
 	select I8253
 	select I8259
 	select ISA
 	select SWAP_IO_SPACE if CPU_BIG_ENDIAN
 	select SYS_HAS_CPU_R4X00
 	select SYS_HAS_CPU_R5000
 	select SYS_HAS_CPU_R10000
 	select R5000_CPU_SCACHE
 	select SYS_HAS_EARLY_PRINTK
 	select SYS_SUPPORTS_32BIT_KERNEL
 	select SYS_SUPPORTS_64BIT_KERNEL if EXPERIMENTAL
 	select SYS_SUPPORTS_BIG_ENDIAN
 	select SYS_SUPPORTS_HIGHMEM
 	select SYS_SUPPORTS_LITTLE_ENDIAN
 	help
 	  The SNI RM200/300/400 are MIPS-based machines manufactured by
 	  Siemens Nixdorf Informationssysteme (SNI), parent company of Pyramid
 	  Technology and now in turn merged with Fujitsu.  Say Y here to
 	  support this machine type.
 config TOSHIBA_JMR3927
 	bool "Toshiba JMR-TX3927 board"
 	select CEVT_TXX9
 	select DMA_NONCOHERENT
 	select HW_HAS_PCI
 	select MIPS_TX3927
 	select IRQ_TXX9
 	select SWAP_IO_SPACE
 	select SYS_HAS_CPU_TX39XX
 	select SYS_SUPPORTS_32BIT_KERNEL
 	select SYS_SUPPORTS_LITTLE_ENDIAN
 	select SYS_SUPPORTS_BIG_ENDIAN
 	select GENERIC_HARDIRQS_NO__DO_IRQ
 config TOSHIBA_RBTX4927
 	bool "Toshiba RBTX49[23]7 board"
 	select CEVT_R4K
 	select CSRC_R4K
 	select CEVT_TXX9
 	select DMA_NONCOHERENT
 	select HAS_TXX9_SERIAL
 	select HW_HAS_PCI
 	select IRQ_CPU
 	select IRQ_TXX9
 	select I8259 if TOSHIBA_FPCIB0
 	select SWAP_IO_SPACE
 	select SYS_HAS_CPU_TX49XX
 	select SYS_SUPPORTS_32BIT_KERNEL
 	select SYS_SUPPORTS_64BIT_KERNEL
 	select SYS_SUPPORTS_LITTLE_ENDIAN
 	select SYS_SUPPORTS_BIG_ENDIAN
 	select SYS_SUPPORTS_KGDB
 	select GENERIC_HARDIRQS_NO__DO_IRQ
 	help
 	  This Toshiba board is based on the TX4927 processor. Say Y here to
 	  support this machine type
 config TOSHIBA_RBTX4938
 	bool "Toshiba RBTX4938 board"
 	select CEVT_R4K
 	select CSRC_R4K
 	select CEVT_TXX9
 	select DMA_NONCOHERENT
 	select HAS_TXX9_SERIAL
 	select HW_HAS_PCI
 	select IRQ_CPU
 	select IRQ_TXX9
 	select SWAP_IO_SPACE
 	select SYS_HAS_CPU_TX49XX
 	select SYS_SUPPORTS_32BIT_KERNEL
 	select SYS_SUPPORTS_LITTLE_ENDIAN
 	select SYS_SUPPORTS_BIG_ENDIAN
 	select SYS_SUPPORTS_KGDB
 	select GENERIC_HARDIRQS_NO__DO_IRQ
 	select GENERIC_GPIO
 	help
 	  This Toshiba board is based on the TX4938 processor. Say Y here to
 	  support this machine type
 config WR_PPMC
 	bool "Wind River PPMC board"
 	select CEVT_R4K
 	select CSRC_R4K
 	select IRQ_CPU
 	select BOOT_ELF32
 	select DMA_NONCOHERENT
 	select HW_HAS_PCI
 	select PCI_GT64XXX_PCI0
 	select SWAP_IO_SPACE
 	select SYS_HAS_CPU_MIPS32_R1
 	select SYS_HAS_CPU_MIPS32_R2
 	select SYS_HAS_CPU_MIPS64_R1
 	select SYS_HAS_CPU_NEVADA
 	select SYS_HAS_CPU_RM7000
 	select SYS_SUPPORTS_32BIT_KERNEL
 	select SYS_SUPPORTS_64BIT_KERNEL
 	select SYS_SUPPORTS_BIG_ENDIAN
 	select SYS_SUPPORTS_LITTLE_ENDIAN
 	help
 	  This enables support for the Wind River MIPS32 4KC PPMC evaluation
 	  board, which is based on GT64120 bridge chip.
 endchoice
 source "arch/mips/au1000/Kconfig"
 source "arch/mips/basler/excite/Kconfig"
 source "arch/mips/jazz/Kconfig"
 source "arch/mips/lasat/Kconfig"
 source "arch/mips/pmc-sierra/Kconfig"
 source "arch/mips/sgi-ip27/Kconfig"
 source "arch/mips/sibyte/Kconfig"
 source "arch/mips/tx4927/Kconfig"
 source "arch/mips/tx4938/Kconfig"
 source "arch/mips/vr41xx/Kconfig"
 endmenu
+config GENERIC_LOCKBREAK
+	bool
+	default y
+	depends on SMP && PREEMPT
 config RWSEM_GENERIC_SPINLOCK
 	bool
 	default y
 config RWSEM_XCHGADD_ALGORITHM
 	bool
 config ARCH_HAS_ILOG2_U32
 	bool
 	default n
 config ARCH_HAS_ILOG2_U64
 	bool
 	default n
 config ARCH_SUPPORTS_OPROFILE
 	bool
 	default y if !MIPS_MT_SMTC
 config GENERIC_FIND_NEXT_BIT
 	bool
 	default y
 config GENERIC_HWEIGHT
 	bool
 	default y
 config GENERIC_CALIBRATE_DELAY
 	bool
 	default y
 config GENERIC_CLOCKEVENTS
 	bool
 	default y
 config GENERIC_TIME
 	bool
 	default y
 config GENERIC_CMOS_UPDATE
 	bool
 	default y
 config SCHED_NO_NO_OMIT_FRAME_POINTER
 	bool
 	default y
 config GENERIC_HARDIRQS_NO__DO_IRQ
 	bool
 	default n
 #
 # Select some configuration options automatically based on user selections.
 #
 config ARC
 	bool
 config ARCH_MAY_HAVE_PC_FDC
 	bool
 config BOOT_RAW
 	bool
 config CEVT_BCM1480
 	bool
 config CEVT_GT641XX
 	bool
 config CEVT_R4K
 	bool
 config CEVT_SB1250
 	bool
 config CEVT_TXX9
 	bool
 config CSRC_BCM1480
 	bool
 config CSRC_R4K
 	bool
 config CSRC_SB1250
 	bool
 config CFE
 	bool
 config DMA_COHERENT
 	bool
 config DMA_IP27
 	bool
 config DMA_NONCOHERENT
 	bool
 	select DMA_NEED_PCI_MAP_STATE
 config DMA_NEED_PCI_MAP_STATE
 	bool
 config EARLY_PRINTK
 	bool "Early printk" if EMBEDDED && DEBUG_KERNEL
 	depends on SYS_HAS_EARLY_PRINTK
 	default y
 	help
 	  This option enables special console drivers which allow the kernel
 	  to print messages very early in the bootup process.
 	  This is useful for kernel debugging when your machine crashes very
 	  early before the console code is initialized. For normal operation,
 	  it is not recommended because it looks ugly on some machines and
 	  doesn't cooperate with an X server. You should normally say N here,
 	  unless you want to debug such a crash.
 config SYS_HAS_EARLY_PRINTK
 	bool
 config HOTPLUG_CPU
 	bool
 	default n
 config I8259
 	bool
 config MIPS_BONITO64
 	bool
 config MIPS_MSC
 	bool
 config MIPS_NILE4
 	bool
 config MIPS_DISABLE_OBSOLETE_IDE
 	bool
 config NO_IOPORT
 	def_bool n
 config GENERIC_ISA_DMA
 	bool
 	select ZONE_DMA if GENERIC_ISA_DMA_SUPPORT_BROKEN=n
 config GENERIC_ISA_DMA_SUPPORT_BROKEN
 	bool
 	select GENERIC_ISA_DMA
 config GENERIC_GPIO
 	bool
 #
 # Endianess selection.  Sufficiently obscure so many users don't know what to
 # answer,so we try hard to limit the available choices.  Also the use of a
 # choice statement should be more obvious to the user.
 #
 choice
 	prompt "Endianess selection"
 	help
 	  Some MIPS machines can be configured for either little or big endian
 	  byte order. These modes require different kernels and a different
 	  Linux distribution.  In general there is one preferred byteorder for a
 	  particular system but some systems are just as commonly used in the
 	  one or the other endianness.
 config CPU_BIG_ENDIAN
 	bool "Big endian"
 	depends on SYS_SUPPORTS_BIG_ENDIAN
 config CPU_LITTLE_ENDIAN
 	bool "Little endian"
 	depends on SYS_SUPPORTS_LITTLE_ENDIAN
 	help
 endchoice
 config SYS_SUPPORTS_APM_EMULATION
 	bool
 config SYS_SUPPORTS_BIG_ENDIAN
 	bool
 config SYS_SUPPORTS_LITTLE_ENDIAN
 	bool
 config IRQ_CPU
 	bool
 config IRQ_CPU_RM7K
 	bool
 config IRQ_CPU_RM9K
 	bool
 config IRQ_MSP_SLP
 	bool
 config IRQ_MSP_CIC
 	bool
 config IRQ_TXX9
 	bool
 config IRQ_GT641XX
 	bool
 config MIPS_BOARDS_GEN
 	bool
 config PCI_GT64XXX_PCI0
 	bool
 config NO_EXCEPT_FILL
 	bool
 config MIPS_TX3927
 	bool
 	select HAS_TXX9_SERIAL
 config MIPS_RM9122
 	bool
 	select SERIAL_RM9000
 config PNX8550
 	bool
 	select SOC_PNX8550
 config SOC_PNX8550
 	bool
 	select DMA_NONCOHERENT
 	select HW_HAS_PCI
 	select SYS_HAS_CPU_MIPS32_R1
 	select SYS_HAS_EARLY_PRINTK
 	select SYS_SUPPORTS_32BIT_KERNEL
 	select GENERIC_HARDIRQS_NO__DO_IRQ
 	select SYS_SUPPORTS_KGDB
 	select GENERIC_GPIO
 config SWAP_IO_SPACE
 	bool
 config EMMA2RH
 	bool
 	depends on MARKEINS
 	default y
 config SERIAL_RM9000
 	bool
 config SGI_HAS_DS1286
 	bool
 config SGI_HAS_INDYDOG
 	bool
 config SGI_HAS_SEEQ
 	bool
 config SGI_HAS_WD93
 	bool
 config SGI_HAS_ZILOG
 	bool
 config SGI_HAS_I8042
 	bool
 config DEFAULT_SGI_PARTITION
 	bool
 config ARC32
 	bool
 config SNIPROM
 	bool
 config BOOT_ELF32
 	bool
 config MIPS_L1_CACHE_SHIFT
 	int
 	default "4" if MACH_DECSTATION
 	default "7" if SGI_IP27 || SGI_IP28 || SNI_RM
 	default "4" if PMC_MSP4200_EVAL
 	default "5"
 config HAVE_STD_PC_SERIAL_PORT
 	bool
 config ARC_CONSOLE
 	bool "ARC console support"
 	depends on SGI_IP22 || SGI_IP28 || (SNI_RM && CPU_LITTLE_ENDIAN)
 config ARC_MEMORY
 	bool
 	depends on MACH_JAZZ || SNI_RM || SGI_IP32
 	default y
 config ARC_PROMLIB
 	bool
 	depends on MACH_JAZZ || SNI_RM || SGI_IP22 || SGI_IP28 || SGI_IP32
 	default y
 config ARC64
 	bool
 config BOOT_ELF64
 	bool
 menu "CPU selection"
 choice
 	prompt "CPU type"
 	default CPU_R4X00
 config CPU_LOONGSON2
 	bool "Loongson 2"
 	depends on SYS_HAS_CPU_LOONGSON2
 	select CPU_SUPPORTS_32BIT_KERNEL
 	select CPU_SUPPORTS_64BIT_KERNEL
 	select CPU_SUPPORTS_HIGHMEM
 	help
 	  The Loongson 2E processor implements the MIPS III instruction set
 	  with many extensions.
 config CPU_MIPS32_R1
 	bool "MIPS32 Release 1"
 	depends on SYS_HAS_CPU_MIPS32_R1
 	select CPU_HAS_LLSC
 	select CPU_HAS_PREFETCH
 	select CPU_SUPPORTS_32BIT_KERNEL
 	select CPU_SUPPORTS_HIGHMEM
 	help
 	  Choose this option to build a kernel for release 1 or later of the
 	  MIPS32 architecture.  Most modern embedded systems with a 32-bit
 	  MIPS processor are based on a MIPS32 processor.  If you know the
 	  specific type of processor in your system, choose those that one
 	  otherwise CPU_MIPS32_R1 is a safe bet for any MIPS32 system.
 	  Release 2 of the MIPS32 architecture is available since several
 	  years so chances are you even have a MIPS32 Release 2 processor
 	  in which case you should choose CPU_MIPS32_R2 instead for better
 	  performance.
 config CPU_MIPS32_R2
 	bool "MIPS32 Release 2"
 	depends on SYS_HAS_CPU_MIPS32_R2
 	select CPU_HAS_LLSC
 	select CPU_HAS_PREFETCH
 	select CPU_SUPPORTS_32BIT_KERNEL
 	select CPU_SUPPORTS_HIGHMEM
 	help
 	  Choose this option to build a kernel for release 2 or later of the
 	  MIPS32 architecture.  Most modern embedded systems with a 32-bit
 	  MIPS processor are based on a MIPS32 processor.  If you know the
 	  specific type of processor in your system, choose those that one
 	  otherwise CPU_MIPS32_R1 is a safe bet for any MIPS32 system.
 config CPU_MIPS64_R1
 	bool "MIPS64 Release 1"
 	depends on SYS_HAS_CPU_MIPS64_R1
 	select CPU_HAS_LLSC
 	select CPU_HAS_PREFETCH
 	select CPU_SUPPORTS_32BIT_KERNEL
 	select CPU_SUPPORTS_64BIT_KERNEL
 	select CPU_SUPPORTS_HIGHMEM
 	help
 	  Choose this option to build a kernel for release 1 or later of the
 	  MIPS64 architecture.  Many modern embedded systems with a 64-bit
 	  MIPS processor are based on a MIPS64 processor.  If you know the
 	  specific type of processor in your system, choose those that one
 	  otherwise CPU_MIPS64_R1 is a safe bet for any MIPS64 system.
 	  Release 2 of the MIPS64 architecture is available since several
 	  years so chances are you even have a MIPS64 Release 2 processor
 	  in which case you should choose CPU_MIPS64_R2 instead for better
 	  performance.
 config CPU_MIPS64_R2
 	bool "MIPS64 Release 2"
 	depends on SYS_HAS_CPU_MIPS64_R2
 	select CPU_HAS_LLSC
 	select CPU_HAS_PREFETCH
 	select CPU_SUPPORTS_32BIT_KERNEL
 	select CPU_SUPPORTS_64BIT_KERNEL
 	select CPU_SUPPORTS_HIGHMEM
 	help
 	  Choose this option to build a kernel for release 2 or later of the
 	  MIPS64 architecture.  Many modern embedded systems with a 64-bit
 	  MIPS processor are based on a MIPS64 processor.  If you know the
 	  specific type of processor in your system, choose those that one
 	  otherwise CPU_MIPS64_R1 is a safe bet for any MIPS64 system.
 config CPU_R3000
 	bool "R3000"
 	depends on SYS_HAS_CPU_R3000
 	select CPU_HAS_WB
 	select CPU_SUPPORTS_32BIT_KERNEL
 	select CPU_SUPPORTS_HIGHMEM
 	help
 	  Please make sure to pick the right CPU type. Linux/MIPS is not
 	  designed to be generic, i.e. Kernels compiled for R3000 CPUs will
 	  *not* work on R4000 machines and vice versa.  However, since most
 	  of the supported machines have an R4000 (or similar) CPU, R4x00
 	  might be a safe bet.  If the resulting kernel does not work,
 	  try to recompile with R3000.
 config CPU_TX39XX
 	bool "R39XX"
 	depends on SYS_HAS_CPU_TX39XX
 	select CPU_SUPPORTS_32BIT_KERNEL
 config CPU_VR41XX
 	bool "R41xx"
 	depends on SYS_HAS_CPU_VR41XX
 	select CPU_SUPPORTS_32BIT_KERNEL
 	select CPU_SUPPORTS_64BIT_KERNEL
 	help
 	  The options selects support for the NEC VR4100 series of processors.
 	  Only choose this option if you have one of these processors as a
 	  kernel built with this option will not run on any other type of
 	  processor or vice versa.
 config CPU_R4300
 	bool "R4300"
 	depends on SYS_HAS_CPU_R4300
 	select CPU_HAS_LLSC
 	select CPU_SUPPORTS_32BIT_KERNEL
 	select CPU_SUPPORTS_64BIT_KERNEL
 	help
 	  MIPS Technologies R4300-series processors.
 config CPU_R4X00
 	bool "R4x00"
 	depends on SYS_HAS_CPU_R4X00
 	select CPU_HAS_LLSC
 	select CPU_SUPPORTS_32BIT_KERNEL
 	select CPU_SUPPORTS_64BIT_KERNEL
 	help
 	  MIPS Technologies R4000-series processors other than 4300, including
 	  the R4000, R4400, R4600, and 4700.
 config CPU_TX49XX
 	bool "R49XX"
 	depends on SYS_HAS_CPU_TX49XX
 	select CPU_HAS_LLSC
 	select CPU_HAS_PREFETCH
 	select CPU_SUPPORTS_32BIT_KERNEL
 	select CPU_SUPPORTS_64BIT_KERNEL
 config CPU_R5000
 	bool "R5000"
 	depends on SYS_HAS_CPU_R5000
 	select CPU_HAS_LLSC
 	select CPU_SUPPORTS_32BIT_KERNEL
 	select CPU_SUPPORTS_64BIT_KERNEL
 	help
 	  MIPS Technologies R5000-series processors other than the Nevada.
 config CPU_R5432
 	bool "R5432"
 	depends on SYS_HAS_CPU_R5432
 	select CPU_HAS_LLSC
 	select CPU_SUPPORTS_32BIT_KERNEL
 	select CPU_SUPPORTS_64BIT_KERNEL
 config CPU_R6000
 	bool "R6000"
 	depends on EXPERIMENTAL
 	select CPU_HAS_LLSC
 	depends on SYS_HAS_CPU_R6000
 	select CPU_SUPPORTS_32BIT_KERNEL
 	help
 	  MIPS Technologies R6000 and R6000A series processors.  Note these
 	  processors are extremely rare and the support for them is incomplete.
 config CPU_NEVADA
 	bool "RM52xx"
 	depends on SYS_HAS_CPU_NEVADA
 	select CPU_HAS_LLSC
 	select CPU_SUPPORTS_32BIT_KERNEL
 	select CPU_SUPPORTS_64BIT_KERNEL
 	help
 	  QED / PMC-Sierra RM52xx-series ("Nevada") processors.
 config CPU_R8000
 	bool "R8000"
 	depends on EXPERIMENTAL
 	depends on SYS_HAS_CPU_R8000
 	select CPU_HAS_LLSC
 	select CPU_HAS_PREFETCH
 	select CPU_SUPPORTS_64BIT_KERNEL
 	help
 	  MIPS Technologies R8000 processors.  Note these processors are
 	  uncommon and the support for them is incomplete.
 config CPU_R10000
 	bool "R10000"
 	depends on SYS_HAS_CPU_R10000
 	select CPU_HAS_LLSC
 	select CPU_HAS_PREFETCH
 	select CPU_SUPPORTS_32BIT_KERNEL
 	select CPU_SUPPORTS_64BIT_KERNEL
 	select CPU_SUPPORTS_HIGHMEM
 	help
 	  MIPS Technologies R10000-series processors.
 config CPU_RM7000
 	bool "RM7000"
 	depends on SYS_HAS_CPU_RM7000
 	select CPU_HAS_LLSC
 	select CPU_HAS_PREFETCH
 	select CPU_SUPPORTS_32BIT_KERNEL
 	select CPU_SUPPORTS_64BIT_KERNEL
 	select CPU_SUPPORTS_HIGHMEM
 config CPU_RM9000
 	bool "RM9000"
 	depends on SYS_HAS_CPU_RM9000
 	select CPU_HAS_LLSC
 	select CPU_HAS_PREFETCH
 	select CPU_SUPPORTS_32BIT_KERNEL
 	select CPU_SUPPORTS_64BIT_KERNEL
 	select CPU_SUPPORTS_HIGHMEM
 	select WEAK_ORDERING
 config CPU_SB1
 	bool "SB1"
 	depends on SYS_HAS_CPU_SB1
 	select CPU_HAS_LLSC
 	select CPU_SUPPORTS_32BIT_KERNEL
 	select CPU_SUPPORTS_64BIT_KERNEL
 	select CPU_SUPPORTS_HIGHMEM
 	select WEAK_ORDERING
 endchoice
 config SYS_HAS_CPU_LOONGSON2
 	bool
 config SYS_HAS_CPU_MIPS32_R1
 	bool
 config SYS_HAS_CPU_MIPS32_R2
 	bool
 config SYS_HAS_CPU_MIPS64_R1
 	bool
 config SYS_HAS_CPU_MIPS64_R2
 	bool
 config SYS_HAS_CPU_R3000
 	bool
 config SYS_HAS_CPU_TX39XX
 	bool
 config SYS_HAS_CPU_VR41XX
 	bool
 config SYS_HAS_CPU_R4300
 	bool
 config SYS_HAS_CPU_R4X00
 	bool
 config SYS_HAS_CPU_TX49XX
 	bool
 config SYS_HAS_CPU_R5000
 	bool
 config SYS_HAS_CPU_R5432
 	bool
 config SYS_HAS_CPU_R6000
 	bool
 config SYS_HAS_CPU_NEVADA
 	bool
 config SYS_HAS_CPU_R8000
 	bool
 config SYS_HAS_CPU_R10000
 	bool
 config SYS_HAS_CPU_RM7000
 	bool
 config SYS_HAS_CPU_RM9000
 	bool
 config SYS_HAS_CPU_SB1
 	bool
 #
 # CPU may reorder R->R, R->W, W->R, W->W
 # Reordering beyond LL and SC is handled in WEAK_REORDERING_BEYOND_LLSC
 #
 config WEAK_ORDERING
 	bool
 #
 # CPU may reorder reads and writes beyond LL/SC
 # CPU may reorder R->LL, R->LL, W->LL, W->LL, R->SC, R->SC, W->SC, W->SC
 #
 config WEAK_REORDERING_BEYOND_LLSC
 	bool
 endmenu
 #
 # These two indicate any level of the MIPS32 and MIPS64 architecture
 #
 config CPU_MIPS32
 	bool
 	default y if CPU_MIPS32_R1 || CPU_MIPS32_R2
 config CPU_MIPS64
 	bool
 	default y if CPU_MIPS64_R1 || CPU_MIPS64_R2
 #
 # These two indicate the revision of the architecture, either Release 1 or Release 2
 #
 config CPU_MIPSR1
 	bool
 	default y if CPU_MIPS32_R1 || CPU_MIPS64_R1
 config CPU_MIPSR2
 	bool
 	default y if CPU_MIPS32_R2 || CPU_MIPS64_R2
 config SYS_SUPPORTS_32BIT_KERNEL
 	bool
 config SYS_SUPPORTS_64BIT_KERNEL
 	bool
 config CPU_SUPPORTS_32BIT_KERNEL
 	bool
 config CPU_SUPPORTS_64BIT_KERNEL
 	bool
 menu "Kernel type"
 choice
 	prompt "Kernel code model"
 	help
 	  You should only select this option if you have a workload that
 	  actually benefits from 64-bit processing or if your machine has
 	  large memory.  You will only be presented a single option in this
 	  menu if your system does not support both 32-bit and 64-bit kernels.
 config 32BIT
 	bool "32-bit kernel"
 	depends on CPU_SUPPORTS_32BIT_KERNEL && SYS_SUPPORTS_32BIT_KERNEL
 	select TRAD_SIGNALS
 	help
 	  Select this option if you want to build a 32-bit kernel.
 config 64BIT
 	bool "64-bit kernel"
 	depends on CPU_SUPPORTS_64BIT_KERNEL && SYS_SUPPORTS_64BIT_KERNEL
 	help
 	  Select this option if you want to build a 64-bit kernel.
 endchoice
 choice
 	prompt "Kernel page size"
 	default PAGE_SIZE_4KB
 config PAGE_SIZE_4KB
 	bool "4kB"
 	help
 	 This option select the standard 4kB Linux page size.  On some
 	 R3000-family processors this is the only available page size.  Using
 	 4kB page size will minimize memory consumption and is therefore
 	 recommended for low memory systems.
 config PAGE_SIZE_8KB
 	bool "8kB"
 	depends on EXPERIMENTAL && CPU_R8000
 	help
 	  Using 8kB page size will result in higher performance kernel at
 	  the price of higher memory consumption.  This option is available
 	  only on the R8000 processor.  Not that at the time of this writing
 	  this option is still high experimental; there are also issues with
 	  compatibility of user applications.
 config PAGE_SIZE_16KB
 	bool "16kB"
 	depends on !CPU_R3000 && !CPU_TX39XX
 	help
 	  Using 16kB page size will result in higher performance kernel at
 	  the price of higher memory consumption.  This option is available on
 	  all non-R3000 family processors.  Note that you will need a suitable
 	  Linux distribution to support this.
 config PAGE_SIZE_64KB
 	bool "64kB"
 	depends on EXPERIMENTAL && !CPU_R3000 && !CPU_TX39XX
 	help
 	  Using 64kB page size will result in higher performance kernel at
 	  the price of higher memory consumption.  This option is available on
 	  all non-R3000 family processor.  Not that at the time of this
 	  writing this option is still high experimental.
 endchoice
 config BOARD_SCACHE
 	bool
 config IP22_CPU_SCACHE
 	bool
 	select BOARD_SCACHE
 #
 # Support for a MIPS32 / MIPS64 style S-caches
 #
 config MIPS_CPU_SCACHE
 	bool
 	select BOARD_SCACHE
 config R5000_CPU_SCACHE
 	bool
 	select BOARD_SCACHE
 config RM7000_CPU_SCACHE
 	bool
 	select BOARD_SCACHE
 config SIBYTE_DMA_PAGEOPS
 	bool "Use DMA to clear/copy pages"
 	depends on CPU_SB1
 	help
 	  Instead of using the CPU to zero and copy pages, use a Data Mover
 	  channel.  These DMA channels are otherwise unused by the standard
 	  SiByte Linux port.  Seems to give a small performance benefit.
 config CPU_HAS_PREFETCH
 	bool
 choice
 	prompt "MIPS MT options"
 config MIPS_MT_DISABLED
 	bool "Disable multithreading support."
 	help
 	  Use this option if your workload can't take advantage of
 	  MIPS hardware multithreading support.  On systems that don't have
 	  the option of an MT-enabled processor this option will be the only
 	  option in this menu.
 config MIPS_MT_SMP
 	bool "Use 1 TC on each available VPE for SMP"
 	depends on SYS_SUPPORTS_MULTITHREADING
 	select CPU_MIPSR2_IRQ_VI
 	select CPU_MIPSR2_IRQ_EI
 	select MIPS_MT
 	select NR_CPUS_DEFAULT_2
 	select SMP
 	select SYS_SUPPORTS_SCHED_SMT if SMP
 	select SYS_SUPPORTS_SMP
 	select SMP_UP
 	help
 	  This is a kernel model which is also known a VSMP or lately
 	  has been marketesed into SMVP.
 config MIPS_MT_SMTC
 	bool "SMTC: Use all TCs on all VPEs for SMP"
 	depends on CPU_MIPS32_R2
 	#depends on CPU_MIPS64_R2		# once there is hardware ...
 	depends on SYS_SUPPORTS_MULTITHREADING
 	select GENERIC_CLOCKEVENTS_BROADCAST
 	select CPU_MIPSR2_IRQ_VI
 	select CPU_MIPSR2_IRQ_EI
 	select MIPS_MT
 	select NR_CPUS_DEFAULT_8
 	select SMP
 	select SYS_SUPPORTS_SMP
 	select SMP_UP
 	help
 	  This is a kernel model which is known a SMTC or lately has been
 	  marketesed into SMVP.
 endchoice
 config MIPS_MT
 	bool
 config SCHED_SMT
 	bool "SMT (multithreading) scheduler support"
 	depends on SYS_SUPPORTS_SCHED_SMT
 	default n
 	help
 	  SMT scheduler support improves the CPU scheduler's decision making
 	  when dealing with MIPS MT enabled cores at a cost of slightly
 	  increased overhead in some places. If unsure say N here.
 config SYS_SUPPORTS_SCHED_SMT
 	bool
 config SYS_SUPPORTS_MULTITHREADING
 	bool
 config MIPS_MT_FPAFF
 	bool "Dynamic FPU affinity for FP-intensive threads"
 	default y
 	depends on MIPS_MT_SMP || MIPS_MT_SMTC
 config MIPS_VPE_LOADER
 	bool "VPE loader support."
 	depends on SYS_SUPPORTS_MULTITHREADING
 	select CPU_MIPSR2_IRQ_VI
 	select CPU_MIPSR2_IRQ_EI
 	select MIPS_MT
 	help
 	  Includes a loader for loading an elf relocatable object
 	  onto another VPE and running it.
 config MIPS_MT_SMTC_INSTANT_REPLAY
 	bool "Low-latency Dispatch of Deferred SMTC IPIs"
 	depends on MIPS_MT_SMTC && !PREEMPT
 	default y
 	help
 	  SMTC pseudo-interrupts between TCs are deferred and queued
 	  if the target TC is interrupt-inhibited (IXMT). In the first
 	  SMTC prototypes, these queued IPIs were serviced on return
 	  to user mode, or on entry into the kernel idle loop. The
 	  INSTANT_REPLAY option dispatches them as part of local_irq_restore()
 	  processing, which adds runtime overhead (hence the option to turn
 	  it off), but ensures that IPIs are handled promptly even under
 	  heavy I/O interrupt load.
 config MIPS_MT_SMTC_IM_BACKSTOP
 	bool "Use per-TC register bits as backstop for inhibited IM bits"
 	depends on MIPS_MT_SMTC
 	default y
 	help
 	  To support multiple TC microthreads acting as "CPUs" within
 	  a VPE, VPE-wide interrupt mask bits must be specially manipulated
 	  during interrupt handling. To support legacy drivers and interrupt
 	  controller management code, SMTC has a "backstop" to track and
 	  if necessary restore the interrupt mask. This has some performance
 	  impact on interrupt service overhead. Disable it only if you know
 	  what you are doing.
 config MIPS_MT_SMTC_IRQAFF
 	bool "Support IRQ affinity API"
 	depends on MIPS_MT_SMTC
 	default n
 	help
 	  Enables SMP IRQ affinity API (/proc/irq/*/smp_affinity, etc.)
 	  for SMTC Linux kernel. Requires platform support, of which
 	  an example can be found in the MIPS kernel i8259 and Malta
 	  platform code.  It is recommended that MIPS_MT_SMTC_INSTANT_REPLAY
 	  be enabled if MIPS_MT_SMTC_IRQAFF is used. Adds overhead to
 	  interrupt dispatch, and should be used only if you know what
 	  you are doing.
 config MIPS_VPE_LOADER_TOM
 	bool "Load VPE program into memory hidden from linux"
 	depends on MIPS_VPE_LOADER
 	default y
 	help
 	  The loader can use memory that is present but has been hidden from
 	  Linux using the kernel command line option "mem=xxMB". It's up to
 	  you to ensure the amount you put in the option and the space your
 	  program requires is less or equal to the amount physically present.
 # this should possibly be in drivers/char, but it is rather cpu related. Hmmm
 config MIPS_VPE_APSP_API
 	bool "Enable support for AP/SP API (RTLX)"
 	depends on MIPS_VPE_LOADER
 	help
 config MIPS_APSP_KSPD
 	bool "Enable KSPD"
 	depends on MIPS_VPE_APSP_API
 	default y
 	help
 	  KSPD is a kernel daemon that accepts syscall requests from the SP
 	  side, actions them and returns the results. It also handles the
 	  "exit" syscall notifying other kernel modules the SP program is
 	  exiting.  You probably want to say yes here.
 config SB1_PASS_1_WORKAROUNDS
 	bool
 	depends on CPU_SB1_PASS_1
 	default y
 config SB1_PASS_2_WORKAROUNDS
 	bool
 	depends on CPU_SB1 && (CPU_SB1_PASS_2_2 || CPU_SB1_PASS_2)
 	default y
 config SB1_PASS_2_1_WORKAROUNDS
 	bool
 	depends on CPU_SB1 && CPU_SB1_PASS_2
 	default y
 config 64BIT_PHYS_ADDR
 	bool
 config CPU_HAS_LLSC
 	bool
 config CPU_HAS_SMARTMIPS
 	depends on SYS_SUPPORTS_SMARTMIPS
 	bool "Support for the SmartMIPS ASE"
 	help
 	  SmartMIPS is a extension of the MIPS32 architecture aimed at
 	  increased security at both hardware and software level for
 	  smartcards.  Enabling this option will allow proper use of the
 	  SmartMIPS instructions by Linux applications.  However a kernel with
 	  this option will not work on a MIPS core without SmartMIPS core.  If
 	  you don't know you probably don't have SmartMIPS and should say N
 	  here.
 config CPU_HAS_WB
 	bool
 #
 # Vectored interrupt mode is an R2 feature
 #
 config CPU_MIPSR2_IRQ_VI
 	bool
 #
 # Extended interrupt mode is an R2 feature
 #
 config CPU_MIPSR2_IRQ_EI
 	bool
 config CPU_HAS_SYNC
 	bool
 	depends on !CPU_R3000
 	default y
 config GENERIC_CLOCKEVENTS_BROADCAST
 	bool
 #
 # CPU non-features
 #
 config CPU_DADDI_WORKAROUNDS
 	bool
 config CPU_R4000_WORKAROUNDS
 	bool
 	select CPU_R4400_WORKAROUNDS
 config CPU_R4400_WORKAROUNDS
 	bool
 #
 # Use the generic interrupt handling code in kernel/irq/:
 #
 config GENERIC_HARDIRQS
 	bool
 	default y
 config GENERIC_IRQ_PROBE
 	bool
 	default y
 config IRQ_PER_CPU
 	bool
 #
 # - Highmem only makes sense for the 32-bit kernel.
 # - The current highmem code will only work properly on physically indexed
 #   caches such as R3000, SB1, R7000 or those that look like they're virtually
 #   indexed such as R4000/R4400 SC and MC versions or R10000.  So for the
 #   moment we protect the user and offer the highmem option only on machines
 #   where it's known to be safe.  This will not offer highmem on a few systems
 #   such as MIPS32 and MIPS64 CPUs which may have virtual and physically
 #   indexed CPUs but we're playing safe.
 # - We use SYS_SUPPORTS_HIGHMEM to offer highmem only for systems where we
 #   know they might have memory configurations that could make use of highmem
 #   support.
 #
 config HIGHMEM
 	bool "High Memory Support"
 	depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM
 config CPU_SUPPORTS_HIGHMEM
 	bool
 config SYS_SUPPORTS_HIGHMEM
 	bool
 config SYS_SUPPORTS_SMARTMIPS
 	bool
 config ARCH_FLATMEM_ENABLE
 	def_bool y
 	depends on !NUMA
 config ARCH_DISCONTIGMEM_ENABLE
 	bool
 	default y if SGI_IP27
 	help
 	  Say Y to support efficient handling of discontiguous physical memory,
 	  for architectures which are either NUMA (Non-Uniform Memory Access)
 	  or have huge holes in the physical address space for other reasons.
 	  See <file:Documentation/vm/numa> for more.
 config ARCH_POPULATES_NODE_MAP
 	def_bool y
 config ARCH_SPARSEMEM_ENABLE
 	bool
 	select SPARSEMEM_STATIC
 config NUMA
 	bool "NUMA Support"
 	depends on SYS_SUPPORTS_NUMA
 	help
 	  Say Y to compile the kernel to support NUMA (Non-Uniform Memory
 	  Access).  This option improves performance on systems with more
 	  than two nodes; on two node systems it is generally better to
 	  leave it disabled; on single node systems disable this option
 	  disabled.
 config SYS_SUPPORTS_NUMA
 	bool
 config NODES_SHIFT
 	int
 	default "6"
 	depends on NEED_MULTIPLE_NODES
 source "mm/Kconfig"
 config SMP
 	bool "Multi-Processing support"
 	depends on SYS_SUPPORTS_SMP
 	select IRQ_PER_CPU
 	help
 	  This enables support for systems with more than one CPU. If you have
 	  a system with only one CPU, like most personal computers, say N. If
 	  you have a system with more than one CPU, say Y.
 	  If you say N here, the kernel will run on single and multiprocessor
 	  machines, but will use only one CPU of a multiprocessor machine. If
 	  you say Y here, the kernel will run on many, but not all,
 	  singleprocessor machines. On a singleprocessor machine, the kernel
 	  will run faster if you say N here.
 	  People using multiprocessor machines who say Y here should also say
 	  Y to "Enhanced Real Time Clock Support", below.
 	  See also the <file:Documentation/smp.txt> and the SMP-HOWTO
 	  available at <http://www.tldp.org/docs.html#howto>.
 	  If you don't know what to do here, say N.
 config SMP_UP
 	bool
 config SYS_SUPPORTS_SMP
 	bool
 config NR_CPUS_DEFAULT_1
 	bool
 config NR_CPUS_DEFAULT_2
 	bool
 config NR_CPUS_DEFAULT_4
 	bool
 config NR_CPUS_DEFAULT_8
 	bool
 config NR_CPUS_DEFAULT_16
 	bool
 config NR_CPUS_DEFAULT_32
 	bool
 config NR_CPUS_DEFAULT_64
 	bool
 config NR_CPUS
 	int "Maximum number of CPUs (2-64)"
 	range 1 64 if NR_CPUS_DEFAULT_1
 	depends on SMP
 	default "1" if NR_CPUS_DEFAULT_1
 	default "2" if NR_CPUS_DEFAULT_2
 	default "4" if NR_CPUS_DEFAULT_4
 	default "8" if NR_CPUS_DEFAULT_8
 	default "16" if NR_CPUS_DEFAULT_16
 	default "32" if NR_CPUS_DEFAULT_32
 	default "64" if NR_CPUS_DEFAULT_64
 	help
 	  This allows you to specify the maximum number of CPUs which this
 	  kernel will support.  The maximum supported value is 32 for 32-bit
 	  kernel and 64 for 64-bit kernels; the minimum value which makes
 	  sense is 1 for Qemu (useful only for kernel debugging purposes)
 	  and 2 for all others.
 	  This is purely to save memory - each supported CPU adds
 	  approximately eight kilobytes to the kernel image.  For best
 	  performance should round up your number of processors to the next
 	  power of two.
 source "kernel/time/Kconfig"
 #
 # Timer Interrupt Frequency Configuration
 #
 choice
 	prompt "Timer frequency"
 	default HZ_250
 	help
 	 Allows the configuration of the timer frequency.
 	config HZ_48
 		bool "48 HZ" if SYS_SUPPORTS_48HZ
 	config HZ_100
 		bool "100 HZ" if SYS_SUPPORTS_100HZ || SYS_SUPPORTS_ARBIT_HZ
 	config HZ_128
 		bool "128 HZ" if SYS_SUPPORTS_128HZ || SYS_SUPPORTS_ARBIT_HZ
 	config HZ_250
 		bool "250 HZ" if SYS_SUPPORTS_250HZ || SYS_SUPPORTS_ARBIT_HZ
 	config HZ_256
 		bool "256 HZ" if SYS_SUPPORTS_256HZ || SYS_SUPPORTS_ARBIT_HZ
 	config HZ_1000
 		bool "1000 HZ" if SYS_SUPPORTS_1000HZ || SYS_SUPPORTS_ARBIT_HZ
 	config HZ_1024
 		bool "1024 HZ" if SYS_SUPPORTS_1024HZ || SYS_SUPPORTS_ARBIT_HZ
 endchoice
 config SYS_SUPPORTS_48HZ
 	bool
 config SYS_SUPPORTS_100HZ
 	bool
 config SYS_SUPPORTS_128HZ
 	bool
 config SYS_SUPPORTS_250HZ
 	bool
 config SYS_SUPPORTS_256HZ
 	bool
 config SYS_SUPPORTS_1000HZ
 	bool
 config SYS_SUPPORTS_1024HZ
 	bool
 config SYS_SUPPORTS_ARBIT_HZ
 	bool
 	default y if !SYS_SUPPORTS_48HZ && !SYS_SUPPORTS_100HZ && \
 		     !SYS_SUPPORTS_128HZ && !SYS_SUPPORTS_250HZ && \
 		     !SYS_SUPPORTS_256HZ && !SYS_SUPPORTS_1000HZ && \
 		     !SYS_SUPPORTS_1024HZ
 config HZ
 	int
 	default 48 if HZ_48
 	default 100 if HZ_100
 	default 128 if HZ_128
 	default 250 if HZ_250
 	default 256 if HZ_256
 	default 1000 if HZ_1000
 	default 1024 if HZ_1024
 source "kernel/Kconfig.preempt"
 config MIPS_INSANE_LARGE
 	bool "Support for large 64-bit configurations"
 	depends on CPU_R10000 && 64BIT
 	help
 	  MIPS R10000 does support a 44 bit / 16TB address space as opposed to
 	  previous 64-bit processors which only supported 40 bit / 1TB. If you
 	  need processes of more than 1TB virtual address space, say Y here.
 	  This will result in additional memory usage, so it is not
 	  recommended for normal users.
 config KEXEC
 	bool "Kexec system call (EXPERIMENTAL)"
 	depends on EXPERIMENTAL
 	help
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
 	  but it is independent of the system firmware.   And like a reboot
 	  you can start any kernel with it, not just Linux.
 	  The name comes from the similarity to the exec system call.
 	  It is an ongoing process to be certain the hardware in a machine
 	  is properly shutdown, so do not be surprised if this code does not
 	  initially work for you.  It may help to enable device hotplugging
 	  support.  As of this writing the exact hardware interface is
 	  strongly in flux, so no good recommendation can be made.
 config SECCOMP
 	bool "Enable seccomp to safely compute untrusted bytecode"
 	depends on PROC_FS
 	default y
 	help
 	  This kernel feature is useful for number crunching applications
 	  that may need to compute untrusted bytecode during their
 	  execution. By using pipes or other transports made available to
 	  the process as file descriptors supporting the read/write
 	  syscalls, it's possible to isolate those applications in
 	  their own address space using seccomp. Once seccomp is
 	  enabled via /proc/<pid>/seccomp, it cannot be disabled
 	  and the task is only allowed to execute a few safe syscalls
 	  defined by each seccomp mode.
 	  If unsure, say Y. Only embedded should say N here.
 endmenu
 config RWSEM_GENERIC_SPINLOCK
 	bool
 	default y
 config LOCKDEP_SUPPORT
 	bool
 	default y
 config STACKTRACE_SUPPORT
 	bool
 	default y
 source "init/Kconfig"
 menu "Bus options (PCI, PCMCIA, EISA, ISA, TC)"
 config HW_HAS_EISA
 	bool
 config HW_HAS_PCI
 	bool
 config PCI
 	bool "Support for PCI controller"
 	depends on HW_HAS_PCI
 	select PCI_DOMAINS
 	help
 	  Find out whether you have a PCI motherboard. PCI is the name of a
 	  bus system, i.e. the way the CPU talks to the other stuff inside
 	  your box. Other bus systems are ISA, EISA, or VESA. If you have PCI,
 	  say Y, otherwise N.
 	  The PCI-HOWTO, available from
 	  <http://www.tldp.org/docs.html#howto>, contains valuable
 	  information about which PCI hardware does work under Linux and which
 	  doesn't.
 config PCI_DOMAINS
 	bool
 source "drivers/pci/Kconfig"
 #
 # ISA support is now enabled via select.  Too many systems still have the one
 # or other ISA chip on the board that users don't know about so don't expect
 # users to choose the right thing ...
 #
 config ISA
 	bool
 config EISA
 	bool "EISA support"
 	depends on HW_HAS_EISA
 	select ISA
 	select GENERIC_ISA_DMA
 	---help---
 	  The Extended Industry Standard Architecture (EISA) bus was
 	  developed as an open alternative to the IBM MicroChannel bus.
 	  The EISA bus provided some of the features of the IBM MicroChannel
 	  bus while maintaining backward compatibility with cards made for
 	  the older ISA bus.  The EISA bus saw limited use between 1988 and
 	  1995 when it was made obsolete by the PCI bus.
 	  Say Y here if you are building a kernel for an EISA-based machine.
 	  Otherwise, say N.
 source "drivers/eisa/Kconfig"
 config TC
 	bool "TURBOchannel support"
 	depends on MACH_DECSTATION
 	help
 	  TurboChannel is a DEC (now Compaq (now HP)) bus for Alpha and MIPS
 	  processors.  Documentation on writing device drivers for TurboChannel
 	  is available at:
 	  <http://www.cs.arizona.edu/computer.help/policy/DIGITAL_unix/AA-PS3HD-TET1_html/TITLE.html>.
 #config ACCESSBUS
 #	bool "Access.Bus support"
 #	depends on TC
 config MMU
 	bool
 	default y
 config I8253
 	bool
 config ZONE_DMA32
 	bool
 source "drivers/pcmcia/Kconfig"
 source "drivers/pci/hotplug/Kconfig"
 endmenu
 menu "Executable file formats"
 source "fs/Kconfig.binfmt"
 config TRAD_SIGNALS
 	bool
 config BINFMT_IRIX
 	bool "Include IRIX binary compatibility"
 	depends on CPU_BIG_ENDIAN && 32BIT && BROKEN
 config MIPS32_COMPAT
 	bool "Kernel support for Linux/MIPS 32-bit binary compatibility"
 	depends on 64BIT
 	help
 	  Select this option if you want Linux/MIPS 32-bit binary
 	  compatibility. Since all software available for Linux/MIPS is
 	  currently 32-bit you should say Y here.
 config COMPAT
 	bool
 	depends on MIPS32_COMPAT
 	default y
 config SYSVIPC_COMPAT
 	bool
 	depends on COMPAT && SYSVIPC
 	default y
 config MIPS32_O32
 	bool "Kernel support for o32 binaries"
 	depends on MIPS32_COMPAT
 	help
 	  Select this option if you want to run o32 binaries.  These are pure
 	  32-bit binaries as used by the 32-bit Linux/MIPS port.  Most of
 	  existing binaries are in this format.
 	  If unsure, say Y.
 config MIPS32_N32
 	bool "Kernel support for n32 binaries"
 	depends on MIPS32_COMPAT
 	help
 	  Select this option if you want to run n32 binaries.  These are
 	  64-bit binaries using 32-bit quantities for addressing and certain
 	  data that would normally be 64-bit.  They are used in special
 	  cases.
 	  If unsure, say N.
 config BINFMT_ELF32
 	bool
 	default y if MIPS32_O32 || MIPS32_N32
 endmenu
 menu "Power management options"
 source "kernel/power/Kconfig"
 endmenu
 source "net/Kconfig"
 source "drivers/Kconfig"
 source "fs/Kconfig"
 source "kernel/Kconfig.instrumentation"
 source "arch/mips/Kconfig.debug"
 source "security/Kconfig"
 source "crypto/Kconfig"
 source "lib/Kconfig"

arch/parisc/Kconfig

Diff comments View file @ 95c354f

 #
 # For a description of the syntax of this configuration file,
 # see Documentation/kbuild/kconfig-language.txt.
 #
 mainmenu "Linux/PA-RISC Kernel Configuration"
 config PARISC
 	def_bool y
 	help
 	  The PA-RISC microprocessor is designed by Hewlett-Packard and used
 	  in many of their workstations & servers (HP9000 700 and 800 series,
 	  and later HP3000 series).  The PA-RISC Linux project home page is
 	  at <http://www.parisc-linux.org/>.
 config MMU
 	def_bool y
 config STACK_GROWSUP
 	def_bool y
+config GENERIC_LOCKBREAK
+	bool
+	default y
+	depends on SMP && PREEMPT
 config RWSEM_GENERIC_SPINLOCK
 	def_bool y
 config RWSEM_XCHGADD_ALGORITHM
 	bool
 config ARCH_HAS_ILOG2_U32
 	bool
 	default n
 config ARCH_HAS_ILOG2_U64
 	bool
 	default n
 config GENERIC_FIND_NEXT_BIT
 	bool
 	default y
 config GENERIC_BUG
 	bool
 	default y
 	depends on BUG
 config GENERIC_HWEIGHT
 	bool
 	default y
 config GENERIC_CALIBRATE_DELAY
 	bool
 	default y
 config GENERIC_TIME
 	bool
 	default y
 config TIME_LOW_RES
 	bool
 	depends on SMP
 	default y
 config GENERIC_HARDIRQS
 	def_bool y
 config GENERIC_IRQ_PROBE
 	def_bool y
 config IRQ_PER_CPU
 	bool
 	default y
 # unless you want to implement ACPI on PA-RISC ... ;-)
 config PM
 	bool
 config ISA_DMA_API
 	bool
 config ARCH_MAY_HAVE_PC_FDC
 	bool
 	depends on BROKEN
 	default y
 source "init/Kconfig"
 menu "Processor type and features"
 choice
 	prompt "Processor type"
 	default PA7000
 config PA7000
 	bool "PA7000/PA7100"
 	---help---
 	  This is the processor type of your CPU.  This information is
 	  used for optimizing purposes.  In order to compile a kernel
 	  that can run on all 32-bit PA CPUs (albeit not optimally fast),
 	  you can specify "PA7000" here.
 	  Specifying "PA8000" here will allow you to select a 64-bit kernel
 	  which is required on some machines.
 config PA7100LC
 	bool "PA7100LC"
 	help
 	  Select this option for the PCX-L processor, as used in the
 	  712, 715/64, 715/80, 715/100, 715/100XC, 725/100, 743, 748,
 	  D200, D210, D300, D310 and E-class
 config PA7200
 	bool "PA7200"
 	help
 	  Select this option for the PCX-T' processor, as used in the
 	  C100, C110, J100, J110, J210XC, D250, D260, D350, D360,
 	  K100, K200, K210, K220, K400, K410 and K420
 config PA7300LC
 	bool "PA7300LC"
 	help
 	  Select this option for the PCX-L2 processor, as used in the
 	  744, A180, B132L, B160L, B180L, C132L, C160L, C180L,
 	  D220, D230, D320 and D330.
 config PA8X00
 	bool "PA8000 and up"
 	help
 	  Select this option for PCX-U to PCX-W2 processors.
 endchoice
 # Define implied options from the CPU selection here
 config PA20
 	def_bool y
 	depends on PA8X00
 config PA11
 	def_bool y
 	depends on PA7000 || PA7100LC || PA7200 || PA7300LC
 config PREFETCH
 	def_bool y
 	depends on PA8X00 || PA7200
 config 64BIT
 	bool "64-bit kernel"
 	depends on PA8X00
 	help
 	  Enable this if you want to support 64bit kernel on PA-RISC platform.
 	  At the moment, only people willing to use more than 2GB of RAM,
 	  or having a 64bit-only capable PA-RISC machine should say Y here.
 	  Since there is no 64bit userland on PA-RISC, there is no point to
 	  enable this option otherwise. The 64bit kernel is significantly bigger
 	  and slower than the 32bit one.
 choice
 	prompt "Kernel page size"
 	default PARISC_PAGE_SIZE_4KB  if !64BIT
 	default PARISC_PAGE_SIZE_4KB  if 64BIT
 #	default PARISC_PAGE_SIZE_16KB if 64BIT
 config PARISC_PAGE_SIZE_4KB
 	bool "4KB"
 	help
 	  This lets you select the page size of the kernel.  For best
 	  performance, a page size of 16KB is recommended.  For best
 	  compatibility with 32bit applications, a page size of 4KB should be
 	  selected (the vast majority of 32bit binaries work perfectly fine
 	  with a larger page size).
 	  4KB                For best 32bit compatibility
 	  16KB               For best performance
 	  64KB               For best performance, might give more overhead.
 	  If you don't know what to do, choose 4KB.
 config PARISC_PAGE_SIZE_16KB
 	bool "16KB (EXPERIMENTAL)"
 	depends on PA8X00 && EXPERIMENTAL
 config PARISC_PAGE_SIZE_64KB
 	bool "64KB (EXPERIMENTAL)"
 	depends on PA8X00 && EXPERIMENTAL
 endchoice
 config SMP
 	bool "Symmetric multi-processing support"
 	---help---
 	  This enables support for systems with more than one CPU. If you have
 	  a system with only one CPU, like most personal computers, say N. If
 	  you have a system with more than one CPU, say Y.
 	  If you say N here, the kernel will run on single and multiprocessor
 	  machines, but will use only one CPU of a multiprocessor machine. If
 	  you say Y here, the kernel will run on many, but not all,
 	  singleprocessor machines. On a singleprocessor machine, the kernel
 	  will run faster if you say N here.
 	  See also the <file:Documentation/smp.txt>,
 	  <file:Documentation/nmi_watchdog.txt> and the SMP-HOWTO available
 	  at <http://www.tldp.org/docs.html#howto>.
 	  If you don't know what to do here, say N.
 config HOTPLUG_CPU
 	bool
 	default y if SMP
 	select HOTPLUG
 config ARCH_SELECT_MEMORY_MODEL
 	def_bool y
 	depends on 64BIT
 config ARCH_DISCONTIGMEM_ENABLE
 	def_bool y
 	depends on 64BIT
 config ARCH_FLATMEM_ENABLE
 	def_bool y
 config ARCH_DISCONTIGMEM_DEFAULT
 	def_bool y
 	depends on ARCH_DISCONTIGMEM_ENABLE
 config NODES_SHIFT
 	int
 	default "3"
 	depends on NEED_MULTIPLE_NODES
 source "kernel/Kconfig.preempt"
 source "kernel/Kconfig.hz"
 source "mm/Kconfig"
 config COMPAT
 	def_bool y
 	depends on 64BIT
 config HPUX
 	bool "Support for HP-UX binaries"
 	depends on !64BIT
 config NR_CPUS
 	int "Maximum number of CPUs (2-32)"
 	range 2 32
 	depends on SMP
 	default "32"
 endmenu
 source "drivers/parisc/Kconfig"
 menu "Executable file formats"
 source "fs/Kconfig.binfmt"
 endmenu
 source "net/Kconfig"
 source "drivers/Kconfig"
 source "fs/Kconfig"
 source "kernel/Kconfig.instrumentation"
 source "arch/parisc/Kconfig.debug"
 source "security/Kconfig"
 source "crypto/Kconfig"
 source "lib/Kconfig"

arch/powerpc/Kconfig

Diff comments View file @ 95c354f

 # For a description of the syntax of this configuration file,
 # see Documentation/kbuild/kconfig-language.txt.
 #
 mainmenu "Linux/PowerPC Kernel Configuration"
 source "arch/powerpc/platforms/Kconfig.cputype"
 config PPC32
 	bool
 	default y if !PPC64
 config 64BIT
 	bool
 	default y if PPC64
 config WORD_SIZE
 	int
 	default 64 if PPC64
 	default 32 if !PPC64
 config PPC_MERGE
 	def_bool y
 config MMU
 	bool
 	default y
 config GENERIC_CMOS_UPDATE
 	def_bool y
 config GENERIC_TIME
 	def_bool y
 config GENERIC_TIME_VSYSCALL
 	def_bool y
 config GENERIC_CLOCKEVENTS
 	def_bool y
 config GENERIC_HARDIRQS
 	bool
 	default y
 config IRQ_PER_CPU
 	bool
 	default y
 config RWSEM_GENERIC_SPINLOCK
 	bool
 config RWSEM_XCHGADD_ALGORITHM
 	bool
 	default y
+config GENERIC_LOCKBREAK
+	bool
+	default y
+	depends on SMP && PREEMPT
 config ARCH_HAS_ILOG2_U32
 	bool
 	default y
 config ARCH_HAS_ILOG2_U64
 	bool
 	default y if 64BIT
 config GENERIC_HWEIGHT
 	bool
 	default y
 config GENERIC_CALIBRATE_DELAY
 	bool
 	default y
 config GENERIC_FIND_NEXT_BIT
 	bool
 	default y
 config ARCH_NO_VIRT_TO_BUS
 	def_bool PPC64
 config PPC
 	bool
 	default y
 config EARLY_PRINTK
 	bool
 	default y
 config COMPAT
 	bool
 	default y if PPC64
 config SYSVIPC_COMPAT
 	bool
 	depends on COMPAT && SYSVIPC
 	default y
 # All PPC32s use generic nvram driver through ppc_md
 config GENERIC_NVRAM
 	bool
 	default y if PPC32
 config SCHED_NO_NO_OMIT_FRAME_POINTER
 	bool
 	default y
 config ARCH_MAY_HAVE_PC_FDC
 	bool
 	default !PPC_PSERIES || PCI
 config PPC_OF
 	def_bool y
 config OF
 	def_bool y
 config PPC_UDBG_16550
 	bool
 	default n
 config GENERIC_TBSYNC
 	bool
 	default y if PPC32 && SMP
 	default n
 config AUDIT_ARCH
 	bool
 	default y
 config GENERIC_BUG
 	bool
 	default y
 	depends on BUG
 config SYS_SUPPORTS_APM_EMULATION
 	default y if PMAC_APM_EMU
 	bool
 config DEFAULT_UIMAGE
 	bool
 	help
 	  Used to allow a board to specify it wants a uImage built by default
 	default n
 config PPC64_SWSUSP
 	bool
 	depends on PPC64 && (BROKEN || (PPC_PMAC64 && EXPERIMENTAL))
 	default y
 config PPC_DCR_NATIVE
 	bool
 	default n
 config PPC_DCR_MMIO
 	bool
 	default n
 config PPC_DCR
 	bool
 	depends on PPC_DCR_NATIVE || PPC_DCR_MMIO
 	default y
 config PPC_OF_PLATFORM_PCI
 	bool
 	depends on PPC64 # not supported on 32 bits yet
 	default n
 source "init/Kconfig"
 source "arch/powerpc/platforms/Kconfig"
 menu "Kernel options"
 config HIGHMEM
 	bool "High memory support"
 	depends on PPC32
 source kernel/time/Kconfig
 source kernel/Kconfig.hz
 source kernel/Kconfig.preempt
 source "fs/Kconfig.binfmt"
 # We optimistically allocate largepages from the VM, so make the limit
 # large enough (16MB). This badly named config option is actually
 # max order + 1
 config FORCE_MAX_ZONEORDER
 	int
 	depends on PPC64
 	default "9" if PPC_64K_PAGES
 	default "13"
 config HUGETLB_PAGE_SIZE_VARIABLE
 	bool
 	depends on HUGETLB_PAGE
 	default y
 config MATH_EMULATION
 	bool "Math emulation"
 	depends on 4xx || 8xx || E200 || PPC_MPC832x || E500
 	---help---
 	  Some PowerPC chips designed for embedded applications do not have
 	  a floating-point unit and therefore do not implement the
 	  floating-point instructions in the PowerPC instruction set.  If you
 	  say Y here, the kernel will include code to emulate a floating-point
 	  unit, which will allow programs that use floating-point
 	  instructions to run.
 config 8XX_MINIMAL_FPEMU
 	bool "Minimal math emulation for 8xx"
 	depends on 8xx && !MATH_EMULATION
 	help
 	  Older arch/ppc kernels still emulated a few floating point
 	  instructions such as load and store, even when full math
 	  emulation is disabled.  Say "Y" here if you want to preserve
 	  this behavior.
 	  It is recommended that you build a soft-float userspace instead.
 config IOMMU_VMERGE
 	bool "Enable IOMMU virtual merging"
 	depends on PPC64
 	default y
 	help
 	  Cause IO segments sent to a device for DMA to be merged virtually
 	  by the IOMMU when they happen to have been allocated contiguously.
 	  This doesn't add pressure to the IOMMU allocator. However, some
 	  drivers don't support getting large merged segments coming back
 	  from *_map_sg().
 	  Most drivers don't have this problem; it is safe to say Y here.
 config HOTPLUG_CPU
 	bool "Support for enabling/disabling CPUs"
 	depends on SMP && HOTPLUG && EXPERIMENTAL && (PPC_PSERIES || PPC_PMAC)
 	---help---
 	  Say Y here to be able to disable and re-enable individual
 	  CPUs at runtime on SMP machines.
 	  Say N if you are unsure.
 config ARCH_ENABLE_MEMORY_HOTPLUG
 	def_bool y
 config KEXEC
 	bool "kexec system call (EXPERIMENTAL)"
 	depends on (PPC_PRPMC2800 || PPC_MULTIPLATFORM) && EXPERIMENTAL
 	help
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
 	  but it is independent of the system firmware.   And like a reboot
 	  you can start any kernel with it, not just Linux.
 	  The name comes from the similarity to the exec system call.
 	  It is an ongoing process to be certain the hardware in a machine
 	  is properly shutdown, so do not be surprised if this code does not
 	  initially work for you.  It may help to enable device hotplugging
 	  support.  As of this writing the exact hardware interface is
 	  strongly in flux, so no good recommendation can be made.
 config CRASH_DUMP
 	bool "Build a kdump crash kernel (EXPERIMENTAL)"
 	depends on PPC_MULTIPLATFORM && PPC64 && EXPERIMENTAL
 	help
 	  Build a kernel suitable for use as a kdump capture kernel.
 	  The kernel will be linked at a different address than normal, and
 	  so can only be used for Kdump.
 	  Don't change this unless you know what you are doing.
 config PPCBUG_NVRAM
 	bool "Enable reading PPCBUG NVRAM during boot" if PPLUS || LOPEC
 	default y if PPC_PREP
 config IRQ_ALL_CPUS
 	bool "Distribute interrupts on all CPUs by default"
 	depends on SMP && !MV64360
 	help
 	  This option gives the kernel permission to distribute IRQs across
 	  multiple CPUs.  Saying N here will route all IRQs to the first
 	  CPU.  Generally saying Y is safe, although some problems have been
 	  reported with SMP Power Macintoshes with this option enabled.
 config NUMA
 	bool "NUMA support"
 	depends on PPC64
 	default y if SMP && PPC_PSERIES
 config NODES_SHIFT
 	int
 	default "4"
 	depends on NEED_MULTIPLE_NODES
 config ARCH_SELECT_MEMORY_MODEL
 	def_bool y
 	depends on PPC64
 config ARCH_FLATMEM_ENABLE
 	def_bool y
 	depends on (PPC64 && !NUMA) || PPC32
 config ARCH_SPARSEMEM_ENABLE
 	def_bool y
 	depends on PPC64
 	select SPARSEMEM_VMEMMAP_ENABLE
 config ARCH_SPARSEMEM_DEFAULT
 	def_bool y
 	depends on (SMP && PPC_PSERIES) || PPC_PS3
 config ARCH_POPULATES_NODE_MAP
 	def_bool y
 source "mm/Kconfig"
 config ARCH_MEMORY_PROBE
 	def_bool y
 	depends on MEMORY_HOTPLUG
 # Some NUMA nodes have memory ranges that span
 # other nodes.  Even though a pfn is valid and
 # between a node's start and end pfns, it may not
 # reside on that node.  See memmap_init_zone()
 # for details.
 config NODES_SPAN_OTHER_NODES
 	def_bool y
 	depends on NEED_MULTIPLE_NODES
 config PPC_HAS_HASH_64K
 	bool
 	depends on PPC64
 	default n
 config PPC_64K_PAGES
 	bool "64k page size"
 	depends on PPC64
 	select PPC_HAS_HASH_64K
 	help
 	  This option changes the kernel logical page size to 64k. On machines
 	  without processor support for 64k pages, the kernel will simulate
 	  them by loading each individual 4k page on demand transparently,
 	  while on hardware with such support, it will be used to map
 	  normal application pages.
 config SCHED_SMT
 	bool "SMT (Hyperthreading) scheduler support"
 	depends on PPC64 && SMP
 	help
 	  SMT scheduler support improves the CPU scheduler's decision making
 	  when dealing with POWER5 cpus at a cost of slightly increased
 	  overhead in some places. If unsure say N here.
 config PROC_DEVICETREE
 	bool "Support for device tree in /proc"
 	depends on PROC_FS
 	help
 	  This option adds a device-tree directory under /proc which contains
 	  an image of the device tree that the kernel copies from Open
 	  Firmware or other boot firmware. If unsure, say Y here.
 config CMDLINE_BOOL
 	bool "Default bootloader kernel arguments"
 config CMDLINE
 	string "Initial kernel command string"
 	depends on CMDLINE_BOOL
 	default "console=ttyS0,9600 console=tty0 root=/dev/sda2"
 	help
 	  On some platforms, there is currently no way for the boot loader to
 	  pass arguments to the kernel. For these platforms, you can supply
 	  some command-line options at build time by entering them here.  In
 	  most cases you will need to specify the root device here.
 if !44x || BROKEN
 source kernel/power/Kconfig
 endif
 config SECCOMP
 	bool "Enable seccomp to safely compute untrusted bytecode"
 	depends on PROC_FS
 	default y
 	help
 	  This kernel feature is useful for number crunching applications
 	  that may need to compute untrusted bytecode during their
 	  execution. By using pipes or other transports made available to
 	  the process as file descriptors supporting the read/write
 	  syscalls, it's possible to isolate those applications in
 	  their own address space using seccomp. Once seccomp is
 	  enabled via /proc/<pid>/seccomp, it cannot be disabled
 	  and the task is only allowed to execute a few safe syscalls
 	  defined by each seccomp mode.
 	  If unsure, say Y. Only embedded should say N here.
 config WANT_DEVICE_TREE
 	bool
 	default n
 config DEVICE_TREE
 	string "Static device tree source file"
 	depends on WANT_DEVICE_TREE
 	help
 	  This specifies the device tree source (.dts) file to be
 	  compiled and included when building the bootwrapper.  If a
 	  relative filename is given, then it will be relative to
 	  arch/powerpc/boot/dts.  If you are not using the bootwrapper,
 	  or do not need to build a dts into the bootwrapper, this
 	  field is ignored.
 	  For example, this is required when building a cuImage target
 	  for an older U-Boot, which cannot pass a device tree itself.
 	  Such a kernel will not work with a newer U-Boot that tries to
 	  pass a device tree (unless you tell it not to).  If your U-Boot
 	  does not mention a device tree in "help bootm", then use the
 	  cuImage target and specify a device tree here.  Otherwise, use
 	  the uImage target and leave this field blank.
 endmenu
 config ISA_DMA_API
 	bool
 	default y
 menu "Bus options"
 config ISA
 	bool "Support for ISA-bus hardware"
 	depends on PPC_PREP || PPC_CHRP
 	select PPC_I8259
 	help
 	  Find out whether you have ISA slots on your motherboard.  ISA is the
 	  name of a bus system, i.e. the way the CPU talks to the other stuff
 	  inside your box.  If you have an Apple machine, say N here; if you
 	  have an IBM RS/6000 or pSeries machine or a PReP machine, say Y.  If
 	  you have an embedded board, consult your board documentation.
 config ZONE_DMA
 	bool
 	default y
 config GENERIC_ISA_DMA
 	bool
 	depends on PPC64 || POWER4 || 6xx && !CPM2
 	default y
 config PPC_INDIRECT_PCI
 	bool
 	depends on PCI
 	default y if 40x || 44x
 	default n
 config EISA
 	bool
 config SBUS
 	bool
 config FSL_SOC
 	bool
 config FSL_PCI
  	bool
 	select PPC_INDIRECT_PCI
 # Yes MCA RS/6000s exist but Linux-PPC does not currently support any
 config MCA
 	bool
 config PCI
 	bool "PCI support" if 40x || CPM2 || PPC_83xx || PPC_85xx || PPC_86xx \
 		|| PPC_MPC52xx || (EMBEDDED && (PPC_PSERIES || PPC_ISERIES)) \
 		|| PPC_PS3
 	default y if !40x && !CPM2 && !8xx && !PPC_83xx \
 		&& !PPC_85xx && !PPC_86xx
 	default PCI_PERMEDIA if !4xx && !CPM2 && !8xx
 	default PCI_QSPAN if !4xx && !CPM2 && 8xx
 	select ARCH_SUPPORTS_MSI
 	help
 	  Find out whether your system includes a PCI bus. PCI is the name of
 	  a bus system, i.e. the way the CPU talks to the other stuff inside
 	  your box.  If you say Y here, the kernel will include drivers and
 	  infrastructure code to support PCI bus devices.
 config PCI_DOMAINS
 	def_bool PCI
 config PCI_SYSCALL
 	def_bool PCI
 config PCI_QSPAN
 	bool "QSpan PCI"
 	depends on !4xx && !CPM2 && 8xx
 	select PPC_I8259
 	help
 	  Say Y here if you have a system based on a Motorola 8xx-series
 	  embedded processor with a QSPAN PCI interface, otherwise say N.
 config PCI_8260
 	bool
 	depends on PCI && 8260
 	select PPC_INDIRECT_PCI
 	default y
 config 8260_PCI9
 	bool "Enable workaround for MPC826x erratum PCI 9"
 	depends on PCI_8260 && !8272
 	default y
 choice
 	prompt "IDMA channel for PCI 9 workaround"
 	depends on 8260_PCI9
 config 8260_PCI9_IDMA1
 	bool "IDMA1"
 config 8260_PCI9_IDMA2
 	bool "IDMA2"
 config 8260_PCI9_IDMA3
 	bool "IDMA3"
 config 8260_PCI9_IDMA4
 	bool "IDMA4"
 endchoice
 source "drivers/pci/pcie/Kconfig"
 source "drivers/pci/Kconfig"
 source "drivers/pcmcia/Kconfig"
 source "drivers/pci/hotplug/Kconfig"
 endmenu
 menu "Advanced setup"
 	depends on PPC32
 config ADVANCED_OPTIONS
 	bool "Prompt for advanced kernel configuration options"
 	help
 	  This option will enable prompting for a variety of advanced kernel
 	  configuration options.  These options can cause the kernel to not
 	  work if they are set incorrectly, but can be used to optimize certain
 	  aspects of kernel memory management.
 	  Unless you know what you are doing, say N here.
 comment "Default settings for advanced configuration options are used"
 	depends on !ADVANCED_OPTIONS
 config HIGHMEM_START_BOOL
 	bool "Set high memory pool address"
 	depends on ADVANCED_OPTIONS && HIGHMEM
 	help
 	  This option allows you to set the base address of the kernel virtual
 	  area used to map high memory pages.  This can be useful in
 	  optimizing the layout of kernel virtual memory.
 	  Say N here unless you know what you are doing.
 config HIGHMEM_START
 	hex "Virtual start address of high memory pool" if HIGHMEM_START_BOOL
 	default "0xfe000000"
 config LOWMEM_SIZE_BOOL
 	bool "Set maximum low memory"
 	depends on ADVANCED_OPTIONS
 	help
 	  This option allows you to set the maximum amount of memory which
 	  will be used as "low memory", that is, memory which the kernel can
 	  access directly, without having to set up a kernel virtual mapping.
 	  This can be useful in optimizing the layout of kernel virtual
 	  memory.
 	  Say N here unless you know what you are doing.
 config LOWMEM_SIZE
 	hex "Maximum low memory size (in bytes)" if LOWMEM_SIZE_BOOL
 	default "0x30000000"
 config KERNEL_START_BOOL
 	bool "Set custom kernel base address"
 	depends on ADVANCED_OPTIONS
 	help
 	  This option allows you to set the kernel virtual address at which
 	  the kernel will map low memory (the kernel image will be linked at
 	  this address).  This can be useful in optimizing the virtual memory
 	  layout of the system.
 	  Say N here unless you know what you are doing.
 config KERNEL_START
 	hex "Virtual address of kernel base" if KERNEL_START_BOOL
 	default "0xc0000000"
 config TASK_SIZE_BOOL
 	bool "Set custom user task size"
 	depends on ADVANCED_OPTIONS
 	help
 	  This option allows you to set the amount of virtual address space
 	  allocated to user tasks.  This can be useful in optimizing the
 	  virtual memory layout of the system.
 	  Say N here unless you know what you are doing.
 config TASK_SIZE
 	hex "Size of user task space" if TASK_SIZE_BOOL
 	default "0x80000000" if PPC_PREP || PPC_8xx
 	default "0xc0000000"
 config CONSISTENT_START_BOOL
 	bool "Set custom consistent memory pool address"
 	depends on ADVANCED_OPTIONS && NOT_COHERENT_CACHE
 	help
 	  This option allows you to set the base virtual address
 	  of the consistent memory pool.  This pool of virtual
 	  memory is used to make consistent memory allocations.
 config CONSISTENT_START
 	hex "Base virtual address of consistent memory pool" if CONSISTENT_START_BOOL
 	default "0xfd000000" if (NOT_COHERENT_CACHE && 8xx)
 	default "0xff100000" if NOT_COHERENT_CACHE
 config CONSISTENT_SIZE_BOOL
 	bool "Set custom consistent memory pool size"
 	depends on ADVANCED_OPTIONS && NOT_COHERENT_CACHE
 	help
 	  This option allows you to set the size of the
 	  consistent memory pool.  This pool of virtual memory
 	  is used to make consistent memory allocations.
 config CONSISTENT_SIZE
 	hex "Size of consistent memory pool" if CONSISTENT_SIZE_BOOL
 	default "0x00200000" if NOT_COHERENT_CACHE
 config BOOT_LOAD_BOOL
 	bool "Set the boot link/load address"
 	depends on ADVANCED_OPTIONS && !PPC_MULTIPLATFORM
 	help
 	  This option allows you to set the initial load address of the zImage
 	  or zImage.initrd file.  This can be useful if you are on a board
 	  which has a small amount of memory.
 	  Say N here unless you know what you are doing.
 config BOOT_LOAD
 	hex "Link/load address for booting" if BOOT_LOAD_BOOL
 	default "0x00400000" if 40x || 8xx || 8260
 	default "0x01000000" if 44x
 	default "0x00800000"
 config PIN_TLB
 	bool "Pinned Kernel TLBs (860 ONLY)"
 	depends on ADVANCED_OPTIONS && 8xx
 endmenu
 if PPC64
 config KERNEL_START
 	hex
 	default "0xc000000000000000"
 endif
 source "net/Kconfig"
 source "drivers/Kconfig"
 source "fs/Kconfig"
 # XXX source "arch/ppc/8xx_io/Kconfig"
 # XXX source "arch/ppc/8260_io/Kconfig"
 source "arch/powerpc/sysdev/qe_lib/Kconfig"
 source "lib/Kconfig"
 source "kernel/Kconfig.instrumentation"
 source "arch/powerpc/Kconfig.debug"
 source "security/Kconfig"
 config KEYS_COMPAT
 	bool
 	depends on COMPAT && KEYS
 	default y
 source "crypto/Kconfig"
 config PPC_CLOCK
 	bool
 	default n
 config PPC_LIB_RHEAP
 	bool

arch/sparc64/Kconfig

Diff comments View file @ 95c354f

 # $Id: config.in,v 1.158 2002/01/24 22:14:44 davem Exp $
 # For a description of the syntax of this configuration file,
 # see the Configure script.
 #
 mainmenu "Linux/UltraSPARC Kernel Configuration"
 config SPARC
 	bool
 	default y
 config SPARC64
 	bool
 	default y
 	help
 	  SPARC is a family of RISC microprocessors designed and marketed by
 	  Sun Microsystems, incorporated.  This port covers the newer 64-bit
 	  UltraSPARC.  The UltraLinux project maintains both the SPARC32 and
 	  SPARC64 ports; its web page is available at
 	  <http://www.ultralinux.org/>.
 config GENERIC_TIME
 	bool
 	default y
 config GENERIC_CMOS_UPDATE
 	bool
 	default y
 config GENERIC_CLOCKEVENTS
 	bool
 	default y
 config 64BIT
 	def_bool y
 config MMU
 	bool
 	default y
 config QUICKLIST
 	bool
 	default y
 config STACKTRACE_SUPPORT
 	bool
 	default y
 config LOCKDEP_SUPPORT
 	bool
 	default y
 config ARCH_MAY_HAVE_PC_FDC
 	bool
 	default y
 config ARCH_HAS_ILOG2_U32
 	bool
 	default n
 config ARCH_HAS_ILOG2_U64
 	bool
 	default n
 config AUDIT_ARCH
 	bool
 	default y
 config ARCH_NO_VIRT_TO_BUS
 	def_bool y
 config OF
 	def_bool y
 config GENERIC_HARDIRQS_NO__DO_IRQ
 	bool
 	def_bool y
 choice
 	prompt "Kernel page size"
 	default SPARC64_PAGE_SIZE_8KB
 config SPARC64_PAGE_SIZE_8KB
 	bool "8KB"
 	help
 	  This lets you select the page size of the kernel.
 	  8KB and 64KB work quite well, since Sparc ELF sections
 	  provide for up to 64KB alignment.
 	  Therefore, 512KB and 4MB are for expert hackers only.
 	  If you don't know what to do, choose 8KB.
 config SPARC64_PAGE_SIZE_64KB
 	bool "64KB"
 config SPARC64_PAGE_SIZE_512KB
 	bool "512KB"
 config SPARC64_PAGE_SIZE_4MB
 	bool "4MB"
 endchoice
 config SECCOMP
 	bool "Enable seccomp to safely compute untrusted bytecode"
 	depends on PROC_FS
 	default y
 	help
 	  This kernel feature is useful for number crunching applications
 	  that may need to compute untrusted bytecode during their
 	  execution. By using pipes or other transports made available to
 	  the process as file descriptors supporting the read/write
 	  syscalls, it's possible to isolate those applications in
 	  their own address space using seccomp. Once seccomp is
 	  enabled via /proc/<pid>/seccomp, it cannot be disabled
 	  and the task is only allowed to execute a few safe syscalls
 	  defined by each seccomp mode.
 	  If unsure, say Y. Only embedded should say N here.
 source kernel/Kconfig.hz
 config HOTPLUG_CPU
 	bool "Support for hot-pluggable CPUs"
 	depends on SMP
 	select HOTPLUG
 	---help---
 	  Say Y here to experiment with turning CPUs off and on.  CPUs
 	  can be controlled through /sys/devices/system/cpu/cpu#.
 	  Say N if you want to disable CPU hotplug.
 source "init/Kconfig"
 config SYSVIPC_COMPAT
 	bool
 	depends on COMPAT && SYSVIPC
 	default y
 config GENERIC_HARDIRQS
 	bool
 	default y
 menu "General machine setup"
 source "kernel/time/Kconfig"
 config SMP
 	bool "Symmetric multi-processing support"
 	---help---
 	  This enables support for systems with more than one CPU. If you have
 	  a system with only one CPU, say N. If you have a system with more than
 	  one CPU, say Y.
 	  If you say N here, the kernel will run on single and multiprocessor
 	  machines, but will use only one CPU of a multiprocessor machine. If
 	  you say Y here, the kernel will run on many, but not all,
 	  singleprocessor machines. On a singleprocessor machine, the kernel
 	  will run faster if you say N here.
 	  People using multiprocessor machines who say Y here should also say
 	  Y to "Enhanced Real Time Clock Support", below. The "Advanced Power
 	  Management" code will be disabled if you say Y here.
 	  See also the <file:Documentation/smp.txt>,
 	  <file:Documentation/nmi_watchdog.txt> and the SMP-HOWTO available at
 	  <http://www.tldp.org/docs.html#howto>.
 	  If you don't know what to do here, say N.
 config NR_CPUS
 	int "Maximum number of CPUs (2-1024)"
 	range 2 1024
 	depends on SMP
 	default "64"
 source "drivers/cpufreq/Kconfig"
 config US3_FREQ
 	tristate "UltraSPARC-III CPU Frequency driver"
 	depends on CPU_FREQ
 	select CPU_FREQ_TABLE
 	help
 	  This adds the CPUFreq driver for UltraSPARC-III processors.
 	  For details, take a look at <file:Documentation/cpu-freq>.
 	  If in doubt, say N.
 config US2E_FREQ
 	tristate "UltraSPARC-IIe CPU Frequency driver"
 	depends on CPU_FREQ
 	select CPU_FREQ_TABLE
 	help
 	  This adds the CPUFreq driver for UltraSPARC-IIe processors.
 	  For details, take a look at <file:Documentation/cpu-freq>.
 	  If in doubt, say N.
 # Global things across all Sun machines.
+config GENERIC_LOCKBREAK
+	bool
+	default y
+	depends on SMP && PREEMPT
 config RWSEM_GENERIC_SPINLOCK
 	bool
 config RWSEM_XCHGADD_ALGORITHM
 	bool
 	default y
 config GENERIC_FIND_NEXT_BIT
 	bool
 	default y
 config GENERIC_HWEIGHT
 	bool
 	default y if !ULTRA_HAS_POPULATION_COUNT
 config GENERIC_CALIBRATE_DELAY
 	bool
 	default y
 choice
 	prompt "SPARC64 Huge TLB Page Size"
 	depends on HUGETLB_PAGE
 	default HUGETLB_PAGE_SIZE_4MB
 config HUGETLB_PAGE_SIZE_4MB
 	bool "4MB"
 config HUGETLB_PAGE_SIZE_512K
 	depends on !SPARC64_PAGE_SIZE_4MB && !SPARC64_PAGE_SIZE_512KB
 	bool "512K"
 config HUGETLB_PAGE_SIZE_64K
 	depends on !SPARC64_PAGE_SIZE_4MB && !SPARC64_PAGE_SIZE_512KB && !SPARC64_PAGE_SIZE_64KB
 	bool "64K"
 endchoice
 endmenu
 config ARCH_SELECT_MEMORY_MODEL
 	def_bool y
 config ARCH_SPARSEMEM_ENABLE
 	def_bool y
 	select SPARSEMEM_VMEMMAP_ENABLE
 config ARCH_SPARSEMEM_DEFAULT
 	def_bool y
 source "mm/Kconfig"
 config ISA
 	bool
 	help
 	  Find out whether you have ISA slots on your motherboard.  ISA is the
 	  name of a bus system, i.e. the way the CPU talks to the other stuff
 	  inside your box.  Other bus systems are PCI, EISA, MicroChannel
 	  (MCA) or VESA.  ISA is an older system, now being displaced by PCI;
 	  newer boards don't support it.  If you have ISA, say Y, otherwise N.
 config ISAPNP
 	bool
 	help
 	  Say Y here if you would like support for ISA Plug and Play devices.
 	  Some information is in <file:Documentation/isapnp.txt>.
 	  To compile this driver as a module, choose M here: the
 	  module will be called isapnp.
 	  If unsure, say Y.
 config EISA
 	bool
 	---help---
 	  The Extended Industry Standard Architecture (EISA) bus was
 	  developed as an open alternative to the IBM MicroChannel bus.
 	  The EISA bus provided some of the features of the IBM MicroChannel
 	  bus while maintaining backward compatibility with cards made for
 	  the older ISA bus.  The EISA bus saw limited use between 1988 and
 	  1995 when it was made obsolete by the PCI bus.
 	  Say Y here if you are building a kernel for an EISA-based machine.
 	  Otherwise, say N.
 config MCA
 	bool
 	help
 	  MicroChannel Architecture is found in some IBM PS/2 machines and
 	  laptops.  It is a bus system similar to PCI or ISA. See
 	  <file:Documentation/mca.txt> (and especially the web page given
 	  there) before attempting to build an MCA bus kernel.
 config PCMCIA
 	tristate
 	---help---
 	  Say Y here if you want to attach PCMCIA- or PC-cards to your Linux
 	  computer.  These are credit-card size devices such as network cards,
 	  modems or hard drives often used with laptops computers.  There are
 	  actually two varieties of these cards: the older 16 bit PCMCIA cards
 	  and the newer 32 bit CardBus cards.  If you want to use CardBus
 	  cards, you need to say Y here and also to "CardBus support" below.
 	  To use your PC-cards, you will need supporting software from David
 	  Hinds' pcmcia-cs package (see the file <file:Documentation/Changes>
 	  for location).  Please also read the PCMCIA-HOWTO, available from
 	  <http://www.tldp.org/docs.html#howto>.
 	  To compile this driver as modules, choose M here: the
 	  modules will be called pcmcia_core and ds.
 config SBUS
 	bool
 	default y
 config SBUSCHAR
 	bool
 	default y
 config SUN_AUXIO
 	bool
 	default y
 config SUN_IO
 	bool
 	default y
 config SUN_LDOMS
 	bool "Sun Logical Domains support"
 	help
 	  Say Y here is you want to support virtual devices via
 	  Logical Domains.
 config PCI
 	bool "PCI support"
 	select ARCH_SUPPORTS_MSI
 	help
 	  Find out whether you have a PCI motherboard. PCI is the name of a
 	  bus system, i.e. the way the CPU talks to the other stuff inside
 	  your box. Other bus systems are ISA, EISA, MicroChannel (MCA) or
 	  VESA. If you have PCI, say Y, otherwise N.
 	  The PCI-HOWTO, available from
 	  <http://www.tldp.org/docs.html#howto>, contains valuable
 	  information about which PCI hardware does work under Linux and which
 	  doesn't.
 config PCI_DOMAINS
 	def_bool PCI
 config PCI_SYSCALL
 	def_bool PCI
 source "drivers/pci/Kconfig"
 config SUN_OPENPROMFS
 	tristate "Openprom tree appears in /proc/openprom"
 	help
 	  If you say Y, the OpenPROM device tree will be available as a
 	  virtual file system, which you can mount to /proc/openprom by "mount
 	  -t openpromfs none /proc/openprom".
 	  To compile the /proc/openprom support as a module, choose M here: the
 	  module will be called openpromfs.  If unsure, choose M.
 config SPARC32_COMPAT
 	bool "Kernel support for Linux/Sparc 32bit binary compatibility"
 	help
 	  This allows you to run 32-bit binaries on your Ultra.
 	  Everybody wants this; say Y.
 config COMPAT
 	bool
 	depends on SPARC32_COMPAT
 	default y
 config BINFMT_ELF32
 	bool "Kernel support for 32-bit ELF binaries"
 	depends on SPARC32_COMPAT
 	help
 	  This allows you to run 32-bit Linux/ELF binaries on your Ultra.
 	  Everybody wants this; say Y.
 config BINFMT_AOUT32
 	bool "Kernel support for 32-bit (ie. SunOS) a.out binaries"
 	depends on SPARC32_COMPAT
 	help
 	  This allows you to run 32-bit a.out format binaries on your Ultra.
 	  If you want to run SunOS binaries (see SunOS binary emulation below)
 	  or other a.out binaries, say Y. If unsure, say N.
 menu "Executable file formats"
 source "fs/Kconfig.binfmt"
 config SUNOS_EMUL
 	bool "SunOS binary emulation"
 	depends on BINFMT_AOUT32
 	help
 	  This allows you to run most SunOS binaries.  If you want to do this,
 	  say Y here and place appropriate files in /usr/gnemul/sunos. See
 	  <http://www.ultralinux.org/faq.html> for more information.  If you
 	  want to run SunOS binaries on an Ultra you must also say Y to
 	  "Kernel support for 32-bit a.out binaries" above.
 config SOLARIS_EMUL
 	tristate "Solaris binary emulation (EXPERIMENTAL)"
 	depends on SPARC32_COMPAT && NET && EXPERIMENTAL
 	help
 	  This is experimental code which will enable you to run (many)
 	  Solaris binaries on your SPARC Linux machine.
 	  To compile this code as a module, choose M here: the
 	  module will be called solaris.
 endmenu
 config SCHED_SMT
 	bool "SMT (Hyperthreading) scheduler support"
 	depends on SMP
 	default y
 	help
 	  SMT scheduler support improves the CPU scheduler's decision making
 	  when dealing with UltraSPARC cpus at a cost of slightly increased
 	  overhead in some places. If unsure say N here.
 config SCHED_MC
 	bool "Multi-core scheduler support"
 	depends on SMP
 	default y
 	help
 	  Multi-core scheduler support improves the CPU scheduler's decision
 	  making when dealing with multi-core CPU chips at a cost of slightly
 	  increased overhead in some places. If unsure say N here.
 source "kernel/Kconfig.preempt"
 config CMDLINE_BOOL
 	bool "Default bootloader kernel arguments"
 config CMDLINE
 	string "Initial kernel command string"
 	depends on CMDLINE_BOOL
 	default "console=ttyS0,9600 root=/dev/sda1"
 	help
 	  Say Y here if you want to be able to pass default arguments to
 	  the kernel. This will be overridden by the bootloader, if you
 	  use one (such as SILO). This is most useful if you want to boot
 	  a kernel from TFTP, and want default options to be available
 	  with having them passed on the command line.
 	  NOTE: This option WILL override the PROM bootargs setting!
 source "net/Kconfig"
 source "drivers/Kconfig"
 source "drivers/sbus/char/Kconfig"
 source "fs/Kconfig"
 source "kernel/Kconfig.instrumentation"
 source "arch/sparc64/Kconfig.debug"
 source "security/Kconfig"
 source "crypto/Kconfig"
 source "lib/Kconfig"

arch/x86/Kconfig

Diff comments View file @ 95c354f

 # x86 configuration
 mainmenu "Linux Kernel Configuration for x86"
 # Select 32 or 64 bit
 config 64BIT
 	bool "64-bit kernel" if ARCH = "x86"
 	default ARCH = "x86_64"
 	help
 	  Say yes to build a 64-bit kernel - formerly known as x86_64
 	  Say no to build a 32-bit kernel - formerly known as i386
 config X86_32
 	def_bool !64BIT
 config X86_64
 	def_bool 64BIT
 ### Arch settings
 config X86
 	def_bool y
+config GENERIC_LOCKBREAK
+	def_bool y
+	depends on SMP && PREEMPT
 config GENERIC_TIME
 	def_bool y
 config GENERIC_CMOS_UPDATE
 	def_bool y
 config CLOCKSOURCE_WATCHDOG
 	def_bool y
 config GENERIC_CLOCKEVENTS
 	def_bool y
 config GENERIC_CLOCKEVENTS_BROADCAST
 	def_bool y
 	depends on X86_64 || (X86_32 && X86_LOCAL_APIC)
 config LOCKDEP_SUPPORT
 	def_bool y
 config STACKTRACE_SUPPORT
 	def_bool y
 config SEMAPHORE_SLEEPERS
 	def_bool y
 config MMU
 	def_bool y
 config ZONE_DMA
 	def_bool y
 config QUICKLIST
 	def_bool X86_32
 config SBUS
 	bool
 config GENERIC_ISA_DMA
 	def_bool y
 config GENERIC_IOMAP
 	def_bool y
 config GENERIC_BUG
 	def_bool y
 	depends on BUG
 config GENERIC_HWEIGHT
 	def_bool y
 config ARCH_MAY_HAVE_PC_FDC
 	def_bool y
 config DMI
 	def_bool y
 config RWSEM_GENERIC_SPINLOCK
 	def_bool !X86_XADD
 config RWSEM_XCHGADD_ALGORITHM
 	def_bool X86_XADD
 config ARCH_HAS_ILOG2_U32
 	def_bool n
 config ARCH_HAS_ILOG2_U64
 	def_bool n
 config GENERIC_CALIBRATE_DELAY
 	def_bool y
 config GENERIC_TIME_VSYSCALL
 	bool
 	default X86_64
 config ARCH_SUPPORTS_OPROFILE
 	bool
 	default y
 config ZONE_DMA32
 	bool
 	default X86_64
 config ARCH_POPULATES_NODE_MAP
 	def_bool y
 config AUDIT_ARCH
 	bool
 	default X86_64
 # Use the generic interrupt handling code in kernel/irq/:
 config GENERIC_HARDIRQS
 	bool
 	default y
 config GENERIC_IRQ_PROBE
 	bool
 	default y
 config GENERIC_PENDING_IRQ
 	bool
 	depends on GENERIC_HARDIRQS && SMP
 	default y
 config X86_SMP
 	bool
 	depends on X86_32 && SMP && !X86_VOYAGER
 	default y
 config X86_HT
 	bool
 	depends on SMP
 	depends on (X86_32 && !(X86_VISWS || X86_VOYAGER)) || (X86_64 && !MK8)
 	default y
 config X86_BIOS_REBOOT
 	bool
 	depends on X86_32 && !(X86_VISWS || X86_VOYAGER)
 	default y
 config X86_TRAMPOLINE
 	bool
 	depends on X86_SMP || (X86_VOYAGER && SMP)
 	default y
 config KTIME_SCALAR
 	def_bool X86_32
 source "init/Kconfig"
 menu "Processor type and features"
 source "kernel/time/Kconfig"
 config SMP
 	bool "Symmetric multi-processing support"
 	---help---
 	  This enables support for systems with more than one CPU. If you have
 	  a system with only one CPU, like most personal computers, say N. If
 	  you have a system with more than one CPU, say Y.
 	  If you say N here, the kernel will run on single and multiprocessor
 	  machines, but will use only one CPU of a multiprocessor machine. If
 	  you say Y here, the kernel will run on many, but not all,
 	  singleprocessor machines. On a singleprocessor machine, the kernel
 	  will run faster if you say N here.
 	  Note that if you say Y here and choose architecture "586" or
 	  "Pentium" under "Processor family", the kernel will not work on 486
 	  architectures. Similarly, multiprocessor kernels for the "PPro"
 	  architecture may not work on all Pentium based boards.
 	  People using multiprocessor machines who say Y here should also say
 	  Y to "Enhanced Real Time Clock Support", below. The "Advanced Power
 	  Management" code will be disabled if you say Y here.
 	  See also the <file:Documentation/smp.txt>,
 	  <file:Documentation/i386/IO-APIC.txt>,
 	  <file:Documentation/nmi_watchdog.txt> and the SMP-HOWTO available at
 	  <http://www.tldp.org/docs.html#howto>.
 	  If you don't know what to do here, say N.
 choice
 	prompt "Subarchitecture Type"
 	default X86_PC
 config X86_PC
 	bool "PC-compatible"
 	help
 	  Choose this option if your computer is a standard PC or compatible.
 config X86_ELAN
 	bool "AMD Elan"
 	depends on X86_32
 	help
 	  Select this for an AMD Elan processor.
 	  Do not use this option for K6/Athlon/Opteron processors!
 	  If unsure, choose "PC-compatible" instead.
 config X86_VOYAGER
 	bool "Voyager (NCR)"
 	depends on X86_32
 	select SMP if !BROKEN
 	help
 	  Voyager is an MCA-based 32-way capable SMP architecture proprietary
 	  to NCR Corp.  Machine classes 345x/35xx/4100/51xx are Voyager-based.
 	  *** WARNING ***
 	  If you do not specifically know you have a Voyager based machine,
 	  say N here, otherwise the kernel you build will not be bootable.
 config X86_NUMAQ
 	bool "NUMAQ (IBM/Sequent)"
 	select SMP
 	select NUMA
 	depends on X86_32
 	help
 	  This option is used for getting Linux to run on a (IBM/Sequent) NUMA
 	  multiquad box. This changes the way that processors are bootstrapped,
 	  and uses Clustered Logical APIC addressing mode instead of Flat Logical.
 	  You will need a new lynxer.elf file to flash your firmware with - send
 	  email to <Martin.Bligh@us.ibm.com>.
 config X86_SUMMIT
 	bool "Summit/EXA (IBM x440)"
 	depends on X86_32 && SMP
 	help
 	  This option is needed for IBM systems that use the Summit/EXA chipset.
 	  In particular, it is needed for the x440.
 	  If you don't have one of these computers, you should say N here.
 	  If you want to build a NUMA kernel, you must select ACPI.
 config X86_BIGSMP
 	bool "Support for other sub-arch SMP systems with more than 8 CPUs"
 	depends on X86_32 && SMP
 	help
 	  This option is needed for the systems that have more than 8 CPUs
 	  and if the system is not of any sub-arch type above.
 	  If you don't have such a system, you should say N here.
 config X86_VISWS
 	bool "SGI 320/540 (Visual Workstation)"
 	depends on X86_32
 	help
 	  The SGI Visual Workstation series is an IA32-based workstation
 	  based on SGI systems chips with some legacy PC hardware attached.
 	  Say Y here to create a kernel to run on the SGI 320 or 540.
 	  A kernel compiled for the Visual Workstation will not run on PCs
 	  and vice versa. See <file:Documentation/sgi-visws.txt> for details.
 config X86_GENERICARCH
        bool "Generic architecture (Summit, bigsmp, ES7000, default)"
 	depends on X86_32
        help
           This option compiles in the Summit, bigsmp, ES7000, default subarchitectures.
 	  It is intended for a generic binary kernel.
 	  If you want a NUMA kernel, select ACPI.   We need SRAT for NUMA.
 config X86_ES7000
 	bool "Support for Unisys ES7000 IA32 series"
 	depends on X86_32 && SMP
 	help
 	  Support for Unisys ES7000 systems.  Say 'Y' here if this kernel is
 	  supposed to run on an IA32-based Unisys ES7000 system.
 	  Only choose this option if you have such a system, otherwise you
 	  should say N here.
 config X86_VSMP
 	bool "Support for ScaleMP vSMP"
 	depends on X86_64 && PCI
 	 help
 	  Support for ScaleMP vSMP systems.  Say 'Y' here if this kernel is
 	  supposed to run on these EM64T-based machines.  Only choose this option
 	  if you have one of these machines.
 endchoice
 config SCHED_NO_NO_OMIT_FRAME_POINTER
 	def_bool y
 	prompt "Single-depth WCHAN output"
 	depends on X86_32
 	help
 	  Calculate simpler /proc/<PID>/wchan values. If this option
 	  is disabled then wchan values will recurse back to the
 	  caller function. This provides more accurate wchan values,
 	  at the expense of slightly more scheduling overhead.
 	  If in doubt, say "Y".
 config PARAVIRT
 	bool
 	depends on X86_32 && !(X86_VISWS || X86_VOYAGER)
 	help
 	  This changes the kernel so it can modify itself when it is run
 	  under a hypervisor, potentially improving performance significantly
 	  over full virtualization.  However, when run without a hypervisor
 	  the kernel is theoretically slower and slightly larger.
 menuconfig PARAVIRT_GUEST
 	bool "Paravirtualized guest support"
 	depends on X86_32
 	help
 	  Say Y here to get to see options related to running Linux under
 	  various hypervisors.  This option alone does not add any kernel code.
 	  If you say N, all options in this submenu will be skipped and disabled.
 if PARAVIRT_GUEST
 source "arch/x86/xen/Kconfig"
 config VMI
 	bool "VMI Guest support"
 	select PARAVIRT
 	depends on !(X86_VISWS || X86_VOYAGER)
 	help
 	  VMI provides a paravirtualized interface to the VMware ESX server
 	  (it could be used by other hypervisors in theory too, but is not
 	  at the moment), by linking the kernel to a GPL-ed ROM module
 	  provided by the hypervisor.
 source "arch/x86/lguest/Kconfig"
 endif
 config ACPI_SRAT
 	def_bool y
 	depends on X86_32 && ACPI && NUMA && (X86_SUMMIT || X86_GENERICARCH)
 	select ACPI_NUMA
 config HAVE_ARCH_PARSE_SRAT
 	def_bool y
 	depends on ACPI_SRAT
 config X86_SUMMIT_NUMA
 	def_bool y
 	depends on X86_32 && NUMA && (X86_SUMMIT || X86_GENERICARCH)
 config X86_CYCLONE_TIMER
 	def_bool y
 	depends on X86_32 && X86_SUMMIT || X86_GENERICARCH
 config ES7000_CLUSTERED_APIC
 	def_bool y
 	depends on SMP && X86_ES7000 && MPENTIUMIII
 source "arch/x86/Kconfig.cpu"
 config HPET_TIMER
 	def_bool X86_64
 	prompt "HPET Timer Support" if X86_32
 	help
          Use the IA-PC HPET (High Precision Event Timer) to manage
          time in preference to the PIT and RTC, if a HPET is
          present.
          HPET is the next generation timer replacing legacy 8254s.
          The HPET provides a stable time base on SMP
          systems, unlike the TSC, but it is more expensive to access,
          as it is off-chip.  You can find the HPET spec at
          <http://www.intel.com/hardwaredesign/hpetspec.htm>.
          You can safely choose Y here.  However, HPET will only be
          activated if the platform and the BIOS support this feature.
          Otherwise the 8254 will be used for timing services.
          Choose N to continue using the legacy 8254 timer.
 config HPET_EMULATE_RTC
 	def_bool y
 	depends on HPET_TIMER && RTC=y
 # Mark as embedded because too many people got it wrong.
 # The code disables itself when not needed.
 config GART_IOMMU
 	bool "GART IOMMU support" if EMBEDDED
 	default y
 	select SWIOTLB
 	select AGP
 	depends on X86_64 && PCI
 	help
 	  Support for full DMA access of devices with 32bit memory access only
 	  on systems with more than 3GB. This is usually needed for USB,
 	  sound, many IDE/SATA chipsets and some other devices.
 	  Provides a driver for the AMD Athlon64/Opteron/Turion/Sempron GART
 	  based hardware IOMMU and a software bounce buffer based IOMMU used
 	  on Intel systems and as fallback.
 	  The code is only active when needed (enough memory and limited
 	  device) unless CONFIG_IOMMU_DEBUG or iommu=force is specified
 	  too.
 config CALGARY_IOMMU
 	bool "IBM Calgary IOMMU support"
 	select SWIOTLB
 	depends on X86_64 && PCI && EXPERIMENTAL
 	help
 	  Support for hardware IOMMUs in IBM's xSeries x366 and x460
 	  systems. Needed to run systems with more than 3GB of memory
 	  properly with 32-bit PCI devices that do not support DAC
 	  (Double Address Cycle). Calgary also supports bus level
 	  isolation, where all DMAs pass through the IOMMU.  This
 	  prevents them from going anywhere except their intended
 	  destination. This catches hard-to-find kernel bugs and
 	  mis-behaving drivers and devices that do not use the DMA-API
 	  properly to set up their DMA buffers.  The IOMMU can be
 	  turned off at boot time with the iommu=off parameter.
 	  Normally the kernel will make the right choice by itself.
 	  If unsure, say Y.
 config CALGARY_IOMMU_ENABLED_BY_DEFAULT
 	def_bool y
 	prompt "Should Calgary be enabled by default?"
 	depends on CALGARY_IOMMU
 	help
 	  Should Calgary be enabled by default? if you choose 'y', Calgary
 	  will be used (if it exists). If you choose 'n', Calgary will not be
 	  used even if it exists. If you choose 'n' and would like to use
 	  Calgary anyway, pass 'iommu=calgary' on the kernel command line.
 	  If unsure, say Y.
 # need this always selected by IOMMU for the VIA workaround
 config SWIOTLB
 	bool
 	help
 	  Support for software bounce buffers used on x86-64 systems
 	  which don't have a hardware IOMMU (e.g. the current generation
 	  of Intel's x86-64 CPUs). Using this PCI devices which can only
 	  access 32-bits of memory can be used on systems with more than
 	  3 GB of memory. If unsure, say Y.
 config NR_CPUS
 	int "Maximum number of CPUs (2-255)"
 	range 2 255
 	depends on SMP
 	default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000
 	default "8"
 	help
 	  This allows you to specify the maximum number of CPUs which this
 	  kernel will support.  The maximum supported value is 255 and the
 	  minimum value which makes sense is 2.
 	  This is purely to save memory - each supported CPU adds
 	  approximately eight kilobytes to the kernel image.
 config SCHED_SMT
 	bool "SMT (Hyperthreading) scheduler support"
 	depends on (X86_64 && SMP) || (X86_32 && X86_HT)
 	help
 	  SMT scheduler support improves the CPU scheduler's decision making
 	  when dealing with Intel Pentium 4 chips with HyperThreading at a
 	  cost of slightly increased overhead in some places. If unsure say
 	  N here.
 config SCHED_MC
 	def_bool y
 	prompt "Multi-core scheduler support"
 	depends on (X86_64 && SMP) || (X86_32 && X86_HT)
 	help
 	  Multi-core scheduler support improves the CPU scheduler's decision
 	  making when dealing with multi-core CPU chips at a cost of slightly
 	  increased overhead in some places. If unsure say N here.
 source "kernel/Kconfig.preempt"
 config X86_UP_APIC
 	bool "Local APIC support on uniprocessors"
 	depends on X86_32 && !SMP && !(X86_VISWS || X86_VOYAGER || X86_GENERICARCH)
 	help
 	  A local APIC (Advanced Programmable Interrupt Controller) is an
 	  integrated interrupt controller in the CPU. If you have a single-CPU
 	  system which has a processor with a local APIC, you can say Y here to
 	  enable and use it. If you say Y here even though your machine doesn't
 	  have a local APIC, then the kernel will still run with no slowdown at
 	  all. The local APIC supports CPU-generated self-interrupts (timer,
 	  performance counters), and the NMI watchdog which detects hard
 	  lockups.
 config X86_UP_IOAPIC
 	bool "IO-APIC support on uniprocessors"
 	depends on X86_UP_APIC
 	help
 	  An IO-APIC (I/O Advanced Programmable Interrupt Controller) is an
 	  SMP-capable replacement for PC-style interrupt controllers. Most
 	  SMP systems and many recent uniprocessor systems have one.
 	  If you have a single-CPU system with an IO-APIC, you can say Y here
 	  to use it. If you say Y here even though your machine doesn't have
 	  an IO-APIC, then the kernel will still run with no slowdown at all.
 config X86_LOCAL_APIC
 	def_bool y
 	depends on X86_64 || (X86_32 && (X86_UP_APIC || ((X86_VISWS || SMP) && !X86_VOYAGER) || X86_GENERICARCH))
 config X86_IO_APIC
 	def_bool y
 	depends on X86_64 || (X86_32 && (X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER)) || X86_GENERICARCH))
 config X86_VISWS_APIC
 	def_bool y
 	depends on X86_32 && X86_VISWS
 config X86_MCE
 	bool "Machine Check Exception"
 	depends on !X86_VOYAGER
 	---help---
 	  Machine Check Exception support allows the processor to notify the
 	  kernel if it detects a problem (e.g. overheating, component failure).
 	  The action the kernel takes depends on the severity of the problem,
 	  ranging from a warning message on the console, to halting the machine.
 	  Your processor must be a Pentium or newer to support this - check the
 	  flags in /proc/cpuinfo for mce.  Note that some older Pentium systems
 	  have a design flaw which leads to false MCE events - hence MCE is
 	  disabled on all P5 processors, unless explicitly enabled with "mce"
 	  as a boot argument.  Similarly, if MCE is built in and creates a
 	  problem on some new non-standard machine, you can boot with "nomce"
 	  to disable it.  MCE support simply ignores non-MCE processors like
 	  the 386 and 486, so nearly everyone can say Y here.
 config X86_MCE_INTEL
 	def_bool y
 	prompt "Intel MCE features"
 	depends on X86_64 && X86_MCE && X86_LOCAL_APIC
 	help
 	   Additional support for intel specific MCE features such as
 	   the thermal monitor.
 config X86_MCE_AMD
 	def_bool y
 	prompt "AMD MCE features"
 	depends on X86_64 && X86_MCE && X86_LOCAL_APIC
 	help
 	   Additional support for AMD specific MCE features such as
 	   the DRAM Error Threshold.
 config X86_MCE_NONFATAL
 	tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4"
 	depends on X86_32 && X86_MCE
 	help
 	  Enabling this feature starts a timer that triggers every 5 seconds which
 	  will look at the machine check registers to see if anything happened.
 	  Non-fatal problems automatically get corrected (but still logged).
 	  Disable this if you don't want to see these messages.
 	  Seeing the messages this option prints out may be indicative of dying
 	  or out-of-spec (ie, overclocked) hardware.
 	  This option only does something on certain CPUs.
 	  (AMD Athlon/Duron and Intel Pentium 4)
 config X86_MCE_P4THERMAL
 	bool "check for P4 thermal throttling interrupt."
 	depends on X86_32 && X86_MCE && (X86_UP_APIC || SMP) && !X86_VISWS
 	help
 	  Enabling this feature will cause a message to be printed when the P4
 	  enters thermal throttling.
 config VM86
 	bool "Enable VM86 support" if EMBEDDED
 	default y
 	depends on X86_32
 	help
           This option is required by programs like DOSEMU to run 16-bit legacy
 	  code on X86 processors. It also may be needed by software like
           XFree86 to initialize some video cards via BIOS. Disabling this
           option saves about 6k.
 config TOSHIBA
 	tristate "Toshiba Laptop support"
 	depends on X86_32
 	---help---
 	  This adds a driver to safely access the System Management Mode of
 	  the CPU on Toshiba portables with a genuine Toshiba BIOS. It does
 	  not work on models with a Phoenix BIOS. The System Management Mode
 	  is used to set the BIOS and power saving options on Toshiba portables.
 	  For information on utilities to make use of this driver see the
 	  Toshiba Linux utilities web site at:
 	  <http://www.buzzard.org.uk/toshiba/>.
 	  Say Y if you intend to run this kernel on a Toshiba portable.
 	  Say N otherwise.
 config I8K
 	tristate "Dell laptop support"
 	depends on X86_32
 	---help---
 	  This adds a driver to safely access the System Management Mode
 	  of the CPU on the Dell Inspiron 8000. The System Management Mode
 	  is used to read cpu temperature and cooling fan status and to
 	  control the fans on the I8K portables.
 	  This driver has been tested only on the Inspiron 8000 but it may
 	  also work with other Dell laptops. You can force loading on other
 	  models by passing the parameter `force=1' to the module. Use at
 	  your own risk.
 	  For information on utilities to make use of this driver see the
 	  I8K Linux utilities web site at:
 	  <http://people.debian.org/~dz/i8k/>
 	  Say Y if you intend to run this kernel on a Dell Inspiron 8000.
 	  Say N otherwise.
 config X86_REBOOTFIXUPS
 	def_bool n
 	prompt "Enable X86 board specific fixups for reboot"
 	depends on X86_32 && X86
 	---help---
 	  This enables chipset and/or board specific fixups to be done
 	  in order to get reboot to work correctly. This is only needed on
 	  some combinations of hardware and BIOS. The symptom, for which
 	  this config is intended, is when reboot ends with a stalled/hung
 	  system.
 	  Currently, the only fixup is for the Geode machines using
 	  CS5530A and CS5536 chipsets.
 	  Say Y if you want to enable the fixup. Currently, it's safe to
 	  enable this option even if you don't need it.
 	  Say N otherwise.
 config MICROCODE
 	tristate "/dev/cpu/microcode - Intel IA32 CPU microcode support"
 	select FW_LOADER
 	---help---
 	  If you say Y here, you will be able to update the microcode on
 	  Intel processors in the IA32 family, e.g. Pentium Pro, Pentium II,
 	  Pentium III, Pentium 4, Xeon etc.  You will obviously need the
 	  actual microcode binary data itself which is not shipped with the
 	  Linux kernel.
 	  For latest news and information on obtaining all the required
 	  ingredients for this driver, check:
 	  <http://www.urbanmyth.org/microcode/>.
 	  To compile this driver as a module, choose M here: the
 	  module will be called microcode.
 config MICROCODE_OLD_INTERFACE
 	def_bool y
 	depends on MICROCODE
 config X86_MSR
 	tristate "/dev/cpu/*/msr - Model-specific register support"
 	help
 	  This device gives privileged processes access to the x86
 	  Model-Specific Registers (MSRs).  It is a character device with
 	  major 202 and minors 0 to 31 for /dev/cpu/0/msr to /dev/cpu/31/msr.
 	  MSR accesses are directed to a specific CPU on multi-processor
 	  systems.
 config X86_CPUID
 	tristate "/dev/cpu/*/cpuid - CPU information support"
 	help
 	  This device gives processes access to the x86 CPUID instruction to
 	  be executed on a specific processor.  It is a character device
 	  with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to
 	  /dev/cpu/31/cpuid.
 choice
 	prompt "High Memory Support"
 	default HIGHMEM4G if !X86_NUMAQ
 	default HIGHMEM64G if X86_NUMAQ
 	depends on X86_32
 config NOHIGHMEM
 	bool "off"
 	depends on !X86_NUMAQ
 	---help---
 	  Linux can use up to 64 Gigabytes of physical memory on x86 systems.
 	  However, the address space of 32-bit x86 processors is only 4
 	  Gigabytes large. That means that, if you have a large amount of
 	  physical memory, not all of it can be "permanently mapped" by the
 	  kernel. The physical memory that's not permanently mapped is called
 	  "high memory".
 	  If you are compiling a kernel which will never run on a machine with
 	  more than 1 Gigabyte total physical RAM, answer "off" here (default
 	  choice and suitable for most users). This will result in a "3GB/1GB"
 	  split: 3GB are mapped so that each process sees a 3GB virtual memory
 	  space and the remaining part of the 4GB virtual memory space is used
 	  by the kernel to permanently map as much physical memory as
 	  possible.
 	  If the machine has between 1 and 4 Gigabytes physical RAM, then
 	  answer "4GB" here.
 	  If more than 4 Gigabytes is used then answer "64GB" here. This
 	  selection turns Intel PAE (Physical Address Extension) mode on.
 	  PAE implements 3-level paging on IA32 processors. PAE is fully
 	  supported by Linux, PAE mode is implemented on all recent Intel
 	  processors (Pentium Pro and better). NOTE: If you say "64GB" here,
 	  then the kernel will not boot on CPUs that don't support PAE!
 	  The actual amount of total physical memory will either be
 	  auto detected or can be forced by using a kernel command line option
 	  such as "mem=256M". (Try "man bootparam" or see the documentation of
 	  your boot loader (lilo or loadlin) about how to pass options to the
 	  kernel at boot time.)
 	  If unsure, say "off".
 config HIGHMEM4G
 	bool "4GB"
 	depends on !X86_NUMAQ
 	help
 	  Select this if you have a 32-bit processor and between 1 and 4
 	  gigabytes of physical RAM.
 config HIGHMEM64G
 	bool "64GB"
 	depends on !M386 && !M486
 	select X86_PAE
 	help
 	  Select this if you have a 32-bit processor and more than 4
 	  gigabytes of physical RAM.
 endchoice
 choice
 	depends on EXPERIMENTAL
 	prompt "Memory split" if EMBEDDED
 	default VMSPLIT_3G
 	depends on X86_32
 	help
 	  Select the desired split between kernel and user memory.
 	  If the address range available to the kernel is less than the
 	  physical memory installed, the remaining memory will be available
 	  as "high memory". Accessing high memory is a little more costly
 	  than low memory, as it needs to be mapped into the kernel first.
 	  Note that increasing the kernel address space limits the range
 	  available to user programs, making the address space there
 	  tighter.  Selecting anything other than the default 3G/1G split
 	  will also likely make your kernel incompatible with binary-only
 	  kernel modules.
 	  If you are not absolutely sure what you are doing, leave this
 	  option alone!
 	config VMSPLIT_3G
 		bool "3G/1G user/kernel split"
 	config VMSPLIT_3G_OPT
 		depends on !X86_PAE
 		bool "3G/1G user/kernel split (for full 1G low memory)"
 	config VMSPLIT_2G
 		bool "2G/2G user/kernel split"
 	config VMSPLIT_2G_OPT
 		depends on !X86_PAE
 		bool "2G/2G user/kernel split (for full 2G low memory)"
 	config VMSPLIT_1G
 		bool "1G/3G user/kernel split"
 endchoice
 config PAGE_OFFSET
 	hex
 	default 0xB0000000 if VMSPLIT_3G_OPT
 	default 0x80000000 if VMSPLIT_2G
 	default 0x78000000 if VMSPLIT_2G_OPT
 	default 0x40000000 if VMSPLIT_1G
 	default 0xC0000000
 	depends on X86_32
 config HIGHMEM
 	def_bool y
 	depends on X86_32 && (HIGHMEM64G || HIGHMEM4G)
 config X86_PAE
 	def_bool n
 	prompt "PAE (Physical Address Extension) Support"
 	depends on X86_32 && !HIGHMEM4G
 	select RESOURCES_64BIT
 	help
 	  PAE is required for NX support, and furthermore enables
 	  larger swapspace support for non-overcommit purposes. It
 	  has the cost of more pagetable lookup overhead, and also
 	  consumes more pagetable space per process.
 # Common NUMA Features
 config NUMA
 	bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)"
 	depends on SMP
 	depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || (X86_SUMMIT || X86_GENERICARCH) && ACPI) && EXPERIMENTAL)
 	default n if X86_PC
 	default y if (X86_NUMAQ || X86_SUMMIT)
 	help
 	  Enable NUMA (Non Uniform Memory Access) support.
 	  The kernel will try to allocate memory used by a CPU on the
 	  local memory controller of the CPU and add some more
 	  NUMA awareness to the kernel.
 	  For i386 this is currently highly experimental and should be only
 	  used for kernel development. It might also cause boot failures.
 	  For x86_64 this is recommended on all multiprocessor Opteron systems.
 	  If the system is EM64T, you should say N unless your system is
 	  EM64T NUMA.
 comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI"
 	depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI)
 config K8_NUMA
 	def_bool y
 	prompt "Old style AMD Opteron NUMA detection"
 	depends on X86_64 && NUMA && PCI
 	help
 	 Enable K8 NUMA node topology detection.  You should say Y here if
 	 you have a multi processor AMD K8 system. This uses an old
 	 method to read the NUMA configuration directly from the builtin
 	 Northbridge of Opteron. It is recommended to use X86_64_ACPI_NUMA
 	 instead, which also takes priority if both are compiled in.
 config X86_64_ACPI_NUMA
 	def_bool y
 	prompt "ACPI NUMA detection"
 	depends on X86_64 && NUMA && ACPI && PCI
 	select ACPI_NUMA
 	help
 	  Enable ACPI SRAT based node topology detection.
 config NUMA_EMU
 	bool "NUMA emulation"
 	depends on X86_64 && NUMA
 	help
 	  Enable NUMA emulation. A flat machine will be split
 	  into virtual nodes when booted with "numa=fake=N", where N is the
 	  number of nodes. This is only useful for debugging.
 config NODES_SHIFT
 	int
 	default "6" if X86_64
 	default "4" if X86_NUMAQ
 	default "3"
 	depends on NEED_MULTIPLE_NODES
 config HAVE_ARCH_BOOTMEM_NODE
 	def_bool y
 	depends on X86_32 && NUMA
 config ARCH_HAVE_MEMORY_PRESENT
 	def_bool y
 	depends on X86_32 && DISCONTIGMEM
 config NEED_NODE_MEMMAP_SIZE
 	def_bool y
 	depends on X86_32 && (DISCONTIGMEM || SPARSEMEM)
 config HAVE_ARCH_ALLOC_REMAP
 	def_bool y
 	depends on X86_32 && NUMA
 config ARCH_FLATMEM_ENABLE
 	def_bool y
 	depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && X86_PC
 config ARCH_DISCONTIGMEM_ENABLE
 	def_bool y
 	depends on NUMA && X86_32
 config ARCH_DISCONTIGMEM_DEFAULT
 	def_bool y
 	depends on NUMA && X86_32
 config ARCH_SPARSEMEM_DEFAULT
 	def_bool y
 	depends on X86_64
 config ARCH_SPARSEMEM_ENABLE
 	def_bool y
 	depends on X86_64 || NUMA || (EXPERIMENTAL && X86_PC)
 	select SPARSEMEM_STATIC if X86_32
 	select SPARSEMEM_VMEMMAP_ENABLE if X86_64
 config ARCH_SELECT_MEMORY_MODEL
 	def_bool y
 	depends on ARCH_SPARSEMEM_ENABLE
 config ARCH_MEMORY_PROBE
 	def_bool X86_64
 	depends on MEMORY_HOTPLUG
 source "mm/Kconfig"
 config HIGHPTE
 	bool "Allocate 3rd-level pagetables from highmem"
 	depends on X86_32 && (HIGHMEM4G || HIGHMEM64G)
 	help
 	  The VM uses one page table entry for each page of physical memory.
 	  For systems with a lot of RAM, this can be wasteful of precious
 	  low memory.  Setting this option will put user-space page table
 	  entries in high memory.
 config MATH_EMULATION
 	bool
 	prompt "Math emulation" if X86_32
 	---help---
 	  Linux can emulate a math coprocessor (used for floating point
 	  operations) if you don't have one. 486DX and Pentium processors have
 	  a math coprocessor built in, 486SX and 386 do not, unless you added
 	  a 487DX or 387, respectively. (The messages during boot time can
 	  give you some hints here ["man dmesg"].) Everyone needs either a
 	  coprocessor or this emulation.
 	  If you don't have a math coprocessor, you need to say Y here; if you
 	  say Y here even though you have a coprocessor, the coprocessor will
 	  be used nevertheless. (This behavior can be changed with the kernel
 	  command line option "no387", which comes handy if your coprocessor
 	  is broken. Try "man bootparam" or see the documentation of your boot
 	  loader (lilo or loadlin) about how to pass options to the kernel at
 	  boot time.) This means that it is a good idea to say Y here if you
 	  intend to use this kernel on different machines.
 	  More information about the internals of the Linux math coprocessor
 	  emulation can be found in <file:arch/x86/math-emu/README>.
 	  If you are not sure, say Y; apart from resulting in a 66 KB bigger
 	  kernel, it won't hurt.
 config MTRR
 	bool "MTRR (Memory Type Range Register) support"
 	---help---
 	  On Intel P6 family processors (Pentium Pro, Pentium II and later)
 	  the Memory Type Range Registers (MTRRs) may be used to control
 	  processor access to memory ranges. This is most useful if you have
 	  a video (VGA) card on a PCI or AGP bus. Enabling write-combining
 	  allows bus write transfers to be combined into a larger transfer
 	  before bursting over the PCI/AGP bus. This can increase performance
 	  of image write operations 2.5 times or more. Saying Y here creates a
 	  /proc/mtrr file which may be used to manipulate your processor's
 	  MTRRs. Typically the X server should use this.
 	  This code has a reasonably generic interface so that similar
 	  control registers on other processors can be easily supported
 	  as well:
 	  The Cyrix 6x86, 6x86MX and M II processors have Address Range
 	  Registers (ARRs) which provide a similar functionality to MTRRs. For
 	  these, the ARRs are used to emulate the MTRRs.
 	  The AMD K6-2 (stepping 8 and above) and K6-3 processors have two
 	  MTRRs. The Centaur C6 (WinChip) has 8 MCRs, allowing
 	  write-combining. All of these processors are supported by this code
 	  and it makes sense to say Y here if you have one of them.
 	  Saying Y here also fixes a problem with buggy SMP BIOSes which only
 	  set the MTRRs for the boot CPU and not for the secondary CPUs. This
 	  can lead to all sorts of problems, so it's good to say Y here.
 	  You can safely say Y even if your machine doesn't have MTRRs, you'll
 	  just add about 9 KB to your kernel.
 	  See <file:Documentation/mtrr.txt> for more information.
 config EFI
 	def_bool n
 	prompt "Boot from EFI support"
 	depends on ACPI
 	---help---
 	This enables the kernel to boot on EFI platforms using
 	system configuration information passed to it from the firmware.
 	This also enables the kernel to use any EFI runtime services that are
 	available (such as the EFI variable services).
 	This option is only useful on systems that have EFI firmware
 	and will result in a kernel image that is ~8k larger.  In addition,
 	you must use the latest ELILO loader available at
 	<http://elilo.sourceforge.net> in order to take advantage of
 	kernel initialization using EFI information (neither GRUB nor LILO know
 	anything about EFI).  However, even with this option, the resultant
 	kernel should continue to boot on existing non-EFI platforms.
 config IRQBALANCE
 	def_bool y
 	prompt "Enable kernel irq balancing"
 	depends on X86_32 && SMP && X86_IO_APIC
 	help
 	  The default yes will allow the kernel to do irq load balancing.
 	  Saying no will keep the kernel from doing irq load balancing.
 # turning this on wastes a bunch of space.
 # Summit needs it only when NUMA is on
 config BOOT_IOREMAP
 	def_bool y
 	depends on X86_32 && (((X86_SUMMIT || X86_GENERICARCH) && NUMA) || (X86 && EFI))
 config SECCOMP
 	def_bool y
 	prompt "Enable seccomp to safely compute untrusted bytecode"
 	depends on PROC_FS
 	help
 	  This kernel feature is useful for number crunching applications
 	  that may need to compute untrusted bytecode during their
 	  execution. By using pipes or other transports made available to
 	  the process as file descriptors supporting the read/write
 	  syscalls, it's possible to isolate those applications in
 	  their own address space using seccomp. Once seccomp is
 	  enabled via /proc/<pid>/seccomp, it cannot be disabled
 	  and the task is only allowed to execute a few safe syscalls
 	  defined by each seccomp mode.
 	  If unsure, say Y. Only embedded should say N here.
 config CC_STACKPROTECTOR
 	bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)"
 	depends on X86_64 && EXPERIMENTAL
 	help
          This option turns on the -fstack-protector GCC feature. This
 	  feature puts, at the beginning of critical functions, a canary
 	  value on the stack just before the return address, and validates
 	  the value just before actually returning.  Stack based buffer
 	  overflows (that need to overwrite this return address) now also
 	  overwrite the canary, which gets detected and the attack is then
 	  neutralized via a kernel panic.
 	  This feature requires gcc version 4.2 or above, or a distribution
 	  gcc with the feature backported. Older versions are automatically
 	  detected and for those versions, this configuration option is ignored.
 config CC_STACKPROTECTOR_ALL
 	bool "Use stack-protector for all functions"
 	depends on CC_STACKPROTECTOR
 	help
 	  Normally, GCC only inserts the canary value protection for
 	  functions that use large-ish on-stack buffers. By enabling
 	  this option, GCC will be asked to do this for ALL functions.
 source kernel/Kconfig.hz
 config KEXEC
 	bool "kexec system call"
 	help
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
 	  but it is independent of the system firmware.   And like a reboot
 	  you can start any kernel with it, not just Linux.
 	  The name comes from the similarity to the exec system call.
 	  It is an ongoing process to be certain the hardware in a machine
 	  is properly shutdown, so do not be surprised if this code does not
 	  initially work for you.  It may help to enable device hotplugging
 	  support.  As of this writing the exact hardware interface is
 	  strongly in flux, so no good recommendation can be made.
 config CRASH_DUMP
 	bool "kernel crash dumps (EXPERIMENTAL)"
 	depends on EXPERIMENTAL
 	depends on X86_64 || (X86_32 && HIGHMEM)
 	help
 	  Generate crash dump after being started by kexec.
 	  This should be normally only set in special crash dump kernels
 	  which are loaded in the main kernel with kexec-tools into
 	  a specially reserved region and then later executed after
 	  a crash by kdump/kexec. The crash dump kernel must be compiled
 	  to a memory address not used by the main kernel or BIOS using
 	  PHYSICAL_START, or it must be built as a relocatable image
 	  (CONFIG_RELOCATABLE=y).
 	  For more details see Documentation/kdump/kdump.txt
 config PHYSICAL_START
 	hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP)
 	default "0x1000000" if X86_NUMAQ
 	default "0x200000" if X86_64
 	default "0x100000"
 	help
 	  This gives the physical address where the kernel is loaded.
 	  If kernel is a not relocatable (CONFIG_RELOCATABLE=n) then
 	  bzImage will decompress itself to above physical address and
 	  run from there. Otherwise, bzImage will run from the address where
 	  it has been loaded by the boot loader and will ignore above physical
 	  address.
 	  In normal kdump cases one does not have to set/change this option
 	  as now bzImage can be compiled as a completely relocatable image
 	  (CONFIG_RELOCATABLE=y) and be used to load and run from a different
 	  address. This option is mainly useful for the folks who don't want
 	  to use a bzImage for capturing the crash dump and want to use a
 	  vmlinux instead. vmlinux is not relocatable hence a kernel needs
 	  to be specifically compiled to run from a specific memory area
 	  (normally a reserved region) and this option comes handy.
 	  So if you are using bzImage for capturing the crash dump, leave
 	  the value here unchanged to 0x100000 and set CONFIG_RELOCATABLE=y.
 	  Otherwise if you plan to use vmlinux for capturing the crash dump
 	  change this value to start of the reserved region (Typically 16MB
 	  0x1000000). In other words, it can be set based on the "X" value as
 	  specified in the "crashkernel=YM@XM" command line boot parameter
 	  passed to the panic-ed kernel. Typically this parameter is set as
 	  crashkernel=64M@16M. Please take a look at
 	  Documentation/kdump/kdump.txt for more details about crash dumps.
 	  Usage of bzImage for capturing the crash dump is recommended as
 	  one does not have to build two kernels. Same kernel can be used
 	  as production kernel and capture kernel. Above option should have
 	  gone away after relocatable bzImage support is introduced. But it
 	  is present because there are users out there who continue to use
 	  vmlinux for dump capture. This option should go away down the
 	  line.
 	  Don't change this unless you know what you are doing.
 config RELOCATABLE
 	bool "Build a relocatable kernel (EXPERIMENTAL)"
 	depends on EXPERIMENTAL
 	help
 	  This builds a kernel image that retains relocation information
 	  so it can be loaded someplace besides the default 1MB.
 	  The relocations tend to make the kernel binary about 10% larger,
 	  but are discarded at runtime.
 	  One use is for the kexec on panic case where the recovery kernel
 	  must live at a different physical address than the primary
 	  kernel.
 	  Note: If CONFIG_RELOCATABLE=y, then the kernel runs from the address
 	  it has been loaded at and the compile time physical address
 	  (CONFIG_PHYSICAL_START) is ignored.
 config PHYSICAL_ALIGN
 	hex
 	prompt "Alignment value to which kernel should be aligned" if X86_32
 	default "0x100000" if X86_32
 	default "0x200000" if X86_64
 	range 0x2000 0x400000
 	help
 	  This value puts the alignment restrictions on physical address
 	  where kernel is loaded and run from. Kernel is compiled for an
 	  address which meets above alignment restriction.
 	  If bootloader loads the kernel at a non-aligned address and
 	  CONFIG_RELOCATABLE is set, kernel will move itself to nearest
 	  address aligned to above value and run from there.
 	  If bootloader loads the kernel at a non-aligned address and
 	  CONFIG_RELOCATABLE is not set, kernel will ignore the run time
 	  load address and decompress itself to the address it has been
 	  compiled for and run from there. The address for which kernel is
 	  compiled already meets above alignment restrictions. Hence the
 	  end result is that kernel runs from a physical address meeting
 	  above alignment restrictions.
 	  Don't change this unless you know what you are doing.
 config HOTPLUG_CPU
 	bool "Support for suspend on SMP and hot-pluggable CPUs (EXPERIMENTAL)"
 	depends on SMP && HOTPLUG && EXPERIMENTAL && !X86_VOYAGER
 	---help---
 	  Say Y here to experiment with turning CPUs off and on, and to
 	  enable suspend on SMP systems. CPUs can be controlled through
 	  /sys/devices/system/cpu.
 	  Say N if you want to disable CPU hotplug and don't need to
 	  suspend.
 config COMPAT_VDSO
 	def_bool y
 	prompt "Compat VDSO support"
 	depends on X86_32 || IA32_EMULATION
 	help
 	  Map the 32-bit VDSO to the predictable old-style address too.
 	---help---
 	  Say N here if you are running a sufficiently recent glibc
 	  version (2.3.3 or later), to remove the high-mapped
 	  VDSO mapping and to exclusively use the randomized VDSO.
 	  If unsure, say Y.
 endmenu
 config ARCH_ENABLE_MEMORY_HOTPLUG
 	def_bool y
 	depends on X86_64 || (X86_32 && HIGHMEM)
 config HAVE_ARCH_EARLY_PFN_TO_NID
 	def_bool X86_64
 	depends on NUMA
 menu "Power management options"
 	depends on !X86_VOYAGER
 config ARCH_HIBERNATION_HEADER
 	def_bool y
 	depends on X86_64 && HIBERNATION
 source "kernel/power/Kconfig"
 source "drivers/acpi/Kconfig"
 menuconfig APM
 	tristate "APM (Advanced Power Management) BIOS support"
 	depends on X86_32 && PM_SLEEP && !X86_VISWS
 	---help---
 	  APM is a BIOS specification for saving power using several different
 	  techniques. This is mostly useful for battery powered laptops with
 	  APM compliant BIOSes. If you say Y here, the system time will be
 	  reset after a RESUME operation, the /proc/apm device will provide
 	  battery status information, and user-space programs will receive
 	  notification of APM "events" (e.g. battery status change).
 	  If you select "Y" here, you can disable actual use of the APM
 	  BIOS by passing the "apm=off" option to the kernel at boot time.
 	  Note that the APM support is almost completely disabled for
 	  machines with more than one CPU.
 	  In order to use APM, you will need supporting software. For location
 	  and more information, read <file:Documentation/pm.txt> and the
 	  Battery Powered Linux mini-HOWTO, available from
 	  <http://www.tldp.org/docs.html#howto>.
 	  This driver does not spin down disk drives (see the hdparm(8)
 	  manpage ("man 8 hdparm") for that), and it doesn't turn off
 	  VESA-compliant "green" monitors.
 	  This driver does not support the TI 4000M TravelMate and the ACER
 	  486/DX4/75 because they don't have compliant BIOSes. Many "green"
 	  desktop machines also don't have compliant BIOSes, and this driver
 	  may cause those machines to panic during the boot phase.
 	  Generally, if you don't have a battery in your machine, there isn't
 	  much point in using this driver and you should say N. If you get
 	  random kernel OOPSes or reboots that don't seem to be related to
 	  anything, try disabling/enabling this option (or disabling/enabling
 	  APM in your BIOS).
 	  Some other things you should try when experiencing seemingly random,
 	  "weird" problems:
 	  1) make sure that you have enough swap space and that it is
 	  enabled.
 	  2) pass the "no-hlt" option to the kernel
 	  3) switch on floating point emulation in the kernel and pass
 	  the "no387" option to the kernel
 	  4) pass the "floppy=nodma" option to the kernel
 	  5) pass the "mem=4M" option to the kernel (thereby disabling
 	  all but the first 4 MB of RAM)
 	  6) make sure that the CPU is not over clocked.
 	  7) read the sig11 FAQ at <http://www.bitwizard.nl/sig11/>
 	  8) disable the cache from your BIOS settings
 	  9) install a fan for the video card or exchange video RAM
 	  10) install a better fan for the CPU
 	  11) exchange RAM chips
 	  12) exchange the motherboard.
 	  To compile this driver as a module, choose M here: the
 	  module will be called apm.
 if APM
 config APM_IGNORE_USER_SUSPEND
 	bool "Ignore USER SUSPEND"
 	help
 	  This option will ignore USER SUSPEND requests. On machines with a
 	  compliant APM BIOS, you want to say N. However, on the NEC Versa M
 	  series notebooks, it is necessary to say Y because of a BIOS bug.
 config APM_DO_ENABLE
 	bool "Enable PM at boot time"
 	---help---
 	  Enable APM features at boot time. From page 36 of the APM BIOS
 	  specification: "When disabled, the APM BIOS does not automatically
 	  power manage devices, enter the Standby State, enter the Suspend
 	  State, or take power saving steps in response to CPU Idle calls."
 	  This driver will make CPU Idle calls when Linux is idle (unless this
 	  feature is turned off -- see "Do CPU IDLE calls", below). This
 	  should always save battery power, but more complicated APM features
 	  will be dependent on your BIOS implementation. You may need to turn
 	  this option off if your computer hangs at boot time when using APM
 	  support, or if it beeps continuously instead of suspending. Turn
 	  this off if you have a NEC UltraLite Versa 33/C or a Toshiba
 	  T400CDT. This is off by default since most machines do fine without
 	  this feature.
 config APM_CPU_IDLE
 	bool "Make CPU Idle calls when idle"
 	help
 	  Enable calls to APM CPU Idle/CPU Busy inside the kernel's idle loop.
 	  On some machines, this can activate improved power savings, such as
 	  a slowed CPU clock rate, when the machine is idle. These idle calls
 	  are made after the idle loop has run for some length of time (e.g.,
 	  333 mS). On some machines, this will cause a hang at boot time or
 	  whenever the CPU becomes idle. (On machines with more than one CPU,
 	  this option does nothing.)
 config APM_DISPLAY_BLANK
 	bool "Enable console blanking using APM"
 	help
 	  Enable console blanking using the APM. Some laptops can use this to
 	  turn off the LCD backlight when the screen blanker of the Linux
 	  virtual console blanks the screen. Note that this is only used by
 	  the virtual console screen blanker, and won't turn off the backlight
 	  when using the X Window system. This also doesn't have anything to
 	  do with your VESA-compliant power-saving monitor. Further, this
 	  option doesn't work for all laptops -- it might not turn off your
 	  backlight at all, or it might print a lot of errors to the console,
 	  especially if you are using gpm.
 config APM_ALLOW_INTS
 	bool "Allow interrupts during APM BIOS calls"
 	help
 	  Normally we disable external interrupts while we are making calls to
 	  the APM BIOS as a measure to lessen the effects of a badly behaving
 	  BIOS implementation.  The BIOS should reenable interrupts if it
 	  needs to.  Unfortunately, some BIOSes do not -- especially those in
 	  many of the newer IBM Thinkpads.  If you experience hangs when you
 	  suspend, try setting this to Y.  Otherwise, say N.
 config APM_REAL_MODE_POWER_OFF
 	bool "Use real mode APM BIOS call to power off"
 	help
 	  Use real mode APM BIOS calls to switch off the computer. This is
 	  a work-around for a number of buggy BIOSes. Switch this option on if
 	  your computer crashes instead of powering off properly.
 endif # APM
 source "arch/x86/kernel/cpu/cpufreq/Kconfig"
 source "drivers/cpuidle/Kconfig"
 endmenu
 menu "Bus options (PCI etc.)"
 config PCI
 	bool "PCI support" if !X86_VISWS
 	depends on !X86_VOYAGER
 	default y if X86_VISWS
 	select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC)
 	help
 	  Find out whether you have a PCI motherboard. PCI is the name of a
 	  bus system, i.e. the way the CPU talks to the other stuff inside
 	  your box. Other bus systems are ISA, EISA, MicroChannel (MCA) or
 	  VESA. If you have PCI, say Y, otherwise N.
 	  The PCI-HOWTO, available from
 	  <http://www.tldp.org/docs.html#howto>, contains valuable
 	  information about which PCI hardware does work under Linux and which
 	  doesn't.
 choice
 	prompt "PCI access mode"
 	depends on X86_32 && PCI && !X86_VISWS
 	default PCI_GOANY
 	---help---
 	  On PCI systems, the BIOS can be used to detect the PCI devices and
 	  determine their configuration. However, some old PCI motherboards
 	  have BIOS bugs and may crash if this is done. Also, some embedded
 	  PCI-based systems don't have any BIOS at all. Linux can also try to
 	  detect the PCI hardware directly without using the BIOS.
 	  With this option, you can specify how Linux should detect the
 	  PCI devices. If you choose "BIOS", the BIOS will be used,
 	  if you choose "Direct", the BIOS won't be used, and if you
 	  choose "MMConfig", then PCI Express MMCONFIG will be used.
 	  If you choose "Any", the kernel will try MMCONFIG, then the
 	  direct access method and falls back to the BIOS if that doesn't
 	  work. If unsure, go with the default, which is "Any".
 config PCI_GOBIOS
 	bool "BIOS"
 config PCI_GOMMCONFIG
 	bool "MMConfig"
 config PCI_GODIRECT
 	bool "Direct"
 config PCI_GOANY
 	bool "Any"
 endchoice
 config PCI_BIOS
 	def_bool y
 	depends on X86_32 && !X86_VISWS && PCI && (PCI_GOBIOS || PCI_GOANY)
 # x86-64 doesn't support PCI BIOS access from long mode so always go direct.
 config PCI_DIRECT
 	def_bool y
 	depends on PCI && (X86_64 || (PCI_GODIRECT || PCI_GOANY) || X86_VISWS)
 config PCI_MMCONFIG
 	def_bool y
 	depends on X86_32 && PCI && ACPI && (PCI_GOMMCONFIG || PCI_GOANY)
 config PCI_DOMAINS
 	def_bool y
 	depends on PCI
 config PCI_MMCONFIG
 	bool "Support mmconfig PCI config space access"
 	depends on X86_64 && PCI && ACPI
 config DMAR
 	bool "Support for DMA Remapping Devices (EXPERIMENTAL)"
 	depends on X86_64 && PCI_MSI && ACPI && EXPERIMENTAL
 	help
 	  DMA remapping (DMAR) devices support enables independent address
 	  translations for Direct Memory Access (DMA) from devices.
 	  These DMA remapping devices are reported via ACPI tables
 	  and include PCI device scope covered by these DMA
 	  remapping devices.
 config DMAR_GFX_WA
 	def_bool y
 	prompt "Support for Graphics workaround"
 	depends on DMAR
 	help
 	 Current Graphics drivers tend to use physical address
 	 for DMA and avoid using DMA APIs. Setting this config
 	 option permits the IOMMU driver to set a unity map for
 	 all the OS-visible memory. Hence the driver can continue
 	 to use physical addresses for DMA.
 config DMAR_FLOPPY_WA
 	def_bool y
 	depends on DMAR
 	help
 	 Floppy disk drivers are know to bypass DMA API calls
 	 thereby failing to work when IOMMU is enabled. This
 	 workaround will setup a 1:1 mapping for the first
 	 16M to make floppy (an ISA device) work.
 source "drivers/pci/pcie/Kconfig"
 source "drivers/pci/Kconfig"
 # x86_64 have no ISA slots, but do have ISA-style DMA.
 config ISA_DMA_API
 	def_bool y
 if X86_32
 config ISA
 	bool "ISA support"
 	depends on !(X86_VOYAGER || X86_VISWS)
 	help
 	  Find out whether you have ISA slots on your motherboard.  ISA is the
 	  name of a bus system, i.e. the way the CPU talks to the other stuff
 	  inside your box.  Other bus systems are PCI, EISA, MicroChannel
 	  (MCA) or VESA.  ISA is an older system, now being displaced by PCI;
 	  newer boards don't support it.  If you have ISA, say Y, otherwise N.
 config EISA
 	bool "EISA support"
 	depends on ISA
 	---help---
 	  The Extended Industry Standard Architecture (EISA) bus was
 	  developed as an open alternative to the IBM MicroChannel bus.
 	  The EISA bus provided some of the features of the IBM MicroChannel
 	  bus while maintaining backward compatibility with cards made for
 	  the older ISA bus.  The EISA bus saw limited use between 1988 and
 	  1995 when it was made obsolete by the PCI bus.
 	  Say Y here if you are building a kernel for an EISA-based machine.
 	  Otherwise, say N.
 source "drivers/eisa/Kconfig"
 config MCA
 	bool "MCA support" if !(X86_VISWS || X86_VOYAGER)
 	default y if X86_VOYAGER
 	help
 	  MicroChannel Architecture is found in some IBM PS/2 machines and
 	  laptops.  It is a bus system similar to PCI or ISA. See
 	  <file:Documentation/mca.txt> (and especially the web page given
 	  there) before attempting to build an MCA bus kernel.
 source "drivers/mca/Kconfig"
 config SCx200
 	tristate "NatSemi SCx200 support"
 	depends on !X86_VOYAGER
 	help
 	  This provides basic support for National Semiconductor's
 	  (now AMD's) Geode processors.  The driver probes for the
 	  PCI-IDs of several on-chip devices, so its a good dependency
 	  for other scx200_* drivers.
 	  If compiled as a module, the driver is named scx200.
 config SCx200HR_TIMER
 	tristate "NatSemi SCx200 27MHz High-Resolution Timer Support"
 	depends on SCx200 && GENERIC_TIME
 	default y
 	help
 	  This driver provides a clocksource built upon the on-chip
 	  27MHz high-resolution timer.  Its also a workaround for
 	  NSC Geode SC-1100's buggy TSC, which loses time when the
 	  processor goes idle (as is done by the scheduler).  The
 	  other workaround is idle=poll boot option.
 config GEODE_MFGPT_TIMER
 	def_bool y
 	prompt "Geode Multi-Function General Purpose Timer (MFGPT) events"
 	depends on MGEODE_LX && GENERIC_TIME && GENERIC_CLOCKEVENTS
 	help
 	  This driver provides a clock event source based on the MFGPT
 	  timer(s) in the CS5535 and CS5536 companion chip for the geode.
 	  MFGPTs have a better resolution and max interval than the
 	  generic PIT, and are suitable for use as high-res timers.
 endif # X86_32
 config K8_NB
 	def_bool y
 	depends on AGP_AMD64 || (X86_64 && (GART_IOMMU || (PCI && NUMA)))
 source "drivers/pcmcia/Kconfig"
 source "drivers/pci/hotplug/Kconfig"
 endmenu
 menu "Executable file formats / Emulations"
 source "fs/Kconfig.binfmt"
 config IA32_EMULATION
 	bool "IA32 Emulation"
 	depends on X86_64
 	help
 	  Include code to run 32-bit programs under a 64-bit kernel. You should
 	  likely turn this on, unless you're 100% sure that you don't have any
 	  32-bit programs left.
 config IA32_AOUT
        tristate "IA32 a.out support"
        depends on IA32_EMULATION
        help
          Support old a.out binaries in the 32bit emulation.
 config COMPAT
 	def_bool y
 	depends on IA32_EMULATION
 config COMPAT_FOR_U64_ALIGNMENT
 	def_bool COMPAT
 	depends on X86_64
 config SYSVIPC_COMPAT
 	def_bool y
 	depends on X86_64 && COMPAT && SYSVIPC
 endmenu
 source "net/Kconfig"
 source "drivers/Kconfig"
 source "drivers/firmware/Kconfig"
 source "fs/Kconfig"
 source "kernel/Kconfig.instrumentation"
 source "arch/x86/Kconfig.debug"
 source "security/Kconfig"
 source "crypto/Kconfig"
 source "lib/Kconfig"

fs/jbd/checkpoint.c

Diff comments View file @ 95c354f

 /*
  * linux/fs/jbd/checkpoint.c
  *
  * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
  *
  * Copyright 1999 Red Hat Software --- All Rights Reserved
  *
  * This file is part of the Linux kernel and is made available under
  * the terms of the GNU General Public License, version 2, or at your
  * option, any later version, incorporated herein by reference.
  *
  * Checkpoint routines for the generic filesystem journaling code.
  * Part of the ext2fs journaling system.
  *
  * Checkpointing is the process of ensuring that a section of the log is
  * committed fully to disk, so that that portion of the log can be
  * reused.
  */
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/jbd.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
 /*
  * Unlink a buffer from a transaction checkpoint list.
  *
  * Called with j_list_lock held.
  */
 static inline void __buffer_unlink_first(struct journal_head *jh)
 {
 	transaction_t *transaction = jh->b_cp_transaction;
 	jh->b_cpnext->b_cpprev = jh->b_cpprev;
 	jh->b_cpprev->b_cpnext = jh->b_cpnext;
 	if (transaction->t_checkpoint_list == jh) {
 		transaction->t_checkpoint_list = jh->b_cpnext;
 		if (transaction->t_checkpoint_list == jh)
 			transaction->t_checkpoint_list = NULL;
 	}
 }
 /*
  * Unlink a buffer from a transaction checkpoint(io) list.
  *
  * Called with j_list_lock held.
  */
 static inline void __buffer_unlink(struct journal_head *jh)
 {
 	transaction_t *transaction = jh->b_cp_transaction;
 	__buffer_unlink_first(jh);
 	if (transaction->t_checkpoint_io_list == jh) {
 		transaction->t_checkpoint_io_list = jh->b_cpnext;
 		if (transaction->t_checkpoint_io_list == jh)
 			transaction->t_checkpoint_io_list = NULL;
 	}
 }
 /*
  * Move a buffer from the checkpoint list to the checkpoint io list
  *
  * Called with j_list_lock held
  */
 static inline void __buffer_relink_io(struct journal_head *jh)
 {
 	transaction_t *transaction = jh->b_cp_transaction;
 	__buffer_unlink_first(jh);
 	if (!transaction->t_checkpoint_io_list) {
 		jh->b_cpnext = jh->b_cpprev = jh;
 	} else {
 		jh->b_cpnext = transaction->t_checkpoint_io_list;
 		jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
 		jh->b_cpprev->b_cpnext = jh;
 		jh->b_cpnext->b_cpprev = jh;
 	}
 	transaction->t_checkpoint_io_list = jh;
 }
 /*
  * Try to release a checkpointed buffer from its transaction.
  * Returns 1 if we released it and 2 if we also released the
  * whole transaction.
  *
  * Requires j_list_lock
  * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
  */
 static int __try_to_free_cp_buf(struct journal_head *jh)
 {
 	int ret = 0;
 	struct buffer_head *bh = jh2bh(jh);
 	if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
 		JBUFFER_TRACE(jh, "remove from checkpoint list");
 		ret = __journal_remove_checkpoint(jh) + 1;
 		jbd_unlock_bh_state(bh);
 		journal_remove_journal_head(bh);
 		BUFFER_TRACE(bh, "release");
 		__brelse(bh);
 	} else {
 		jbd_unlock_bh_state(bh);
 	}
 	return ret;
 }
 /*
  * __log_wait_for_space: wait until there is space in the journal.
  *
  * Called under j-state_lock *only*.  It will be unlocked if we have to wait
  * for a checkpoint to free up some space in the log.
  */
 void __log_wait_for_space(journal_t *journal)
 {
 	int nblocks;
 	assert_spin_locked(&journal->j_state_lock);
 	nblocks = jbd_space_needed(journal);
 	while (__log_space_left(journal) < nblocks) {
 		if (journal->j_flags & JFS_ABORT)
 			return;
 		spin_unlock(&journal->j_state_lock);
 		mutex_lock(&journal->j_checkpoint_mutex);
 		/*
 		 * Test again, another process may have checkpointed while we
 		 * were waiting for the checkpoint lock
 		 */
 		spin_lock(&journal->j_state_lock);
 		nblocks = jbd_space_needed(journal);
 		if (__log_space_left(journal) < nblocks) {
 			spin_unlock(&journal->j_state_lock);
 			log_do_checkpoint(journal);
 			spin_lock(&journal->j_state_lock);
 		}
 		mutex_unlock(&journal->j_checkpoint_mutex);
 	}
 }
 /*
  * We were unable to perform jbd_trylock_bh_state() inside j_list_lock.
  * The caller must restart a list walk.  Wait for someone else to run
  * jbd_unlock_bh_state().
  */
 static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
 	__releases(journal->j_list_lock)
 {
 	get_bh(bh);
 	spin_unlock(&journal->j_list_lock);
 	jbd_lock_bh_state(bh);
 	jbd_unlock_bh_state(bh);
 	put_bh(bh);
 }
 /*
  * Clean up transaction's list of buffers submitted for io.
  * We wait for any pending IO to complete and remove any clean
  * buffers. Note that we take the buffers in the opposite ordering
  * from the one in which they were submitted for IO.
  *
  * Called with j_list_lock held.
  */
 static void __wait_cp_io(journal_t *journal, transaction_t *transaction)
 {
 	struct journal_head *jh;
 	struct buffer_head *bh;
 	tid_t this_tid;
 	int released = 0;
 	this_tid = transaction->t_tid;
 restart:
 	/* Did somebody clean up the transaction in the meanwhile? */
 	if (journal->j_checkpoint_transactions != transaction ||
 			transaction->t_tid != this_tid)
 		return;
 	while (!released && transaction->t_checkpoint_io_list) {
 		jh = transaction->t_checkpoint_io_list;
 		bh = jh2bh(jh);
 		if (!jbd_trylock_bh_state(bh)) {
 			jbd_sync_bh(journal, bh);
 			spin_lock(&journal->j_list_lock);
 			goto restart;
 		}
 		if (buffer_locked(bh)) {
 			atomic_inc(&bh->b_count);
 			spin_unlock(&journal->j_list_lock);
 			jbd_unlock_bh_state(bh);
 			wait_on_buffer(bh);
 			/* the journal_head may have gone by now */
 			BUFFER_TRACE(bh, "brelse");
 			__brelse(bh);
 			spin_lock(&journal->j_list_lock);
 			goto restart;
 		}
 		/*
 		 * Now in whatever state the buffer currently is, we know that
 		 * it has been written out and so we can drop it from the list
 		 */
 		released = __journal_remove_checkpoint(jh);
 		jbd_unlock_bh_state(bh);
 		journal_remove_journal_head(bh);
 		__brelse(bh);
 	}
 }
 #define NR_BATCH	64
 static void
 __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
 {
 	int i;
 	ll_rw_block(SWRITE, *batch_count, bhs);
 	for (i = 0; i < *batch_count; i++) {
 		struct buffer_head *bh = bhs[i];
 		clear_buffer_jwrite(bh);
 		BUFFER_TRACE(bh, "brelse");
 		__brelse(bh);
 	}
 	*batch_count = 0;
 }
 /*
  * Try to flush one buffer from the checkpoint list to disk.
  *
  * Return 1 if something happened which requires us to abort the current
  * scan of the checkpoint list.
  *
  * Called with j_list_lock held and drops it if 1 is returned
  * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
  */
 static int __process_buffer(journal_t *journal, struct journal_head *jh,
 			struct buffer_head **bhs, int *batch_count)
 {
 	struct buffer_head *bh = jh2bh(jh);
 	int ret = 0;
 	if (buffer_locked(bh)) {
 		atomic_inc(&bh->b_count);
 		spin_unlock(&journal->j_list_lock);
 		jbd_unlock_bh_state(bh);
 		wait_on_buffer(bh);
 		/* the journal_head may have gone by now */
 		BUFFER_TRACE(bh, "brelse");
 		__brelse(bh);
 		ret = 1;
 	} else if (jh->b_transaction != NULL) {
 		transaction_t *t = jh->b_transaction;
 		tid_t tid = t->t_tid;
 		spin_unlock(&journal->j_list_lock);
 		jbd_unlock_bh_state(bh);
 		log_start_commit(journal, tid);
 		log_wait_commit(journal, tid);
 		ret = 1;
 	} else if (!buffer_dirty(bh)) {
 		J_ASSERT_JH(jh, !buffer_jbddirty(bh));
 		BUFFER_TRACE(bh, "remove from checkpoint");
 		__journal_remove_checkpoint(jh);
 		spin_unlock(&journal->j_list_lock);
 		jbd_unlock_bh_state(bh);
 		journal_remove_journal_head(bh);
 		__brelse(bh);
 		ret = 1;
 	} else {
 		/*
 		 * Important: we are about to write the buffer, and
 		 * possibly block, while still holding the journal lock.
 		 * We cannot afford to let the transaction logic start
 		 * messing around with this buffer before we write it to
 		 * disk, as that would break recoverability.
 		 */
 		BUFFER_TRACE(bh, "queue");
 		get_bh(bh);
 		J_ASSERT_BH(bh, !buffer_jwrite(bh));
 		set_buffer_jwrite(bh);
 		bhs[*batch_count] = bh;
 		__buffer_relink_io(jh);
 		jbd_unlock_bh_state(bh);
 		(*batch_count)++;
 		if (*batch_count == NR_BATCH) {
 			spin_unlock(&journal->j_list_lock);
 			__flush_batch(journal, bhs, batch_count);
 			ret = 1;
 		}
 	}
 	return ret;
 }
 /*
  * Perform an actual checkpoint. We take the first transaction on the
  * list of transactions to be checkpointed and send all its buffers
  * to disk. We submit larger chunks of data at once.
  *
  * The journal should be locked before calling this function.
  */
 int log_do_checkpoint(journal_t *journal)
 {
 	transaction_t *transaction;
 	tid_t this_tid;
 	int result;
 	jbd_debug(1, "Start checkpoint\n");
 	/*
 	 * First thing: if there are any transactions in the log which
 	 * don't need checkpointing, just eliminate them from the
 	 * journal straight away.
 	 */
 	result = cleanup_journal_tail(journal);
 	jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
 	if (result <= 0)
 		return result;
 	/*
 	 * OK, we need to start writing disk blocks.  Take one transaction
 	 * and write it.
 	 */
 	spin_lock(&journal->j_list_lock);
 	if (!journal->j_checkpoint_transactions)
 		goto out;
 	transaction = journal->j_checkpoint_transactions;
 	this_tid = transaction->t_tid;
 restart:
 	/*
 	 * If someone cleaned up this transaction while we slept, we're
 	 * done (maybe it's a new transaction, but it fell at the same
 	 * address).
 	 */
 	if (journal->j_checkpoint_transactions == transaction &&
 			transaction->t_tid == this_tid) {
 		int batch_count = 0;
 		struct buffer_head *bhs[NR_BATCH];
 		struct journal_head *jh;
 		int retry = 0;
 		while (!retry && transaction->t_checkpoint_list) {
 			struct buffer_head *bh;
 			jh = transaction->t_checkpoint_list;
 			bh = jh2bh(jh);
 			if (!jbd_trylock_bh_state(bh)) {
 				jbd_sync_bh(journal, bh);
 				retry = 1;
 				break;
 			}
 			retry = __process_buffer(journal, jh, bhs,&batch_count);
-			if (!retry && lock_need_resched(&journal->j_list_lock)){
+			if (!retry && (need_resched() ||
+				spin_needbreak(&journal->j_list_lock))) {
 				spin_unlock(&journal->j_list_lock);
 				retry = 1;
 				break;
 			}
 		}
 		if (batch_count) {
 			if (!retry) {
 				spin_unlock(&journal->j_list_lock);
 				retry = 1;
 			}
 			__flush_batch(journal, bhs, &batch_count);
 		}
 		if (retry) {
 			spin_lock(&journal->j_list_lock);
 			goto restart;
 		}
 		/*
 		 * Now we have cleaned up the first transaction's checkpoint
 		 * list. Let's clean up the second one
 		 */
 		__wait_cp_io(journal, transaction);
 	}
 out:
 	spin_unlock(&journal->j_list_lock);
 	result = cleanup_journal_tail(journal);
 	if (result < 0)
 		return result;
 	return 0;
 }
 /*
  * Check the list of checkpoint transactions for the journal to see if
  * we have already got rid of any since the last update of the log tail
  * in the journal superblock.  If so, we can instantly roll the
  * superblock forward to remove those transactions from the log.
  *
  * Return <0 on error, 0 on success, 1 if there was nothing to clean up.
  *
  * Called with the journal lock held.
  *
  * This is the only part of the journaling code which really needs to be
  * aware of transaction aborts.  Checkpointing involves writing to the
  * main filesystem area rather than to the journal, so it can proceed
  * even in abort state, but we must not update the journal superblock if
  * we have an abort error outstanding.
  */
 int cleanup_journal_tail(journal_t *journal)
 {
 	transaction_t * transaction;
 	tid_t		first_tid;
 	unsigned long	blocknr, freed;
 	/* OK, work out the oldest transaction remaining in the log, and
 	 * the log block it starts at.
 	 *
 	 * If the log is now empty, we need to work out which is the
 	 * next transaction ID we will write, and where it will
 	 * start. */
 	spin_lock(&journal->j_state_lock);
 	spin_lock(&journal->j_list_lock);
 	transaction = journal->j_checkpoint_transactions;
 	if (transaction) {
 		first_tid = transaction->t_tid;
 		blocknr = transaction->t_log_start;
 	} else if ((transaction = journal->j_committing_transaction) != NULL) {
 		first_tid = transaction->t_tid;
 		blocknr = transaction->t_log_start;
 	} else if ((transaction = journal->j_running_transaction) != NULL) {
 		first_tid = transaction->t_tid;
 		blocknr = journal->j_head;
 	} else {
 		first_tid = journal->j_transaction_sequence;
 		blocknr = journal->j_head;
 	}
 	spin_unlock(&journal->j_list_lock);
 	J_ASSERT(blocknr != 0);
 	/* If the oldest pinned transaction is at the tail of the log
            already then there's not much we can do right now. */
 	if (journal->j_tail_sequence == first_tid) {
 		spin_unlock(&journal->j_state_lock);
 		return 1;
 	}
 	/* OK, update the superblock to recover the freed space.
 	 * Physical blocks come first: have we wrapped beyond the end of
 	 * the log?  */
 	freed = blocknr - journal->j_tail;
 	if (blocknr < journal->j_tail)
 		freed = freed + journal->j_last - journal->j_first;
 	jbd_debug(1,
 		  "Cleaning journal tail from %d to %d (offset %lu), "
 		  "freeing %lu\n",
 		  journal->j_tail_sequence, first_tid, blocknr, freed);
 	journal->j_free += freed;
 	journal->j_tail_sequence = first_tid;
 	journal->j_tail = blocknr;
 	spin_unlock(&journal->j_state_lock);
 	if (!(journal->j_flags & JFS_ABORT))
 		journal_update_superblock(journal, 1);
 	return 0;
 }
 /* Checkpoint list management */
 /*
  * journal_clean_one_cp_list
  *
  * Find all the written-back checkpoint buffers in the given list and release them.
  *
  * Called with the journal locked.
  * Called with j_list_lock held.
  * Returns number of bufers reaped (for debug)
  */
 static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
 {
 	struct journal_head *last_jh;
 	struct journal_head *next_jh = jh;
 	int ret, freed = 0;
 	*released = 0;
 	if (!jh)
 		return 0;
 	last_jh = jh->b_cpprev;
 	do {
 		jh = next_jh;
 		next_jh = jh->b_cpnext;
 		/* Use trylock because of the ranking */
 		if (jbd_trylock_bh_state(jh2bh(jh))) {
 			ret = __try_to_free_cp_buf(jh);
 			if (ret) {
 				freed++;
 				if (ret == 2) {
 					*released = 1;
 					return freed;
 				}
 			}
 		}
 		/*
 		 * This function only frees up some memory
 		 * if possible so we dont have an obligation
 		 * to finish processing. Bail out if preemption
 		 * requested:
 		 */
 		if (need_resched())
 			return freed;
 	} while (jh != last_jh);
 	return freed;
 }
 /*
  * journal_clean_checkpoint_list
  *
  * Find all the written-back checkpoint buffers in the journal and release them.
  *
  * Called with the journal locked.
  * Called with j_list_lock held.
  * Returns number of buffers reaped (for debug)
  */
 int __journal_clean_checkpoint_list(journal_t *journal)
 {
 	transaction_t *transaction, *last_transaction, *next_transaction;
 	int ret = 0;
 	int released;
 	transaction = journal->j_checkpoint_transactions;
 	if (!transaction)
 		goto out;
 	last_transaction = transaction->t_cpprev;
 	next_transaction = transaction;
 	do {
 		transaction = next_transaction;
 		next_transaction = transaction->t_cpnext;
 		ret += journal_clean_one_cp_list(transaction->
 				t_checkpoint_list, &released);
 		/*
 		 * This function only frees up some memory if possible so we
 		 * dont have an obligation to finish processing. Bail out if
 		 * preemption requested:
 		 */
 		if (need_resched())
 			goto out;
 		if (released)
 			continue;
 		/*
 		 * It is essential that we are as careful as in the case of
 		 * t_checkpoint_list with removing the buffer from the list as
 		 * we can possibly see not yet submitted buffers on io_list
 		 */
 		ret += journal_clean_one_cp_list(transaction->
 				t_checkpoint_io_list, &released);
 		if (need_resched())
 			goto out;
 	} while (transaction != last_transaction);
 out:
 	return ret;
 }
 /*
  * journal_remove_checkpoint: called after a buffer has been committed
  * to disk (either by being write-back flushed to disk, or being
  * committed to the log).
  *
  * We cannot safely clean a transaction out of the log until all of the
  * buffer updates committed in that transaction have safely been stored
  * elsewhere on disk.  To achieve this, all of the buffers in a
  * transaction need to be maintained on the transaction's checkpoint
  * lists until they have been rewritten, at which point this function is
  * called to remove the buffer from the existing transaction's
  * checkpoint lists.
  *
  * The function returns 1 if it frees the transaction, 0 otherwise.
  *
  * This function is called with the journal locked.
  * This function is called with j_list_lock held.
  * This function is called with jbd_lock_bh_state(jh2bh(jh))
  */
 int __journal_remove_checkpoint(struct journal_head *jh)
 {
 	transaction_t *transaction;
 	journal_t *journal;
 	int ret = 0;
 	JBUFFER_TRACE(jh, "entry");
 	if ((transaction = jh->b_cp_transaction) == NULL) {
 		JBUFFER_TRACE(jh, "not on transaction");
 		goto out;
 	}
 	journal = transaction->t_journal;
 	__buffer_unlink(jh);
 	jh->b_cp_transaction = NULL;
 	if (transaction->t_checkpoint_list != NULL ||
 	    transaction->t_checkpoint_io_list != NULL)
 		goto out;
 	JBUFFER_TRACE(jh, "transaction has no more buffers");
 	/*
 	 * There is one special case to worry about: if we have just pulled the
 	 * buffer off a running or committing transaction's checkpoing list,
 	 * then even if the checkpoint list is empty, the transaction obviously
 	 * cannot be dropped!
 	 *
 	 * The locking here around t_state is a bit sleazy.
 	 * See the comment at the end of journal_commit_transaction().
 	 */
 	if (transaction->t_state != T_FINISHED) {
 		JBUFFER_TRACE(jh, "belongs to running/committing transaction");
 		goto out;
 	}
 	/* OK, that was the last buffer for the transaction: we can now
 	   safely remove this transaction from the log */
 	__journal_drop_transaction(journal, transaction);
 	/* Just in case anybody was waiting for more transactions to be
            checkpointed... */
 	wake_up(&journal->j_wait_logspace);
 	ret = 1;
 out:
 	JBUFFER_TRACE(jh, "exit");
 	return ret;
 }
 /*
  * journal_insert_checkpoint: put a committed buffer onto a checkpoint
  * list so that we know when it is safe to clean the transaction out of
  * the log.
  *
  * Called with the journal locked.
  * Called with j_list_lock held.
  */
 void __journal_insert_checkpoint(struct journal_head *jh,
 			       transaction_t *transaction)
 {
 	JBUFFER_TRACE(jh, "entry");
 	J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh)));
 	J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
 	jh->b_cp_transaction = transaction;
 	if (!transaction->t_checkpoint_list) {
 		jh->b_cpnext = jh->b_cpprev = jh;
 	} else {
 		jh->b_cpnext = transaction->t_checkpoint_list;
 		jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev;
 		jh->b_cpprev->b_cpnext = jh;
 		jh->b_cpnext->b_cpprev = jh;
 	}
 	transaction->t_checkpoint_list = jh;
 }
 /*
  * We've finished with this transaction structure: adios...
  *
  * The transaction must have no links except for the checkpoint by this
  * point.
  *
  * Called with the journal locked.
  * Called with j_list_lock held.
  */
 void __journal_drop_transaction(journal_t *journal, transaction_t *transaction)
 {
 	assert_spin_locked(&journal->j_list_lock);
 	if (transaction->t_cpnext) {
 		transaction->t_cpnext->t_cpprev = transaction->t_cpprev;
 		transaction->t_cpprev->t_cpnext = transaction->t_cpnext;
 		if (journal->j_checkpoint_transactions == transaction)
 			journal->j_checkpoint_transactions =
 				transaction->t_cpnext;
 		if (journal->j_checkpoint_transactions == transaction)
 			journal->j_checkpoint_transactions = NULL;
 	}
 	J_ASSERT(transaction->t_state == T_FINISHED);
 	J_ASSERT(transaction->t_buffers == NULL);
 	J_ASSERT(transaction->t_sync_datalist == NULL);
 	J_ASSERT(transaction->t_forget == NULL);
 	J_ASSERT(transaction->t_iobuf_list == NULL);
 	J_ASSERT(transaction->t_shadow_list == NULL);
 	J_ASSERT(transaction->t_log_list == NULL);
 	J_ASSERT(transaction->t_checkpoint_list == NULL);
 	J_ASSERT(transaction->t_checkpoint_io_list == NULL);
 	J_ASSERT(transaction->t_updates == 0);
 	J_ASSERT(journal->j_committing_transaction != transaction);
 	J_ASSERT(journal->j_running_transaction != transaction);
 	jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
 	kfree(transaction);
 }

fs/jbd/commit.c

Diff comments View file @ 95c354f

fs/jbd2/checkpoint.c

Diff comments View file @ 95c354f

 /*
  * linux/fs/jbd2/checkpoint.c
  *
  * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
  *
  * Copyright 1999 Red Hat Software --- All Rights Reserved
  *
  * This file is part of the Linux kernel and is made available under
  * the terms of the GNU General Public License, version 2, or at your
  * option, any later version, incorporated herein by reference.
  *
  * Checkpoint routines for the generic filesystem journaling code.
  * Part of the ext2fs journaling system.
  *
  * Checkpointing is the process of ensuring that a section of the log is
  * committed fully to disk, so that that portion of the log can be
  * reused.
  */
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/jbd2.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
 /*
  * Unlink a buffer from a transaction checkpoint list.
  *
  * Called with j_list_lock held.
  */
 static inline void __buffer_unlink_first(struct journal_head *jh)
 {
 	transaction_t *transaction = jh->b_cp_transaction;
 	jh->b_cpnext->b_cpprev = jh->b_cpprev;
 	jh->b_cpprev->b_cpnext = jh->b_cpnext;
 	if (transaction->t_checkpoint_list == jh) {
 		transaction->t_checkpoint_list = jh->b_cpnext;
 		if (transaction->t_checkpoint_list == jh)
 			transaction->t_checkpoint_list = NULL;
 	}
 }
 /*
  * Unlink a buffer from a transaction checkpoint(io) list.
  *
  * Called with j_list_lock held.
  */
 static inline void __buffer_unlink(struct journal_head *jh)
 {
 	transaction_t *transaction = jh->b_cp_transaction;
 	__buffer_unlink_first(jh);
 	if (transaction->t_checkpoint_io_list == jh) {
 		transaction->t_checkpoint_io_list = jh->b_cpnext;
 		if (transaction->t_checkpoint_io_list == jh)
 			transaction->t_checkpoint_io_list = NULL;
 	}
 }
 /*
  * Move a buffer from the checkpoint list to the checkpoint io list
  *
  * Called with j_list_lock held
  */
 static inline void __buffer_relink_io(struct journal_head *jh)
 {
 	transaction_t *transaction = jh->b_cp_transaction;
 	__buffer_unlink_first(jh);
 	if (!transaction->t_checkpoint_io_list) {
 		jh->b_cpnext = jh->b_cpprev = jh;
 	} else {
 		jh->b_cpnext = transaction->t_checkpoint_io_list;
 		jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
 		jh->b_cpprev->b_cpnext = jh;
 		jh->b_cpnext->b_cpprev = jh;
 	}
 	transaction->t_checkpoint_io_list = jh;
 }
 /*
  * Try to release a checkpointed buffer from its transaction.
  * Returns 1 if we released it and 2 if we also released the
  * whole transaction.
  *
  * Requires j_list_lock
  * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
  */
 static int __try_to_free_cp_buf(struct journal_head *jh)
 {
 	int ret = 0;
 	struct buffer_head *bh = jh2bh(jh);
 	if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
 		JBUFFER_TRACE(jh, "remove from checkpoint list");
 		ret = __jbd2_journal_remove_checkpoint(jh) + 1;
 		jbd_unlock_bh_state(bh);
 		jbd2_journal_remove_journal_head(bh);
 		BUFFER_TRACE(bh, "release");
 		__brelse(bh);
 	} else {
 		jbd_unlock_bh_state(bh);
 	}
 	return ret;
 }
 /*
  * __jbd2_log_wait_for_space: wait until there is space in the journal.
  *
  * Called under j-state_lock *only*.  It will be unlocked if we have to wait
  * for a checkpoint to free up some space in the log.
  */
 void __jbd2_log_wait_for_space(journal_t *journal)
 {
 	int nblocks;
 	assert_spin_locked(&journal->j_state_lock);
 	nblocks = jbd_space_needed(journal);
 	while (__jbd2_log_space_left(journal) < nblocks) {
 		if (journal->j_flags & JBD2_ABORT)
 			return;
 		spin_unlock(&journal->j_state_lock);
 		mutex_lock(&journal->j_checkpoint_mutex);
 		/*
 		 * Test again, another process may have checkpointed while we
 		 * were waiting for the checkpoint lock
 		 */
 		spin_lock(&journal->j_state_lock);
 		nblocks = jbd_space_needed(journal);
 		if (__jbd2_log_space_left(journal) < nblocks) {
 			spin_unlock(&journal->j_state_lock);
 			jbd2_log_do_checkpoint(journal);
 			spin_lock(&journal->j_state_lock);
 		}
 		mutex_unlock(&journal->j_checkpoint_mutex);
 	}
 }
 /*
  * We were unable to perform jbd_trylock_bh_state() inside j_list_lock.
  * The caller must restart a list walk.  Wait for someone else to run
  * jbd_unlock_bh_state().
  */
 static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
 	__releases(journal->j_list_lock)
 {
 	get_bh(bh);
 	spin_unlock(&journal->j_list_lock);
 	jbd_lock_bh_state(bh);
 	jbd_unlock_bh_state(bh);
 	put_bh(bh);
 }
 /*
  * Clean up transaction's list of buffers submitted for io.
  * We wait for any pending IO to complete and remove any clean
  * buffers. Note that we take the buffers in the opposite ordering
  * from the one in which they were submitted for IO.
  *
  * Called with j_list_lock held.
  */
 static void __wait_cp_io(journal_t *journal, transaction_t *transaction)
 {
 	struct journal_head *jh;
 	struct buffer_head *bh;
 	tid_t this_tid;
 	int released = 0;
 	this_tid = transaction->t_tid;
 restart:
 	/* Did somebody clean up the transaction in the meanwhile? */
 	if (journal->j_checkpoint_transactions != transaction ||
 			transaction->t_tid != this_tid)
 		return;
 	while (!released && transaction->t_checkpoint_io_list) {
 		jh = transaction->t_checkpoint_io_list;
 		bh = jh2bh(jh);
 		if (!jbd_trylock_bh_state(bh)) {
 			jbd_sync_bh(journal, bh);
 			spin_lock(&journal->j_list_lock);
 			goto restart;
 		}
 		if (buffer_locked(bh)) {
 			atomic_inc(&bh->b_count);
 			spin_unlock(&journal->j_list_lock);
 			jbd_unlock_bh_state(bh);
 			wait_on_buffer(bh);
 			/* the journal_head may have gone by now */
 			BUFFER_TRACE(bh, "brelse");
 			__brelse(bh);
 			spin_lock(&journal->j_list_lock);
 			goto restart;
 		}
 		/*
 		 * Now in whatever state the buffer currently is, we know that
 		 * it has been written out and so we can drop it from the list
 		 */
 		released = __jbd2_journal_remove_checkpoint(jh);
 		jbd_unlock_bh_state(bh);
 		jbd2_journal_remove_journal_head(bh);
 		__brelse(bh);
 	}
 }
 #define NR_BATCH	64
 static void
 __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
 {
 	int i;
 	ll_rw_block(SWRITE, *batch_count, bhs);
 	for (i = 0; i < *batch_count; i++) {
 		struct buffer_head *bh = bhs[i];
 		clear_buffer_jwrite(bh);
 		BUFFER_TRACE(bh, "brelse");
 		__brelse(bh);
 	}
 	*batch_count = 0;
 }
 /*
  * Try to flush one buffer from the checkpoint list to disk.
  *
  * Return 1 if something happened which requires us to abort the current
  * scan of the checkpoint list.
  *
  * Called with j_list_lock held and drops it if 1 is returned
  * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
  */
 static int __process_buffer(journal_t *journal, struct journal_head *jh,
 			struct buffer_head **bhs, int *batch_count,
 			transaction_t *transaction)
 {
 	struct buffer_head *bh = jh2bh(jh);
 	int ret = 0;
 	if (buffer_locked(bh)) {
 		atomic_inc(&bh->b_count);
 		spin_unlock(&journal->j_list_lock);
 		jbd_unlock_bh_state(bh);
 		wait_on_buffer(bh);
 		/* the journal_head may have gone by now */
 		BUFFER_TRACE(bh, "brelse");
 		__brelse(bh);
 		ret = 1;
 	} else if (jh->b_transaction != NULL) {
 		transaction_t *t = jh->b_transaction;
 		tid_t tid = t->t_tid;
 		transaction->t_chp_stats.cs_forced_to_close++;
 		spin_unlock(&journal->j_list_lock);
 		jbd_unlock_bh_state(bh);
 		jbd2_log_start_commit(journal, tid);
 		jbd2_log_wait_commit(journal, tid);
 		ret = 1;
 	} else if (!buffer_dirty(bh)) {
 		J_ASSERT_JH(jh, !buffer_jbddirty(bh));
 		BUFFER_TRACE(bh, "remove from checkpoint");
 		__jbd2_journal_remove_checkpoint(jh);
 		spin_unlock(&journal->j_list_lock);
 		jbd_unlock_bh_state(bh);
 		jbd2_journal_remove_journal_head(bh);
 		__brelse(bh);
 		ret = 1;
 	} else {
 		/*
 		 * Important: we are about to write the buffer, and
 		 * possibly block, while still holding the journal lock.
 		 * We cannot afford to let the transaction logic start
 		 * messing around with this buffer before we write it to
 		 * disk, as that would break recoverability.
 		 */
 		BUFFER_TRACE(bh, "queue");
 		get_bh(bh);
 		J_ASSERT_BH(bh, !buffer_jwrite(bh));
 		set_buffer_jwrite(bh);
 		bhs[*batch_count] = bh;
 		__buffer_relink_io(jh);
 		jbd_unlock_bh_state(bh);
 		transaction->t_chp_stats.cs_written++;
 		(*batch_count)++;
 		if (*batch_count == NR_BATCH) {
 			spin_unlock(&journal->j_list_lock);
 			__flush_batch(journal, bhs, batch_count);
 			ret = 1;
 		}
 	}
 	return ret;
 }
 /*
  * Perform an actual checkpoint. We take the first transaction on the
  * list of transactions to be checkpointed and send all its buffers
  * to disk. We submit larger chunks of data at once.
  *
  * The journal should be locked before calling this function.
  */
 int jbd2_log_do_checkpoint(journal_t *journal)
 {
 	transaction_t *transaction;
 	tid_t this_tid;
 	int result;
 	jbd_debug(1, "Start checkpoint\n");
 	/*
 	 * First thing: if there are any transactions in the log which
 	 * don't need checkpointing, just eliminate them from the
 	 * journal straight away.
 	 */
 	result = jbd2_cleanup_journal_tail(journal);
 	jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
 	if (result <= 0)
 		return result;
 	/*
 	 * OK, we need to start writing disk blocks.  Take one transaction
 	 * and write it.
 	 */
 	spin_lock(&journal->j_list_lock);
 	if (!journal->j_checkpoint_transactions)
 		goto out;
 	transaction = journal->j_checkpoint_transactions;
 	if (transaction->t_chp_stats.cs_chp_time == 0)
 		transaction->t_chp_stats.cs_chp_time = jiffies;
 	this_tid = transaction->t_tid;
 restart:
 	/*
 	 * If someone cleaned up this transaction while we slept, we're
 	 * done (maybe it's a new transaction, but it fell at the same
 	 * address).
 	 */
 	if (journal->j_checkpoint_transactions == transaction &&
 			transaction->t_tid == this_tid) {
 		int batch_count = 0;
 		struct buffer_head *bhs[NR_BATCH];
 		struct journal_head *jh;
 		int retry = 0;
 		while (!retry && transaction->t_checkpoint_list) {
 			struct buffer_head *bh;
 			jh = transaction->t_checkpoint_list;
 			bh = jh2bh(jh);
 			if (!jbd_trylock_bh_state(bh)) {
 				jbd_sync_bh(journal, bh);
 				retry = 1;
 				break;
 			}
 			retry = __process_buffer(journal, jh, bhs, &batch_count,
 						 transaction);
-			if (!retry && lock_need_resched(&journal->j_list_lock)){
+			if (!retry && (need_resched() ||
+				spin_needbreak(&journal->j_list_lock))) {
 				spin_unlock(&journal->j_list_lock);
 				retry = 1;
 				break;
 			}
 		}
 		if (batch_count) {
 			if (!retry) {
 				spin_unlock(&journal->j_list_lock);
 				retry = 1;
 			}
 			__flush_batch(journal, bhs, &batch_count);
 		}
 		if (retry) {
 			spin_lock(&journal->j_list_lock);
 			goto restart;
 		}
 		/*
 		 * Now we have cleaned up the first transaction's checkpoint
 		 * list. Let's clean up the second one
 		 */
 		__wait_cp_io(journal, transaction);
 	}
 out:
 	spin_unlock(&journal->j_list_lock);
 	result = jbd2_cleanup_journal_tail(journal);
 	if (result < 0)
 		return result;
 	return 0;
 }
 /*
  * Check the list of checkpoint transactions for the journal to see if
  * we have already got rid of any since the last update of the log tail
  * in the journal superblock.  If so, we can instantly roll the
  * superblock forward to remove those transactions from the log.
  *
  * Return <0 on error, 0 on success, 1 if there was nothing to clean up.
  *
  * Called with the journal lock held.
  *
  * This is the only part of the journaling code which really needs to be
  * aware of transaction aborts.  Checkpointing involves writing to the
  * main filesystem area rather than to the journal, so it can proceed
  * even in abort state, but we must not update the journal superblock if
  * we have an abort error outstanding.
  */
 int jbd2_cleanup_journal_tail(journal_t *journal)
 {
 	transaction_t * transaction;
 	tid_t		first_tid;
 	unsigned long	blocknr, freed;
 	/* OK, work out the oldest transaction remaining in the log, and
 	 * the log block it starts at.
 	 *
 	 * If the log is now empty, we need to work out which is the
 	 * next transaction ID we will write, and where it will
 	 * start. */
 	spin_lock(&journal->j_state_lock);
 	spin_lock(&journal->j_list_lock);
 	transaction = journal->j_checkpoint_transactions;
 	if (transaction) {
 		first_tid = transaction->t_tid;
 		blocknr = transaction->t_log_start;
 	} else if ((transaction = journal->j_committing_transaction) != NULL) {
 		first_tid = transaction->t_tid;
 		blocknr = transaction->t_log_start;
 	} else if ((transaction = journal->j_running_transaction) != NULL) {
 		first_tid = transaction->t_tid;
 		blocknr = journal->j_head;
 	} else {
 		first_tid = journal->j_transaction_sequence;
 		blocknr = journal->j_head;
 	}
 	spin_unlock(&journal->j_list_lock);
 	J_ASSERT(blocknr != 0);
 	/* If the oldest pinned transaction is at the tail of the log
            already then there's not much we can do right now. */
 	if (journal->j_tail_sequence == first_tid) {
 		spin_unlock(&journal->j_state_lock);
 		return 1;
 	}
 	/* OK, update the superblock to recover the freed space.
 	 * Physical blocks come first: have we wrapped beyond the end of
 	 * the log?  */
 	freed = blocknr - journal->j_tail;
 	if (blocknr < journal->j_tail)
 		freed = freed + journal->j_last - journal->j_first;
 	jbd_debug(1,
 		  "Cleaning journal tail from %d to %d (offset %lu), "
 		  "freeing %lu\n",
 		  journal->j_tail_sequence, first_tid, blocknr, freed);
 	journal->j_free += freed;
 	journal->j_tail_sequence = first_tid;
 	journal->j_tail = blocknr;
 	spin_unlock(&journal->j_state_lock);
 	if (!(journal->j_flags & JBD2_ABORT))
 		jbd2_journal_update_superblock(journal, 1);
 	return 0;
 }
 /* Checkpoint list management */
 /*
  * journal_clean_one_cp_list
  *
  * Find all the written-back checkpoint buffers in the given list and release them.
  *
  * Called with the journal locked.
  * Called with j_list_lock held.
  * Returns number of bufers reaped (for debug)
  */
 static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
 {
 	struct journal_head *last_jh;
 	struct journal_head *next_jh = jh;
 	int ret, freed = 0;
 	*released = 0;
 	if (!jh)
 		return 0;
 	last_jh = jh->b_cpprev;
 	do {
 		jh = next_jh;
 		next_jh = jh->b_cpnext;
 		/* Use trylock because of the ranking */
 		if (jbd_trylock_bh_state(jh2bh(jh))) {
 			ret = __try_to_free_cp_buf(jh);
 			if (ret) {
 				freed++;
 				if (ret == 2) {
 					*released = 1;
 					return freed;
 				}
 			}
 		}
 		/*
 		 * This function only frees up some memory
 		 * if possible so we dont have an obligation
 		 * to finish processing. Bail out if preemption
 		 * requested:
 		 */
 		if (need_resched())
 			return freed;
 	} while (jh != last_jh);
 	return freed;
 }
 /*
  * journal_clean_checkpoint_list
  *
  * Find all the written-back checkpoint buffers in the journal and release them.
  *
  * Called with the journal locked.
  * Called with j_list_lock held.
  * Returns number of buffers reaped (for debug)
  */
 int __jbd2_journal_clean_checkpoint_list(journal_t *journal)
 {
 	transaction_t *transaction, *last_transaction, *next_transaction;
 	int ret = 0;
 	int released;
 	transaction = journal->j_checkpoint_transactions;
 	if (!transaction)
 		goto out;
 	last_transaction = transaction->t_cpprev;
 	next_transaction = transaction;
 	do {
 		transaction = next_transaction;
 		next_transaction = transaction->t_cpnext;
 		ret += journal_clean_one_cp_list(transaction->
 				t_checkpoint_list, &released);
 		/*
 		 * This function only frees up some memory if possible so we
 		 * dont have an obligation to finish processing. Bail out if
 		 * preemption requested:
 		 */
 		if (need_resched())
 			goto out;
 		if (released)
 			continue;
 		/*
 		 * It is essential that we are as careful as in the case of
 		 * t_checkpoint_list with removing the buffer from the list as
 		 * we can possibly see not yet submitted buffers on io_list
 		 */
 		ret += journal_clean_one_cp_list(transaction->
 				t_checkpoint_io_list, &released);
 		if (need_resched())
 			goto out;
 	} while (transaction != last_transaction);
 out:
 	return ret;
 }
 /*
  * journal_remove_checkpoint: called after a buffer has been committed
  * to disk (either by being write-back flushed to disk, or being
  * committed to the log).
  *
  * We cannot safely clean a transaction out of the log until all of the
  * buffer updates committed in that transaction have safely been stored
  * elsewhere on disk.  To achieve this, all of the buffers in a
  * transaction need to be maintained on the transaction's checkpoint
  * lists until they have been rewritten, at which point this function is
  * called to remove the buffer from the existing transaction's
  * checkpoint lists.
  *
  * The function returns 1 if it frees the transaction, 0 otherwise.
  *
  * This function is called with the journal locked.
  * This function is called with j_list_lock held.
  * This function is called with jbd_lock_bh_state(jh2bh(jh))
  */
 int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
 {
 	transaction_t *transaction;
 	journal_t *journal;
 	int ret = 0;
 	JBUFFER_TRACE(jh, "entry");
 	if ((transaction = jh->b_cp_transaction) == NULL) {
 		JBUFFER_TRACE(jh, "not on transaction");
 		goto out;
 	}
 	journal = transaction->t_journal;
 	__buffer_unlink(jh);
 	jh->b_cp_transaction = NULL;
 	if (transaction->t_checkpoint_list != NULL ||
 	    transaction->t_checkpoint_io_list != NULL)
 		goto out;
 	JBUFFER_TRACE(jh, "transaction has no more buffers");
 	/*
 	 * There is one special case to worry about: if we have just pulled the
 	 * buffer off a running or committing transaction's checkpoing list,
 	 * then even if the checkpoint list is empty, the transaction obviously
 	 * cannot be dropped!
 	 *
 	 * The locking here around t_state is a bit sleazy.
 	 * See the comment at the end of jbd2_journal_commit_transaction().
 	 */
 	if (transaction->t_state != T_FINISHED) {
 		JBUFFER_TRACE(jh, "belongs to running/committing transaction");
 		goto out;
 	}
 	/* OK, that was the last buffer for the transaction: we can now
 	   safely remove this transaction from the log */
 	__jbd2_journal_drop_transaction(journal, transaction);
 	/* Just in case anybody was waiting for more transactions to be
            checkpointed... */
 	wake_up(&journal->j_wait_logspace);
 	ret = 1;
 out:
 	JBUFFER_TRACE(jh, "exit");
 	return ret;
 }
 /*
  * journal_insert_checkpoint: put a committed buffer onto a checkpoint
  * list so that we know when it is safe to clean the transaction out of
  * the log.
  *
  * Called with the journal locked.
  * Called with j_list_lock held.
  */
 void __jbd2_journal_insert_checkpoint(struct journal_head *jh,
 			       transaction_t *transaction)
 {
 	JBUFFER_TRACE(jh, "entry");
 	J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh)));
 	J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
 	jh->b_cp_transaction = transaction;
 	if (!transaction->t_checkpoint_list) {
 		jh->b_cpnext = jh->b_cpprev = jh;
 	} else {
 		jh->b_cpnext = transaction->t_checkpoint_list;
 		jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev;
 		jh->b_cpprev->b_cpnext = jh;
 		jh->b_cpnext->b_cpprev = jh;
 	}
 	transaction->t_checkpoint_list = jh;
 }
 /*
  * We've finished with this transaction structure: adios...
  *
  * The transaction must have no links except for the checkpoint by this
  * point.
  *
  * Called with the journal locked.
  * Called with j_list_lock held.
  */
 void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transaction)
 {
 	assert_spin_locked(&journal->j_list_lock);
 	if (transaction->t_cpnext) {
 		transaction->t_cpnext->t_cpprev = transaction->t_cpprev;
 		transaction->t_cpprev->t_cpnext = transaction->t_cpnext;
 		if (journal->j_checkpoint_transactions == transaction)
 			journal->j_checkpoint_transactions =
 				transaction->t_cpnext;
 		if (journal->j_checkpoint_transactions == transaction)
 			journal->j_checkpoint_transactions = NULL;
 	}
 	J_ASSERT(transaction->t_state == T_FINISHED);
 	J_ASSERT(transaction->t_buffers == NULL);
 	J_ASSERT(transaction->t_sync_datalist == NULL);
 	J_ASSERT(transaction->t_forget == NULL);
 	J_ASSERT(transaction->t_iobuf_list == NULL);
 	J_ASSERT(transaction->t_shadow_list == NULL);
 	J_ASSERT(transaction->t_log_list == NULL);
 	J_ASSERT(transaction->t_checkpoint_list == NULL);
 	J_ASSERT(transaction->t_checkpoint_io_list == NULL);
 	J_ASSERT(transaction->t_updates == 0);
 	J_ASSERT(journal->j_committing_transaction != transaction);
 	J_ASSERT(journal->j_running_transaction != transaction);
 	jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
 	kfree(transaction);
 }

fs/jbd2/commit.c

Diff comments View file @ 95c354f

include/linux/sched.h

Diff comments View file @ 95c354f

 #ifndef _LINUX_SCHED_H
 #define _LINUX_SCHED_H
 /*
  * cloning flags:
  */
 #define CSIGNAL		0x000000ff	/* signal mask to be sent at exit */
 #define CLONE_VM	0x00000100	/* set if VM shared between processes */
 #define CLONE_FS	0x00000200	/* set if fs info shared between processes */
 #define CLONE_FILES	0x00000400	/* set if open files shared between processes */
 #define CLONE_SIGHAND	0x00000800	/* set if signal handlers and blocked signals shared */
 #define CLONE_PTRACE	0x00002000	/* set if we want to let tracing continue on the child too */
 #define CLONE_VFORK	0x00004000	/* set if the parent wants the child to wake it up on mm_release */
 #define CLONE_PARENT	0x00008000	/* set if we want to have the same parent as the cloner */
 #define CLONE_THREAD	0x00010000	/* Same thread group? */
 #define CLONE_NEWNS	0x00020000	/* New namespace group? */
 #define CLONE_SYSVSEM	0x00040000	/* share system V SEM_UNDO semantics */
 #define CLONE_SETTLS	0x00080000	/* create a new TLS for the child */
 #define CLONE_PARENT_SETTID	0x00100000	/* set the TID in the parent */
 #define CLONE_CHILD_CLEARTID	0x00200000	/* clear the TID in the child */
 #define CLONE_DETACHED		0x00400000	/* Unused, ignored */
 #define CLONE_UNTRACED		0x00800000	/* set if the tracing process can't force CLONE_PTRACE on this clone */
 #define CLONE_CHILD_SETTID	0x01000000	/* set the TID in the child */
 #define CLONE_STOPPED		0x02000000	/* Start in stopped state */
 #define CLONE_NEWUTS		0x04000000	/* New utsname group? */
 #define CLONE_NEWIPC		0x08000000	/* New ipcs */
 #define CLONE_NEWUSER		0x10000000	/* New user namespace */
 #define CLONE_NEWPID		0x20000000	/* New pid namespace */
 #define CLONE_NEWNET		0x40000000	/* New network namespace */
 #define CLONE_IO		0x80000000	/* Clone io context */
 /*
  * Scheduling policies
  */
 #define SCHED_NORMAL		0
 #define SCHED_FIFO		1
 #define SCHED_RR		2
 #define SCHED_BATCH		3
 /* SCHED_ISO: reserved but not implemented yet */
 #define SCHED_IDLE		5
 #ifdef __KERNEL__
 struct sched_param {
 	int sched_priority;
 };
 #include <asm/param.h>	/* for HZ */
 #include <linux/capability.h>
 #include <linux/threads.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/timex.h>
 #include <linux/jiffies.h>
 #include <linux/rbtree.h>
 #include <linux/thread_info.h>
 #include <linux/cpumask.h>
 #include <linux/errno.h>
 #include <linux/nodemask.h>
 #include <linux/mm_types.h>
 #include <asm/system.h>
 #include <asm/semaphore.h>
 #include <asm/page.h>
 #include <asm/ptrace.h>
 #include <asm/cputime.h>
 #include <linux/smp.h>
 #include <linux/sem.h>
 #include <linux/signal.h>
 #include <linux/securebits.h>
 #include <linux/fs_struct.h>
 #include <linux/compiler.h>
 #include <linux/completion.h>
 #include <linux/pid.h>
 #include <linux/percpu.h>
 #include <linux/topology.h>
 #include <linux/proportions.h>
 #include <linux/seccomp.h>
 #include <linux/rcupdate.h>
 #include <linux/rtmutex.h>
 #include <linux/time.h>
 #include <linux/param.h>
 #include <linux/resource.h>
 #include <linux/timer.h>
 #include <linux/hrtimer.h>
 #include <linux/task_io_accounting.h>
 #include <linux/kobject.h>
 #include <linux/latencytop.h>
 #include <asm/processor.h>
 struct exec_domain;
 struct futex_pi_state;
 struct robust_list_head;
 struct bio;
 /*
  * List of flags we want to share for kernel threads,
  * if only because they are not used by them anyway.
  */
 #define CLONE_KERNEL	(CLONE_FS | CLONE_FILES | CLONE_SIGHAND)
 /*
  * These are the constant used to fake the fixed-point load-average
  * counting. Some notes:
  *  - 11 bit fractions expand to 22 bits by the multiplies: this gives
  *    a load-average precision of 10 bits integer + 11 bits fractional
  *  - if you want to count load-averages more often, you need more
  *    precision, or rounding will get you. With 2-second counting freq,
  *    the EXP_n values would be 1981, 2034 and 2043 if still using only
  *    11 bit fractions.
  */
 extern unsigned long avenrun[];		/* Load averages */
 #define FSHIFT		11		/* nr of bits of precision */
 #define FIXED_1		(1<<FSHIFT)	/* 1.0 as fixed-point */
 #define LOAD_FREQ	(5*HZ+1)	/* 5 sec intervals */
 #define EXP_1		1884		/* 1/exp(5sec/1min) as fixed-point */
 #define EXP_5		2014		/* 1/exp(5sec/5min) */
 #define EXP_15		2037		/* 1/exp(5sec/15min) */
 #define CALC_LOAD(load,exp,n) \
 	load *= exp; \
 	load += n*(FIXED_1-exp); \
 	load >>= FSHIFT;
 extern unsigned long total_forks;
 extern int nr_threads;
 DECLARE_PER_CPU(unsigned long, process_counts);
 extern int nr_processes(void);
 extern unsigned long nr_running(void);
 extern unsigned long nr_uninterruptible(void);
 extern unsigned long nr_active(void);
 extern unsigned long nr_iowait(void);
 extern unsigned long weighted_cpuload(const int cpu);
 struct seq_file;
 struct cfs_rq;
 struct task_group;
 #ifdef CONFIG_SCHED_DEBUG
 extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m);
 extern void proc_sched_set_task(struct task_struct *p);
 extern void
 print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
 #else
 static inline void
 proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 {
 }
 static inline void proc_sched_set_task(struct task_struct *p)
 {
 }
 static inline void
 print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 {
 }
 #endif
 /*
  * Task state bitmask. NOTE! These bits are also
  * encoded in fs/proc/array.c: get_task_state().
  *
  * We have two separate sets of flags: task->state
  * is about runnability, while task->exit_state are
  * about the task exiting. Confusing, but this way
  * modifying one set can't modify the other one by
  * mistake.
  */
 #define TASK_RUNNING		0
 #define TASK_INTERRUPTIBLE	1
 #define TASK_UNINTERRUPTIBLE	2
 #define TASK_STOPPED		4
 #define TASK_TRACED		8
 /* in tsk->exit_state */
 #define EXIT_ZOMBIE		16
 #define EXIT_DEAD		32
 /* in tsk->state again */
 #define TASK_DEAD		64
 #define __set_task_state(tsk, state_value)		\
 	do { (tsk)->state = (state_value); } while (0)
 #define set_task_state(tsk, state_value)		\
 	set_mb((tsk)->state, (state_value))
 /*
  * set_current_state() includes a barrier so that the write of current->state
  * is correctly serialised wrt the caller's subsequent test of whether to
  * actually sleep:
  *
  *	set_current_state(TASK_UNINTERRUPTIBLE);
  *	if (do_i_need_to_sleep())
  *		schedule();
  *
  * If the caller does not need such serialisation then use __set_current_state()
  */
 #define __set_current_state(state_value)			\
 	do { current->state = (state_value); } while (0)
 #define set_current_state(state_value)		\
 	set_mb(current->state, (state_value))
 /* Task command name length */
 #define TASK_COMM_LEN 16
 #include <linux/spinlock.h>
 /*
  * This serializes "schedule()" and also protects
  * the run-queue from deletions/modifications (but
  * _adding_ to the beginning of the run-queue has
  * a separate lock).
  */
 extern rwlock_t tasklist_lock;
 extern spinlock_t mmlist_lock;
 struct task_struct;
 extern void sched_init(void);
 extern void sched_init_smp(void);
 extern void init_idle(struct task_struct *idle, int cpu);
 extern void init_idle_bootup_task(struct task_struct *idle);
 extern cpumask_t nohz_cpu_mask;
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
 extern int select_nohz_load_balancer(int cpu);
 #else
 static inline int select_nohz_load_balancer(int cpu)
 {
 	return 0;
 }
 #endif
 extern unsigned long rt_needs_cpu(int cpu);
 /*
  * Only dump TASK_* tasks. (0 for all tasks)
  */
 extern void show_state_filter(unsigned long state_filter);
 static inline void show_state(void)
 {
 	show_state_filter(0);
 }
 extern void show_regs(struct pt_regs *);
 /*
  * TASK is a pointer to the task whose backtrace we want to see (or NULL for current
  * task), SP is the stack pointer of the first frame that should be shown in the back
  * trace (or NULL if the entire call-chain of the task should be shown).
  */
 extern void show_stack(struct task_struct *task, unsigned long *sp);
 void io_schedule(void);
 long io_schedule_timeout(long timeout);
 extern void cpu_init (void);
 extern void trap_init(void);
 extern void account_process_tick(struct task_struct *task, int user);
 extern void update_process_times(int user);
 extern void scheduler_tick(void);
 extern void hrtick_resched(void);
 extern void sched_show_task(struct task_struct *p);
 #ifdef CONFIG_DETECT_SOFTLOCKUP
 extern void softlockup_tick(void);
 extern void spawn_softlockup_task(void);
 extern void touch_softlockup_watchdog(void);
 extern void touch_all_softlockup_watchdogs(void);
 extern unsigned long  softlockup_thresh;
 extern unsigned long sysctl_hung_task_check_count;
 extern unsigned long sysctl_hung_task_timeout_secs;
 extern unsigned long sysctl_hung_task_warnings;
 #else
 static inline void softlockup_tick(void)
 {
 }
 static inline void spawn_softlockup_task(void)
 {
 }
 static inline void touch_softlockup_watchdog(void)
 {
 }
 static inline void touch_all_softlockup_watchdogs(void)
 {
 }
 #endif
 /* Attach to any functions which should be ignored in wchan output. */
 #define __sched		__attribute__((__section__(".sched.text")))
 /* Linker adds these: start and end of __sched functions */
 extern char __sched_text_start[], __sched_text_end[];
 /* Is this address in the __sched functions? */
 extern int in_sched_functions(unsigned long addr);
 #define	MAX_SCHEDULE_TIMEOUT	LONG_MAX
 extern signed long FASTCALL(schedule_timeout(signed long timeout));
 extern signed long schedule_timeout_interruptible(signed long timeout);
 extern signed long schedule_timeout_uninterruptible(signed long timeout);
 asmlinkage void schedule(void);
 struct nsproxy;
 struct user_namespace;
 /* Maximum number of active map areas.. This is a random (large) number */
 #define DEFAULT_MAX_MAP_COUNT	65536
 extern int sysctl_max_map_count;
 #include <linux/aio.h>
 extern unsigned long
 arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
 		       unsigned long, unsigned long);
 extern unsigned long
 arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
 			  unsigned long len, unsigned long pgoff,
 			  unsigned long flags);
 extern void arch_unmap_area(struct mm_struct *, unsigned long);
 extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
 #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
 /*
  * The mm counters are not protected by its page_table_lock,
  * so must be incremented atomically.
  */
 #define set_mm_counter(mm, member, value) atomic_long_set(&(mm)->_##member, value)
 #define get_mm_counter(mm, member) ((unsigned long)atomic_long_read(&(mm)->_##member))
 #define add_mm_counter(mm, member, value) atomic_long_add(value, &(mm)->_##member)
 #define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member)
 #define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member)
 #else  /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
 /*
  * The mm counters are protected by its page_table_lock,
  * so can be incremented directly.
  */
 #define set_mm_counter(mm, member, value) (mm)->_##member = (value)
 #define get_mm_counter(mm, member) ((mm)->_##member)
 #define add_mm_counter(mm, member, value) (mm)->_##member += (value)
 #define inc_mm_counter(mm, member) (mm)->_##member++
 #define dec_mm_counter(mm, member) (mm)->_##member--
 #endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
 #define get_mm_rss(mm)					\
 	(get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss))
 #define update_hiwater_rss(mm)	do {			\
 	unsigned long _rss = get_mm_rss(mm);		\
 	if ((mm)->hiwater_rss < _rss)			\
 		(mm)->hiwater_rss = _rss;		\
 } while (0)
 #define update_hiwater_vm(mm)	do {			\
 	if ((mm)->hiwater_vm < (mm)->total_vm)		\
 		(mm)->hiwater_vm = (mm)->total_vm;	\
 } while (0)
 extern void set_dumpable(struct mm_struct *mm, int value);
 extern int get_dumpable(struct mm_struct *mm);
 /* mm flags */
 /* dumpable bits */
 #define MMF_DUMPABLE      0  /* core dump is permitted */
 #define MMF_DUMP_SECURELY 1  /* core file is readable only by root */
 #define MMF_DUMPABLE_BITS 2
 /* coredump filter bits */
 #define MMF_DUMP_ANON_PRIVATE	2
 #define MMF_DUMP_ANON_SHARED	3
 #define MMF_DUMP_MAPPED_PRIVATE	4
 #define MMF_DUMP_MAPPED_SHARED	5
 #define MMF_DUMP_ELF_HEADERS	6
 #define MMF_DUMP_FILTER_SHIFT	MMF_DUMPABLE_BITS
 #define MMF_DUMP_FILTER_BITS	5
 #define MMF_DUMP_FILTER_MASK \
 	(((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT)
 #define MMF_DUMP_FILTER_DEFAULT \
 	((1 << MMF_DUMP_ANON_PRIVATE) |	(1 << MMF_DUMP_ANON_SHARED))
 struct sighand_struct {
 	atomic_t		count;
 	struct k_sigaction	action[_NSIG];
 	spinlock_t		siglock;
 	wait_queue_head_t	signalfd_wqh;
 };
 struct pacct_struct {
 	int			ac_flag;
 	long			ac_exitcode;
 	unsigned long		ac_mem;
 	cputime_t		ac_utime, ac_stime;
 	unsigned long		ac_minflt, ac_majflt;
 };
 /*
  * NOTE! "signal_struct" does not have it's own
  * locking, because a shared signal_struct always
  * implies a shared sighand_struct, so locking
  * sighand_struct is always a proper superset of
  * the locking of signal_struct.
  */
 struct signal_struct {
 	atomic_t		count;
 	atomic_t		live;
 	wait_queue_head_t	wait_chldexit;	/* for wait4() */
 	/* current thread group signal load-balancing target: */
 	struct task_struct	*curr_target;
 	/* shared signal handling: */
 	struct sigpending	shared_pending;
 	/* thread group exit support */
 	int			group_exit_code;
 	/* overloaded:
 	 * - notify group_exit_task when ->count is equal to notify_count
 	 * - everyone except group_exit_task is stopped during signal delivery
 	 *   of fatal signals, group_exit_task processes the signal.
 	 */
 	struct task_struct	*group_exit_task;
 	int			notify_count;
 	/* thread group stop support, overloads group_exit_code too */
 	int			group_stop_count;
 	unsigned int		flags; /* see SIGNAL_* flags below */
 	/* POSIX.1b Interval Timers */
 	struct list_head posix_timers;
 	/* ITIMER_REAL timer for the process */
 	struct hrtimer real_timer;
 	struct task_struct *tsk;
 	ktime_t it_real_incr;
 	/* ITIMER_PROF and ITIMER_VIRTUAL timers for the process */
 	cputime_t it_prof_expires, it_virt_expires;
 	cputime_t it_prof_incr, it_virt_incr;
 	/* job control IDs */
 	/*
 	 * pgrp and session fields are deprecated.
 	 * use the task_session_Xnr and task_pgrp_Xnr routines below
 	 */
 	union {
 		pid_t pgrp __deprecated;
 		pid_t __pgrp;
 	};
 	struct pid *tty_old_pgrp;
 	union {
 		pid_t session __deprecated;
 		pid_t __session;
 	};
 	/* boolean value for session group leader */
 	int leader;
 	struct tty_struct *tty; /* NULL if no tty */
 	/*
 	 * Cumulative resource counters for dead threads in the group,
 	 * and for reaped dead child processes forked by this group.
 	 * Live threads maintain their own counters and add to these
 	 * in __exit_signal, except for the group leader.
 	 */
 	cputime_t utime, stime, cutime, cstime;
 	cputime_t gtime;
 	cputime_t cgtime;
 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
 	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
 	unsigned long inblock, oublock, cinblock, coublock;
 	/*
 	 * Cumulative ns of scheduled CPU time for dead threads in the
 	 * group, not including a zombie group leader.  (This only differs
 	 * from jiffies_to_ns(utime + stime) if sched_clock uses something
 	 * other than jiffies.)
 	 */
 	unsigned long long sum_sched_runtime;
 	/*
 	 * We don't bother to synchronize most readers of this at all,
 	 * because there is no reader checking a limit that actually needs
 	 * to get both rlim_cur and rlim_max atomically, and either one
 	 * alone is a single word that can safely be read normally.
 	 * getrlimit/setrlimit use task_lock(current->group_leader) to
 	 * protect this instead of the siglock, because they really
 	 * have no need to disable irqs.
 	 */
 	struct rlimit rlim[RLIM_NLIMITS];
 	struct list_head cpu_timers[3];
 	/* keep the process-shared keyrings here so that they do the right
 	 * thing in threads created with CLONE_THREAD */
 #ifdef CONFIG_KEYS
 	struct key *session_keyring;	/* keyring inherited over fork */
 	struct key *process_keyring;	/* keyring private to this process */
 #endif
 #ifdef CONFIG_BSD_PROCESS_ACCT
 	struct pacct_struct pacct;	/* per-process accounting information */
 #endif
 #ifdef CONFIG_TASKSTATS
 	struct taskstats *stats;
 #endif
 #ifdef CONFIG_AUDIT
 	unsigned audit_tty;
 	struct tty_audit_buf *tty_audit_buf;
 #endif
 };
 /* Context switch must be unlocked if interrupts are to be enabled */
 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 # define __ARCH_WANT_UNLOCKED_CTXSW
 #endif
 /*
  * Bits in flags field of signal_struct.
  */
 #define SIGNAL_STOP_STOPPED	0x00000001 /* job control stop in effect */
 #define SIGNAL_STOP_DEQUEUED	0x00000002 /* stop signal dequeued */
 #define SIGNAL_STOP_CONTINUED	0x00000004 /* SIGCONT since WCONTINUED reap */
 #define SIGNAL_GROUP_EXIT	0x00000008 /* group exit in progress */
 /*
  * Some day this will be a full-fledged user tracking system..
  */
 struct user_struct {
 	atomic_t __count;	/* reference count */
 	atomic_t processes;	/* How many processes does this user have? */
 	atomic_t files;		/* How many open files does this user have? */
 	atomic_t sigpending;	/* How many pending signals does this user have? */
 #ifdef CONFIG_INOTIFY_USER
 	atomic_t inotify_watches; /* How many inotify watches does this user have? */
 	atomic_t inotify_devs;	/* How many inotify devs does this user have opened? */
 #endif
 #ifdef CONFIG_POSIX_MQUEUE
 	/* protected by mq_lock	*/
 	unsigned long mq_bytes;	/* How many bytes can be allocated to mqueue? */
 #endif
 	unsigned long locked_shm; /* How many pages of mlocked shm ? */
 #ifdef CONFIG_KEYS
 	struct key *uid_keyring;	/* UID specific keyring */
 	struct key *session_keyring;	/* UID's default session keyring */
 #endif
 	/* Hash table maintenance information */
 	struct hlist_node uidhash_node;
 	uid_t uid;
 #ifdef CONFIG_FAIR_USER_SCHED
 	struct task_group *tg;
 #ifdef CONFIG_SYSFS
 	struct kobject kobj;
 	struct work_struct work;
 #endif
 #endif
 };
 extern int uids_sysfs_init(void);
 extern struct user_struct *find_user(uid_t);
 extern struct user_struct root_user;
 #define INIT_USER (&root_user)
 struct backing_dev_info;
 struct reclaim_state;
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 struct sched_info {
 	/* cumulative counters */
 	unsigned long pcount;	      /* # of times run on this cpu */
 	unsigned long long cpu_time,  /* time spent on the cpu */
 			   run_delay; /* time spent waiting on a runqueue */
 	/* timestamps */
 	unsigned long long last_arrival,/* when we last ran on a cpu */
 			   last_queued;	/* when we were last queued to run */
 #ifdef CONFIG_SCHEDSTATS
 	/* BKL stats */
 	unsigned int bkl_count;
 #endif
 };
 #endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */
 #ifdef CONFIG_SCHEDSTATS
 extern const struct file_operations proc_schedstat_operations;
 #endif /* CONFIG_SCHEDSTATS */
 #ifdef CONFIG_TASK_DELAY_ACCT
 struct task_delay_info {
 	spinlock_t	lock;
 	unsigned int	flags;	/* Private per-task flags */
 	/* For each stat XXX, add following, aligned appropriately
 	 *
 	 * struct timespec XXX_start, XXX_end;
 	 * u64 XXX_delay;
 	 * u32 XXX_count;
 	 *
 	 * Atomicity of updates to XXX_delay, XXX_count protected by
 	 * single lock above (split into XXX_lock if contention is an issue).
 	 */
 	/*
 	 * XXX_count is incremented on every XXX operation, the delay
 	 * associated with the operation is added to XXX_delay.
 	 * XXX_delay contains the accumulated delay time in nanoseconds.
 	 */
 	struct timespec blkio_start, blkio_end;	/* Shared by blkio, swapin */
 	u64 blkio_delay;	/* wait for sync block io completion */
 	u64 swapin_delay;	/* wait for swapin block io completion */
 	u32 blkio_count;	/* total count of the number of sync block */
 				/* io operations performed */
 	u32 swapin_count;	/* total count of the number of swapin block */
 				/* io operations performed */
 };
 #endif	/* CONFIG_TASK_DELAY_ACCT */
 static inline int sched_info_on(void)
 {
 #ifdef CONFIG_SCHEDSTATS
 	return 1;
 #elif defined(CONFIG_TASK_DELAY_ACCT)
 	extern int delayacct_on;
 	return delayacct_on;
 #else
 	return 0;
 #endif
 }
 enum cpu_idle_type {
 	CPU_IDLE,
 	CPU_NOT_IDLE,
 	CPU_NEWLY_IDLE,
 	CPU_MAX_IDLE_TYPES
 };
 /*
  * sched-domains (multiprocessor balancing) declarations:
  */
 /*
  * Increase resolution of nice-level calculations:
  */
 #define SCHED_LOAD_SHIFT	10
 #define SCHED_LOAD_SCALE	(1L << SCHED_LOAD_SHIFT)
 #define SCHED_LOAD_SCALE_FUZZ	SCHED_LOAD_SCALE
 #ifdef CONFIG_SMP
 #define SD_LOAD_BALANCE		1	/* Do load balancing on this domain. */
 #define SD_BALANCE_NEWIDLE	2	/* Balance when about to become idle */
 #define SD_BALANCE_EXEC		4	/* Balance on exec */
 #define SD_BALANCE_FORK		8	/* Balance on fork, clone */
 #define SD_WAKE_IDLE		16	/* Wake to idle CPU on task wakeup */
 #define SD_WAKE_AFFINE		32	/* Wake task to waking CPU */
 #define SD_WAKE_BALANCE		64	/* Perform balancing at task wakeup */
 #define SD_SHARE_CPUPOWER	128	/* Domain members share cpu power */
 #define SD_POWERSAVINGS_BALANCE	256	/* Balance for power savings */
 #define SD_SHARE_PKG_RESOURCES	512	/* Domain members share cpu pkg resources */
 #define SD_SERIALIZE		1024	/* Only a single load balancing instance */
 #define BALANCE_FOR_MC_POWER	\
 	(sched_smt_power_savings ? SD_POWERSAVINGS_BALANCE : 0)
 #define BALANCE_FOR_PKG_POWER	\
 	((sched_mc_power_savings || sched_smt_power_savings) ?	\
 	 SD_POWERSAVINGS_BALANCE : 0)
 #define test_sd_parent(sd, flag)	((sd->parent &&		\
 					 (sd->parent->flags & flag)) ? 1 : 0)
 struct sched_group {
 	struct sched_group *next;	/* Must be a circular list */
 	cpumask_t cpumask;
 	/*
 	 * CPU power of this group, SCHED_LOAD_SCALE being max power for a
 	 * single CPU. This is read only (except for setup, hotplug CPU).
 	 * Note : Never change cpu_power without recompute its reciprocal
 	 */
 	unsigned int __cpu_power;
 	/*
 	 * reciprocal value of cpu_power to avoid expensive divides
 	 * (see include/linux/reciprocal_div.h)
 	 */
 	u32 reciprocal_cpu_power;
 };
 struct sched_domain {
 	/* These fields must be setup */
 	struct sched_domain *parent;	/* top domain must be null terminated */
 	struct sched_domain *child;	/* bottom domain must be null terminated */
 	struct sched_group *groups;	/* the balancing groups of the domain */
 	cpumask_t span;			/* span of all CPUs in this domain */
 	unsigned long min_interval;	/* Minimum balance interval ms */
 	unsigned long max_interval;	/* Maximum balance interval ms */
 	unsigned int busy_factor;	/* less balancing by factor if busy */
 	unsigned int imbalance_pct;	/* No balance until over watermark */
 	unsigned int cache_nice_tries;	/* Leave cache hot tasks for # tries */
 	unsigned int busy_idx;
 	unsigned int idle_idx;
 	unsigned int newidle_idx;
 	unsigned int wake_idx;
 	unsigned int forkexec_idx;
 	int flags;			/* See SD_* */
 	/* Runtime fields. */
 	unsigned long last_balance;	/* init to jiffies. units in jiffies */
 	unsigned int balance_interval;	/* initialise to 1. units in ms. */
 	unsigned int nr_balance_failed; /* initialise to 0 */
 #ifdef CONFIG_SCHEDSTATS
 	/* load_balance() stats */
 	unsigned int lb_count[CPU_MAX_IDLE_TYPES];
 	unsigned int lb_failed[CPU_MAX_IDLE_TYPES];
 	unsigned int lb_balanced[CPU_MAX_IDLE_TYPES];
 	unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES];
 	unsigned int lb_gained[CPU_MAX_IDLE_TYPES];
 	unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES];
 	unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES];
 	unsigned int lb_nobusyq[CPU_MAX_IDLE_TYPES];
 	/* Active load balancing */
 	unsigned int alb_count;
 	unsigned int alb_failed;
 	unsigned int alb_pushed;
 	/* SD_BALANCE_EXEC stats */
 	unsigned int sbe_count;
 	unsigned int sbe_balanced;
 	unsigned int sbe_pushed;
 	/* SD_BALANCE_FORK stats */
 	unsigned int sbf_count;
 	unsigned int sbf_balanced;
 	unsigned int sbf_pushed;
 	/* try_to_wake_up() stats */
 	unsigned int ttwu_wake_remote;
 	unsigned int ttwu_move_affine;
 	unsigned int ttwu_move_balance;
 #endif
 };
 extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new);
 #endif	/* CONFIG_SMP */
 /*
  * A runqueue laden with a single nice 0 task scores a weighted_cpuload of
  * SCHED_LOAD_SCALE. This function returns 1 if any cpu is laden with a
  * task of nice 0 or enough lower priority tasks to bring up the
  * weighted_cpuload
  */
 static inline int above_background_load(void)
 {
 	unsigned long cpu;
 	for_each_online_cpu(cpu) {
 		if (weighted_cpuload(cpu) >= SCHED_LOAD_SCALE)
 			return 1;
 	}
 	return 0;
 }
 struct io_context;			/* See blkdev.h */
 #define NGROUPS_SMALL		32
 #define NGROUPS_PER_BLOCK	((int)(PAGE_SIZE / sizeof(gid_t)))
 struct group_info {
 	int ngroups;
 	atomic_t usage;
 	gid_t small_block[NGROUPS_SMALL];
 	int nblocks;
 	gid_t *blocks[0];
 };
 /*
  * get_group_info() must be called with the owning task locked (via task_lock())
  * when task != current.  The reason being that the vast majority of callers are
  * looking at current->group_info, which can not be changed except by the
  * current task.  Changing current->group_info requires the task lock, too.
  */
 #define get_group_info(group_info) do { \
 	atomic_inc(&(group_info)->usage); \
 } while (0)
 #define put_group_info(group_info) do { \
 	if (atomic_dec_and_test(&(group_info)->usage)) \
 		groups_free(group_info); \
 } while (0)
 extern struct group_info *groups_alloc(int gidsetsize);
 extern void groups_free(struct group_info *group_info);
 extern int set_current_groups(struct group_info *group_info);
 extern int groups_search(struct group_info *group_info, gid_t grp);
 /* access the groups "array" with this macro */
 #define GROUP_AT(gi, i) \
     ((gi)->blocks[(i)/NGROUPS_PER_BLOCK][(i)%NGROUPS_PER_BLOCK])
 #ifdef ARCH_HAS_PREFETCH_SWITCH_STACK
 extern void prefetch_stack(struct task_struct *t);
 #else
 static inline void prefetch_stack(struct task_struct *t) { }
 #endif
 struct audit_context;		/* See audit.c */
 struct mempolicy;
 struct pipe_inode_info;
 struct uts_namespace;
 struct rq;
 struct sched_domain;
 struct sched_class {
 	const struct sched_class *next;
 	void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup);
 	void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
 	void (*yield_task) (struct rq *rq);
 	int  (*select_task_rq)(struct task_struct *p, int sync);
 	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p);
 	struct task_struct * (*pick_next_task) (struct rq *rq);
 	void (*put_prev_task) (struct rq *rq, struct task_struct *p);
 #ifdef CONFIG_SMP
 	unsigned long (*load_balance) (struct rq *this_rq, int this_cpu,
 			struct rq *busiest, unsigned long max_load_move,
 			struct sched_domain *sd, enum cpu_idle_type idle,
 			int *all_pinned, int *this_best_prio);
 	int (*move_one_task) (struct rq *this_rq, int this_cpu,
 			      struct rq *busiest, struct sched_domain *sd,
 			      enum cpu_idle_type idle);
 	void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
 	void (*post_schedule) (struct rq *this_rq);
 	void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
 #endif
 	void (*set_curr_task) (struct rq *rq);
 	void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
 	void (*task_new) (struct rq *rq, struct task_struct *p);
 	void (*set_cpus_allowed)(struct task_struct *p, cpumask_t *newmask);
 	void (*join_domain)(struct rq *rq);
 	void (*leave_domain)(struct rq *rq);
 	void (*switched_from) (struct rq *this_rq, struct task_struct *task,
 			       int running);
 	void (*switched_to) (struct rq *this_rq, struct task_struct *task,
 			     int running);
 	void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
 			     int oldprio, int running);
 };
 struct load_weight {
 	unsigned long weight, inv_weight;
 };
 /*
  * CFS stats for a schedulable entity (task, task-group etc)
  *
  * Current field usage histogram:
  *
  *     4 se->block_start
  *     4 se->run_node
  *     4 se->sleep_start
  *     6 se->load.weight
  */
 struct sched_entity {
 	struct load_weight	load;		/* for load-balancing */
 	struct rb_node		run_node;
 	unsigned int		on_rq;
 	u64			exec_start;
 	u64			sum_exec_runtime;
 	u64			vruntime;
 	u64			prev_sum_exec_runtime;
 #ifdef CONFIG_SCHEDSTATS
 	u64			wait_start;
 	u64			wait_max;
 	u64			wait_count;
 	u64			wait_sum;
 	u64			sleep_start;
 	u64			sleep_max;
 	s64			sum_sleep_runtime;
 	u64			block_start;
 	u64			block_max;
 	u64			exec_max;
 	u64			slice_max;
 	u64			nr_migrations;
 	u64			nr_migrations_cold;
 	u64			nr_failed_migrations_affine;
 	u64			nr_failed_migrations_running;
 	u64			nr_failed_migrations_hot;
 	u64			nr_forced_migrations;
 	u64			nr_forced2_migrations;
 	u64			nr_wakeups;
 	u64			nr_wakeups_sync;
 	u64			nr_wakeups_migrate;
 	u64			nr_wakeups_local;
 	u64			nr_wakeups_remote;
 	u64			nr_wakeups_affine;
 	u64			nr_wakeups_affine_attempts;
 	u64			nr_wakeups_passive;
 	u64			nr_wakeups_idle;
 #endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	struct sched_entity	*parent;
 	/* rq on which this entity is (to be) queued: */
 	struct cfs_rq		*cfs_rq;
 	/* rq "owned" by this entity/group: */
 	struct cfs_rq		*my_q;
 #endif
 };
 struct sched_rt_entity {
 	struct list_head run_list;
 	unsigned int time_slice;
 	unsigned long timeout;
 	int nr_cpus_allowed;
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	struct sched_rt_entity	*parent;
 	/* rq on which this entity is (to be) queued: */
 	struct rt_rq		*rt_rq;
 	/* rq "owned" by this entity/group: */
 	struct rt_rq		*my_q;
 #endif
 };
 struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
 	void *stack;
 	atomic_t usage;
 	unsigned int flags;	/* per process flags, defined below */
 	unsigned int ptrace;
 	int lock_depth;		/* BKL lock depth */
 #ifdef CONFIG_SMP
 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
 	int oncpu;
 #endif
 #endif
 	int prio, static_prio, normal_prio;
 	const struct sched_class *sched_class;
 	struct sched_entity se;
 	struct sched_rt_entity rt;
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 	/* list of struct preempt_notifier: */
 	struct hlist_head preempt_notifiers;
 #endif
 	/*
 	 * fpu_counter contains the number of consecutive context switches
 	 * that the FPU is used. If this is over a threshold, the lazy fpu
 	 * saving becomes unlazy to save the trap. This is an unsigned char
 	 * so that after 256 times the counter wraps and the behavior turns
 	 * lazy again; this to deal with bursty apps that only use FPU for
 	 * a short time
 	 */
 	unsigned char fpu_counter;
 	s8 oomkilladj; /* OOM kill score adjustment (bit shift). */
 #ifdef CONFIG_BLK_DEV_IO_TRACE
 	unsigned int btrace_seq;
 #endif
 	unsigned int policy;
 	cpumask_t cpus_allowed;
 #ifdef CONFIG_PREEMPT_RCU
 	int rcu_read_lock_nesting;
 	int rcu_flipctr_idx;
 #endif /* #ifdef CONFIG_PREEMPT_RCU */
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	struct sched_info sched_info;
 #endif
 	struct list_head tasks;
 	/*
 	 * ptrace_list/ptrace_children forms the list of my children
 	 * that were stolen by a ptracer.
 	 */
 	struct list_head ptrace_children;
 	struct list_head ptrace_list;
 	struct mm_struct *mm, *active_mm;
 /* task state */
 	struct linux_binfmt *binfmt;
 	int exit_state;
 	int exit_code, exit_signal;
 	int pdeath_signal;  /*  The signal sent when the parent dies  */
 	/* ??? */
 	unsigned int personality;
 	unsigned did_exec:1;
 	pid_t pid;
 	pid_t tgid;
 #ifdef CONFIG_CC_STACKPROTECTOR
 	/* Canary value for the -fstack-protector gcc feature */
 	unsigned long stack_canary;
 #endif
 	/*
 	 * pointers to (original) parent process, youngest child, younger sibling,
 	 * older sibling, respectively.  (p->father can be replaced with
 	 * p->parent->pid)
 	 */
 	struct task_struct *real_parent; /* real parent process (when being debugged) */
 	struct task_struct *parent;	/* parent process */
 	/*
 	 * children/sibling forms the list of my children plus the
 	 * tasks I'm ptracing.
 	 */
 	struct list_head children;	/* list of my children */
 	struct list_head sibling;	/* linkage in my parent's children list */
 	struct task_struct *group_leader;	/* threadgroup leader */
 	/* PID/PID hash table linkage. */
 	struct pid_link pids[PIDTYPE_MAX];
 	struct list_head thread_group;
 	struct completion *vfork_done;		/* for vfork() */
 	int __user *set_child_tid;		/* CLONE_CHILD_SETTID */
 	int __user *clear_child_tid;		/* CLONE_CHILD_CLEARTID */
 	unsigned int rt_priority;
 	cputime_t utime, stime, utimescaled, stimescaled;
 	cputime_t gtime;
 	cputime_t prev_utime, prev_stime;
 	unsigned long nvcsw, nivcsw; /* context switch counts */
 	struct timespec start_time; 		/* monotonic time */
 	struct timespec real_start_time;	/* boot based time */
 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
 	unsigned long min_flt, maj_flt;
   	cputime_t it_prof_expires, it_virt_expires;
 	unsigned long long it_sched_expires;
 	struct list_head cpu_timers[3];
 /* process credentials */
 	uid_t uid,euid,suid,fsuid;
 	gid_t gid,egid,sgid,fsgid;
 	struct group_info *group_info;
 	kernel_cap_t   cap_effective, cap_inheritable, cap_permitted;
 	unsigned keep_capabilities:1;
 	struct user_struct *user;
 #ifdef CONFIG_KEYS
 	struct key *request_key_auth;	/* assumed request_key authority */
 	struct key *thread_keyring;	/* keyring private to this thread */
 	unsigned char jit_keyring;	/* default keyring to attach requested keys to */
 #endif
 	char comm[TASK_COMM_LEN]; /* executable name excluding path
 				     - access with [gs]et_task_comm (which lock
 				       it with task_lock())
 				     - initialized normally by flush_old_exec */
 /* file system info */
 	int link_count, total_link_count;
 #ifdef CONFIG_SYSVIPC
 /* ipc stuff */
 	struct sysv_sem sysvsem;
 #endif
 #ifdef CONFIG_DETECT_SOFTLOCKUP
 /* hung task detection */
 	unsigned long last_switch_timestamp;
 	unsigned long last_switch_count;
 #endif
 /* CPU-specific state of this task */
 	struct thread_struct thread;
 /* filesystem information */
 	struct fs_struct *fs;
 /* open file information */
 	struct files_struct *files;
 /* namespaces */
 	struct nsproxy *nsproxy;
 /* signal handlers */
 	struct signal_struct *signal;
 	struct sighand_struct *sighand;
 	sigset_t blocked, real_blocked;
 	sigset_t saved_sigmask;		/* To be restored with TIF_RESTORE_SIGMASK */
 	struct sigpending pending;
 	unsigned long sas_ss_sp;
 	size_t sas_ss_size;
 	int (*notifier)(void *priv);
 	void *notifier_data;
 	sigset_t *notifier_mask;
 #ifdef CONFIG_SECURITY
 	void *security;
 #endif
 	struct audit_context *audit_context;
 	seccomp_t seccomp;
 /* Thread group tracking */
    	u32 parent_exec_id;
    	u32 self_exec_id;
 /* Protection of (de-)allocation: mm, files, fs, tty, keyrings */
 	spinlock_t alloc_lock;
 	/* Protection of the PI data structures: */
 	spinlock_t pi_lock;
 #ifdef CONFIG_RT_MUTEXES
 	/* PI waiters blocked on a rt_mutex held by this task */
 	struct plist_head pi_waiters;
 	/* Deadlock detection and priority inheritance handling */
 	struct rt_mutex_waiter *pi_blocked_on;
 #endif
 #ifdef CONFIG_DEBUG_MUTEXES
 	/* mutex deadlock detection */
 	struct mutex_waiter *blocked_on;
 #endif
 #ifdef CONFIG_TRACE_IRQFLAGS
 	unsigned int irq_events;
 	int hardirqs_enabled;
 	unsigned long hardirq_enable_ip;
 	unsigned int hardirq_enable_event;
 	unsigned long hardirq_disable_ip;
 	unsigned int hardirq_disable_event;
 	int softirqs_enabled;
 	unsigned long softirq_disable_ip;
 	unsigned int softirq_disable_event;
 	unsigned long softirq_enable_ip;
 	unsigned int softirq_enable_event;
 	int hardirq_context;
 	int softirq_context;
 #endif
 #ifdef CONFIG_LOCKDEP
 # define MAX_LOCK_DEPTH 30UL
 	u64 curr_chain_key;
 	int lockdep_depth;
 	struct held_lock held_locks[MAX_LOCK_DEPTH];
 	unsigned int lockdep_recursion;
 #endif
 /* journalling filesystem info */
 	void *journal_info;
 /* stacked block device info */
 	struct bio *bio_list, **bio_tail;
 /* VM state */
 	struct reclaim_state *reclaim_state;
 	struct backing_dev_info *backing_dev_info;
 	struct io_context *io_context;
 	unsigned long ptrace_message;
 	siginfo_t *last_siginfo; /* For ptrace use.  */
 #ifdef CONFIG_TASK_XACCT
 /* i/o counters(bytes read/written, #syscalls */
 	u64 rchar, wchar, syscr, syscw;
 #endif
 	struct task_io_accounting ioac;
 #if defined(CONFIG_TASK_XACCT)
 	u64 acct_rss_mem1;	/* accumulated rss usage */
 	u64 acct_vm_mem1;	/* accumulated virtual memory usage */
 	cputime_t acct_stimexpd;/* stime since last update */
 #endif
 #ifdef CONFIG_NUMA
   	struct mempolicy *mempolicy;
 	short il_next;
 #endif
 #ifdef CONFIG_CPUSETS
 	nodemask_t mems_allowed;
 	int cpuset_mems_generation;
 	int cpuset_mem_spread_rotor;
 #endif
 #ifdef CONFIG_CGROUPS
 	/* Control Group info protected by css_set_lock */
 	struct css_set *cgroups;
 	/* cg_list protected by css_set_lock and tsk->alloc_lock */
 	struct list_head cg_list;
 #endif
 #ifdef CONFIG_FUTEX
 	struct robust_list_head __user *robust_list;
 #ifdef CONFIG_COMPAT
 	struct compat_robust_list_head __user *compat_robust_list;
 #endif
 	struct list_head pi_state_list;
 	struct futex_pi_state *pi_state_cache;
 #endif
 	atomic_t fs_excl;	/* holding fs exclusive resources */
 	struct rcu_head rcu;
 	/*
 	 * cache last used pipe for splice
 	 */
 	struct pipe_inode_info *splice_pipe;
 #ifdef	CONFIG_TASK_DELAY_ACCT
 	struct task_delay_info *delays;
 #endif
 #ifdef CONFIG_FAULT_INJECTION
 	int make_it_fail;
 #endif
 	struct prop_local_single dirties;
 #ifdef CONFIG_LATENCYTOP
 	int latency_record_count;
 	struct latency_record latency_record[LT_SAVECOUNT];
 #endif
 };
 /*
  * Priority of a process goes from 0..MAX_PRIO-1, valid RT
  * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
  * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
  * values are inverted: lower p->prio value means higher priority.
  *
  * The MAX_USER_RT_PRIO value allows the actual maximum
  * RT priority to be separate from the value exported to
  * user-space.  This allows kernel threads to set their
  * priority to a value higher than any user task. Note:
  * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
  */
 #define MAX_USER_RT_PRIO	100
 #define MAX_RT_PRIO		MAX_USER_RT_PRIO
 #define MAX_PRIO		(MAX_RT_PRIO + 40)
 #define DEFAULT_PRIO		(MAX_RT_PRIO + 20)
 static inline int rt_prio(int prio)
 {
 	if (unlikely(prio < MAX_RT_PRIO))
 		return 1;
 	return 0;
 }
 static inline int rt_task(struct task_struct *p)
 {
 	return rt_prio(p->prio);
 }
 static inline void set_task_session(struct task_struct *tsk, pid_t session)
 {
 	tsk->signal->__session = session;
 }
 static inline void set_task_pgrp(struct task_struct *tsk, pid_t pgrp)
 {
 	tsk->signal->__pgrp = pgrp;
 }
 static inline struct pid *task_pid(struct task_struct *task)
 {
 	return task->pids[PIDTYPE_PID].pid;
 }
 static inline struct pid *task_tgid(struct task_struct *task)
 {
 	return task->group_leader->pids[PIDTYPE_PID].pid;
 }
 static inline struct pid *task_pgrp(struct task_struct *task)
 {
 	return task->group_leader->pids[PIDTYPE_PGID].pid;
 }
 static inline struct pid *task_session(struct task_struct *task)
 {
 	return task->group_leader->pids[PIDTYPE_SID].pid;
 }
 struct pid_namespace;
 /*
  * the helpers to get the task's different pids as they are seen
  * from various namespaces
  *
  * task_xid_nr()     : global id, i.e. the id seen from the init namespace;
  * task_xid_vnr()    : virtual id, i.e. the id seen from the namespace the task
  *                     belongs to. this only makes sence when called in the
  *                     context of the task that belongs to the same namespace;
  * task_xid_nr_ns()  : id seen from the ns specified;
  *
  * set_task_vxid()   : assigns a virtual id to a task;
  *
  * see also pid_nr() etc in include/linux/pid.h
  */
 static inline pid_t task_pid_nr(struct task_struct *tsk)
 {
 	return tsk->pid;
 }
 pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
 static inline pid_t task_pid_vnr(struct task_struct *tsk)
 {
 	return pid_vnr(task_pid(tsk));
 }
 static inline pid_t task_tgid_nr(struct task_struct *tsk)
 {
 	return tsk->tgid;
 }
 pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
 static inline pid_t task_tgid_vnr(struct task_struct *tsk)
 {
 	return pid_vnr(task_tgid(tsk));
 }
 static inline pid_t task_pgrp_nr(struct task_struct *tsk)
 {
 	return tsk->signal->__pgrp;
 }
 pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
 static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
 {
 	return pid_vnr(task_pgrp(tsk));
 }
 static inline pid_t task_session_nr(struct task_struct *tsk)
 {
 	return tsk->signal->__session;
 }
 pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
 static inline pid_t task_session_vnr(struct task_struct *tsk)
 {
 	return pid_vnr(task_session(tsk));
 }
 /**
  * pid_alive - check that a task structure is not stale
  * @p: Task structure to be checked.
  *
  * Test if a process is not yet dead (at most zombie state)
  * If pid_alive fails, then pointers within the task structure
  * can be stale and must not be dereferenced.
  */
 static inline int pid_alive(struct task_struct *p)
 {
 	return p->pids[PIDTYPE_PID].pid != NULL;
 }
 /**
  * is_global_init - check if a task structure is init
  * @tsk: Task structure to be checked.
  *
  * Check if a task structure is the first user space task the kernel created.
  */
 static inline int is_global_init(struct task_struct *tsk)
 {
 	return tsk->pid == 1;
 }
 /*
  * is_container_init:
  * check whether in the task is init in its own pid namespace.
  */
 extern int is_container_init(struct task_struct *tsk);
 extern struct pid *cad_pid;
 extern void free_task(struct task_struct *tsk);
 #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
 extern void __put_task_struct(struct task_struct *t);
 static inline void put_task_struct(struct task_struct *t)
 {
 	if (atomic_dec_and_test(&t->usage))
 		__put_task_struct(t);
 }
 /*
  * Per process flags
  */
 #define PF_ALIGNWARN	0x00000001	/* Print alignment warning msgs */
 					/* Not implemented yet, only for 486*/
 #define PF_STARTING	0x00000002	/* being created */
 #define PF_EXITING	0x00000004	/* getting shut down */
 #define PF_EXITPIDONE	0x00000008	/* pi exit done on shut down */
 #define PF_VCPU		0x00000010	/* I'm a virtual CPU */
 #define PF_FORKNOEXEC	0x00000040	/* forked but didn't exec */
 #define PF_SUPERPRIV	0x00000100	/* used super-user privileges */
 #define PF_DUMPCORE	0x00000200	/* dumped core */
 #define PF_SIGNALED	0x00000400	/* killed by a signal */
 #define PF_MEMALLOC	0x00000800	/* Allocating memory */
 #define PF_FLUSHER	0x00001000	/* responsible for disk writeback */
 #define PF_USED_MATH	0x00002000	/* if unset the fpu must be initialized before use */
 #define PF_NOFREEZE	0x00008000	/* this thread should not be frozen */
 #define PF_FROZEN	0x00010000	/* frozen for system suspend */
 #define PF_FSTRANS	0x00020000	/* inside a filesystem transaction */
 #define PF_KSWAPD	0x00040000	/* I am kswapd */
 #define PF_SWAPOFF	0x00080000	/* I am in swapoff */
 #define PF_LESS_THROTTLE 0x00100000	/* Throttle me less: I clean memory */
 #define PF_BORROWED_MM	0x00200000	/* I am a kthread doing use_mm */
 #define PF_RANDOMIZE	0x00400000	/* randomize virtual address space */
 #define PF_SWAPWRITE	0x00800000	/* Allowed to write to swap */
 #define PF_SPREAD_PAGE	0x01000000	/* Spread page cache over cpuset */
 #define PF_SPREAD_SLAB	0x02000000	/* Spread some slab caches over cpuset */
 #define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
 #define PF_MUTEX_TESTER	0x20000000	/* Thread belongs to the rt mutex tester */
 #define PF_FREEZER_SKIP	0x40000000	/* Freezer should not count it as freezeable */
 /*
  * Only the _current_ task can read/write to tsk->flags, but other
  * tasks can access tsk->flags in readonly mode for example
  * with tsk_used_math (like during threaded core dumping).
  * There is however an exception to this rule during ptrace
  * or during fork: the ptracer task is allowed to write to the
  * child->flags of its traced child (same goes for fork, the parent
  * can write to the child->flags), because we're guaranteed the
  * child is not running and in turn not changing child->flags
  * at the same time the parent does it.
  */
 #define clear_stopped_child_used_math(child) do { (child)->flags &= ~PF_USED_MATH; } while (0)
 #define set_stopped_child_used_math(child) do { (child)->flags |= PF_USED_MATH; } while (0)
 #define clear_used_math() clear_stopped_child_used_math(current)
 #define set_used_math() set_stopped_child_used_math(current)
 #define conditional_stopped_child_used_math(condition, child) \
 	do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0)
 #define conditional_used_math(condition) \
 	conditional_stopped_child_used_math(condition, current)
 #define copy_to_stopped_child_used_math(child) \
 	do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0)
 /* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */
 #define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
 #define used_math() tsk_used_math(current)
 #ifdef CONFIG_SMP
 extern int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask);
 #else
 static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
 {
 	if (!cpu_isset(0, new_mask))
 		return -EINVAL;
 	return 0;
 }
 #endif
 extern unsigned long long sched_clock(void);
 /*
  * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
  * clock constructed from sched_clock():
  */
 extern unsigned long long cpu_clock(int cpu);
 extern unsigned long long
 task_sched_runtime(struct task_struct *task);
 /* sched_exec is called by processes performing an exec */
 #ifdef CONFIG_SMP
 extern void sched_exec(void);
 #else
 #define sched_exec()   {}
 #endif
 extern void sched_clock_idle_sleep_event(void);
 extern void sched_clock_idle_wakeup_event(u64 delta_ns);
 #ifdef CONFIG_HOTPLUG_CPU
 extern void idle_task_exit(void);
 #else
 static inline void idle_task_exit(void) {}
 #endif
 extern void sched_idle_next(void);
 #ifdef CONFIG_SCHED_DEBUG
 extern unsigned int sysctl_sched_latency;
 extern unsigned int sysctl_sched_min_granularity;
 extern unsigned int sysctl_sched_wakeup_granularity;
 extern unsigned int sysctl_sched_batch_wakeup_granularity;
 extern unsigned int sysctl_sched_child_runs_first;
 extern unsigned int sysctl_sched_features;
 extern unsigned int sysctl_sched_migration_cost;
 extern unsigned int sysctl_sched_nr_migrate;
 extern unsigned int sysctl_sched_rt_period;
 extern unsigned int sysctl_sched_rt_ratio;
 #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
 extern unsigned int sysctl_sched_min_bal_int_shares;
 extern unsigned int sysctl_sched_max_bal_int_shares;
 #endif
 int sched_nr_latency_handler(struct ctl_table *table, int write,
 		struct file *file, void __user *buffer, size_t *length,
 		loff_t *ppos);
 #endif
 extern unsigned int sysctl_sched_compat_yield;
 #ifdef CONFIG_RT_MUTEXES
 extern int rt_mutex_getprio(struct task_struct *p);
 extern void rt_mutex_setprio(struct task_struct *p, int prio);
 extern void rt_mutex_adjust_pi(struct task_struct *p);
 #else
 static inline int rt_mutex_getprio(struct task_struct *p)
 {
 	return p->normal_prio;
 }
 # define rt_mutex_adjust_pi(p)		do { } while (0)
 #endif
 extern void set_user_nice(struct task_struct *p, long nice);
 extern int task_prio(const struct task_struct *p);
 extern int task_nice(const struct task_struct *p);
 extern int can_nice(const struct task_struct *p, const int nice);
 extern int task_curr(const struct task_struct *p);
 extern int idle_cpu(int cpu);
 extern int sched_setscheduler(struct task_struct *, int, struct sched_param *);
 extern struct task_struct *idle_task(int cpu);
 extern struct task_struct *curr_task(int cpu);
 extern void set_curr_task(int cpu, struct task_struct *p);
 void yield(void);
 /*
  * The default (Linux) execution domain.
  */
 extern struct exec_domain	default_exec_domain;
 union thread_union {
 	struct thread_info thread_info;
 	unsigned long stack[THREAD_SIZE/sizeof(long)];
 };
 #ifndef __HAVE_ARCH_KSTACK_END
 static inline int kstack_end(void *addr)
 {
 	/* Reliable end of stack detection:
 	 * Some APM bios versions misalign the stack
 	 */
 	return !(((unsigned long)addr+sizeof(void*)-1) & (THREAD_SIZE-sizeof(void*)));
 }
 #endif
 extern union thread_union init_thread_union;
 extern struct task_struct init_task;
 extern struct   mm_struct init_mm;
 extern struct pid_namespace init_pid_ns;
 /*
  * find a task by one of its numerical ids
  *
  * find_task_by_pid_type_ns():
  *      it is the most generic call - it finds a task by all id,
  *      type and namespace specified
  * find_task_by_pid_ns():
  *      finds a task by its pid in the specified namespace
  * find_task_by_vpid():
  *      finds a task by its virtual pid
  * find_task_by_pid():
  *      finds a task by its global pid
  *
  * see also find_pid() etc in include/linux/pid.h
  */
 extern struct task_struct *find_task_by_pid_type_ns(int type, int pid,
 		struct pid_namespace *ns);
 extern struct task_struct *find_task_by_pid(pid_t nr);
 extern struct task_struct *find_task_by_vpid(pid_t nr);
 extern struct task_struct *find_task_by_pid_ns(pid_t nr,
 		struct pid_namespace *ns);
 extern void __set_special_pids(pid_t session, pid_t pgrp);
 /* per-UID process charging. */
 extern struct user_struct * alloc_uid(struct user_namespace *, uid_t);
 static inline struct user_struct *get_uid(struct user_struct *u)
 {
 	atomic_inc(&u->__count);
 	return u;
 }
 extern void free_uid(struct user_struct *);
 extern void switch_uid(struct user_struct *);
 extern void release_uids(struct user_namespace *ns);
 #include <asm/current.h>
 extern void do_timer(unsigned long ticks);
 extern int FASTCALL(wake_up_state(struct task_struct * tsk, unsigned int state));
 extern int FASTCALL(wake_up_process(struct task_struct * tsk));
 extern void FASTCALL(wake_up_new_task(struct task_struct * tsk,
 						unsigned long clone_flags));
 #ifdef CONFIG_SMP
  extern void kick_process(struct task_struct *tsk);
 #else
  static inline void kick_process(struct task_struct *tsk) { }
 #endif
 extern void sched_fork(struct task_struct *p, int clone_flags);
 extern void sched_dead(struct task_struct *p);
 extern int in_group_p(gid_t);
 extern int in_egroup_p(gid_t);
 extern void proc_caches_init(void);
 extern void flush_signals(struct task_struct *);
 extern void ignore_signals(struct task_struct *);
 extern void flush_signal_handlers(struct task_struct *, int force_default);
 extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info);
 static inline int dequeue_signal_lock(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
 {
 	unsigned long flags;
 	int ret;
 	spin_lock_irqsave(&tsk->sighand->siglock, flags);
 	ret = dequeue_signal(tsk, mask, info);
 	spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
 	return ret;
 }
 extern void block_all_signals(int (*notifier)(void *priv), void *priv,
 			      sigset_t *mask);
 extern void unblock_all_signals(void);
 extern void release_task(struct task_struct * p);
 extern int send_sig_info(int, struct siginfo *, struct task_struct *);
 extern int send_group_sig_info(int, struct siginfo *, struct task_struct *);
 extern int force_sigsegv(int, struct task_struct *);
 extern int force_sig_info(int, struct siginfo *, struct task_struct *);
 extern int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp);
 extern int kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp);
 extern int kill_pid_info(int sig, struct siginfo *info, struct pid *pid);
 extern int kill_pid_info_as_uid(int, struct siginfo *, struct pid *, uid_t, uid_t, u32);
 extern int kill_pgrp(struct pid *pid, int sig, int priv);
 extern int kill_pid(struct pid *pid, int sig, int priv);
 extern int kill_proc_info(int, struct siginfo *, pid_t);
 extern void do_notify_parent(struct task_struct *, int);
 extern void force_sig(int, struct task_struct *);
 extern void force_sig_specific(int, struct task_struct *);
 extern int send_sig(int, struct task_struct *, int);
 extern void zap_other_threads(struct task_struct *p);
 extern int kill_proc(pid_t, int, int);
 extern struct sigqueue *sigqueue_alloc(void);
 extern void sigqueue_free(struct sigqueue *);
 extern int send_sigqueue(int, struct sigqueue *,  struct task_struct *);
 extern int send_group_sigqueue(int, struct sigqueue *,  struct task_struct *);
 extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *);
 extern int do_sigaltstack(const stack_t __user *, stack_t __user *, unsigned long);
 static inline int kill_cad_pid(int sig, int priv)
 {
 	return kill_pid(cad_pid, sig, priv);
 }
 /* These can be the second arg to send_sig_info/send_group_sig_info.  */
 #define SEND_SIG_NOINFO ((struct siginfo *) 0)
 #define SEND_SIG_PRIV	((struct siginfo *) 1)
 #define SEND_SIG_FORCED	((struct siginfo *) 2)
 static inline int is_si_special(const struct siginfo *info)
 {
 	return info <= SEND_SIG_FORCED;
 }
 /* True if we are on the alternate signal stack.  */
 static inline int on_sig_stack(unsigned long sp)
 {
 	return (sp - current->sas_ss_sp < current->sas_ss_size);
 }
 static inline int sas_ss_flags(unsigned long sp)
 {
 	return (current->sas_ss_size == 0 ? SS_DISABLE
 		: on_sig_stack(sp) ? SS_ONSTACK : 0);
 }
 /*
  * Routines for handling mm_structs
  */
 extern struct mm_struct * mm_alloc(void);
 /* mmdrop drops the mm and the page tables */
 extern void FASTCALL(__mmdrop(struct mm_struct *));
 static inline void mmdrop(struct mm_struct * mm)
 {
 	if (unlikely(atomic_dec_and_test(&mm->mm_count)))
 		__mmdrop(mm);
 }
 /* mmput gets rid of the mappings and all user-space */
 extern void mmput(struct mm_struct *);
 /* Grab a reference to a task's mm, if it is not already going away */
 extern struct mm_struct *get_task_mm(struct task_struct *task);
 /* Remove the current tasks stale references to the old mm_struct */
 extern void mm_release(struct task_struct *, struct mm_struct *);
 extern int  copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *);
 extern void flush_thread(void);
 extern void exit_thread(void);
 extern void exit_files(struct task_struct *);
 extern void __cleanup_signal(struct signal_struct *);
 extern void __cleanup_sighand(struct sighand_struct *);
 extern void exit_itimers(struct signal_struct *);
 extern NORET_TYPE void do_group_exit(int);
 extern void daemonize(const char *, ...);
 extern int allow_signal(int);
 extern int disallow_signal(int);
 extern int do_execve(char *, char __user * __user *, char __user * __user *, struct pt_regs *);
 extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *);
 struct task_struct *fork_idle(int);
 extern void set_task_comm(struct task_struct *tsk, char *from);
 extern void get_task_comm(char *to, struct task_struct *tsk);
 #ifdef CONFIG_SMP
 extern void wait_task_inactive(struct task_struct * p);
 #else
 #define wait_task_inactive(p)	do { } while (0)
 #endif
 #define remove_parent(p)	list_del_init(&(p)->sibling)
 #define add_parent(p)		list_add_tail(&(p)->sibling,&(p)->parent->children)
 #define next_task(p)	list_entry(rcu_dereference((p)->tasks.next), struct task_struct, tasks)
 #define for_each_process(p) \
 	for (p = &init_task ; (p = next_task(p)) != &init_task ; )
 /*
  * Careful: do_each_thread/while_each_thread is a double loop so
  *          'break' will not work as expected - use goto instead.
  */
 #define do_each_thread(g, t) \
 	for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do
 #define while_each_thread(g, t) \
 	while ((t = next_thread(t)) != g)
 /* de_thread depends on thread_group_leader not being a pid based check */
 #define thread_group_leader(p)	(p == p->group_leader)
 /* Do to the insanities of de_thread it is possible for a process
  * to have the pid of the thread group leader without actually being
  * the thread group leader.  For iteration through the pids in proc
  * all we care about is that we have a task with the appropriate
  * pid, we don't actually care if we have the right task.
  */
 static inline int has_group_leader_pid(struct task_struct *p)
 {
 	return p->pid == p->tgid;
 }
 static inline
 int same_thread_group(struct task_struct *p1, struct task_struct *p2)
 {
 	return p1->tgid == p2->tgid;
 }
 static inline struct task_struct *next_thread(const struct task_struct *p)
 {
 	return list_entry(rcu_dereference(p->thread_group.next),
 			  struct task_struct, thread_group);
 }
 static inline int thread_group_empty(struct task_struct *p)
 {
 	return list_empty(&p->thread_group);
 }
 #define delay_group_leader(p) \
 		(thread_group_leader(p) && !thread_group_empty(p))
 /*
  * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
  * subscriptions and synchronises with wait4().  Also used in procfs.  Also
  * pins the final release of task.io_context.  Also protects ->cpuset and
  * ->cgroup.subsys[].
  *
  * Nests both inside and outside of read_lock(&tasklist_lock).
  * It must not be nested with write_lock_irq(&tasklist_lock),
  * neither inside nor outside.
  */
 static inline void task_lock(struct task_struct *p)
 {
 	spin_lock(&p->alloc_lock);
 }
 static inline void task_unlock(struct task_struct *p)
 {
 	spin_unlock(&p->alloc_lock);
 }
 extern struct sighand_struct *lock_task_sighand(struct task_struct *tsk,
 							unsigned long *flags);
 static inline void unlock_task_sighand(struct task_struct *tsk,
 						unsigned long *flags)
 {
 	spin_unlock_irqrestore(&tsk->sighand->siglock, *flags);
 }
 #ifndef __HAVE_THREAD_FUNCTIONS
 #define task_thread_info(task)	((struct thread_info *)(task)->stack)
 #define task_stack_page(task)	((task)->stack)
 static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org)
 {
 	*task_thread_info(p) = *task_thread_info(org);
 	task_thread_info(p)->task = p;
 }
 static inline unsigned long *end_of_stack(struct task_struct *p)
 {
 	return (unsigned long *)(task_thread_info(p) + 1);
 }
 #endif
 /* set thread flags in other task's structures
  * - see asm/thread_info.h for TIF_xxxx flags available
  */
 static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag)
 {
 	set_ti_thread_flag(task_thread_info(tsk), flag);
 }
 static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag)
 {
 	clear_ti_thread_flag(task_thread_info(tsk), flag);
 }
 static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag)
 {
 	return test_and_set_ti_thread_flag(task_thread_info(tsk), flag);
 }
 static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag)
 {
 	return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag);
 }
 static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag)
 {
 	return test_ti_thread_flag(task_thread_info(tsk), flag);
 }
 static inline void set_tsk_need_resched(struct task_struct *tsk)
 {
 	set_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
 }
 static inline void clear_tsk_need_resched(struct task_struct *tsk)
 {
 	clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
 }
 static inline int signal_pending(struct task_struct *p)
 {
 	return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
 }
 static inline int need_resched(void)
 {
 	return unlikely(test_thread_flag(TIF_NEED_RESCHED));
 }
 /*
  * cond_resched() and cond_resched_lock(): latency reduction via
  * explicit rescheduling in places that are safe. The return
  * value indicates whether a reschedule was done in fact.
  * cond_resched_lock() will drop the spinlock before scheduling,
  * cond_resched_softirq() will enable bhs before scheduling.
  */
 #ifdef CONFIG_PREEMPT
 static inline int cond_resched(void)
 {
 	return 0;
 }
 #else
 extern int _cond_resched(void);
 static inline int cond_resched(void)
 {
 	return _cond_resched();
 }
 #endif
 extern int cond_resched_lock(spinlock_t * lock);
 extern int cond_resched_softirq(void);
 /*
  * Does a critical section need to be broken due to another
- * task waiting?:
+ * task waiting?: (technically does not depend on CONFIG_PREEMPT,
+ * but a general need for low latency)
  */
-#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)
+static inline int spin_needbreak(spinlock_t *lock)
-# define need_lockbreak(lock) ((lock)->break_lock)
-#else
-# define need_lockbreak(lock) 0
-#endif
-/*
- * Does a critical section need to be broken due to another
- * task waiting or preemption being signalled:
- */
-static inline int lock_need_resched(spinlock_t *lock)
 {
-	if (need_lockbreak(lock) || need_resched())
+#ifdef CONFIG_PREEMPT
-		return 1;
+	return spin_is_contended(lock);
+#else
 	return 0;
+#endif
 }
 /*
  * Reevaluate whether the task has signals pending delivery.
  * Wake the task if so.
  * This is required every time the blocked sigset_t changes.
  * callers must hold sighand->siglock.
  */
 extern void recalc_sigpending_and_wake(struct task_struct *t);
 extern void recalc_sigpending(void);
 extern void signal_wake_up(struct task_struct *t, int resume_stopped);
 /*
  * Wrappers for p->thread_info->cpu access. No-op on UP.
  */
 #ifdef CONFIG_SMP
 static inline unsigned int task_cpu(const struct task_struct *p)
 {
 	return task_thread_info(p)->cpu;
 }
 extern void set_task_cpu(struct task_struct *p, unsigned int cpu);
 #else
 static inline unsigned int task_cpu(const struct task_struct *p)
 {
 	return 0;
 }
 static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
 {
 }
 #endif /* CONFIG_SMP */
 #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
 extern void arch_pick_mmap_layout(struct mm_struct *mm);
 #else
 static inline void arch_pick_mmap_layout(struct mm_struct *mm)
 {
 	mm->mmap_base = TASK_UNMAPPED_BASE;
 	mm->get_unmapped_area = arch_get_unmapped_area;
 	mm->unmap_area = arch_unmap_area;
 }
 #endif
 extern long sched_setaffinity(pid_t pid, cpumask_t new_mask);
 extern long sched_getaffinity(pid_t pid, cpumask_t *mask);
 extern int sched_mc_power_savings, sched_smt_power_savings;
 extern void normalize_rt_tasks(void);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 extern struct task_group init_task_group;
 extern struct task_group *sched_create_group(void);
 extern void sched_destroy_group(struct task_group *tg);
 extern void sched_move_task(struct task_struct *tsk);
 extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
 extern unsigned long sched_group_shares(struct task_group *tg);
 #endif
 #ifdef CONFIG_TASK_XACCT
 static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
 {
 	tsk->rchar += amt;
 }
 static inline void add_wchar(struct task_struct *tsk, ssize_t amt)
 {
 	tsk->wchar += amt;
 }
 static inline void inc_syscr(struct task_struct *tsk)
 {
 	tsk->syscr++;
 }
 static inline void inc_syscw(struct task_struct *tsk)
 {
 	tsk->syscw++;
 }
 #else
 static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
 {
 }
 static inline void add_wchar(struct task_struct *tsk, ssize_t amt)
 {
 }
 static inline void inc_syscr(struct task_struct *tsk)
 {
 }
 static inline void inc_syscw(struct task_struct *tsk)
 {
 }
 #endif
 #ifdef CONFIG_SMP
 void migration_init(void);
 #else
 static inline void migration_init(void)
 {
 }
 #endif
 #endif /* __KERNEL__ */

include/linux/spinlock.h

Diff comments View file @ 95c354f

 #ifndef __LINUX_SPINLOCK_H
 #define __LINUX_SPINLOCK_H
 /*
  * include/linux/spinlock.h - generic spinlock/rwlock declarations
  *
  * here's the role of the various spinlock/rwlock related include files:
  *
  * on SMP builds:
  *
  *  asm/spinlock_types.h: contains the raw_spinlock_t/raw_rwlock_t and the
  *                        initializers
  *
  *  linux/spinlock_types.h:
  *                        defines the generic type and initializers
  *
  *  asm/spinlock.h:       contains the __raw_spin_*()/etc. lowlevel
  *                        implementations, mostly inline assembly code
  *
  *   (also included on UP-debug builds:)
  *
  *  linux/spinlock_api_smp.h:
  *                        contains the prototypes for the _spin_*() APIs.
  *
  *  linux/spinlock.h:     builds the final spin_*() APIs.
  *
  * on UP builds:
  *
  *  linux/spinlock_type_up.h:
  *                        contains the generic, simplified UP spinlock type.
  *                        (which is an empty structure on non-debug builds)
  *
  *  linux/spinlock_types.h:
  *                        defines the generic type and initializers
  *
  *  linux/spinlock_up.h:
  *                        contains the __raw_spin_*()/etc. version of UP
  *                        builds. (which are NOPs on non-debug, non-preempt
  *                        builds)
  *
  *   (included on UP-non-debug builds:)
  *
  *  linux/spinlock_api_up.h:
  *                        builds the _spin_*() APIs.
  *
  *  linux/spinlock.h:     builds the final spin_*() APIs.
  */
 #include <linux/preempt.h>
 #include <linux/linkage.h>
 #include <linux/compiler.h>
 #include <linux/thread_info.h>
 #include <linux/kernel.h>
 #include <linux/stringify.h>
 #include <linux/bottom_half.h>
 #include <asm/system.h>
 /*
  * Must define these before including other files, inline functions need them
  */
 #define LOCK_SECTION_NAME ".text.lock."KBUILD_BASENAME
 #define LOCK_SECTION_START(extra)               \
         ".subsection 1\n\t"                     \
         extra                                   \
         ".ifndef " LOCK_SECTION_NAME "\n\t"     \
         LOCK_SECTION_NAME ":\n\t"               \
         ".endif\n"
 #define LOCK_SECTION_END                        \
         ".previous\n\t"
 #define __lockfunc fastcall __attribute__((section(".spinlock.text")))
 /*
  * Pull the raw_spinlock_t and raw_rwlock_t definitions:
  */
 #include <linux/spinlock_types.h>
 extern int __lockfunc generic__raw_read_trylock(raw_rwlock_t *lock);
 /*
  * Pull the __raw*() functions/declarations (UP-nondebug doesnt need them):
  */
 #ifdef CONFIG_SMP
 # include <asm/spinlock.h>
 #else
 # include <linux/spinlock_up.h>
 #endif
 #ifdef CONFIG_DEBUG_SPINLOCK
   extern void __spin_lock_init(spinlock_t *lock, const char *name,
 			       struct lock_class_key *key);
 # define spin_lock_init(lock)					\
 do {								\
 	static struct lock_class_key __key;			\
 								\
 	__spin_lock_init((lock), #lock, &__key);		\
 } while (0)
 #else
 # define spin_lock_init(lock)					\
 	do { *(lock) = SPIN_LOCK_UNLOCKED; } while (0)
 #endif
 #ifdef CONFIG_DEBUG_SPINLOCK
   extern void __rwlock_init(rwlock_t *lock, const char *name,
 			    struct lock_class_key *key);
 # define rwlock_init(lock)					\
 do {								\
 	static struct lock_class_key __key;			\
 								\
 	__rwlock_init((lock), #lock, &__key);			\
 } while (0)
 #else
 # define rwlock_init(lock)					\
 	do { *(lock) = RW_LOCK_UNLOCKED; } while (0)
 #endif
 #define spin_is_locked(lock)	__raw_spin_is_locked(&(lock)->raw_lock)
+#ifdef CONFIG_GENERIC_LOCKBREAK
+#define spin_is_contended(lock) ((lock)->break_lock)
+#else
+#define spin_is_contended(lock)	__raw_spin_is_contended(&(lock)->raw_lock)
+#endif
 /**
  * spin_unlock_wait - wait until the spinlock gets unlocked
  * @lock: the spinlock in question.
  */
 #define spin_unlock_wait(lock)	__raw_spin_unlock_wait(&(lock)->raw_lock)
 /*
  * Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
  */
 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
 # include <linux/spinlock_api_smp.h>
 #else
 # include <linux/spinlock_api_up.h>
 #endif
 #ifdef CONFIG_DEBUG_SPINLOCK
  extern void _raw_spin_lock(spinlock_t *lock);
 #define _raw_spin_lock_flags(lock, flags) _raw_spin_lock(lock)
  extern int _raw_spin_trylock(spinlock_t *lock);
  extern void _raw_spin_unlock(spinlock_t *lock);
  extern void _raw_read_lock(rwlock_t *lock);
  extern int _raw_read_trylock(rwlock_t *lock);
  extern void _raw_read_unlock(rwlock_t *lock);
  extern void _raw_write_lock(rwlock_t *lock);
  extern int _raw_write_trylock(rwlock_t *lock);
  extern void _raw_write_unlock(rwlock_t *lock);
 #else
 # define _raw_spin_lock(lock)		__raw_spin_lock(&(lock)->raw_lock)
 # define _raw_spin_lock_flags(lock, flags) \
 		__raw_spin_lock_flags(&(lock)->raw_lock, *(flags))
 # define _raw_spin_trylock(lock)	__raw_spin_trylock(&(lock)->raw_lock)
 # define _raw_spin_unlock(lock)		__raw_spin_unlock(&(lock)->raw_lock)
 # define _raw_read_lock(rwlock)		__raw_read_lock(&(rwlock)->raw_lock)
 # define _raw_read_trylock(rwlock)	__raw_read_trylock(&(rwlock)->raw_lock)
 # define _raw_read_unlock(rwlock)	__raw_read_unlock(&(rwlock)->raw_lock)
 # define _raw_write_lock(rwlock)	__raw_write_lock(&(rwlock)->raw_lock)
 # define _raw_write_trylock(rwlock)	__raw_write_trylock(&(rwlock)->raw_lock)
 # define _raw_write_unlock(rwlock)	__raw_write_unlock(&(rwlock)->raw_lock)
 #endif
 #define read_can_lock(rwlock)		__raw_read_can_lock(&(rwlock)->raw_lock)
 #define write_can_lock(rwlock)		__raw_write_can_lock(&(rwlock)->raw_lock)
 /*
  * Define the various spin_lock and rw_lock methods.  Note we define these
  * regardless of whether CONFIG_SMP or CONFIG_PREEMPT are set. The various
  * methods are defined as nops in the case they are not required.
  */
 #define spin_trylock(lock)		__cond_lock(lock, _spin_trylock(lock))
 #define read_trylock(lock)		__cond_lock(lock, _read_trylock(lock))
 #define write_trylock(lock)		__cond_lock(lock, _write_trylock(lock))
 #define spin_lock(lock)			_spin_lock(lock)
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 # define spin_lock_nested(lock, subclass) _spin_lock_nested(lock, subclass)
 #else
 # define spin_lock_nested(lock, subclass) _spin_lock(lock)
 #endif
 #define write_lock(lock)		_write_lock(lock)
 #define read_lock(lock)			_read_lock(lock)
 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
 #define spin_lock_irqsave(lock, flags)	flags = _spin_lock_irqsave(lock)
 #define read_lock_irqsave(lock, flags)	flags = _read_lock_irqsave(lock)
 #define write_lock_irqsave(lock, flags)	flags = _write_lock_irqsave(lock)
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 #define spin_lock_irqsave_nested(lock, flags, subclass) \
 	flags = _spin_lock_irqsave_nested(lock, subclass)
 #else
 #define spin_lock_irqsave_nested(lock, flags, subclass) \
 	flags = _spin_lock_irqsave(lock)
 #endif
 #else
 #define spin_lock_irqsave(lock, flags)	_spin_lock_irqsave(lock, flags)
 #define read_lock_irqsave(lock, flags)	_read_lock_irqsave(lock, flags)
 #define write_lock_irqsave(lock, flags)	_write_lock_irqsave(lock, flags)
 #define spin_lock_irqsave_nested(lock, flags, subclass)	\
 	spin_lock_irqsave(lock, flags)
 #endif
 #define spin_lock_irq(lock)		_spin_lock_irq(lock)
 #define spin_lock_bh(lock)		_spin_lock_bh(lock)
 #define read_lock_irq(lock)		_read_lock_irq(lock)
 #define read_lock_bh(lock)		_read_lock_bh(lock)
 #define write_lock_irq(lock)		_write_lock_irq(lock)
 #define write_lock_bh(lock)		_write_lock_bh(lock)
 /*
  * We inline the unlock functions in the nondebug case:
  */
 #if defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT) || \
 	!defined(CONFIG_SMP)
 # define spin_unlock(lock)		_spin_unlock(lock)
 # define read_unlock(lock)		_read_unlock(lock)
 # define write_unlock(lock)		_write_unlock(lock)
 # define spin_unlock_irq(lock)		_spin_unlock_irq(lock)
 # define read_unlock_irq(lock)		_read_unlock_irq(lock)
 # define write_unlock_irq(lock)		_write_unlock_irq(lock)
 #else
 # define spin_unlock(lock) \
     do {__raw_spin_unlock(&(lock)->raw_lock); __release(lock); } while (0)
 # define read_unlock(lock) \
     do {__raw_read_unlock(&(lock)->raw_lock); __release(lock); } while (0)
 # define write_unlock(lock) \
     do {__raw_write_unlock(&(lock)->raw_lock); __release(lock); } while (0)
 # define spin_unlock_irq(lock)			\
 do {						\
 	__raw_spin_unlock(&(lock)->raw_lock);	\
 	__release(lock);			\
 	local_irq_enable();			\
 } while (0)
 # define read_unlock_irq(lock)			\
 do {						\
 	__raw_read_unlock(&(lock)->raw_lock);	\
 	__release(lock);			\
 	local_irq_enable();			\
 } while (0)
 # define write_unlock_irq(lock)			\
 do {						\
 	__raw_write_unlock(&(lock)->raw_lock);	\
 	__release(lock);			\
 	local_irq_enable();			\
 } while (0)
 #endif
 #define spin_unlock_irqrestore(lock, flags) \
 					_spin_unlock_irqrestore(lock, flags)
 #define spin_unlock_bh(lock)		_spin_unlock_bh(lock)
 #define read_unlock_irqrestore(lock, flags) \
 					_read_unlock_irqrestore(lock, flags)
 #define read_unlock_bh(lock)		_read_unlock_bh(lock)
 #define write_unlock_irqrestore(lock, flags) \
 					_write_unlock_irqrestore(lock, flags)
 #define write_unlock_bh(lock)		_write_unlock_bh(lock)
 #define spin_trylock_bh(lock)	__cond_lock(lock, _spin_trylock_bh(lock))
 #define spin_trylock_irq(lock) \
 ({ \
 	local_irq_disable(); \
 	spin_trylock(lock) ? \
 	1 : ({ local_irq_enable(); 0;  }); \
 })
 #define spin_trylock_irqsave(lock, flags) \
 ({ \
 	local_irq_save(flags); \
 	spin_trylock(lock) ? \
 	1 : ({ local_irq_restore(flags); 0; }); \
 })
 #define write_trylock_irqsave(lock, flags) \
 ({ \
 	local_irq_save(flags); \
 	write_trylock(lock) ? \
 	1 : ({ local_irq_restore(flags); 0; }); \
 })
 /*
  * Locks two spinlocks l1 and l2.
  * l1_first indicates if spinlock l1 should be taken first.
  */
 static inline void double_spin_lock(spinlock_t *l1, spinlock_t *l2,
 				    bool l1_first)
 	__acquires(l1)
 	__acquires(l2)
 {
 	if (l1_first) {
 		spin_lock(l1);
 		spin_lock(l2);
 	} else {
 		spin_lock(l2);
 		spin_lock(l1);
 	}
 }
 /*
  * Unlocks two spinlocks l1 and l2.
  * l1_taken_first indicates if spinlock l1 was taken first and therefore
  * should be released after spinlock l2.
  */
 static inline void double_spin_unlock(spinlock_t *l1, spinlock_t *l2,
 				      bool l1_taken_first)
 	__releases(l1)
 	__releases(l2)
 {
 	if (l1_taken_first) {
 		spin_unlock(l2);
 		spin_unlock(l1);
 	} else {
 		spin_unlock(l1);
 		spin_unlock(l2);
 	}
 }
 /*
  * Pull the atomic_t declaration:
  * (asm-mips/atomic.h needs above definitions)
  */
 #include <asm/atomic.h>
 /**
  * atomic_dec_and_lock - lock on reaching reference count zero
  * @atomic: the atomic counter
  * @lock: the spinlock in question
  */
 extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock);
 #define atomic_dec_and_lock(atomic, lock) \
 		__cond_lock(lock, _atomic_dec_and_lock(atomic, lock))
 /**
  * spin_can_lock - would spin_trylock() succeed?
  * @lock: the spinlock in question.
  */
 #define spin_can_lock(lock)	(!spin_is_locked(lock))
 #endif /* __LINUX_SPINLOCK_H */

include/linux/spinlock_types.h

Diff comments View file @ 95c354f

1	#ifndef __LINUX_SPINLOCK_TYPES_H	1	#ifndef __LINUX_SPINLOCK_TYPES_H
2	#define __LINUX_SPINLOCK_TYPES_H	2	#define __LINUX_SPINLOCK_TYPES_H
3		3
4	/*	4	/*
5	* include/linux/spinlock_types.h - generic spinlock type definitions	5	* include/linux/spinlock_types.h - generic spinlock type definitions
6	* and initializers	6	* and initializers
7	*	7	*
8	* portions Copyright 2005, Red Hat, Inc., Ingo Molnar	8	* portions Copyright 2005, Red Hat, Inc., Ingo Molnar
9	* Released under the General Public License (GPL).	9	* Released under the General Public License (GPL).
10	*/	10	*/
11		11
12	#if defined(CONFIG_SMP)	12	#if defined(CONFIG_SMP)
13	# include <asm/spinlock_types.h>	13	# include <asm/spinlock_types.h>
14	#else	14	#else
15	# include <linux/spinlock_types_up.h>	15	# include <linux/spinlock_types_up.h>
16	#endif	16	#endif
17		17
18	#include <linux/lockdep.h>	18	#include <linux/lockdep.h>
19		19
20	typedef struct {	20	typedef struct {
21	raw_spinlock_t raw_lock;	21	raw_spinlock_t raw_lock;
22	#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)	22	#ifdef CONFIG_GENERIC_LOCKBREAK
23	unsigned int break_lock;	23	unsigned int break_lock;
24	#endif	24	#endif
25	#ifdef CONFIG_DEBUG_SPINLOCK	25	#ifdef CONFIG_DEBUG_SPINLOCK
26	unsigned int magic, owner_cpu;	26	unsigned int magic, owner_cpu;
27	void *owner;	27	void *owner;
28	#endif	28	#endif
29	#ifdef CONFIG_DEBUG_LOCK_ALLOC	29	#ifdef CONFIG_DEBUG_LOCK_ALLOC
30	struct lockdep_map dep_map;	30	struct lockdep_map dep_map;
31	#endif	31	#endif
32	} spinlock_t;	32	} spinlock_t;
33		33
34	#define SPINLOCK_MAGIC 0xdead4ead	34	#define SPINLOCK_MAGIC 0xdead4ead
35		35
36	typedef struct {	36	typedef struct {
37	raw_rwlock_t raw_lock;	37	raw_rwlock_t raw_lock;
38	#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)	38	#ifdef CONFIG_GENERIC_LOCKBREAK
39	unsigned int break_lock;	39	unsigned int break_lock;
40	#endif	40	#endif
41	#ifdef CONFIG_DEBUG_SPINLOCK	41	#ifdef CONFIG_DEBUG_SPINLOCK
42	unsigned int magic, owner_cpu;	42	unsigned int magic, owner_cpu;
43	void *owner;	43	void *owner;
44	#endif	44	#endif
45	#ifdef CONFIG_DEBUG_LOCK_ALLOC	45	#ifdef CONFIG_DEBUG_LOCK_ALLOC
46	struct lockdep_map dep_map;	46	struct lockdep_map dep_map;
47	#endif	47	#endif
48	} rwlock_t;	48	} rwlock_t;
49		49
50	#define RWLOCK_MAGIC 0xdeaf1eed	50	#define RWLOCK_MAGIC 0xdeaf1eed
51		51
52	#define SPINLOCK_OWNER_INIT ((void *)-1L)	52	#define SPINLOCK_OWNER_INIT ((void *)-1L)
53		53
54	#ifdef CONFIG_DEBUG_LOCK_ALLOC	54	#ifdef CONFIG_DEBUG_LOCK_ALLOC
55	# define SPIN_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname }	55	# define SPIN_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname }
56	#else	56	#else
57	# define SPIN_DEP_MAP_INIT(lockname)	57	# define SPIN_DEP_MAP_INIT(lockname)
58	#endif	58	#endif
59		59
60	#ifdef CONFIG_DEBUG_LOCK_ALLOC	60	#ifdef CONFIG_DEBUG_LOCK_ALLOC
61	# define RW_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname }	61	# define RW_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname }
62	#else	62	#else
63	# define RW_DEP_MAP_INIT(lockname)	63	# define RW_DEP_MAP_INIT(lockname)
64	#endif	64	#endif
65		65
66	#ifdef CONFIG_DEBUG_SPINLOCK	66	#ifdef CONFIG_DEBUG_SPINLOCK
67	# define __SPIN_LOCK_UNLOCKED(lockname) \	67	# define __SPIN_LOCK_UNLOCKED(lockname) \
68	(spinlock_t) { .raw_lock = __RAW_SPIN_LOCK_UNLOCKED, \	68	(spinlock_t) { .raw_lock = __RAW_SPIN_LOCK_UNLOCKED, \
69	.magic = SPINLOCK_MAGIC, \	69	.magic = SPINLOCK_MAGIC, \
70	.owner = SPINLOCK_OWNER_INIT, \	70	.owner = SPINLOCK_OWNER_INIT, \
71	.owner_cpu = -1, \	71	.owner_cpu = -1, \
72	SPIN_DEP_MAP_INIT(lockname) }	72	SPIN_DEP_MAP_INIT(lockname) }
73	#define __RW_LOCK_UNLOCKED(lockname) \	73	#define __RW_LOCK_UNLOCKED(lockname) \
74	(rwlock_t) { .raw_lock = __RAW_RW_LOCK_UNLOCKED, \	74	(rwlock_t) { .raw_lock = __RAW_RW_LOCK_UNLOCKED, \
75	.magic = RWLOCK_MAGIC, \	75	.magic = RWLOCK_MAGIC, \
76	.owner = SPINLOCK_OWNER_INIT, \	76	.owner = SPINLOCK_OWNER_INIT, \
77	.owner_cpu = -1, \	77	.owner_cpu = -1, \
78	RW_DEP_MAP_INIT(lockname) }	78	RW_DEP_MAP_INIT(lockname) }
79	#else	79	#else
80	# define __SPIN_LOCK_UNLOCKED(lockname) \	80	# define __SPIN_LOCK_UNLOCKED(lockname) \
81	(spinlock_t) { .raw_lock = __RAW_SPIN_LOCK_UNLOCKED, \	81	(spinlock_t) { .raw_lock = __RAW_SPIN_LOCK_UNLOCKED, \
82	SPIN_DEP_MAP_INIT(lockname) }	82	SPIN_DEP_MAP_INIT(lockname) }
83	#define __RW_LOCK_UNLOCKED(lockname) \	83	#define __RW_LOCK_UNLOCKED(lockname) \
84	(rwlock_t) { .raw_lock = __RAW_RW_LOCK_UNLOCKED, \	84	(rwlock_t) { .raw_lock = __RAW_RW_LOCK_UNLOCKED, \
85	RW_DEP_MAP_INIT(lockname) }	85	RW_DEP_MAP_INIT(lockname) }
86	#endif	86	#endif
87		87
88	/*	88	/*
89	* SPIN_LOCK_UNLOCKED and RW_LOCK_UNLOCKED defeat lockdep state tracking and	89	* SPIN_LOCK_UNLOCKED and RW_LOCK_UNLOCKED defeat lockdep state tracking and
90	* are hence deprecated.	90	* are hence deprecated.
91	* Please use DEFINE_SPINLOCK()/DEFINE_RWLOCK() or	91	* Please use DEFINE_SPINLOCK()/DEFINE_RWLOCK() or
92	* __SPIN_LOCK_UNLOCKED()/__RW_LOCK_UNLOCKED() as appropriate.	92	* __SPIN_LOCK_UNLOCKED()/__RW_LOCK_UNLOCKED() as appropriate.
93	*/	93	*/
94	#define SPIN_LOCK_UNLOCKED __SPIN_LOCK_UNLOCKED(old_style_spin_init)	94	#define SPIN_LOCK_UNLOCKED __SPIN_LOCK_UNLOCKED(old_style_spin_init)
95	#define RW_LOCK_UNLOCKED __RW_LOCK_UNLOCKED(old_style_rw_init)	95	#define RW_LOCK_UNLOCKED __RW_LOCK_UNLOCKED(old_style_rw_init)
96		96
97	#define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x)	97	#define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
98	#define DEFINE_RWLOCK(x) rwlock_t x = __RW_LOCK_UNLOCKED(x)	98	#define DEFINE_RWLOCK(x) rwlock_t x = __RW_LOCK_UNLOCKED(x)
99		99
100	#endif /* __LINUX_SPINLOCK_TYPES_H */	100	#endif /* __LINUX_SPINLOCK_TYPES_H */
101		101

include/linux/spinlock_up.h

Diff comments View file @ 95c354f

 #ifndef __LINUX_SPINLOCK_UP_H
 #define __LINUX_SPINLOCK_UP_H
 #ifndef __LINUX_SPINLOCK_H
 # error "please don't include this file directly"
 #endif
 /*
  * include/linux/spinlock_up.h - UP-debug version of spinlocks.
  *
  * portions Copyright 2005, Red Hat, Inc., Ingo Molnar
  * Released under the General Public License (GPL).
  *
  * In the debug case, 1 means unlocked, 0 means locked. (the values
  * are inverted, to catch initialization bugs)
  *
  * No atomicity anywhere, we are on UP.
  */
 #ifdef CONFIG_DEBUG_SPINLOCK
 #define __raw_spin_is_locked(x)		((x)->slock == 0)
 static inline void __raw_spin_lock(raw_spinlock_t *lock)
 {
 	lock->slock = 0;
 }
 static inline void
 __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags)
 {
 	local_irq_save(flags);
 	lock->slock = 0;
 }
 static inline int __raw_spin_trylock(raw_spinlock_t *lock)
 {
 	char oldval = lock->slock;
 	lock->slock = 0;
 	return oldval > 0;
 }
 static inline void __raw_spin_unlock(raw_spinlock_t *lock)
 {
 	lock->slock = 1;
 }
 /*
  * Read-write spinlocks. No debug version.
  */
 #define __raw_read_lock(lock)		do { (void)(lock); } while (0)
 #define __raw_write_lock(lock)		do { (void)(lock); } while (0)
 #define __raw_read_trylock(lock)	({ (void)(lock); 1; })
 #define __raw_write_trylock(lock)	({ (void)(lock); 1; })
 #define __raw_read_unlock(lock)		do { (void)(lock); } while (0)
 #define __raw_write_unlock(lock)	do { (void)(lock); } while (0)
 #else /* DEBUG_SPINLOCK */
 #define __raw_spin_is_locked(lock)	((void)(lock), 0)
 /* for sched.c and kernel_lock.c: */
 # define __raw_spin_lock(lock)		do { (void)(lock); } while (0)
 # define __raw_spin_unlock(lock)	do { (void)(lock); } while (0)
 # define __raw_spin_trylock(lock)	({ (void)(lock); 1; })
 #endif /* DEBUG_SPINLOCK */
+#define __raw_spin_is_contended(lock)	(((void)(lock), 0))
 #define __raw_read_can_lock(lock)	(((void)(lock), 1))
 #define __raw_write_can_lock(lock)	(((void)(lock), 1))
 #define __raw_spin_unlock_wait(lock) \
 		do { cpu_relax(); } while (__raw_spin_is_locked(lock))
 #endif /* __LINUX_SPINLOCK_UP_H */

kernel/sched.c

Diff comments View file @ 95c354f

 /*
  *  kernel/sched.c
  *
  *  Kernel scheduler and related syscalls
  *
  *  Copyright (C) 1991-2002  Linus Torvalds
  *
  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
  *		make semaphores SMP safe
  *  1998-11-19	Implemented schedule_timeout() and related stuff
  *		by Andrea Arcangeli
  *  2002-01-04	New ultra-scalable O(1) scheduler by Ingo Molnar:
  *		hybrid priority-list and round-robin design with
  *		an array-switch method of distributing timeslices
  *		and per-CPU runqueues.  Cleanups and useful suggestions
  *		by Davide Libenzi, preemptible kernel bits by Robert Love.
  *  2003-09-03	Interactivity tuning by Con Kolivas.
  *  2004-04-02	Scheduler domains code by Nick Piggin
  *  2007-04-15  Work begun on replacing all interactivity tuning with a
  *              fair scheduling design by Con Kolivas.
  *  2007-05-05  Load balancing (smp-nice) and other improvements
  *              by Peter Williams
  *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith
  *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
  *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
  *              Thomas Gleixner, Mike Kravetz
  */
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/nmi.h>
 #include <linux/init.h>
 #include <linux/uaccess.h>
 #include <linux/highmem.h>
 #include <linux/smp_lock.h>
 #include <asm/mmu_context.h>
 #include <linux/interrupt.h>
 #include <linux/capability.h>
 #include <linux/completion.h>
 #include <linux/kernel_stat.h>
 #include <linux/debug_locks.h>
 #include <linux/security.h>
 #include <linux/notifier.h>
 #include <linux/profile.h>
 #include <linux/freezer.h>
 #include <linux/vmalloc.h>
 #include <linux/blkdev.h>
 #include <linux/delay.h>
 #include <linux/pid_namespace.h>
 #include <linux/smp.h>
 #include <linux/threads.h>
 #include <linux/timer.h>
 #include <linux/rcupdate.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/percpu.h>
 #include <linux/kthread.h>
 #include <linux/seq_file.h>
 #include <linux/sysctl.h>
 #include <linux/syscalls.h>
 #include <linux/times.h>
 #include <linux/tsacct_kern.h>
 #include <linux/kprobes.h>
 #include <linux/delayacct.h>
 #include <linux/reciprocal_div.h>
 #include <linux/unistd.h>
 #include <linux/pagemap.h>
 #include <linux/hrtimer.h>
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
 /*
  * Scheduler clock - returns current time in nanosec units.
  * This is default implementation.
  * Architectures and sub-architectures can override this.
  */
 unsigned long long __attribute__((weak)) sched_clock(void)
 {
 	return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
 }
 /*
  * Convert user-nice values [ -20 ... 0 ... 19 ]
  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
  * and back.
  */
 #define NICE_TO_PRIO(nice)	(MAX_RT_PRIO + (nice) + 20)
 #define PRIO_TO_NICE(prio)	((prio) - MAX_RT_PRIO - 20)
 #define TASK_NICE(p)		PRIO_TO_NICE((p)->static_prio)
 /*
  * 'User priority' is the nice value converted to something we
  * can work with better when scaling various scheduler parameters,
  * it's a [ 0 ... 39 ] range.
  */
 #define USER_PRIO(p)		((p)-MAX_RT_PRIO)
 #define TASK_USER_PRIO(p)	USER_PRIO((p)->static_prio)
 #define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
 /*
  * Helpers for converting nanosecond timing to jiffy resolution
  */
 #define NS_TO_JIFFIES(TIME)	((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
 #define NICE_0_LOAD		SCHED_LOAD_SCALE
 #define NICE_0_SHIFT		SCHED_LOAD_SHIFT
 /*
  * These are the 'tuning knobs' of the scheduler:
  *
  * default timeslice is 100 msecs (used only for SCHED_RR tasks).
  * Timeslices get refilled after they expire.
  */
 #define DEF_TIMESLICE		(100 * HZ / 1000)
 #ifdef CONFIG_SMP
 /*
  * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
  * Since cpu_power is a 'constant', we can use a reciprocal divide.
  */
 static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
 {
 	return reciprocal_divide(load, sg->reciprocal_cpu_power);
 }
 /*
  * Each time a sched group cpu_power is changed,
  * we must compute its reciprocal value
  */
 static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
 {
 	sg->__cpu_power += val;
 	sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
 }
 #endif
 static inline int rt_policy(int policy)
 {
 	if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
 		return 1;
 	return 0;
 }
 static inline int task_has_rt_policy(struct task_struct *p)
 {
 	return rt_policy(p->policy);
 }
 /*
  * This is the priority-queue data structure of the RT scheduling class:
  */
 struct rt_prio_array {
 	DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
 	struct list_head queue[MAX_RT_PRIO];
 };
 #ifdef CONFIG_FAIR_GROUP_SCHED
 #include <linux/cgroup.h>
 struct cfs_rq;
 static LIST_HEAD(task_groups);
 /* task group related information */
 struct task_group {
 #ifdef CONFIG_FAIR_CGROUP_SCHED
 	struct cgroup_subsys_state css;
 #endif
 	/* schedulable entities of this group on each cpu */
 	struct sched_entity **se;
 	/* runqueue "owned" by this group on each cpu */
 	struct cfs_rq **cfs_rq;
 	struct sched_rt_entity **rt_se;
 	struct rt_rq **rt_rq;
 	unsigned int rt_ratio;
 	/*
 	 * shares assigned to a task group governs how much of cpu bandwidth
 	 * is allocated to the group. The more shares a group has, the more is
 	 * the cpu bandwidth allocated to it.
 	 *
 	 * For ex, lets say that there are three task groups, A, B and C which
 	 * have been assigned shares 1000, 2000 and 3000 respectively. Then,
 	 * cpu bandwidth allocated by the scheduler to task groups A, B and C
 	 * should be:
 	 *
 	 *	Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66%
 	 *	Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33%
 	 *	Bw(C) = 3000/(1000+2000+3000) * 100 = 50%
 	 *
 	 * The weight assigned to a task group's schedulable entities on every
 	 * cpu (task_group.se[a_cpu]->load.weight) is derived from the task
 	 * group's shares. For ex: lets say that task group A has been
 	 * assigned shares of 1000 and there are two CPUs in a system. Then,
 	 *
 	 *  tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000;
 	 *
 	 * Note: It's not necessary that each of a task's group schedulable
 	 *	 entity have the same weight on all CPUs. If the group
 	 *	 has 2 of its tasks on CPU0 and 1 task on CPU1, then a
 	 *	 better distribution of weight could be:
 	 *
 	 *	tg_A->se[0]->load.weight = 2/3 * 2000 = 1333
 	 *	tg_A->se[1]->load.weight = 1/2 * 2000 =  667
 	 *
 	 * rebalance_shares() is responsible for distributing the shares of a
 	 * task groups like this among the group's schedulable entities across
 	 * cpus.
 	 *
 	 */
 	unsigned long shares;
 	struct rcu_head rcu;
 	struct list_head list;
 };
 /* Default task group's sched entity on each cpu */
 static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
 /* Default task group's cfs_rq on each cpu */
 static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
 static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
 static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
 static struct sched_entity *init_sched_entity_p[NR_CPUS];
 static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
 static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS];
 static struct rt_rq *init_rt_rq_p[NR_CPUS];
 /* task_group_mutex serializes add/remove of task groups and also changes to
  * a task group's cpu shares.
  */
 static DEFINE_MUTEX(task_group_mutex);
 /* doms_cur_mutex serializes access to doms_cur[] array */
 static DEFINE_MUTEX(doms_cur_mutex);
 #ifdef CONFIG_SMP
 /* kernel thread that runs rebalance_shares() periodically */
 static struct task_struct *lb_monitor_task;
 static int load_balance_monitor(void *unused);
 #endif
 static void set_se_shares(struct sched_entity *se, unsigned long shares);
 /* Default task group.
  *	Every task in system belong to this group at bootup.
  */
 struct task_group init_task_group = {
 	.se	= init_sched_entity_p,
 	.cfs_rq = init_cfs_rq_p,
 	.rt_se	= init_sched_rt_entity_p,
 	.rt_rq	= init_rt_rq_p,
 };
 #ifdef CONFIG_FAIR_USER_SCHED
 # define INIT_TASK_GROUP_LOAD	(2*NICE_0_LOAD)
 #else
 # define INIT_TASK_GROUP_LOAD	NICE_0_LOAD
 #endif
 #define MIN_GROUP_SHARES	2
 static int init_task_group_load = INIT_TASK_GROUP_LOAD;
 /* return group to which a task belongs */
 static inline struct task_group *task_group(struct task_struct *p)
 {
 	struct task_group *tg;
 #ifdef CONFIG_FAIR_USER_SCHED
 	tg = p->user->tg;
 #elif defined(CONFIG_FAIR_CGROUP_SCHED)
 	tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
 				struct task_group, css);
 #else
 	tg = &init_task_group;
 #endif
 	return tg;
 }
 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
 static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 {
 	p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
 	p->se.parent = task_group(p)->se[cpu];
 	p->rt.rt_rq  = task_group(p)->rt_rq[cpu];
 	p->rt.parent = task_group(p)->rt_se[cpu];
 }
 static inline void lock_task_group_list(void)
 {
 	mutex_lock(&task_group_mutex);
 }
 static inline void unlock_task_group_list(void)
 {
 	mutex_unlock(&task_group_mutex);
 }
 static inline void lock_doms_cur(void)
 {
 	mutex_lock(&doms_cur_mutex);
 }
 static inline void unlock_doms_cur(void)
 {
 	mutex_unlock(&doms_cur_mutex);
 }
 #else
 static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
 static inline void lock_task_group_list(void) { }
 static inline void unlock_task_group_list(void) { }
 static inline void lock_doms_cur(void) { }
 static inline void unlock_doms_cur(void) { }
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 /* CFS-related fields in a runqueue */
 struct cfs_rq {
 	struct load_weight load;
 	unsigned long nr_running;
 	u64 exec_clock;
 	u64 min_vruntime;
 	struct rb_root tasks_timeline;
 	struct rb_node *rb_leftmost;
 	struct rb_node *rb_load_balance_curr;
 	/* 'curr' points to currently running entity on this cfs_rq.
 	 * It is set to NULL otherwise (i.e when none are currently running).
 	 */
 	struct sched_entity *curr;
 	unsigned long nr_spread_over;
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */
 	/*
 	 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
 	 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
 	 * (like users, containers etc.)
 	 *
 	 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
 	 * list is used during load balance.
 	 */
 	struct list_head leaf_cfs_rq_list;
 	struct task_group *tg;	/* group that "owns" this runqueue */
 #endif
 };
 /* Real-Time classes' related field in a runqueue: */
 struct rt_rq {
 	struct rt_prio_array active;
 	unsigned long rt_nr_running;
 #if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
 	int highest_prio; /* highest queued rt task prio */
 #endif
 #ifdef CONFIG_SMP
 	unsigned long rt_nr_migratory;
 	int overloaded;
 #endif
 	int rt_throttled;
 	u64 rt_time;
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	struct rq *rq;
 	struct list_head leaf_rt_rq_list;
 	struct task_group *tg;
 	struct sched_rt_entity *rt_se;
 #endif
 };
 #ifdef CONFIG_SMP
 /*
  * We add the notion of a root-domain which will be used to define per-domain
  * variables. Each exclusive cpuset essentially defines an island domain by
  * fully partitioning the member cpus from any other cpuset. Whenever a new
  * exclusive cpuset is created, we also create and attach a new root-domain
  * object.
  *
  */
 struct root_domain {
 	atomic_t refcount;
 	cpumask_t span;
 	cpumask_t online;
 	/*
 	 * The "RT overload" flag: it gets set if a CPU has more than
 	 * one runnable RT task.
 	 */
 	cpumask_t rto_mask;
 	atomic_t rto_count;
 };
 /*
  * By default the system creates a single root-domain with all cpus as
  * members (mimicking the global state we have today).
  */
 static struct root_domain def_root_domain;
 #endif
 /*
  * This is the main, per-CPU runqueue data structure.
  *
  * Locking rule: those places that want to lock multiple runqueues
  * (such as the load balancing or the thread migration code), lock
  * acquire operations must be ordered by ascending &runqueue.
  */
 struct rq {
 	/* runqueue lock: */
 	spinlock_t lock;
 	/*
 	 * nr_running and cpu_load should be in the same cacheline because
 	 * remote CPUs use both these fields when doing load calculation.
 	 */
 	unsigned long nr_running;
 	#define CPU_LOAD_IDX_MAX 5
 	unsigned long cpu_load[CPU_LOAD_IDX_MAX];
 	unsigned char idle_at_tick;
 #ifdef CONFIG_NO_HZ
 	unsigned char in_nohz_recently;
 #endif
 	/* capture load from *all* tasks on this cpu: */
 	struct load_weight load;
 	unsigned long nr_load_updates;
 	u64 nr_switches;
 	struct cfs_rq cfs;
 	struct rt_rq rt;
 	u64 rt_period_expire;
 	int rt_throttled;
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* list of leaf cfs_rq on this cpu: */
 	struct list_head leaf_cfs_rq_list;
 	struct list_head leaf_rt_rq_list;
 #endif
 	/*
 	 * This is part of a global counter where only the total sum
 	 * over all CPUs matters. A task can increase this counter on
 	 * one CPU and if it got migrated afterwards it may decrease
 	 * it on another CPU. Always updated under the runqueue lock:
 	 */
 	unsigned long nr_uninterruptible;
 	struct task_struct *curr, *idle;
 	unsigned long next_balance;
 	struct mm_struct *prev_mm;
 	u64 clock, prev_clock_raw;
 	s64 clock_max_delta;
 	unsigned int clock_warps, clock_overflows, clock_underflows;
 	u64 idle_clock;
 	unsigned int clock_deep_idle_events;
 	u64 tick_timestamp;
 	atomic_t nr_iowait;
 #ifdef CONFIG_SMP
 	struct root_domain *rd;
 	struct sched_domain *sd;
 	/* For active balancing */
 	int active_balance;
 	int push_cpu;
 	/* cpu of this runqueue: */
 	int cpu;
 	struct task_struct *migration_thread;
 	struct list_head migration_queue;
 #endif
 #ifdef CONFIG_SCHED_HRTICK
 	unsigned long hrtick_flags;
 	ktime_t hrtick_expire;
 	struct hrtimer hrtick_timer;
 #endif
 #ifdef CONFIG_SCHEDSTATS
 	/* latency stats */
 	struct sched_info rq_sched_info;
 	/* sys_sched_yield() stats */
 	unsigned int yld_exp_empty;
 	unsigned int yld_act_empty;
 	unsigned int yld_both_empty;
 	unsigned int yld_count;
 	/* schedule() stats */
 	unsigned int sched_switch;
 	unsigned int sched_count;
 	unsigned int sched_goidle;
 	/* try_to_wake_up() stats */
 	unsigned int ttwu_count;
 	unsigned int ttwu_local;
 	/* BKL stats */
 	unsigned int bkl_count;
 #endif
 	struct lock_class_key rq_lock_key;
 };
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
 {
 	rq->curr->sched_class->check_preempt_curr(rq, p);
 }
 static inline int cpu_of(struct rq *rq)
 {
 #ifdef CONFIG_SMP
 	return rq->cpu;
 #else
 	return 0;
 #endif
 }
 /*
  * Update the per-runqueue clock, as finegrained as the platform can give
  * us, but without assuming monotonicity, etc.:
  */
 static void __update_rq_clock(struct rq *rq)
 {
 	u64 prev_raw = rq->prev_clock_raw;
 	u64 now = sched_clock();
 	s64 delta = now - prev_raw;
 	u64 clock = rq->clock;
 #ifdef CONFIG_SCHED_DEBUG
 	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
 #endif
 	/*
 	 * Protect against sched_clock() occasionally going backwards:
 	 */
 	if (unlikely(delta < 0)) {
 		clock++;
 		rq->clock_warps++;
 	} else {
 		/*
 		 * Catch too large forward jumps too:
 		 */
 		if (unlikely(clock + delta > rq->tick_timestamp + TICK_NSEC)) {
 			if (clock < rq->tick_timestamp + TICK_NSEC)
 				clock = rq->tick_timestamp + TICK_NSEC;
 			else
 				clock++;
 			rq->clock_overflows++;
 		} else {
 			if (unlikely(delta > rq->clock_max_delta))
 				rq->clock_max_delta = delta;
 			clock += delta;
 		}
 	}
 	rq->prev_clock_raw = now;
 	rq->clock = clock;
 }
 static void update_rq_clock(struct rq *rq)
 {
 	if (likely(smp_processor_id() == cpu_of(rq)))
 		__update_rq_clock(rq);
 }
 /*
  * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
  * See detach_destroy_domains: synchronize_sched for details.
  *
  * The domain tree of any CPU may only be accessed from within
  * preempt-disabled sections.
  */
 #define for_each_domain(cpu, __sd) \
 	for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
 #define cpu_rq(cpu)		(&per_cpu(runqueues, (cpu)))
 #define this_rq()		(&__get_cpu_var(runqueues))
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 unsigned long rt_needs_cpu(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	u64 delta;
 	if (!rq->rt_throttled)
 		return 0;
 	if (rq->clock > rq->rt_period_expire)
 		return 1;
 	delta = rq->rt_period_expire - rq->clock;
 	do_div(delta, NSEC_PER_SEC / HZ);
 	return (unsigned long)delta;
 }
 /*
  * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
  */
 #ifdef CONFIG_SCHED_DEBUG
 # define const_debug __read_mostly
 #else
 # define const_debug static const
 #endif
 /*
  * Debugging: various feature bits
  */
 enum {
 	SCHED_FEAT_NEW_FAIR_SLEEPERS	= 1,
 	SCHED_FEAT_WAKEUP_PREEMPT	= 2,
 	SCHED_FEAT_START_DEBIT		= 4,
 	SCHED_FEAT_TREE_AVG		= 8,
 	SCHED_FEAT_APPROX_AVG		= 16,
 	SCHED_FEAT_HRTICK		= 32,
 	SCHED_FEAT_DOUBLE_TICK		= 64,
 };
 const_debug unsigned int sysctl_sched_features =
 		SCHED_FEAT_NEW_FAIR_SLEEPERS	* 1 |
 		SCHED_FEAT_WAKEUP_PREEMPT	* 1 |
 		SCHED_FEAT_START_DEBIT		* 1 |
 		SCHED_FEAT_TREE_AVG		* 0 |
 		SCHED_FEAT_APPROX_AVG		* 0 |
 		SCHED_FEAT_HRTICK		* 1 |
 		SCHED_FEAT_DOUBLE_TICK		* 0;
 #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
 /*
  * Number of tasks to iterate in a single balance run.
  * Limited because this is done with IRQs disabled.
  */
 const_debug unsigned int sysctl_sched_nr_migrate = 32;
 /*
  * period over which we measure -rt task cpu usage in ms.
  * default: 1s
  */
 const_debug unsigned int sysctl_sched_rt_period = 1000;
 #define SCHED_RT_FRAC_SHIFT	16
 #define SCHED_RT_FRAC		(1UL << SCHED_RT_FRAC_SHIFT)
 /*
  * ratio of time -rt tasks may consume.
  * default: 95%
  */
 const_debug unsigned int sysctl_sched_rt_ratio = 62259;
 /*
  * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
  * clock constructed from sched_clock():
  */
 unsigned long long cpu_clock(int cpu)
 {
 	unsigned long long now;
 	unsigned long flags;
 	struct rq *rq;
 	local_irq_save(flags);
 	rq = cpu_rq(cpu);
 	/*
 	 * Only call sched_clock() if the scheduler has already been
 	 * initialized (some code might call cpu_clock() very early):
 	 */
 	if (rq->idle)
 		update_rq_clock(rq);
 	now = rq->clock;
 	local_irq_restore(flags);
 	return now;
 }
 EXPORT_SYMBOL_GPL(cpu_clock);
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)	do { } while (0)
 #endif
 #ifndef finish_arch_switch
 # define finish_arch_switch(prev)	do { } while (0)
 #endif
 static inline int task_current(struct rq *rq, struct task_struct *p)
 {
 	return rq->curr == p;
 }
 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
 static inline int task_running(struct rq *rq, struct task_struct *p)
 {
 	return task_current(rq, p);
 }
 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 {
 }
 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 {
 #ifdef CONFIG_DEBUG_SPINLOCK
 	/* this is a valid case when another task releases the spinlock */
 	rq->lock.owner = current;
 #endif
 	/*
 	 * If we are tracking spinlock dependencies then we have to
 	 * fix up the runqueue lock - which gets 'carried over' from
 	 * prev into current:
 	 */
 	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
 	spin_unlock_irq(&rq->lock);
 }
 #else /* __ARCH_WANT_UNLOCKED_CTXSW */
 static inline int task_running(struct rq *rq, struct task_struct *p)
 {
 #ifdef CONFIG_SMP
 	return p->oncpu;
 #else
 	return task_current(rq, p);
 #endif
 }
 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 {
 #ifdef CONFIG_SMP
 	/*
 	 * We can optimise this out completely for !SMP, because the
 	 * SMP rebalancing from interrupt is the only thing that cares
 	 * here.
 	 */
 	next->oncpu = 1;
 #endif
 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 	spin_unlock_irq(&rq->lock);
 #else
 	spin_unlock(&rq->lock);
 #endif
 }
 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 {
 #ifdef CONFIG_SMP
 	/*
 	 * After ->oncpu is cleared, the task can be moved to a different CPU.
 	 * We must ensure this doesn't happen until the switch is completely
 	 * finished.
 	 */
 	smp_wmb();
 	prev->oncpu = 0;
 #endif
 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 	local_irq_enable();
 #endif
 }
 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 /*
  * __task_rq_lock - lock the runqueue a given task resides on.
  * Must be called interrupts disabled.
  */
 static inline struct rq *__task_rq_lock(struct task_struct *p)
 	__acquires(rq->lock)
 {
 	for (;;) {
 		struct rq *rq = task_rq(p);
 		spin_lock(&rq->lock);
 		if (likely(rq == task_rq(p)))
 			return rq;
 		spin_unlock(&rq->lock);
 	}
 }
 /*
  * task_rq_lock - lock the runqueue a given task resides on and disable
  * interrupts. Note the ordering: we can safely lookup the task_rq without
  * explicitly disabling preemption.
  */
 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
 	__acquires(rq->lock)
 {
 	struct rq *rq;
 	for (;;) {
 		local_irq_save(*flags);
 		rq = task_rq(p);
 		spin_lock(&rq->lock);
 		if (likely(rq == task_rq(p)))
 			return rq;
 		spin_unlock_irqrestore(&rq->lock, *flags);
 	}
 }
 static void __task_rq_unlock(struct rq *rq)
 	__releases(rq->lock)
 {
 	spin_unlock(&rq->lock);
 }
 static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
 	__releases(rq->lock)
 {
 	spin_unlock_irqrestore(&rq->lock, *flags);
 }
 /*
  * this_rq_lock - lock this runqueue and disable interrupts.
  */
 static struct rq *this_rq_lock(void)
 	__acquires(rq->lock)
 {
 	struct rq *rq;
 	local_irq_disable();
 	rq = this_rq();
 	spin_lock(&rq->lock);
 	return rq;
 }
 /*
  * We are going deep-idle (irqs are disabled):
  */
 void sched_clock_idle_sleep_event(void)
 {
 	struct rq *rq = cpu_rq(smp_processor_id());
 	spin_lock(&rq->lock);
 	__update_rq_clock(rq);
 	spin_unlock(&rq->lock);
 	rq->clock_deep_idle_events++;
 }
 EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
 /*
  * We just idled delta nanoseconds (called with irqs disabled):
  */
 void sched_clock_idle_wakeup_event(u64 delta_ns)
 {
 	struct rq *rq = cpu_rq(smp_processor_id());
 	u64 now = sched_clock();
 	rq->idle_clock += delta_ns;
 	/*
 	 * Override the previous timestamp and ignore all
 	 * sched_clock() deltas that occured while we idled,
 	 * and use the PM-provided delta_ns to advance the
 	 * rq clock:
 	 */
 	spin_lock(&rq->lock);
 	rq->prev_clock_raw = now;
 	rq->clock += delta_ns;
 	spin_unlock(&rq->lock);
 	touch_softlockup_watchdog();
 }
 EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 static void __resched_task(struct task_struct *p, int tif_bit);
 static inline void resched_task(struct task_struct *p)
 {
 	__resched_task(p, TIF_NEED_RESCHED);
 }
 #ifdef CONFIG_SCHED_HRTICK
 /*
  * Use HR-timers to deliver accurate preemption points.
  *
  * Its all a bit involved since we cannot program an hrt while holding the
  * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
  * reschedule event.
  *
  * When we get rescheduled we reprogram the hrtick_timer outside of the
  * rq->lock.
  */
 static inline void resched_hrt(struct task_struct *p)
 {
 	__resched_task(p, TIF_HRTICK_RESCHED);
 }
 static inline void resched_rq(struct rq *rq)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&rq->lock, flags);
 	resched_task(rq->curr);
 	spin_unlock_irqrestore(&rq->lock, flags);
 }
 enum {
 	HRTICK_SET,		/* re-programm hrtick_timer */
 	HRTICK_RESET,		/* not a new slice */
 };
 /*
  * Use hrtick when:
  *  - enabled by features
  *  - hrtimer is actually high res
  */
 static inline int hrtick_enabled(struct rq *rq)
 {
 	if (!sched_feat(HRTICK))
 		return 0;
 	return hrtimer_is_hres_active(&rq->hrtick_timer);
 }
 /*
  * Called to set the hrtick timer state.
  *
  * called with rq->lock held and irqs disabled
  */
 static void hrtick_start(struct rq *rq, u64 delay, int reset)
 {
 	assert_spin_locked(&rq->lock);
 	/*
 	 * preempt at: now + delay
 	 */
 	rq->hrtick_expire =
 		ktime_add_ns(rq->hrtick_timer.base->get_time(), delay);
 	/*
 	 * indicate we need to program the timer
 	 */
 	__set_bit(HRTICK_SET, &rq->hrtick_flags);
 	if (reset)
 		__set_bit(HRTICK_RESET, &rq->hrtick_flags);
 	/*
 	 * New slices are called from the schedule path and don't need a
 	 * forced reschedule.
 	 */
 	if (reset)
 		resched_hrt(rq->curr);
 }
 static void hrtick_clear(struct rq *rq)
 {
 	if (hrtimer_active(&rq->hrtick_timer))
 		hrtimer_cancel(&rq->hrtick_timer);
 }
 /*
  * Update the timer from the possible pending state.
  */
 static void hrtick_set(struct rq *rq)
 {
 	ktime_t time;
 	int set, reset;
 	unsigned long flags;
 	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
 	spin_lock_irqsave(&rq->lock, flags);
 	set = __test_and_clear_bit(HRTICK_SET, &rq->hrtick_flags);
 	reset = __test_and_clear_bit(HRTICK_RESET, &rq->hrtick_flags);
 	time = rq->hrtick_expire;
 	clear_thread_flag(TIF_HRTICK_RESCHED);
 	spin_unlock_irqrestore(&rq->lock, flags);
 	if (set) {
 		hrtimer_start(&rq->hrtick_timer, time, HRTIMER_MODE_ABS);
 		if (reset && !hrtimer_active(&rq->hrtick_timer))
 			resched_rq(rq);
 	} else
 		hrtick_clear(rq);
 }
 /*
  * High-resolution timer tick.
  * Runs from hardirq context with interrupts disabled.
  */
 static enum hrtimer_restart hrtick(struct hrtimer *timer)
 {
 	struct rq *rq = container_of(timer, struct rq, hrtick_timer);
 	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
 	spin_lock(&rq->lock);
 	__update_rq_clock(rq);
 	rq->curr->sched_class->task_tick(rq, rq->curr, 1);
 	spin_unlock(&rq->lock);
 	return HRTIMER_NORESTART;
 }
 static inline void init_rq_hrtick(struct rq *rq)
 {
 	rq->hrtick_flags = 0;
 	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	rq->hrtick_timer.function = hrtick;
 	rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
 }
 void hrtick_resched(void)
 {
 	struct rq *rq;
 	unsigned long flags;
 	if (!test_thread_flag(TIF_HRTICK_RESCHED))
 		return;
 	local_irq_save(flags);
 	rq = cpu_rq(smp_processor_id());
 	hrtick_set(rq);
 	local_irq_restore(flags);
 }
 #else
 static inline void hrtick_clear(struct rq *rq)
 {
 }
 static inline void hrtick_set(struct rq *rq)
 {
 }
 static inline void init_rq_hrtick(struct rq *rq)
 {
 }
 void hrtick_resched(void)
 {
 }
 #endif
 /*
  * resched_task - mark a task 'to be rescheduled now'.
  *
  * On UP this means the setting of the need_resched flag, on SMP it
  * might also involve a cross-CPU call to trigger the scheduler on
  * the target CPU.
  */
 #ifdef CONFIG_SMP
 #ifndef tsk_is_polling
 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
 #endif
 static void __resched_task(struct task_struct *p, int tif_bit)
 {
 	int cpu;
 	assert_spin_locked(&task_rq(p)->lock);
 	if (unlikely(test_tsk_thread_flag(p, tif_bit)))
 		return;
 	set_tsk_thread_flag(p, tif_bit);
 	cpu = task_cpu(p);
 	if (cpu == smp_processor_id())
 		return;
 	/* NEED_RESCHED must be visible before we test polling */
 	smp_mb();
 	if (!tsk_is_polling(p))
 		smp_send_reschedule(cpu);
 }
 static void resched_cpu(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
 	if (!spin_trylock_irqsave(&rq->lock, flags))
 		return;
 	resched_task(cpu_curr(cpu));
 	spin_unlock_irqrestore(&rq->lock, flags);
 }
 #else
 static void __resched_task(struct task_struct *p, int tif_bit)
 {
 	assert_spin_locked(&task_rq(p)->lock);
 	set_tsk_thread_flag(p, tif_bit);
 }
 #endif
 #if BITS_PER_LONG == 32
 # define WMULT_CONST	(~0UL)
 #else
 # define WMULT_CONST	(1UL << 32)
 #endif
 #define WMULT_SHIFT	32
 /*
  * Shift right and round:
  */
 #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
 static unsigned long
 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 		struct load_weight *lw)
 {
 	u64 tmp;
 	if (unlikely(!lw->inv_weight))
 		lw->inv_weight = (WMULT_CONST - lw->weight/2) / lw->weight + 1;
 	tmp = (u64)delta_exec * weight;
 	/*
 	 * Check whether we'd overflow the 64-bit multiplication:
 	 */
 	if (unlikely(tmp > WMULT_CONST))
 		tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
 			WMULT_SHIFT/2);
 	else
 		tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
 	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
 }
 static inline unsigned long
 calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
 {
 	return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
 }
 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 {
 	lw->weight += inc;
 }
 static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
 {
 	lw->weight -= dec;
 }
 /*
  * To aid in avoiding the subversion of "niceness" due to uneven distribution
  * of tasks with abnormal "nice" values across CPUs the contribution that
  * each task makes to its run queue's load is weighted according to its
  * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
  * scaled version of the new time slice allocation that they receive on time
  * slice expiry etc.
  */
 #define WEIGHT_IDLEPRIO		2
 #define WMULT_IDLEPRIO		(1 << 31)
 /*
  * Nice levels are multiplicative, with a gentle 10% change for every
  * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
  * nice 1, it will get ~10% less CPU time than another CPU-bound task
  * that remained on nice 0.
  *
  * The "10% effect" is relative and cumulative: from _any_ nice level,
  * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
  * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
  * If a task goes up by ~10% and another task goes down by ~10% then
  * the relative distance between them is ~25%.)
  */
 static const int prio_to_weight[40] = {
  /* -20 */     88761,     71755,     56483,     46273,     36291,
  /* -15 */     29154,     23254,     18705,     14949,     11916,
  /* -10 */      9548,      7620,      6100,      4904,      3906,
  /*  -5 */      3121,      2501,      1991,      1586,      1277,
  /*   0 */      1024,       820,       655,       526,       423,
  /*   5 */       335,       272,       215,       172,       137,
  /*  10 */       110,        87,        70,        56,        45,
  /*  15 */        36,        29,        23,        18,        15,
 };
 /*
  * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
  *
  * In cases where the weight does not change often, we can use the
  * precalculated inverse to speed up arithmetics by turning divisions
  * into multiplications:
  */
 static const u32 prio_to_wmult[40] = {
  /* -20 */     48388,     59856,     76040,     92818,    118348,
  /* -15 */    147320,    184698,    229616,    287308,    360437,
  /* -10 */    449829,    563644,    704093,    875809,   1099582,
  /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
  /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
  /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
  /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
  /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
 };
 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
 /*
  * runqueue iterator, to support SMP load-balancing between different
  * scheduling classes, without having to expose their internal data
  * structures to the load-balancing proper:
  */
 struct rq_iterator {
 	void *arg;
 	struct task_struct *(*start)(void *);
 	struct task_struct *(*next)(void *);
 };
 #ifdef CONFIG_SMP
 static unsigned long
 balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	      unsigned long max_load_move, struct sched_domain *sd,
 	      enum cpu_idle_type idle, int *all_pinned,
 	      int *this_best_prio, struct rq_iterator *iterator);
 static int
 iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		   struct sched_domain *sd, enum cpu_idle_type idle,
 		   struct rq_iterator *iterator);
 #endif
 #ifdef CONFIG_CGROUP_CPUACCT
 static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
 #else
 static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
 #endif
 static inline void inc_cpu_load(struct rq *rq, unsigned long load)
 {
 	update_load_add(&rq->load, load);
 }
 static inline void dec_cpu_load(struct rq *rq, unsigned long load)
 {
 	update_load_sub(&rq->load, load);
 }
 #ifdef CONFIG_SMP
 static unsigned long source_load(int cpu, int type);
 static unsigned long target_load(int cpu, int type);
 static unsigned long cpu_avg_load_per_task(int cpu);
 static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
 #endif /* CONFIG_SMP */
 #include "sched_stats.h"
 #include "sched_idletask.c"
 #include "sched_fair.c"
 #include "sched_rt.c"
 #ifdef CONFIG_SCHED_DEBUG
 # include "sched_debug.c"
 #endif
 #define sched_class_highest (&rt_sched_class)
 static void inc_nr_running(struct task_struct *p, struct rq *rq)
 {
 	rq->nr_running++;
 }
 static void dec_nr_running(struct task_struct *p, struct rq *rq)
 {
 	rq->nr_running--;
 }
 static void set_load_weight(struct task_struct *p)
 {
 	if (task_has_rt_policy(p)) {
 		p->se.load.weight = prio_to_weight[0] * 2;
 		p->se.load.inv_weight = prio_to_wmult[0] >> 1;
 		return;
 	}
 	/*
 	 * SCHED_IDLE tasks get minimal weight:
 	 */
 	if (p->policy == SCHED_IDLE) {
 		p->se.load.weight = WEIGHT_IDLEPRIO;
 		p->se.load.inv_weight = WMULT_IDLEPRIO;
 		return;
 	}
 	p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
 	p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
 }
 static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
 {
 	sched_info_queued(p);
 	p->sched_class->enqueue_task(rq, p, wakeup);
 	p->se.on_rq = 1;
 }
 static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
 {
 	p->sched_class->dequeue_task(rq, p, sleep);
 	p->se.on_rq = 0;
 }
 /*
  * __normal_prio - return the priority that is based on the static prio
  */
 static inline int __normal_prio(struct task_struct *p)
 {
 	return p->static_prio;
 }
 /*
  * Calculate the expected normal priority: i.e. priority
  * without taking RT-inheritance into account. Might be
  * boosted by interactivity modifiers. Changes upon fork,
  * setprio syscalls, and whenever the interactivity
  * estimator recalculates.
  */
 static inline int normal_prio(struct task_struct *p)
 {
 	int prio;
 	if (task_has_rt_policy(p))
 		prio = MAX_RT_PRIO-1 - p->rt_priority;
 	else
 		prio = __normal_prio(p);
 	return prio;
 }
 /*
  * Calculate the current priority, i.e. the priority
  * taken into account by the scheduler. This value might
  * be boosted by RT tasks, or might be boosted by
  * interactivity modifiers. Will be RT if the task got
  * RT-boosted. If not then it returns p->normal_prio.
  */
 static int effective_prio(struct task_struct *p)
 {
 	p->normal_prio = normal_prio(p);
 	/*
 	 * If we are RT tasks or we were boosted to RT priority,
 	 * keep the priority unchanged. Otherwise, update priority
 	 * to the normal priority:
 	 */
 	if (!rt_prio(p->prio))
 		return p->normal_prio;
 	return p->prio;
 }
 /*
  * activate_task - move a task to the runqueue.
  */
 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
 {
 	if (p->state == TASK_UNINTERRUPTIBLE)
 		rq->nr_uninterruptible--;
 	enqueue_task(rq, p, wakeup);
 	inc_nr_running(p, rq);
 }
 /*
  * deactivate_task - remove a task from the runqueue.
  */
 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
 {
 	if (p->state == TASK_UNINTERRUPTIBLE)
 		rq->nr_uninterruptible++;
 	dequeue_task(rq, p, sleep);
 	dec_nr_running(p, rq);
 }
 /**
  * task_curr - is this task currently executing on a CPU?
  * @p: the task in question.
  */
 inline int task_curr(const struct task_struct *p)
 {
 	return cpu_curr(task_cpu(p)) == p;
 }
 /* Used instead of source_load when we know the type == 0 */
 unsigned long weighted_cpuload(const int cpu)
 {
 	return cpu_rq(cpu)->load.weight;
 }
 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 {
 	set_task_rq(p, cpu);
 #ifdef CONFIG_SMP
 	/*
 	 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
 	 * successfuly executed on another CPU. We must ensure that updates of
 	 * per-task data have been completed by this moment.
 	 */
 	smp_wmb();
 	task_thread_info(p)->cpu = cpu;
 #endif
 }
 static inline void check_class_changed(struct rq *rq, struct task_struct *p,
 				       const struct sched_class *prev_class,
 				       int oldprio, int running)
 {
 	if (prev_class != p->sched_class) {
 		if (prev_class->switched_from)
 			prev_class->switched_from(rq, p, running);
 		p->sched_class->switched_to(rq, p, running);
 	} else
 		p->sched_class->prio_changed(rq, p, oldprio, running);
 }
 #ifdef CONFIG_SMP
 /*
  * Is this task likely cache-hot:
  */
 static int
 task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
 {
 	s64 delta;
 	if (p->sched_class != &fair_sched_class)
 		return 0;
 	if (sysctl_sched_migration_cost == -1)
 		return 1;
 	if (sysctl_sched_migration_cost == 0)
 		return 0;
 	delta = now - p->se.exec_start;
 	return delta < (s64)sysctl_sched_migration_cost;
 }
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
 	int old_cpu = task_cpu(p);
 	struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
 	struct cfs_rq *old_cfsrq = task_cfs_rq(p),
 		      *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
 	u64 clock_offset;
 	clock_offset = old_rq->clock - new_rq->clock;
 #ifdef CONFIG_SCHEDSTATS
 	if (p->se.wait_start)
 		p->se.wait_start -= clock_offset;
 	if (p->se.sleep_start)
 		p->se.sleep_start -= clock_offset;
 	if (p->se.block_start)
 		p->se.block_start -= clock_offset;
 	if (old_cpu != new_cpu) {
 		schedstat_inc(p, se.nr_migrations);
 		if (task_hot(p, old_rq->clock, NULL))
 			schedstat_inc(p, se.nr_forced2_migrations);
 	}
 #endif
 	p->se.vruntime -= old_cfsrq->min_vruntime -
 					 new_cfsrq->min_vruntime;
 	__set_task_cpu(p, new_cpu);
 }
 struct migration_req {
 	struct list_head list;
 	struct task_struct *task;
 	int dest_cpu;
 	struct completion done;
 };
 /*
  * The task's runqueue lock must be held.
  * Returns true if you have to wait for migration thread.
  */
 static int
 migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
 {
 	struct rq *rq = task_rq(p);
 	/*
 	 * If the task is not on a runqueue (and not running), then
 	 * it is sufficient to simply update the task's cpu field.
 	 */
 	if (!p->se.on_rq && !task_running(rq, p)) {
 		set_task_cpu(p, dest_cpu);
 		return 0;
 	}
 	init_completion(&req->done);
 	req->task = p;
 	req->dest_cpu = dest_cpu;
 	list_add(&req->list, &rq->migration_queue);
 	return 1;
 }
 /*
  * wait_task_inactive - wait for a thread to unschedule.
  *
  * The caller must ensure that the task *will* unschedule sometime soon,
  * else this function might spin for a *long* time. This function can't
  * be called with interrupts off, or it may introduce deadlock with
  * smp_call_function() if an IPI is sent by the same process we are
  * waiting to become inactive.
  */
 void wait_task_inactive(struct task_struct *p)
 {
 	unsigned long flags;
 	int running, on_rq;
 	struct rq *rq;
 	for (;;) {
 		/*
 		 * We do the initial early heuristics without holding
 		 * any task-queue locks at all. We'll only try to get
 		 * the runqueue lock when things look like they will
 		 * work out!
 		 */
 		rq = task_rq(p);
 		/*
 		 * If the task is actively running on another CPU
 		 * still, just relax and busy-wait without holding
 		 * any locks.
 		 *
 		 * NOTE! Since we don't hold any locks, it's not
 		 * even sure that "rq" stays as the right runqueue!
 		 * But we don't care, since "task_running()" will
 		 * return false if the runqueue has changed and p
 		 * is actually now running somewhere else!
 		 */
 		while (task_running(rq, p))
 			cpu_relax();
 		/*
 		 * Ok, time to look more closely! We need the rq
 		 * lock now, to be *sure*. If we're wrong, we'll
 		 * just go back and repeat.
 		 */
 		rq = task_rq_lock(p, &flags);
 		running = task_running(rq, p);
 		on_rq = p->se.on_rq;
 		task_rq_unlock(rq, &flags);
 		/*
 		 * Was it really running after all now that we
 		 * checked with the proper locks actually held?
 		 *
 		 * Oops. Go back and try again..
 		 */
 		if (unlikely(running)) {
 			cpu_relax();
 			continue;
 		}
 		/*
 		 * It's not enough that it's not actively running,
 		 * it must be off the runqueue _entirely_, and not
 		 * preempted!
 		 *
 		 * So if it wa still runnable (but just not actively
 		 * running right now), it's preempted, and we should
 		 * yield - it could be a while.
 		 */
 		if (unlikely(on_rq)) {
 			schedule_timeout_uninterruptible(1);
 			continue;
 		}
 		/*
 		 * Ahh, all good. It wasn't running, and it wasn't
 		 * runnable, which means that it will never become
 		 * running in the future either. We're all done!
 		 */
 		break;
 	}
 }
 /***
  * kick_process - kick a running thread to enter/exit the kernel
  * @p: the to-be-kicked thread
  *
  * Cause a process which is running on another CPU to enter
  * kernel-mode, without any delay. (to get signals handled.)
  *
  * NOTE: this function doesnt have to take the runqueue lock,
  * because all it wants to ensure is that the remote task enters
  * the kernel. If the IPI races and the task has been migrated
  * to another CPU then no harm is done and the purpose has been
  * achieved as well.
  */
 void kick_process(struct task_struct *p)
 {
 	int cpu;
 	preempt_disable();
 	cpu = task_cpu(p);
 	if ((cpu != smp_processor_id()) && task_curr(p))
 		smp_send_reschedule(cpu);
 	preempt_enable();
 }
 /*
  * Return a low guess at the load of a migration-source cpu weighted
  * according to the scheduling class and "nice" value.
  *
  * We want to under-estimate the load of migration sources, to
  * balance conservatively.
  */
 static unsigned long source_load(int cpu, int type)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long total = weighted_cpuload(cpu);
 	if (type == 0)
 		return total;
 	return min(rq->cpu_load[type-1], total);
 }
 /*
  * Return a high guess at the load of a migration-target cpu weighted
  * according to the scheduling class and "nice" value.
  */
 static unsigned long target_load(int cpu, int type)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long total = weighted_cpuload(cpu);
 	if (type == 0)
 		return total;
 	return max(rq->cpu_load[type-1], total);
 }
 /*
  * Return the average load per task on the cpu's run queue
  */
 static unsigned long cpu_avg_load_per_task(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long total = weighted_cpuload(cpu);
 	unsigned long n = rq->nr_running;
 	return n ? total / n : SCHED_LOAD_SCALE;
 }
 /*
  * find_idlest_group finds and returns the least busy CPU group within the
  * domain.
  */
 static struct sched_group *
 find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 {
 	struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
 	unsigned long min_load = ULONG_MAX, this_load = 0;
 	int load_idx = sd->forkexec_idx;
 	int imbalance = 100 + (sd->imbalance_pct-100)/2;
 	do {
 		unsigned long load, avg_load;
 		int local_group;
 		int i;
 		/* Skip over this group if it has no CPUs allowed */
 		if (!cpus_intersects(group->cpumask, p->cpus_allowed))
 			continue;
 		local_group = cpu_isset(this_cpu, group->cpumask);
 		/* Tally up the load of all CPUs in the group */
 		avg_load = 0;
 		for_each_cpu_mask(i, group->cpumask) {
 			/* Bias balancing toward cpus of our domain */
 			if (local_group)
 				load = source_load(i, load_idx);
 			else
 				load = target_load(i, load_idx);
 			avg_load += load;
 		}
 		/* Adjust by relative CPU power of the group */
 		avg_load = sg_div_cpu_power(group,
 				avg_load * SCHED_LOAD_SCALE);
 		if (local_group) {
 			this_load = avg_load;
 			this = group;
 		} else if (avg_load < min_load) {
 			min_load = avg_load;
 			idlest = group;
 		}
 	} while (group = group->next, group != sd->groups);
 	if (!idlest || 100*this_load < imbalance*min_load)
 		return NULL;
 	return idlest;
 }
 /*
  * find_idlest_cpu - find the idlest cpu among the cpus in group.
  */
 static int
 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 {
 	cpumask_t tmp;
 	unsigned long load, min_load = ULONG_MAX;
 	int idlest = -1;
 	int i;
 	/* Traverse only the allowed CPUs */
 	cpus_and(tmp, group->cpumask, p->cpus_allowed);
 	for_each_cpu_mask(i, tmp) {
 		load = weighted_cpuload(i);
 		if (load < min_load || (load == min_load && i == this_cpu)) {
 			min_load = load;
 			idlest = i;
 		}
 	}
 	return idlest;
 }
 /*
  * sched_balance_self: balance the current task (running on cpu) in domains
  * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
  * SD_BALANCE_EXEC.
  *
  * Balance, ie. select the least loaded group.
  *
  * Returns the target CPU number, or the same CPU if no balancing is needed.
  *
  * preempt must be disabled.
  */
 static int sched_balance_self(int cpu, int flag)
 {
 	struct task_struct *t = current;
 	struct sched_domain *tmp, *sd = NULL;
 	for_each_domain(cpu, tmp) {
 		/*
 		 * If power savings logic is enabled for a domain, stop there.
 		 */
 		if (tmp->flags & SD_POWERSAVINGS_BALANCE)
 			break;
 		if (tmp->flags & flag)
 			sd = tmp;
 	}
 	while (sd) {
 		cpumask_t span;
 		struct sched_group *group;
 		int new_cpu, weight;
 		if (!(sd->flags & flag)) {
 			sd = sd->child;
 			continue;
 		}
 		span = sd->span;
 		group = find_idlest_group(sd, t, cpu);
 		if (!group) {
 			sd = sd->child;
 			continue;
 		}
 		new_cpu = find_idlest_cpu(group, t, cpu);
 		if (new_cpu == -1 || new_cpu == cpu) {
 			/* Now try balancing at a lower domain level of cpu */
 			sd = sd->child;
 			continue;
 		}
 		/* Now try balancing at a lower domain level of new_cpu */
 		cpu = new_cpu;
 		sd = NULL;
 		weight = cpus_weight(span);
 		for_each_domain(cpu, tmp) {
 			if (weight <= cpus_weight(tmp->span))
 				break;
 			if (tmp->flags & flag)
 				sd = tmp;
 		}
 		/* while loop will break here if sd == NULL */
 	}
 	return cpu;
 }
 #endif /* CONFIG_SMP */
 /***
  * try_to_wake_up - wake up a thread
  * @p: the to-be-woken-up thread
  * @state: the mask of task states that can be woken
  * @sync: do a synchronous wakeup?
  *
  * Put it on the run-queue if it's not already there. The "current"
  * thread is always on the run-queue (except when the actual
  * re-schedule is in progress), and as such you're allowed to do
  * the simpler "current->state = TASK_RUNNING" to mark yourself
  * runnable without the overhead of this.
  *
  * returns failure only if the task is already active.
  */
 static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 {
 	int cpu, orig_cpu, this_cpu, success = 0;
 	unsigned long flags;
 	long old_state;
 	struct rq *rq;
 	rq = task_rq_lock(p, &flags);
 	old_state = p->state;
 	if (!(old_state & state))
 		goto out;
 	if (p->se.on_rq)
 		goto out_running;
 	cpu = task_cpu(p);
 	orig_cpu = cpu;
 	this_cpu = smp_processor_id();
 #ifdef CONFIG_SMP
 	if (unlikely(task_running(rq, p)))
 		goto out_activate;
 	cpu = p->sched_class->select_task_rq(p, sync);
 	if (cpu != orig_cpu) {
 		set_task_cpu(p, cpu);
 		task_rq_unlock(rq, &flags);
 		/* might preempt at this point */
 		rq = task_rq_lock(p, &flags);
 		old_state = p->state;
 		if (!(old_state & state))
 			goto out;
 		if (p->se.on_rq)
 			goto out_running;
 		this_cpu = smp_processor_id();
 		cpu = task_cpu(p);
 	}
 #ifdef CONFIG_SCHEDSTATS
 	schedstat_inc(rq, ttwu_count);
 	if (cpu == this_cpu)
 		schedstat_inc(rq, ttwu_local);
 	else {
 		struct sched_domain *sd;
 		for_each_domain(this_cpu, sd) {
 			if (cpu_isset(cpu, sd->span)) {
 				schedstat_inc(sd, ttwu_wake_remote);
 				break;
 			}
 		}
 	}
 #endif
 out_activate:
 #endif /* CONFIG_SMP */
 	schedstat_inc(p, se.nr_wakeups);
 	if (sync)
 		schedstat_inc(p, se.nr_wakeups_sync);
 	if (orig_cpu != cpu)
 		schedstat_inc(p, se.nr_wakeups_migrate);
 	if (cpu == this_cpu)
 		schedstat_inc(p, se.nr_wakeups_local);
 	else
 		schedstat_inc(p, se.nr_wakeups_remote);
 	update_rq_clock(rq);
 	activate_task(rq, p, 1);
 	check_preempt_curr(rq, p);
 	success = 1;
 out_running:
 	p->state = TASK_RUNNING;
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_wake_up)
 		p->sched_class->task_wake_up(rq, p);
 #endif
 out:
 	task_rq_unlock(rq, &flags);
 	return success;
 }
 int fastcall wake_up_process(struct task_struct *p)
 {
 	return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
 				 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
 }
 EXPORT_SYMBOL(wake_up_process);
 int fastcall wake_up_state(struct task_struct *p, unsigned int state)
 {
 	return try_to_wake_up(p, state, 0);
 }
 /*
  * Perform scheduler related setup for a newly forked process p.
  * p is forked by current.
  *
  * __sched_fork() is basic setup used by init_idle() too:
  */
 static void __sched_fork(struct task_struct *p)
 {
 	p->se.exec_start		= 0;
 	p->se.sum_exec_runtime		= 0;
 	p->se.prev_sum_exec_runtime	= 0;
 #ifdef CONFIG_SCHEDSTATS
 	p->se.wait_start		= 0;
 	p->se.sum_sleep_runtime		= 0;
 	p->se.sleep_start		= 0;
 	p->se.block_start		= 0;
 	p->se.sleep_max			= 0;
 	p->se.block_max			= 0;
 	p->se.exec_max			= 0;
 	p->se.slice_max			= 0;
 	p->se.wait_max			= 0;
 #endif
 	INIT_LIST_HEAD(&p->rt.run_list);
 	p->se.on_rq = 0;
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 	INIT_HLIST_HEAD(&p->preempt_notifiers);
 #endif
 	/*
 	 * We mark the process as running here, but have not actually
 	 * inserted it onto the runqueue yet. This guarantees that
 	 * nobody will actually run it, and a signal or other external
 	 * event cannot wake it up and insert it on the runqueue either.
 	 */
 	p->state = TASK_RUNNING;
 }
 /*
  * fork()/clone()-time setup:
  */
 void sched_fork(struct task_struct *p, int clone_flags)
 {
 	int cpu = get_cpu();
 	__sched_fork(p);
 #ifdef CONFIG_SMP
 	cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
 #endif
 	set_task_cpu(p, cpu);
 	/*
 	 * Make sure we do not leak PI boosting priority to the child:
 	 */
 	p->prio = current->normal_prio;
 	if (!rt_prio(p->prio))
 		p->sched_class = &fair_sched_class;
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	if (likely(sched_info_on()))
 		memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
 	p->oncpu = 0;
 #endif
 #ifdef CONFIG_PREEMPT
 	/* Want to start with kernel preemption disabled. */
 	task_thread_info(p)->preempt_count = 1;
 #endif
 	put_cpu();
 }
 /*
  * wake_up_new_task - wake up a newly created task for the first time.
  *
  * This function will do some initial scheduler statistics housekeeping
  * that must be done for every newly created context, then puts the task
  * on the runqueue and wakes it.
  */
 void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 {
 	unsigned long flags;
 	struct rq *rq;
 	rq = task_rq_lock(p, &flags);
 	BUG_ON(p->state != TASK_RUNNING);
 	update_rq_clock(rq);
 	p->prio = effective_prio(p);
 	if (!p->sched_class->task_new || !current->se.on_rq) {
 		activate_task(rq, p, 0);
 	} else {
 		/*
 		 * Let the scheduling class do new task startup
 		 * management (if any):
 		 */
 		p->sched_class->task_new(rq, p);
 		inc_nr_running(p, rq);
 	}
 	check_preempt_curr(rq, p);
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_wake_up)
 		p->sched_class->task_wake_up(rq, p);
 #endif
 	task_rq_unlock(rq, &flags);
 }
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 /**
  * preempt_notifier_register - tell me when current is being being preempted & rescheduled
  * @notifier: notifier struct to register
  */
 void preempt_notifier_register(struct preempt_notifier *notifier)
 {
 	hlist_add_head(&notifier->link, &current->preempt_notifiers);
 }
 EXPORT_SYMBOL_GPL(preempt_notifier_register);
 /**
  * preempt_notifier_unregister - no longer interested in preemption notifications
  * @notifier: notifier struct to unregister
  *
  * This is safe to call from within a preemption notifier.
  */
 void preempt_notifier_unregister(struct preempt_notifier *notifier)
 {
 	hlist_del(&notifier->link);
 }
 EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
 static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
 {
 	struct preempt_notifier *notifier;
 	struct hlist_node *node;
 	hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
 		notifier->ops->sched_in(notifier, raw_smp_processor_id());
 }
 static void
 fire_sched_out_preempt_notifiers(struct task_struct *curr,
 				 struct task_struct *next)
 {
 	struct preempt_notifier *notifier;
 	struct hlist_node *node;
 	hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
 		notifier->ops->sched_out(notifier, next);
 }
 #else
 static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
 {
 }
 static void
 fire_sched_out_preempt_notifiers(struct task_struct *curr,
 				 struct task_struct *next)
 {
 }
 #endif
 /**
  * prepare_task_switch - prepare to switch tasks
  * @rq: the runqueue preparing to switch
  * @prev: the current task that is being switched out
  * @next: the task we are going to switch to.
  *
  * This is called with the rq lock held and interrupts off. It must
  * be paired with a subsequent finish_task_switch after the context
  * switch.
  *
  * prepare_task_switch sets up locking and calls architecture specific
  * hooks.
  */
 static inline void
 prepare_task_switch(struct rq *rq, struct task_struct *prev,
 		    struct task_struct *next)
 {
 	fire_sched_out_preempt_notifiers(prev, next);
 	prepare_lock_switch(rq, next);
 	prepare_arch_switch(next);
 }
 /**
  * finish_task_switch - clean up after a task-switch
  * @rq: runqueue associated with task-switch
  * @prev: the thread we just switched away from.
  *
  * finish_task_switch must be called after the context switch, paired
  * with a prepare_task_switch call before the context switch.
  * finish_task_switch will reconcile locking set up by prepare_task_switch,
  * and do any other architecture-specific cleanup actions.
  *
  * Note that we may have delayed dropping an mm in context_switch(). If
  * so, we finish that here outside of the runqueue lock. (Doing it
  * with the lock held can cause deadlocks; see schedule() for
  * details.)
  */
 static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	__releases(rq->lock)
 {
 	struct mm_struct *mm = rq->prev_mm;
 	long prev_state;
 	rq->prev_mm = NULL;
 	/*
 	 * A task struct has one reference for the use as "current".
 	 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
 	 * schedule one last time. The schedule call will never return, and
 	 * the scheduled task must drop that reference.
 	 * The test for TASK_DEAD must occur while the runqueue locks are
 	 * still held, otherwise prev could be scheduled on another cpu, die
 	 * there before we look at prev->state, and then the reference would
 	 * be dropped twice.
 	 *		Manfred Spraul <manfred@colorfullife.com>
 	 */
 	prev_state = prev->state;
 	finish_arch_switch(prev);
 	finish_lock_switch(rq, prev);
 #ifdef CONFIG_SMP
 	if (current->sched_class->post_schedule)
 		current->sched_class->post_schedule(rq);
 #endif
 	fire_sched_in_preempt_notifiers(current);
 	if (mm)
 		mmdrop(mm);
 	if (unlikely(prev_state == TASK_DEAD)) {
 		/*
 		 * Remove function-return probe instances associated with this
 		 * task and put them back on the free list.
 		 */
 		kprobe_flush_task(prev);
 		put_task_struct(prev);
 	}
 }
 /**
  * schedule_tail - first thing a freshly forked thread must call.
  * @prev: the thread we just switched away from.
  */
 asmlinkage void schedule_tail(struct task_struct *prev)
 	__releases(rq->lock)
 {
 	struct rq *rq = this_rq();
 	finish_task_switch(rq, prev);
 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
 	/* In this case, finish_task_switch does not reenable preemption */
 	preempt_enable();
 #endif
 	if (current->set_child_tid)
 		put_user(task_pid_vnr(current), current->set_child_tid);
 }
 /*
  * context_switch - switch to the new MM and the new
  * thread's register state.
  */
 static inline void
 context_switch(struct rq *rq, struct task_struct *prev,
 	       struct task_struct *next)
 {
 	struct mm_struct *mm, *oldmm;
 	prepare_task_switch(rq, prev, next);
 	mm = next->mm;
 	oldmm = prev->active_mm;
 	/*
 	 * For paravirt, this is coupled with an exit in switch_to to
 	 * combine the page table reload and the switch backend into
 	 * one hypercall.
 	 */
 	arch_enter_lazy_cpu_mode();
 	if (unlikely(!mm)) {
 		next->active_mm = oldmm;
 		atomic_inc(&oldmm->mm_count);
 		enter_lazy_tlb(oldmm, next);
 	} else
 		switch_mm(oldmm, mm, next);
 	if (unlikely(!prev->mm)) {
 		prev->active_mm = NULL;
 		rq->prev_mm = oldmm;
 	}
 	/*
 	 * Since the runqueue lock will be released by the next
 	 * task (which is an invalid locking op but in the case
 	 * of the scheduler it's an obvious special-case), so we
 	 * do an early lockdep release here:
 	 */
 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
 	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
 #endif
 	/* Here we just switch the register state and the stack. */
 	switch_to(prev, next, prev);
 	barrier();
 	/*
 	 * this_rq must be evaluated again because prev may have moved
 	 * CPUs since it called schedule(), thus the 'rq' on its stack
 	 * frame will be invalid.
 	 */
 	finish_task_switch(this_rq(), prev);
 }
 /*
  * nr_running, nr_uninterruptible and nr_context_switches:
  *
  * externally visible scheduler statistics: current number of runnable
  * threads, current number of uninterruptible-sleeping threads, total
  * number of context switches performed since bootup.
  */
 unsigned long nr_running(void)
 {
 	unsigned long i, sum = 0;
 	for_each_online_cpu(i)
 		sum += cpu_rq(i)->nr_running;
 	return sum;
 }
 unsigned long nr_uninterruptible(void)
 {
 	unsigned long i, sum = 0;
 	for_each_possible_cpu(i)
 		sum += cpu_rq(i)->nr_uninterruptible;
 	/*
 	 * Since we read the counters lockless, it might be slightly
 	 * inaccurate. Do not allow it to go below zero though:
 	 */
 	if (unlikely((long)sum < 0))
 		sum = 0;
 	return sum;
 }
 unsigned long long nr_context_switches(void)
 {
 	int i;
 	unsigned long long sum = 0;
 	for_each_possible_cpu(i)
 		sum += cpu_rq(i)->nr_switches;
 	return sum;
 }
 unsigned long nr_iowait(void)
 {
 	unsigned long i, sum = 0;
 	for_each_possible_cpu(i)
 		sum += atomic_read(&cpu_rq(i)->nr_iowait);
 	return sum;
 }
 unsigned long nr_active(void)
 {
 	unsigned long i, running = 0, uninterruptible = 0;
 	for_each_online_cpu(i) {
 		running += cpu_rq(i)->nr_running;
 		uninterruptible += cpu_rq(i)->nr_uninterruptible;
 	}
 	if (unlikely((long)uninterruptible < 0))
 		uninterruptible = 0;
 	return running + uninterruptible;
 }
 /*
  * Update rq->cpu_load[] statistics. This function is usually called every
  * scheduler tick (TICK_NSEC).
  */
 static void update_cpu_load(struct rq *this_rq)
 {
 	unsigned long this_load = this_rq->load.weight;
 	int i, scale;
 	this_rq->nr_load_updates++;
 	/* Update our load: */
 	for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
 		unsigned long old_load, new_load;
 		/* scale is effectively 1 << i now, and >> i divides by scale */
 		old_load = this_rq->cpu_load[i];
 		new_load = this_load;
 		/*
 		 * Round up the averaging division if load is increasing. This
 		 * prevents us from getting stuck on 9 if the load is 10, for
 		 * example.
 		 */
 		if (new_load > old_load)
 			new_load += scale-1;
 		this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
 	}
 }
 #ifdef CONFIG_SMP
 /*
  * double_rq_lock - safely lock two runqueues
  *
  * Note this does not disable interrupts like task_rq_lock,
  * you need to do so manually before calling.
  */
 static void double_rq_lock(struct rq *rq1, struct rq *rq2)
 	__acquires(rq1->lock)
 	__acquires(rq2->lock)
 {
 	BUG_ON(!irqs_disabled());
 	if (rq1 == rq2) {
 		spin_lock(&rq1->lock);
 		__acquire(rq2->lock);	/* Fake it out ;) */
 	} else {
 		if (rq1 < rq2) {
 			spin_lock(&rq1->lock);
 			spin_lock(&rq2->lock);
 		} else {
 			spin_lock(&rq2->lock);
 			spin_lock(&rq1->lock);
 		}
 	}
 	update_rq_clock(rq1);
 	update_rq_clock(rq2);
 }
 /*
  * double_rq_unlock - safely unlock two runqueues
  *
  * Note this does not restore interrupts like task_rq_unlock,
  * you need to do so manually after calling.
  */
 static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
 	__releases(rq1->lock)
 	__releases(rq2->lock)
 {
 	spin_unlock(&rq1->lock);
 	if (rq1 != rq2)
 		spin_unlock(&rq2->lock);
 	else
 		__release(rq2->lock);
 }
 /*
  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
  */
 static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
 	__releases(this_rq->lock)
 	__acquires(busiest->lock)
 	__acquires(this_rq->lock)
 {
 	int ret = 0;
 	if (unlikely(!irqs_disabled())) {
 		/* printk() doesn't work good under rq->lock */
 		spin_unlock(&this_rq->lock);
 		BUG_ON(1);
 	}
 	if (unlikely(!spin_trylock(&busiest->lock))) {
 		if (busiest < this_rq) {
 			spin_unlock(&this_rq->lock);
 			spin_lock(&busiest->lock);
 			spin_lock(&this_rq->lock);
 			ret = 1;
 		} else
 			spin_lock(&busiest->lock);
 	}
 	return ret;
 }
 /*
  * If dest_cpu is allowed for this process, migrate the task to it.
  * This is accomplished by forcing the cpu_allowed mask to only
  * allow dest_cpu, which will force the cpu onto dest_cpu. Then
  * the cpu_allowed mask is restored.
  */
 static void sched_migrate_task(struct task_struct *p, int dest_cpu)
 {
 	struct migration_req req;
 	unsigned long flags;
 	struct rq *rq;
 	rq = task_rq_lock(p, &flags);
 	if (!cpu_isset(dest_cpu, p->cpus_allowed)
 	    || unlikely(cpu_is_offline(dest_cpu)))
 		goto out;
 	/* force the process onto the specified CPU */
 	if (migrate_task(p, dest_cpu, &req)) {
 		/* Need to wait for migration thread (might exit: take ref). */
 		struct task_struct *mt = rq->migration_thread;
 		get_task_struct(mt);
 		task_rq_unlock(rq, &flags);
 		wake_up_process(mt);
 		put_task_struct(mt);
 		wait_for_completion(&req.done);
 		return;
 	}
 out:
 	task_rq_unlock(rq, &flags);
 }
 /*
  * sched_exec - execve() is a valuable balancing opportunity, because at
  * this point the task has the smallest effective memory and cache footprint.
  */
 void sched_exec(void)
 {
 	int new_cpu, this_cpu = get_cpu();
 	new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
 	put_cpu();
 	if (new_cpu != this_cpu)
 		sched_migrate_task(current, new_cpu);
 }
 /*
  * pull_task - move a task from a remote runqueue to the local runqueue.
  * Both runqueues must be locked.
  */
 static void pull_task(struct rq *src_rq, struct task_struct *p,
 		      struct rq *this_rq, int this_cpu)
 {
 	deactivate_task(src_rq, p, 0);
 	set_task_cpu(p, this_cpu);
 	activate_task(this_rq, p, 0);
 	/*
 	 * Note that idle threads have a prio of MAX_PRIO, for this test
 	 * to be always true for them.
 	 */
 	check_preempt_curr(this_rq, p);
 }
 /*
  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
  */
 static
 int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 		     struct sched_domain *sd, enum cpu_idle_type idle,
 		     int *all_pinned)
 {
 	/*
 	 * We do not migrate tasks that are:
 	 * 1) running (obviously), or
 	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
 	 * 3) are cache-hot on their current CPU.
 	 */
 	if (!cpu_isset(this_cpu, p->cpus_allowed)) {
 		schedstat_inc(p, se.nr_failed_migrations_affine);
 		return 0;
 	}
 	*all_pinned = 0;
 	if (task_running(rq, p)) {
 		schedstat_inc(p, se.nr_failed_migrations_running);
 		return 0;
 	}
 	/*
 	 * Aggressive migration if:
 	 * 1) task is cache cold, or
 	 * 2) too many balance attempts have failed.
 	 */
 	if (!task_hot(p, rq->clock, sd) ||
 			sd->nr_balance_failed > sd->cache_nice_tries) {
 #ifdef CONFIG_SCHEDSTATS
 		if (task_hot(p, rq->clock, sd)) {
 			schedstat_inc(sd, lb_hot_gained[idle]);
 			schedstat_inc(p, se.nr_forced_migrations);
 		}
 #endif
 		return 1;
 	}
 	if (task_hot(p, rq->clock, sd)) {
 		schedstat_inc(p, se.nr_failed_migrations_hot);
 		return 0;
 	}
 	return 1;
 }
 static unsigned long
 balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	      unsigned long max_load_move, struct sched_domain *sd,
 	      enum cpu_idle_type idle, int *all_pinned,
 	      int *this_best_prio, struct rq_iterator *iterator)
 {
 	int loops = 0, pulled = 0, pinned = 0, skip_for_load;
 	struct task_struct *p;
 	long rem_load_move = max_load_move;
 	if (max_load_move == 0)
 		goto out;
 	pinned = 1;
 	/*
 	 * Start the load-balancing iterator:
 	 */
 	p = iterator->start(iterator->arg);
 next:
 	if (!p || loops++ > sysctl_sched_nr_migrate)
 		goto out;
 	/*
 	 * To help distribute high priority tasks across CPUs we don't
 	 * skip a task if it will be the highest priority task (i.e. smallest
 	 * prio value) on its new queue regardless of its load weight
 	 */
 	skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
 							 SCHED_LOAD_SCALE_FUZZ;
 	if ((skip_for_load && p->prio >= *this_best_prio) ||
 	    !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
 		p = iterator->next(iterator->arg);
 		goto next;
 	}
 	pull_task(busiest, p, this_rq, this_cpu);
 	pulled++;
 	rem_load_move -= p->se.load.weight;
 	/*
 	 * We only want to steal up to the prescribed amount of weighted load.
 	 */
 	if (rem_load_move > 0) {
 		if (p->prio < *this_best_prio)
 			*this_best_prio = p->prio;
 		p = iterator->next(iterator->arg);
 		goto next;
 	}
 out:
 	/*
 	 * Right now, this is one of only two places pull_task() is called,
 	 * so we can safely collect pull_task() stats here rather than
 	 * inside pull_task().
 	 */
 	schedstat_add(sd, lb_gained[idle], pulled);
 	if (all_pinned)
 		*all_pinned = pinned;
 	return max_load_move - rem_load_move;
 }
 /*
  * move_tasks tries to move up to max_load_move weighted load from busiest to
  * this_rq, as part of a balancing operation within domain "sd".
  * Returns 1 if successful and 0 otherwise.
  *
  * Called with both runqueues locked.
  */
 static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		      unsigned long max_load_move,
 		      struct sched_domain *sd, enum cpu_idle_type idle,
 		      int *all_pinned)
 {
 	const struct sched_class *class = sched_class_highest;
 	unsigned long total_load_moved = 0;
 	int this_best_prio = this_rq->curr->prio;
 	do {
 		total_load_moved +=
 			class->load_balance(this_rq, this_cpu, busiest,
 				max_load_move - total_load_moved,
 				sd, idle, all_pinned, &this_best_prio);
 		class = class->next;
 	} while (class && max_load_move > total_load_moved);
 	return total_load_moved > 0;
 }
 static int
 iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		   struct sched_domain *sd, enum cpu_idle_type idle,
 		   struct rq_iterator *iterator)
 {
 	struct task_struct *p = iterator->start(iterator->arg);
 	int pinned = 0;
 	while (p) {
 		if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
 			pull_task(busiest, p, this_rq, this_cpu);
 			/*
 			 * Right now, this is only the second place pull_task()
 			 * is called, so we can safely collect pull_task()
 			 * stats here rather than inside pull_task().
 			 */
 			schedstat_inc(sd, lb_gained[idle]);
 			return 1;
 		}
 		p = iterator->next(iterator->arg);
 	}
 	return 0;
 }
 /*
  * move_one_task tries to move exactly one task from busiest to this_rq, as
  * part of active balancing operations within "domain".
  * Returns 1 if successful and 0 otherwise.
  *
  * Called with both runqueues locked.
  */
 static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
 			 struct sched_domain *sd, enum cpu_idle_type idle)
 {
 	const struct sched_class *class;
 	for (class = sched_class_highest; class; class = class->next)
 		if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
 			return 1;
 	return 0;
 }
 /*
  * find_busiest_group finds and returns the busiest CPU group within the
  * domain. It calculates and returns the amount of weighted load which
  * should be moved to restore balance via the imbalance parameter.
  */
 static struct sched_group *
 find_busiest_group(struct sched_domain *sd, int this_cpu,
 		   unsigned long *imbalance, enum cpu_idle_type idle,
 		   int *sd_idle, cpumask_t *cpus, int *balance)
 {
 	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
 	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
 	unsigned long max_pull;
 	unsigned long busiest_load_per_task, busiest_nr_running;
 	unsigned long this_load_per_task, this_nr_running;
 	int load_idx, group_imb = 0;
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 	int power_savings_balance = 1;
 	unsigned long leader_nr_running = 0, min_load_per_task = 0;
 	unsigned long min_nr_running = ULONG_MAX;
 	struct sched_group *group_min = NULL, *group_leader = NULL;
 #endif
 	max_load = this_load = total_load = total_pwr = 0;
 	busiest_load_per_task = busiest_nr_running = 0;
 	this_load_per_task = this_nr_running = 0;
 	if (idle == CPU_NOT_IDLE)
 		load_idx = sd->busy_idx;
 	else if (idle == CPU_NEWLY_IDLE)
 		load_idx = sd->newidle_idx;
 	else
 		load_idx = sd->idle_idx;
 	do {
 		unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
 		int local_group;
 		int i;
 		int __group_imb = 0;
 		unsigned int balance_cpu = -1, first_idle_cpu = 0;
 		unsigned long sum_nr_running, sum_weighted_load;
 		local_group = cpu_isset(this_cpu, group->cpumask);
 		if (local_group)
 			balance_cpu = first_cpu(group->cpumask);
 		/* Tally up the load of all CPUs in the group */
 		sum_weighted_load = sum_nr_running = avg_load = 0;
 		max_cpu_load = 0;
 		min_cpu_load = ~0UL;
 		for_each_cpu_mask(i, group->cpumask) {
 			struct rq *rq;
 			if (!cpu_isset(i, *cpus))
 				continue;
 			rq = cpu_rq(i);
 			if (*sd_idle && rq->nr_running)
 				*sd_idle = 0;
 			/* Bias balancing toward cpus of our domain */
 			if (local_group) {
 				if (idle_cpu(i) && !first_idle_cpu) {
 					first_idle_cpu = 1;
 					balance_cpu = i;
 				}
 				load = target_load(i, load_idx);
 			} else {
 				load = source_load(i, load_idx);
 				if (load > max_cpu_load)
 					max_cpu_load = load;
 				if (min_cpu_load > load)
 					min_cpu_load = load;
 			}
 			avg_load += load;
 			sum_nr_running += rq->nr_running;
 			sum_weighted_load += weighted_cpuload(i);
 		}
 		/*
 		 * First idle cpu or the first cpu(busiest) in this sched group
 		 * is eligible for doing load balancing at this and above
 		 * domains. In the newly idle case, we will allow all the cpu's
 		 * to do the newly idle load balance.
 		 */
 		if (idle != CPU_NEWLY_IDLE && local_group &&
 		    balance_cpu != this_cpu && balance) {
 			*balance = 0;
 			goto ret;
 		}
 		total_load += avg_load;
 		total_pwr += group->__cpu_power;
 		/* Adjust by relative CPU power of the group */
 		avg_load = sg_div_cpu_power(group,
 				avg_load * SCHED_LOAD_SCALE);
 		if ((max_cpu_load - min_cpu_load) > SCHED_LOAD_SCALE)
 			__group_imb = 1;
 		group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
 		if (local_group) {
 			this_load = avg_load;
 			this = group;
 			this_nr_running = sum_nr_running;
 			this_load_per_task = sum_weighted_load;
 		} else if (avg_load > max_load &&
 			   (sum_nr_running > group_capacity || __group_imb)) {
 			max_load = avg_load;
 			busiest = group;
 			busiest_nr_running = sum_nr_running;
 			busiest_load_per_task = sum_weighted_load;
 			group_imb = __group_imb;
 		}
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 		/*
 		 * Busy processors will not participate in power savings
 		 * balance.
 		 */
 		if (idle == CPU_NOT_IDLE ||
 				!(sd->flags & SD_POWERSAVINGS_BALANCE))
 			goto group_next;
 		/*
 		 * If the local group is idle or completely loaded
 		 * no need to do power savings balance at this domain
 		 */
 		if (local_group && (this_nr_running >= group_capacity ||
 				    !this_nr_running))
 			power_savings_balance = 0;
 		/*
 		 * If a group is already running at full capacity or idle,
 		 * don't include that group in power savings calculations
 		 */
 		if (!power_savings_balance || sum_nr_running >= group_capacity
 		    || !sum_nr_running)
 			goto group_next;
 		/*
 		 * Calculate the group which has the least non-idle load.
 		 * This is the group from where we need to pick up the load
 		 * for saving power
 		 */
 		if ((sum_nr_running < min_nr_running) ||
 		    (sum_nr_running == min_nr_running &&
 		     first_cpu(group->cpumask) <
 		     first_cpu(group_min->cpumask))) {
 			group_min = group;
 			min_nr_running = sum_nr_running;
 			min_load_per_task = sum_weighted_load /
 						sum_nr_running;
 		}
 		/*
 		 * Calculate the group which is almost near its
 		 * capacity but still has some space to pick up some load
 		 * from other group and save more power
 		 */
 		if (sum_nr_running <= group_capacity - 1) {
 			if (sum_nr_running > leader_nr_running ||
 			    (sum_nr_running == leader_nr_running &&
 			     first_cpu(group->cpumask) >
 			      first_cpu(group_leader->cpumask))) {
 				group_leader = group;
 				leader_nr_running = sum_nr_running;
 			}
 		}
 group_next:
 #endif
 		group = group->next;
 	} while (group != sd->groups);
 	if (!busiest || this_load >= max_load || busiest_nr_running == 0)
 		goto out_balanced;
 	avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
 	if (this_load >= avg_load ||
 			100*max_load <= sd->imbalance_pct*this_load)
 		goto out_balanced;
 	busiest_load_per_task /= busiest_nr_running;
 	if (group_imb)
 		busiest_load_per_task = min(busiest_load_per_task, avg_load);
 	/*
 	 * We're trying to get all the cpus to the average_load, so we don't
 	 * want to push ourselves above the average load, nor do we wish to
 	 * reduce the max loaded cpu below the average load, as either of these
 	 * actions would just result in more rebalancing later, and ping-pong
 	 * tasks around. Thus we look for the minimum possible imbalance.
 	 * Negative imbalances (*we* are more loaded than anyone else) will
 	 * be counted as no imbalance for these purposes -- we can't fix that
 	 * by pulling tasks to us. Be careful of negative numbers as they'll
 	 * appear as very large values with unsigned longs.
 	 */
 	if (max_load <= busiest_load_per_task)
 		goto out_balanced;
 	/*
 	 * In the presence of smp nice balancing, certain scenarios can have
 	 * max load less than avg load(as we skip the groups at or below
 	 * its cpu_power, while calculating max_load..)
 	 */
 	if (max_load < avg_load) {
 		*imbalance = 0;
 		goto small_imbalance;
 	}
 	/* Don't want to pull so many tasks that a group would go idle */
 	max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
 	/* How much load to actually move to equalise the imbalance */
 	*imbalance = min(max_pull * busiest->__cpu_power,
 				(avg_load - this_load) * this->__cpu_power)
 			/ SCHED_LOAD_SCALE;
 	/*
 	 * if *imbalance is less than the average load per runnable task
 	 * there is no gaurantee that any tasks will be moved so we'll have
 	 * a think about bumping its value to force at least one task to be
 	 * moved
 	 */
 	if (*imbalance < busiest_load_per_task) {
 		unsigned long tmp, pwr_now, pwr_move;
 		unsigned int imbn;
 small_imbalance:
 		pwr_move = pwr_now = 0;
 		imbn = 2;
 		if (this_nr_running) {
 			this_load_per_task /= this_nr_running;
 			if (busiest_load_per_task > this_load_per_task)
 				imbn = 1;
 		} else
 			this_load_per_task = SCHED_LOAD_SCALE;
 		if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
 					busiest_load_per_task * imbn) {
 			*imbalance = busiest_load_per_task;
 			return busiest;
 		}
 		/*
 		 * OK, we don't have enough imbalance to justify moving tasks,
 		 * however we may be able to increase total CPU power used by
 		 * moving them.
 		 */
 		pwr_now += busiest->__cpu_power *
 				min(busiest_load_per_task, max_load);
 		pwr_now += this->__cpu_power *
 				min(this_load_per_task, this_load);
 		pwr_now /= SCHED_LOAD_SCALE;
 		/* Amount of load we'd subtract */
 		tmp = sg_div_cpu_power(busiest,
 				busiest_load_per_task * SCHED_LOAD_SCALE);
 		if (max_load > tmp)
 			pwr_move += busiest->__cpu_power *
 				min(busiest_load_per_task, max_load - tmp);
 		/* Amount of load we'd add */
 		if (max_load * busiest->__cpu_power <
 				busiest_load_per_task * SCHED_LOAD_SCALE)
 			tmp = sg_div_cpu_power(this,
 					max_load * busiest->__cpu_power);
 		else
 			tmp = sg_div_cpu_power(this,
 				busiest_load_per_task * SCHED_LOAD_SCALE);
 		pwr_move += this->__cpu_power *
 				min(this_load_per_task, this_load + tmp);
 		pwr_move /= SCHED_LOAD_SCALE;
 		/* Move if we gain throughput */
 		if (pwr_move > pwr_now)
 			*imbalance = busiest_load_per_task;
 	}
 	return busiest;
 out_balanced:
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 	if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
 		goto ret;
 	if (this == group_leader && group_leader != group_min) {
 		*imbalance = min_load_per_task;
 		return group_min;
 	}
 #endif
 ret:
 	*imbalance = 0;
 	return NULL;
 }
 /*
  * find_busiest_queue - find the busiest runqueue among the cpus in group.
  */
 static struct rq *
 find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
 		   unsigned long imbalance, cpumask_t *cpus)
 {
 	struct rq *busiest = NULL, *rq;
 	unsigned long max_load = 0;
 	int i;
 	for_each_cpu_mask(i, group->cpumask) {
 		unsigned long wl;
 		if (!cpu_isset(i, *cpus))
 			continue;
 		rq = cpu_rq(i);
 		wl = weighted_cpuload(i);
 		if (rq->nr_running == 1 && wl > imbalance)
 			continue;
 		if (wl > max_load) {
 			max_load = wl;
 			busiest = rq;
 		}
 	}
 	return busiest;
 }
 /*
  * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
  * so long as it is large enough.
  */
 #define MAX_PINNED_INTERVAL	512
 /*
  * Check this_cpu to ensure it is balanced within domain. Attempt to move
  * tasks if there is an imbalance.
  */
 static int load_balance(int this_cpu, struct rq *this_rq,
 			struct sched_domain *sd, enum cpu_idle_type idle,
 			int *balance)
 {
 	int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
 	struct sched_group *group;
 	unsigned long imbalance;
 	struct rq *busiest;
 	cpumask_t cpus = CPU_MASK_ALL;
 	unsigned long flags;
 	/*
 	 * When power savings policy is enabled for the parent domain, idle
 	 * sibling can pick up load irrespective of busy siblings. In this case,
 	 * let the state of idle sibling percolate up as CPU_IDLE, instead of
 	 * portraying it as CPU_NOT_IDLE.
 	 */
 	if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 		sd_idle = 1;
 	schedstat_inc(sd, lb_count[idle]);
 redo:
 	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
 				   &cpus, balance);
 	if (*balance == 0)
 		goto out_balanced;
 	if (!group) {
 		schedstat_inc(sd, lb_nobusyg[idle]);
 		goto out_balanced;
 	}
 	busiest = find_busiest_queue(group, idle, imbalance, &cpus);
 	if (!busiest) {
 		schedstat_inc(sd, lb_nobusyq[idle]);
 		goto out_balanced;
 	}
 	BUG_ON(busiest == this_rq);
 	schedstat_add(sd, lb_imbalance[idle], imbalance);
 	ld_moved = 0;
 	if (busiest->nr_running > 1) {
 		/*
 		 * Attempt to move tasks. If find_busiest_group has found
 		 * an imbalance but busiest->nr_running <= 1, the group is
 		 * still unbalanced. ld_moved simply stays zero, so it is
 		 * correctly treated as an imbalance.
 		 */
 		local_irq_save(flags);
 		double_rq_lock(this_rq, busiest);
 		ld_moved = move_tasks(this_rq, this_cpu, busiest,
 				      imbalance, sd, idle, &all_pinned);
 		double_rq_unlock(this_rq, busiest);
 		local_irq_restore(flags);
 		/*
 		 * some other cpu did the load balance for us.
 		 */
 		if (ld_moved && this_cpu != smp_processor_id())
 			resched_cpu(this_cpu);
 		/* All tasks on this runqueue were pinned by CPU affinity */
 		if (unlikely(all_pinned)) {
 			cpu_clear(cpu_of(busiest), cpus);
 			if (!cpus_empty(cpus))
 				goto redo;
 			goto out_balanced;
 		}
 	}
 	if (!ld_moved) {
 		schedstat_inc(sd, lb_failed[idle]);
 		sd->nr_balance_failed++;
 		if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
 			spin_lock_irqsave(&busiest->lock, flags);
 			/* don't kick the migration_thread, if the curr
 			 * task on busiest cpu can't be moved to this_cpu
 			 */
 			if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
 				spin_unlock_irqrestore(&busiest->lock, flags);
 				all_pinned = 1;
 				goto out_one_pinned;
 			}
 			if (!busiest->active_balance) {
 				busiest->active_balance = 1;
 				busiest->push_cpu = this_cpu;
 				active_balance = 1;
 			}
 			spin_unlock_irqrestore(&busiest->lock, flags);
 			if (active_balance)
 				wake_up_process(busiest->migration_thread);
 			/*
 			 * We've kicked active balancing, reset the failure
 			 * counter.
 			 */
 			sd->nr_balance_failed = sd->cache_nice_tries+1;
 		}
 	} else
 		sd->nr_balance_failed = 0;
 	if (likely(!active_balance)) {
 		/* We were unbalanced, so reset the balancing interval */
 		sd->balance_interval = sd->min_interval;
 	} else {
 		/*
 		 * If we've begun active balancing, start to back off. This
 		 * case may not be covered by the all_pinned logic if there
 		 * is only 1 task on the busy runqueue (because we don't call
 		 * move_tasks).
 		 */
 		if (sd->balance_interval < sd->max_interval)
 			sd->balance_interval *= 2;
 	}
 	if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 		return -1;
 	return ld_moved;
 out_balanced:
 	schedstat_inc(sd, lb_balanced[idle]);
 	sd->nr_balance_failed = 0;
 out_one_pinned:
 	/* tune up the balancing interval */
 	if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
 			(sd->balance_interval < sd->max_interval))
 		sd->balance_interval *= 2;
 	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 		return -1;
 	return 0;
 }
 /*
  * Check this_cpu to ensure it is balanced within domain. Attempt to move
  * tasks if there is an imbalance.
  *
  * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
  * this_rq is locked.
  */
 static int
 load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
 {
 	struct sched_group *group;
 	struct rq *busiest = NULL;
 	unsigned long imbalance;
 	int ld_moved = 0;
 	int sd_idle = 0;
 	int all_pinned = 0;
 	cpumask_t cpus = CPU_MASK_ALL;
 	/*
 	 * When power savings policy is enabled for the parent domain, idle
 	 * sibling can pick up load irrespective of busy siblings. In this case,
 	 * let the state of idle sibling percolate up as IDLE, instead of
 	 * portraying it as CPU_NOT_IDLE.
 	 */
 	if (sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 		sd_idle = 1;
 	schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
 redo:
 	group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
 				   &sd_idle, &cpus, NULL);
 	if (!group) {
 		schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
 		goto out_balanced;
 	}
 	busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance,
 				&cpus);
 	if (!busiest) {
 		schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
 		goto out_balanced;
 	}
 	BUG_ON(busiest == this_rq);
 	schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
 	ld_moved = 0;
 	if (busiest->nr_running > 1) {
 		/* Attempt to move tasks */
 		double_lock_balance(this_rq, busiest);
 		/* this_rq->clock is already updated */
 		update_rq_clock(busiest);
 		ld_moved = move_tasks(this_rq, this_cpu, busiest,
 					imbalance, sd, CPU_NEWLY_IDLE,
 					&all_pinned);
 		spin_unlock(&busiest->lock);
 		if (unlikely(all_pinned)) {
 			cpu_clear(cpu_of(busiest), cpus);
 			if (!cpus_empty(cpus))
 				goto redo;
 		}
 	}
 	if (!ld_moved) {
 		schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
 		if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
 		    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 			return -1;
 	} else
 		sd->nr_balance_failed = 0;
 	return ld_moved;
 out_balanced:
 	schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
 	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 		return -1;
 	sd->nr_balance_failed = 0;
 	return 0;
 }
 /*
  * idle_balance is called by schedule() if this_cpu is about to become
  * idle. Attempts to pull tasks from other CPUs.
  */
 static void idle_balance(int this_cpu, struct rq *this_rq)
 {
 	struct sched_domain *sd;
 	int pulled_task = -1;
 	unsigned long next_balance = jiffies + HZ;
 	for_each_domain(this_cpu, sd) {
 		unsigned long interval;
 		if (!(sd->flags & SD_LOAD_BALANCE))
 			continue;
 		if (sd->flags & SD_BALANCE_NEWIDLE)
 			/* If we've pulled tasks over stop searching: */
 			pulled_task = load_balance_newidle(this_cpu,
 								this_rq, sd);
 		interval = msecs_to_jiffies(sd->balance_interval);
 		if (time_after(next_balance, sd->last_balance + interval))
 			next_balance = sd->last_balance + interval;
 		if (pulled_task)
 			break;
 	}
 	if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
 		/*
 		 * We are going idle. next_balance may be set based on
 		 * a busy processor. So reset next_balance.
 		 */
 		this_rq->next_balance = next_balance;
 	}
 }
 /*
  * active_load_balance is run by migration threads. It pushes running tasks
  * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
  * running on each physical CPU where possible, and avoids physical /
  * logical imbalances.
  *
  * Called with busiest_rq locked.
  */
 static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
 {
 	int target_cpu = busiest_rq->push_cpu;
 	struct sched_domain *sd;
 	struct rq *target_rq;
 	/* Is there any task to move? */
 	if (busiest_rq->nr_running <= 1)
 		return;
 	target_rq = cpu_rq(target_cpu);
 	/*
 	 * This condition is "impossible", if it occurs
 	 * we need to fix it. Originally reported by
 	 * Bjorn Helgaas on a 128-cpu setup.
 	 */
 	BUG_ON(busiest_rq == target_rq);
 	/* move a task from busiest_rq to target_rq */
 	double_lock_balance(busiest_rq, target_rq);
 	update_rq_clock(busiest_rq);
 	update_rq_clock(target_rq);
 	/* Search for an sd spanning us and the target CPU. */
 	for_each_domain(target_cpu, sd) {
 		if ((sd->flags & SD_LOAD_BALANCE) &&
 		    cpu_isset(busiest_cpu, sd->span))
 				break;
 	}
 	if (likely(sd)) {
 		schedstat_inc(sd, alb_count);
 		if (move_one_task(target_rq, target_cpu, busiest_rq,
 				  sd, CPU_IDLE))
 			schedstat_inc(sd, alb_pushed);
 		else
 			schedstat_inc(sd, alb_failed);
 	}
 	spin_unlock(&target_rq->lock);
 }
 #ifdef CONFIG_NO_HZ
 static struct {
 	atomic_t load_balancer;
 	cpumask_t cpu_mask;
 } nohz ____cacheline_aligned = {
 	.load_balancer = ATOMIC_INIT(-1),
 	.cpu_mask = CPU_MASK_NONE,
 };
 /*
  * This routine will try to nominate the ilb (idle load balancing)
  * owner among the cpus whose ticks are stopped. ilb owner will do the idle
  * load balancing on behalf of all those cpus. If all the cpus in the system
  * go into this tickless mode, then there will be no ilb owner (as there is
  * no need for one) and all the cpus will sleep till the next wakeup event
  * arrives...
  *
  * For the ilb owner, tick is not stopped. And this tick will be used
  * for idle load balancing. ilb owner will still be part of
  * nohz.cpu_mask..
  *
  * While stopping the tick, this cpu will become the ilb owner if there
  * is no other owner. And will be the owner till that cpu becomes busy
  * or if all cpus in the system stop their ticks at which point
  * there is no need for ilb owner.
  *
  * When the ilb owner becomes busy, it nominates another owner, during the
  * next busy scheduler_tick()
  */
 int select_nohz_load_balancer(int stop_tick)
 {
 	int cpu = smp_processor_id();
 	if (stop_tick) {
 		cpu_set(cpu, nohz.cpu_mask);
 		cpu_rq(cpu)->in_nohz_recently = 1;
 		/*
 		 * If we are going offline and still the leader, give up!
 		 */
 		if (cpu_is_offline(cpu) &&
 		    atomic_read(&nohz.load_balancer) == cpu) {
 			if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
 				BUG();
 			return 0;
 		}
 		/* time for ilb owner also to sleep */
 		if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
 			if (atomic_read(&nohz.load_balancer) == cpu)
 				atomic_set(&nohz.load_balancer, -1);
 			return 0;
 		}
 		if (atomic_read(&nohz.load_balancer) == -1) {
 			/* make me the ilb owner */
 			if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
 				return 1;
 		} else if (atomic_read(&nohz.load_balancer) == cpu)
 			return 1;
 	} else {
 		if (!cpu_isset(cpu, nohz.cpu_mask))
 			return 0;
 		cpu_clear(cpu, nohz.cpu_mask);
 		if (atomic_read(&nohz.load_balancer) == cpu)
 			if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
 				BUG();
 	}
 	return 0;
 }
 #endif
 static DEFINE_SPINLOCK(balancing);
 /*
  * It checks each scheduling domain to see if it is due to be balanced,
  * and initiates a balancing operation if so.
  *
  * Balancing parameters are set up in arch_init_sched_domains.
  */
 static void rebalance_domains(int cpu, enum cpu_idle_type idle)
 {
 	int balance = 1;
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long interval;
 	struct sched_domain *sd;
 	/* Earliest time when we have to do rebalance again */
 	unsigned long next_balance = jiffies + 60*HZ;
 	int update_next_balance = 0;
 	for_each_domain(cpu, sd) {
 		if (!(sd->flags & SD_LOAD_BALANCE))
 			continue;
 		interval = sd->balance_interval;
 		if (idle != CPU_IDLE)
 			interval *= sd->busy_factor;
 		/* scale ms to jiffies */
 		interval = msecs_to_jiffies(interval);
 		if (unlikely(!interval))
 			interval = 1;
 		if (interval > HZ*NR_CPUS/10)
 			interval = HZ*NR_CPUS/10;
 		if (sd->flags & SD_SERIALIZE) {
 			if (!spin_trylock(&balancing))
 				goto out;
 		}
 		if (time_after_eq(jiffies, sd->last_balance + interval)) {
 			if (load_balance(cpu, rq, sd, idle, &balance)) {
 				/*
 				 * We've pulled tasks over so either we're no
 				 * longer idle, or one of our SMT siblings is
 				 * not idle.
 				 */
 				idle = CPU_NOT_IDLE;
 			}
 			sd->last_balance = jiffies;
 		}
 		if (sd->flags & SD_SERIALIZE)
 			spin_unlock(&balancing);
 out:
 		if (time_after(next_balance, sd->last_balance + interval)) {
 			next_balance = sd->last_balance + interval;
 			update_next_balance = 1;
 		}
 		/*
 		 * Stop the load balance at this level. There is another
 		 * CPU in our sched group which is doing load balancing more
 		 * actively.
 		 */
 		if (!balance)
 			break;
 	}
 	/*
 	 * next_balance will be updated only when there is a need.
 	 * When the cpu is attached to null domain for ex, it will not be
 	 * updated.
 	 */
 	if (likely(update_next_balance))
 		rq->next_balance = next_balance;
 }
 /*
  * run_rebalance_domains is triggered when needed from the scheduler tick.
  * In CONFIG_NO_HZ case, the idle load balance owner will do the
  * rebalancing for all the cpus for whom scheduler ticks are stopped.
  */
 static void run_rebalance_domains(struct softirq_action *h)
 {
 	int this_cpu = smp_processor_id();
 	struct rq *this_rq = cpu_rq(this_cpu);
 	enum cpu_idle_type idle = this_rq->idle_at_tick ?
 						CPU_IDLE : CPU_NOT_IDLE;
 	rebalance_domains(this_cpu, idle);
 #ifdef CONFIG_NO_HZ
 	/*
 	 * If this cpu is the owner for idle load balancing, then do the
 	 * balancing on behalf of the other idle cpus whose ticks are
 	 * stopped.
 	 */
 	if (this_rq->idle_at_tick &&
 	    atomic_read(&nohz.load_balancer) == this_cpu) {
 		cpumask_t cpus = nohz.cpu_mask;
 		struct rq *rq;
 		int balance_cpu;
 		cpu_clear(this_cpu, cpus);
 		for_each_cpu_mask(balance_cpu, cpus) {
 			/*
 			 * If this cpu gets work to do, stop the load balancing
 			 * work being done for other cpus. Next load
 			 * balancing owner will pick it up.
 			 */
 			if (need_resched())
 				break;
 			rebalance_domains(balance_cpu, CPU_IDLE);
 			rq = cpu_rq(balance_cpu);
 			if (time_after(this_rq->next_balance, rq->next_balance))
 				this_rq->next_balance = rq->next_balance;
 		}
 	}
 #endif
 }
 /*
  * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
  *
  * In case of CONFIG_NO_HZ, this is the place where we nominate a new
  * idle load balancing owner or decide to stop the periodic load balancing,
  * if the whole system is idle.
  */
 static inline void trigger_load_balance(struct rq *rq, int cpu)
 {
 #ifdef CONFIG_NO_HZ
 	/*
 	 * If we were in the nohz mode recently and busy at the current
 	 * scheduler tick, then check if we need to nominate new idle
 	 * load balancer.
 	 */
 	if (rq->in_nohz_recently && !rq->idle_at_tick) {
 		rq->in_nohz_recently = 0;
 		if (atomic_read(&nohz.load_balancer) == cpu) {
 			cpu_clear(cpu, nohz.cpu_mask);
 			atomic_set(&nohz.load_balancer, -1);
 		}
 		if (atomic_read(&nohz.load_balancer) == -1) {
 			/*
 			 * simple selection for now: Nominate the
 			 * first cpu in the nohz list to be the next
 			 * ilb owner.
 			 *
 			 * TBD: Traverse the sched domains and nominate
 			 * the nearest cpu in the nohz.cpu_mask.
 			 */
 			int ilb = first_cpu(nohz.cpu_mask);
 			if (ilb != NR_CPUS)
 				resched_cpu(ilb);
 		}
 	}
 	/*
 	 * If this cpu is idle and doing idle load balancing for all the
 	 * cpus with ticks stopped, is it time for that to stop?
 	 */
 	if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
 	    cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
 		resched_cpu(cpu);
 		return;
 	}
 	/*
 	 * If this cpu is idle and the idle load balancing is done by
 	 * someone else, then no need raise the SCHED_SOFTIRQ
 	 */
 	if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
 	    cpu_isset(cpu, nohz.cpu_mask))
 		return;
 #endif
 	if (time_after_eq(jiffies, rq->next_balance))
 		raise_softirq(SCHED_SOFTIRQ);
 }
 #else	/* CONFIG_SMP */
 /*
  * on UP we do not need to balance between CPUs:
  */
 static inline void idle_balance(int cpu, struct rq *rq)
 {
 }
 #endif
 DEFINE_PER_CPU(struct kernel_stat, kstat);
 EXPORT_PER_CPU_SYMBOL(kstat);
 /*
  * Return p->sum_exec_runtime plus any more ns on the sched_clock
  * that have not yet been banked in case the task is currently running.
  */
 unsigned long long task_sched_runtime(struct task_struct *p)
 {
 	unsigned long flags;
 	u64 ns, delta_exec;
 	struct rq *rq;
 	rq = task_rq_lock(p, &flags);
 	ns = p->se.sum_exec_runtime;
 	if (task_current(rq, p)) {
 		update_rq_clock(rq);
 		delta_exec = rq->clock - p->se.exec_start;
 		if ((s64)delta_exec > 0)
 			ns += delta_exec;
 	}
 	task_rq_unlock(rq, &flags);
 	return ns;
 }
 /*
  * Account user cpu time to a process.
  * @p: the process that the cpu time gets accounted to
  * @cputime: the cpu time spent in user space since the last update
  */
 void account_user_time(struct task_struct *p, cputime_t cputime)
 {
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 	cputime64_t tmp;
 	p->utime = cputime_add(p->utime, cputime);
 	/* Add user time to cpustat. */
 	tmp = cputime_to_cputime64(cputime);
 	if (TASK_NICE(p) > 0)
 		cpustat->nice = cputime64_add(cpustat->nice, tmp);
 	else
 		cpustat->user = cputime64_add(cpustat->user, tmp);
 }
 /*
  * Account guest cpu time to a process.
  * @p: the process that the cpu time gets accounted to
  * @cputime: the cpu time spent in virtual machine since the last update
  */
 static void account_guest_time(struct task_struct *p, cputime_t cputime)
 {
 	cputime64_t tmp;
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 	tmp = cputime_to_cputime64(cputime);
 	p->utime = cputime_add(p->utime, cputime);
 	p->gtime = cputime_add(p->gtime, cputime);
 	cpustat->user = cputime64_add(cpustat->user, tmp);
 	cpustat->guest = cputime64_add(cpustat->guest, tmp);
 }
 /*
  * Account scaled user cpu time to a process.
  * @p: the process that the cpu time gets accounted to
  * @cputime: the cpu time spent in user space since the last update
  */
 void account_user_time_scaled(struct task_struct *p, cputime_t cputime)
 {
 	p->utimescaled = cputime_add(p->utimescaled, cputime);
 }
 /*
  * Account system cpu time to a process.
  * @p: the process that the cpu time gets accounted to
  * @hardirq_offset: the offset to subtract from hardirq_count()
  * @cputime: the cpu time spent in kernel space since the last update
  */
 void account_system_time(struct task_struct *p, int hardirq_offset,
 			 cputime_t cputime)
 {
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 	struct rq *rq = this_rq();
 	cputime64_t tmp;
 	if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0))
 		return account_guest_time(p, cputime);
 	p->stime = cputime_add(p->stime, cputime);
 	/* Add system time to cpustat. */
 	tmp = cputime_to_cputime64(cputime);
 	if (hardirq_count() - hardirq_offset)
 		cpustat->irq = cputime64_add(cpustat->irq, tmp);
 	else if (softirq_count())
 		cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
 	else if (p != rq->idle)
 		cpustat->system = cputime64_add(cpustat->system, tmp);
 	else if (atomic_read(&rq->nr_iowait) > 0)
 		cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
 	else
 		cpustat->idle = cputime64_add(cpustat->idle, tmp);
 	/* Account for system time used */
 	acct_update_integrals(p);
 }
 /*
  * Account scaled system cpu time to a process.
  * @p: the process that the cpu time gets accounted to
  * @hardirq_offset: the offset to subtract from hardirq_count()
  * @cputime: the cpu time spent in kernel space since the last update
  */
 void account_system_time_scaled(struct task_struct *p, cputime_t cputime)
 {
 	p->stimescaled = cputime_add(p->stimescaled, cputime);
 }
 /*
  * Account for involuntary wait time.
  * @p: the process from which the cpu time has been stolen
  * @steal: the cpu time spent in involuntary wait
  */
 void account_steal_time(struct task_struct *p, cputime_t steal)
 {
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 	cputime64_t tmp = cputime_to_cputime64(steal);
 	struct rq *rq = this_rq();
 	if (p == rq->idle) {
 		p->stime = cputime_add(p->stime, steal);
 		if (atomic_read(&rq->nr_iowait) > 0)
 			cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
 		else
 			cpustat->idle = cputime64_add(cpustat->idle, tmp);
 	} else
 		cpustat->steal = cputime64_add(cpustat->steal, tmp);
 }
 /*
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
  *
  * It also gets called by the fork code, when changing the parent's
  * timeslices.
  */
 void scheduler_tick(void)
 {
 	int cpu = smp_processor_id();
 	struct rq *rq = cpu_rq(cpu);
 	struct task_struct *curr = rq->curr;
 	u64 next_tick = rq->tick_timestamp + TICK_NSEC;
 	spin_lock(&rq->lock);
 	__update_rq_clock(rq);
 	/*
 	 * Let rq->clock advance by at least TICK_NSEC:
 	 */
 	if (unlikely(rq->clock < next_tick)) {
 		rq->clock = next_tick;
 		rq->clock_underflows++;
 	}
 	rq->tick_timestamp = rq->clock;
 	update_cpu_load(rq);
 	curr->sched_class->task_tick(rq, curr, 0);
 	update_sched_rt_period(rq);
 	spin_unlock(&rq->lock);
 #ifdef CONFIG_SMP
 	rq->idle_at_tick = idle_cpu(cpu);
 	trigger_load_balance(rq, cpu);
 #endif
 }
 #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
 void fastcall add_preempt_count(int val)
 {
 	/*
 	 * Underflow?
 	 */
 	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
 		return;
 	preempt_count() += val;
 	/*
 	 * Spinlock count overflowing soon?
 	 */
 	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
 				PREEMPT_MASK - 10);
 }
 EXPORT_SYMBOL(add_preempt_count);
 void fastcall sub_preempt_count(int val)
 {
 	/*
 	 * Underflow?
 	 */
 	if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
 		return;
 	/*
 	 * Is the spinlock portion underflowing?
 	 */
 	if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
 			!(preempt_count() & PREEMPT_MASK)))
 		return;
 	preempt_count() -= val;
 }
 EXPORT_SYMBOL(sub_preempt_count);
 #endif
 /*
  * Print scheduling while atomic bug:
  */
 static noinline void __schedule_bug(struct task_struct *prev)
 {
 	struct pt_regs *regs = get_irq_regs();
 	printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
 		prev->comm, prev->pid, preempt_count());
 	debug_show_held_locks(prev);
 	if (irqs_disabled())
 		print_irqtrace_events(prev);
 	if (regs)
 		show_regs(regs);
 	else
 		dump_stack();
 }
 /*
  * Various schedule()-time debugging checks and statistics:
  */
 static inline void schedule_debug(struct task_struct *prev)
 {
 	/*
 	 * Test if we are atomic. Since do_exit() needs to call into
 	 * schedule() atomically, we ignore that path for now.
 	 * Otherwise, whine if we are scheduling when we should not be.
 	 */
 	if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state))
 		__schedule_bug(prev);
 	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
 	schedstat_inc(this_rq(), sched_count);
 #ifdef CONFIG_SCHEDSTATS
 	if (unlikely(prev->lock_depth >= 0)) {
 		schedstat_inc(this_rq(), bkl_count);
 		schedstat_inc(prev, sched_info.bkl_count);
 	}
 #endif
 }
 /*
  * Pick up the highest-prio task:
  */
 static inline struct task_struct *
 pick_next_task(struct rq *rq, struct task_struct *prev)
 {
 	const struct sched_class *class;
 	struct task_struct *p;
 	/*
 	 * Optimization: we know that if all tasks are in
 	 * the fair class we can call that function directly:
 	 */
 	if (likely(rq->nr_running == rq->cfs.nr_running)) {
 		p = fair_sched_class.pick_next_task(rq);
 		if (likely(p))
 			return p;
 	}
 	class = sched_class_highest;
 	for ( ; ; ) {
 		p = class->pick_next_task(rq);
 		if (p)
 			return p;
 		/*
 		 * Will never be NULL as the idle class always
 		 * returns a non-NULL p:
 		 */
 		class = class->next;
 	}
 }
 /*
  * schedule() is the main scheduler function.
  */
 asmlinkage void __sched schedule(void)
 {
 	struct task_struct *prev, *next;
 	long *switch_count;
 	struct rq *rq;
 	int cpu;
 need_resched:
 	preempt_disable();
 	cpu = smp_processor_id();
 	rq = cpu_rq(cpu);
 	rcu_qsctr_inc(cpu);
 	prev = rq->curr;
 	switch_count = &prev->nivcsw;
 	release_kernel_lock(prev);
 need_resched_nonpreemptible:
 	schedule_debug(prev);
 	hrtick_clear(rq);
 	/*
 	 * Do the rq-clock update outside the rq lock:
 	 */
 	local_irq_disable();
 	__update_rq_clock(rq);
 	spin_lock(&rq->lock);
 	clear_tsk_need_resched(prev);
 	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
 		if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
 				unlikely(signal_pending(prev)))) {
 			prev->state = TASK_RUNNING;
 		} else {
 			deactivate_task(rq, prev, 1);
 		}
 		switch_count = &prev->nvcsw;
 	}
 #ifdef CONFIG_SMP
 	if (prev->sched_class->pre_schedule)
 		prev->sched_class->pre_schedule(rq, prev);
 #endif
 	if (unlikely(!rq->nr_running))
 		idle_balance(cpu, rq);
 	prev->sched_class->put_prev_task(rq, prev);
 	next = pick_next_task(rq, prev);
 	sched_info_switch(prev, next);
 	if (likely(prev != next)) {
 		rq->nr_switches++;
 		rq->curr = next;
 		++*switch_count;
 		context_switch(rq, prev, next); /* unlocks the rq */
 		/*
 		 * the context switch might have flipped the stack from under
 		 * us, hence refresh the local variables.
 		 */
 		cpu = smp_processor_id();
 		rq = cpu_rq(cpu);
 	} else
 		spin_unlock_irq(&rq->lock);
 	hrtick_set(rq);
 	if (unlikely(reacquire_kernel_lock(current) < 0))
 		goto need_resched_nonpreemptible;
 	preempt_enable_no_resched();
 	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
 		goto need_resched;
 }
 EXPORT_SYMBOL(schedule);
 #ifdef CONFIG_PREEMPT
 /*
  * this is the entry point to schedule() from in-kernel preemption
  * off of preempt_enable. Kernel preemptions off return from interrupt
  * occur there and call schedule directly.
  */
 asmlinkage void __sched preempt_schedule(void)
 {
 	struct thread_info *ti = current_thread_info();
 	struct task_struct *task = current;
 	int saved_lock_depth;
 	/*
 	 * If there is a non-zero preempt_count or interrupts are disabled,
 	 * we do not want to preempt the current task. Just return..
 	 */
 	if (likely(ti->preempt_count || irqs_disabled()))
 		return;
 	do {
 		add_preempt_count(PREEMPT_ACTIVE);
 		/*
 		 * We keep the big kernel semaphore locked, but we
 		 * clear ->lock_depth so that schedule() doesnt
 		 * auto-release the semaphore:
 		 */
 		saved_lock_depth = task->lock_depth;
 		task->lock_depth = -1;
 		schedule();
 		task->lock_depth = saved_lock_depth;
 		sub_preempt_count(PREEMPT_ACTIVE);
 		/*
 		 * Check again in case we missed a preemption opportunity
 		 * between schedule and now.
 		 */
 		barrier();
 	} while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
 }
 EXPORT_SYMBOL(preempt_schedule);
 /*
  * this is the entry point to schedule() from kernel preemption
  * off of irq context.
  * Note, that this is called and return with irqs disabled. This will
  * protect us against recursive calling from irq.
  */
 asmlinkage void __sched preempt_schedule_irq(void)
 {
 	struct thread_info *ti = current_thread_info();
 	struct task_struct *task = current;
 	int saved_lock_depth;
 	/* Catch callers which need to be fixed */
 	BUG_ON(ti->preempt_count || !irqs_disabled());
 	do {
 		add_preempt_count(PREEMPT_ACTIVE);
 		/*
 		 * We keep the big kernel semaphore locked, but we
 		 * clear ->lock_depth so that schedule() doesnt
 		 * auto-release the semaphore:
 		 */
 		saved_lock_depth = task->lock_depth;
 		task->lock_depth = -1;
 		local_irq_enable();
 		schedule();
 		local_irq_disable();
 		task->lock_depth = saved_lock_depth;
 		sub_preempt_count(PREEMPT_ACTIVE);
 		/*
 		 * Check again in case we missed a preemption opportunity
 		 * between schedule and now.
 		 */
 		barrier();
 	} while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
 }
 #endif /* CONFIG_PREEMPT */
 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
 			  void *key)
 {
 	return try_to_wake_up(curr->private, mode, sync);
 }
 EXPORT_SYMBOL(default_wake_function);
 /*
  * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
  * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
  * number) then we wake all the non-exclusive tasks and one exclusive task.
  *
  * There are circumstances in which we can try to wake a task which has already
  * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
  * zero in this (rare) case, and we handle it by continuing to scan the queue.
  */
 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
 			     int nr_exclusive, int sync, void *key)
 {
 	wait_queue_t *curr, *next;
 	list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
 		unsigned flags = curr->flags;
 		if (curr->func(curr, mode, sync, key) &&
 				(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
 			break;
 	}
 }
 /**
  * __wake_up - wake up threads blocked on a waitqueue.
  * @q: the waitqueue
  * @mode: which threads
  * @nr_exclusive: how many wake-one or wake-many threads to wake up
  * @key: is directly passed to the wakeup function
  */
 void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
 			int nr_exclusive, void *key)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&q->lock, flags);
 	__wake_up_common(q, mode, nr_exclusive, 0, key);
 	spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL(__wake_up);
 /*
  * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
  */
 void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
 {
 	__wake_up_common(q, mode, 1, 0, NULL);
 }
 /**
  * __wake_up_sync - wake up threads blocked on a waitqueue.
  * @q: the waitqueue
  * @mode: which threads
  * @nr_exclusive: how many wake-one or wake-many threads to wake up
  *
  * The sync wakeup differs that the waker knows that it will schedule
  * away soon, so while the target thread will be woken up, it will not
  * be migrated to another CPU - ie. the two threads are 'synchronized'
  * with each other. This can prevent needless bouncing between CPUs.
  *
  * On UP it can prevent extra preemption.
  */
 void fastcall
 __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
 {
 	unsigned long flags;
 	int sync = 1;
 	if (unlikely(!q))
 		return;
 	if (unlikely(!nr_exclusive))
 		sync = 0;
 	spin_lock_irqsave(&q->lock, flags);
 	__wake_up_common(q, mode, nr_exclusive, sync, NULL);
 	spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */
 void complete(struct completion *x)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&x->wait.lock, flags);
 	x->done++;
 	__wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
 			 1, 0, NULL);
 	spin_unlock_irqrestore(&x->wait.lock, flags);
 }
 EXPORT_SYMBOL(complete);
 void complete_all(struct completion *x)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&x->wait.lock, flags);
 	x->done += UINT_MAX/2;
 	__wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
 			 0, 0, NULL);
 	spin_unlock_irqrestore(&x->wait.lock, flags);
 }
 EXPORT_SYMBOL(complete_all);
 static inline long __sched
 do_wait_for_common(struct completion *x, long timeout, int state)
 {
 	if (!x->done) {
 		DECLARE_WAITQUEUE(wait, current);
 		wait.flags |= WQ_FLAG_EXCLUSIVE;
 		__add_wait_queue_tail(&x->wait, &wait);
 		do {
 			if (state == TASK_INTERRUPTIBLE &&
 			    signal_pending(current)) {
 				__remove_wait_queue(&x->wait, &wait);
 				return -ERESTARTSYS;
 			}
 			__set_current_state(state);
 			spin_unlock_irq(&x->wait.lock);
 			timeout = schedule_timeout(timeout);
 			spin_lock_irq(&x->wait.lock);
 			if (!timeout) {
 				__remove_wait_queue(&x->wait, &wait);
 				return timeout;
 			}
 		} while (!x->done);
 		__remove_wait_queue(&x->wait, &wait);
 	}
 	x->done--;
 	return timeout;
 }
 static long __sched
 wait_for_common(struct completion *x, long timeout, int state)
 {
 	might_sleep();
 	spin_lock_irq(&x->wait.lock);
 	timeout = do_wait_for_common(x, timeout, state);
 	spin_unlock_irq(&x->wait.lock);
 	return timeout;
 }
 void __sched wait_for_completion(struct completion *x)
 {
 	wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(wait_for_completion);
 unsigned long __sched
 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
 {
 	return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(wait_for_completion_timeout);
 int __sched wait_for_completion_interruptible(struct completion *x)
 {
 	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
 	if (t == -ERESTARTSYS)
 		return t;
 	return 0;
 }
 EXPORT_SYMBOL(wait_for_completion_interruptible);
 unsigned long __sched
 wait_for_completion_interruptible_timeout(struct completion *x,
 					  unsigned long timeout)
 {
 	return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
 }
 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
 static long __sched
 sleep_on_common(wait_queue_head_t *q, int state, long timeout)
 {
 	unsigned long flags;
 	wait_queue_t wait;
 	init_waitqueue_entry(&wait, current);
 	__set_current_state(state);
 	spin_lock_irqsave(&q->lock, flags);
 	__add_wait_queue(q, &wait);
 	spin_unlock(&q->lock);
 	timeout = schedule_timeout(timeout);
 	spin_lock_irq(&q->lock);
 	__remove_wait_queue(q, &wait);
 	spin_unlock_irqrestore(&q->lock, flags);
 	return timeout;
 }
 void __sched interruptible_sleep_on(wait_queue_head_t *q)
 {
 	sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
 }
 EXPORT_SYMBOL(interruptible_sleep_on);
 long __sched
 interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
 {
 	return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
 }
 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
 void __sched sleep_on(wait_queue_head_t *q)
 {
 	sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
 }
 EXPORT_SYMBOL(sleep_on);
 long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
 {
 	return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
 }
 EXPORT_SYMBOL(sleep_on_timeout);
 #ifdef CONFIG_RT_MUTEXES
 /*
  * rt_mutex_setprio - set the current priority of a task
  * @p: task
  * @prio: prio value (kernel-internal form)
  *
  * This function changes the 'effective' priority of a task. It does
  * not touch ->normal_prio like __setscheduler().
  *
  * Used by the rt_mutex code to implement priority inheritance logic.
  */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
 	unsigned long flags;
 	int oldprio, on_rq, running;
 	struct rq *rq;
 	const struct sched_class *prev_class = p->sched_class;
 	BUG_ON(prio < 0 || prio > MAX_PRIO);
 	rq = task_rq_lock(p, &flags);
 	update_rq_clock(rq);
 	oldprio = p->prio;
 	on_rq = p->se.on_rq;
 	running = task_current(rq, p);
 	if (on_rq) {
 		dequeue_task(rq, p, 0);
 		if (running)
 			p->sched_class->put_prev_task(rq, p);
 	}
 	if (rt_prio(prio))
 		p->sched_class = &rt_sched_class;
 	else
 		p->sched_class = &fair_sched_class;
 	p->prio = prio;
 	if (on_rq) {
 		if (running)
 			p->sched_class->set_curr_task(rq);
 		enqueue_task(rq, p, 0);
 		check_class_changed(rq, p, prev_class, oldprio, running);
 	}
 	task_rq_unlock(rq, &flags);
 }
 #endif
 void set_user_nice(struct task_struct *p, long nice)
 {
 	int old_prio, delta, on_rq;
 	unsigned long flags;
 	struct rq *rq;
 	if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
 		return;
 	/*
 	 * We have to be careful, if called from sys_setpriority(),
 	 * the task might be in the middle of scheduling on another CPU.
 	 */
 	rq = task_rq_lock(p, &flags);
 	update_rq_clock(rq);
 	/*
 	 * The RT priorities are set via sched_setscheduler(), but we still
 	 * allow the 'normal' nice value to be set - but as expected
 	 * it wont have any effect on scheduling until the task is
 	 * SCHED_FIFO/SCHED_RR:
 	 */
 	if (task_has_rt_policy(p)) {
 		p->static_prio = NICE_TO_PRIO(nice);
 		goto out_unlock;
 	}
 	on_rq = p->se.on_rq;
 	if (on_rq)
 		dequeue_task(rq, p, 0);
 	p->static_prio = NICE_TO_PRIO(nice);
 	set_load_weight(p);
 	old_prio = p->prio;
 	p->prio = effective_prio(p);
 	delta = p->prio - old_prio;
 	if (on_rq) {
 		enqueue_task(rq, p, 0);
 		/*
 		 * If the task increased its priority or is running and
 		 * lowered its priority, then reschedule its CPU:
 		 */
 		if (delta < 0 || (delta > 0 && task_running(rq, p)))
 			resched_task(rq->curr);
 	}
 out_unlock:
 	task_rq_unlock(rq, &flags);
 }
 EXPORT_SYMBOL(set_user_nice);
 /*
  * can_nice - check if a task can reduce its nice value
  * @p: task
  * @nice: nice value
  */
 int can_nice(const struct task_struct *p, const int nice)
 {
 	/* convert nice value [19,-20] to rlimit style value [1,40] */
 	int nice_rlim = 20 - nice;
 	return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
 		capable(CAP_SYS_NICE));
 }
 #ifdef __ARCH_WANT_SYS_NICE
 /*
  * sys_nice - change the priority of the current process.
  * @increment: priority increment
  *
  * sys_setpriority is a more generic, but much slower function that
  * does similar things.
  */
 asmlinkage long sys_nice(int increment)
 {
 	long nice, retval;
 	/*
 	 * Setpriority might change our priority at the same moment.
 	 * We don't have to worry. Conceptually one call occurs first
 	 * and we have a single winner.
 	 */
 	if (increment < -40)
 		increment = -40;
 	if (increment > 40)
 		increment = 40;
 	nice = PRIO_TO_NICE(current->static_prio) + increment;
 	if (nice < -20)
 		nice = -20;
 	if (nice > 19)
 		nice = 19;
 	if (increment < 0 && !can_nice(current, nice))
 		return -EPERM;
 	retval = security_task_setnice(current, nice);
 	if (retval)
 		return retval;
 	set_user_nice(current, nice);
 	return 0;
 }
 #endif
 /**
  * task_prio - return the priority value of a given task.
  * @p: the task in question.
  *
  * This is the priority value as seen by users in /proc.
  * RT tasks are offset by -200. Normal tasks are centered
  * around 0, value goes from -16 to +15.
  */
 int task_prio(const struct task_struct *p)
 {
 	return p->prio - MAX_RT_PRIO;
 }
 /**
  * task_nice - return the nice value of a given task.
  * @p: the task in question.
  */
 int task_nice(const struct task_struct *p)
 {
 	return TASK_NICE(p);
 }
 EXPORT_SYMBOL_GPL(task_nice);
 /**
  * idle_cpu - is a given cpu idle currently?
  * @cpu: the processor in question.
  */
 int idle_cpu(int cpu)
 {
 	return cpu_curr(cpu) == cpu_rq(cpu)->idle;
 }
 /**
  * idle_task - return the idle task for a given cpu.
  * @cpu: the processor in question.
  */
 struct task_struct *idle_task(int cpu)
 {
 	return cpu_rq(cpu)->idle;
 }
 /**
  * find_process_by_pid - find a process with a matching PID value.
  * @pid: the pid in question.
  */
 static struct task_struct *find_process_by_pid(pid_t pid)
 {
 	return pid ? find_task_by_vpid(pid) : current;
 }
 /* Actually do priority change: must hold rq lock. */
 static void
 __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
 {
 	BUG_ON(p->se.on_rq);
 	p->policy = policy;
 	switch (p->policy) {
 	case SCHED_NORMAL:
 	case SCHED_BATCH:
 	case SCHED_IDLE:
 		p->sched_class = &fair_sched_class;
 		break;
 	case SCHED_FIFO:
 	case SCHED_RR:
 		p->sched_class = &rt_sched_class;
 		break;
 	}
 	p->rt_priority = prio;
 	p->normal_prio = normal_prio(p);
 	/* we are holding p->pi_lock already */
 	p->prio = rt_mutex_getprio(p);
 	set_load_weight(p);
 }
 /**
  * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
  * @p: the task in question.
  * @policy: new policy.
  * @param: structure containing the new RT priority.
  *
  * NOTE that the task may be already dead.
  */
 int sched_setscheduler(struct task_struct *p, int policy,
 		       struct sched_param *param)
 {
 	int retval, oldprio, oldpolicy = -1, on_rq, running;
 	unsigned long flags;
 	const struct sched_class *prev_class = p->sched_class;
 	struct rq *rq;
 	/* may grab non-irq protected spin_locks */
 	BUG_ON(in_interrupt());
 recheck:
 	/* double check policy once rq lock held */
 	if (policy < 0)
 		policy = oldpolicy = p->policy;
 	else if (policy != SCHED_FIFO && policy != SCHED_RR &&
 			policy != SCHED_NORMAL && policy != SCHED_BATCH &&
 			policy != SCHED_IDLE)
 		return -EINVAL;
 	/*
 	 * Valid priorities for SCHED_FIFO and SCHED_RR are
 	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
 	 * SCHED_BATCH and SCHED_IDLE is 0.
 	 */
 	if (param->sched_priority < 0 ||
 	    (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
 	    (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
 		return -EINVAL;
 	if (rt_policy(policy) != (param->sched_priority != 0))
 		return -EINVAL;
 	/*
 	 * Allow unprivileged RT tasks to decrease priority:
 	 */
 	if (!capable(CAP_SYS_NICE)) {
 		if (rt_policy(policy)) {
 			unsigned long rlim_rtprio;
 			if (!lock_task_sighand(p, &flags))
 				return -ESRCH;
 			rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
 			unlock_task_sighand(p, &flags);
 			/* can't set/change the rt policy */
 			if (policy != p->policy && !rlim_rtprio)
 				return -EPERM;
 			/* can't increase priority */
 			if (param->sched_priority > p->rt_priority &&
 			    param->sched_priority > rlim_rtprio)
 				return -EPERM;
 		}
 		/*
 		 * Like positive nice levels, dont allow tasks to
 		 * move out of SCHED_IDLE either:
 		 */
 		if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
 			return -EPERM;
 		/* can't change other user's priorities */
 		if ((current->euid != p->euid) &&
 		    (current->euid != p->uid))
 			return -EPERM;
 	}
 	retval = security_task_setscheduler(p, policy, param);
 	if (retval)
 		return retval;
 	/*
 	 * make sure no PI-waiters arrive (or leave) while we are
 	 * changing the priority of the task:
 	 */
 	spin_lock_irqsave(&p->pi_lock, flags);
 	/*
 	 * To be able to change p->policy safely, the apropriate
 	 * runqueue lock must be held.
 	 */
 	rq = __task_rq_lock(p);
 	/* recheck policy now with rq lock held */
 	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
 		policy = oldpolicy = -1;
 		__task_rq_unlock(rq);
 		spin_unlock_irqrestore(&p->pi_lock, flags);
 		goto recheck;
 	}
 	update_rq_clock(rq);
 	on_rq = p->se.on_rq;
 	running = task_current(rq, p);
 	if (on_rq) {
 		deactivate_task(rq, p, 0);
 		if (running)
 			p->sched_class->put_prev_task(rq, p);
 	}
 	oldprio = p->prio;
 	__setscheduler(rq, p, policy, param->sched_priority);
 	if (on_rq) {
 		if (running)
 			p->sched_class->set_curr_task(rq);
 		activate_task(rq, p, 0);
 		check_class_changed(rq, p, prev_class, oldprio, running);
 	}
 	__task_rq_unlock(rq);
 	spin_unlock_irqrestore(&p->pi_lock, flags);
 	rt_mutex_adjust_pi(p);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(sched_setscheduler);
 static int
 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
 {
 	struct sched_param lparam;
 	struct task_struct *p;
 	int retval;
 	if (!param || pid < 0)
 		return -EINVAL;
 	if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
 		return -EFAULT;
 	rcu_read_lock();
 	retval = -ESRCH;
 	p = find_process_by_pid(pid);
 	if (p != NULL)
 		retval = sched_setscheduler(p, policy, &lparam);
 	rcu_read_unlock();
 	return retval;
 }
 /**
  * sys_sched_setscheduler - set/change the scheduler policy and RT priority
  * @pid: the pid in question.
  * @policy: new policy.
  * @param: structure containing the new RT priority.
  */
 asmlinkage long
 sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
 {
 	/* negative values for policy are not valid */
 	if (policy < 0)
 		return -EINVAL;
 	return do_sched_setscheduler(pid, policy, param);
 }
 /**
  * sys_sched_setparam - set/change the RT priority of a thread
  * @pid: the pid in question.
  * @param: structure containing the new RT priority.
  */
 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
 {
 	return do_sched_setscheduler(pid, -1, param);
 }
 /**
  * sys_sched_getscheduler - get the policy (scheduling class) of a thread
  * @pid: the pid in question.
  */
 asmlinkage long sys_sched_getscheduler(pid_t pid)
 {
 	struct task_struct *p;
 	int retval;
 	if (pid < 0)
 		return -EINVAL;
 	retval = -ESRCH;
 	read_lock(&tasklist_lock);
 	p = find_process_by_pid(pid);
 	if (p) {
 		retval = security_task_getscheduler(p);
 		if (!retval)
 			retval = p->policy;
 	}
 	read_unlock(&tasklist_lock);
 	return retval;
 }
 /**
  * sys_sched_getscheduler - get the RT priority of a thread
  * @pid: the pid in question.
  * @param: structure containing the RT priority.
  */
 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
 {
 	struct sched_param lp;
 	struct task_struct *p;
 	int retval;
 	if (!param || pid < 0)
 		return -EINVAL;
 	read_lock(&tasklist_lock);
 	p = find_process_by_pid(pid);
 	retval = -ESRCH;
 	if (!p)
 		goto out_unlock;
 	retval = security_task_getscheduler(p);
 	if (retval)
 		goto out_unlock;
 	lp.sched_priority = p->rt_priority;
 	read_unlock(&tasklist_lock);
 	/*
 	 * This one might sleep, we cannot do it with a spinlock held ...
 	 */
 	retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
 	return retval;
 out_unlock:
 	read_unlock(&tasklist_lock);
 	return retval;
 }
 long sched_setaffinity(pid_t pid, cpumask_t new_mask)
 {
 	cpumask_t cpus_allowed;
 	struct task_struct *p;
 	int retval;
 	get_online_cpus();
 	read_lock(&tasklist_lock);
 	p = find_process_by_pid(pid);
 	if (!p) {
 		read_unlock(&tasklist_lock);
 		put_online_cpus();
 		return -ESRCH;
 	}
 	/*
 	 * It is not safe to call set_cpus_allowed with the
 	 * tasklist_lock held. We will bump the task_struct's
 	 * usage count and then drop tasklist_lock.
 	 */
 	get_task_struct(p);
 	read_unlock(&tasklist_lock);
 	retval = -EPERM;
 	if ((current->euid != p->euid) && (current->euid != p->uid) &&
 			!capable(CAP_SYS_NICE))
 		goto out_unlock;
 	retval = security_task_setscheduler(p, 0, NULL);
 	if (retval)
 		goto out_unlock;
 	cpus_allowed = cpuset_cpus_allowed(p);
 	cpus_and(new_mask, new_mask, cpus_allowed);
  again:
 	retval = set_cpus_allowed(p, new_mask);
 	if (!retval) {
 		cpus_allowed = cpuset_cpus_allowed(p);
 		if (!cpus_subset(new_mask, cpus_allowed)) {
 			/*
 			 * We must have raced with a concurrent cpuset
 			 * update. Just reset the cpus_allowed to the
 			 * cpuset's cpus_allowed
 			 */
 			new_mask = cpus_allowed;
 			goto again;
 		}
 	}
 out_unlock:
 	put_task_struct(p);
 	put_online_cpus();
 	return retval;
 }
 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
 			     cpumask_t *new_mask)
 {
 	if (len < sizeof(cpumask_t)) {
 		memset(new_mask, 0, sizeof(cpumask_t));
 	} else if (len > sizeof(cpumask_t)) {
 		len = sizeof(cpumask_t);
 	}
 	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
 }
 /**
  * sys_sched_setaffinity - set the cpu affinity of a process
  * @pid: pid of the process
  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
  * @user_mask_ptr: user-space pointer to the new cpu mask
  */
 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
 				      unsigned long __user *user_mask_ptr)
 {
 	cpumask_t new_mask;
 	int retval;
 	retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
 	if (retval)
 		return retval;
 	return sched_setaffinity(pid, new_mask);
 }
 /*
  * Represents all cpu's present in the system
  * In systems capable of hotplug, this map could dynamically grow
  * as new cpu's are detected in the system via any platform specific
  * method, such as ACPI for e.g.
  */
 cpumask_t cpu_present_map __read_mostly;
 EXPORT_SYMBOL(cpu_present_map);
 #ifndef CONFIG_SMP
 cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
 EXPORT_SYMBOL(cpu_online_map);
 cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
 EXPORT_SYMBOL(cpu_possible_map);
 #endif
 long sched_getaffinity(pid_t pid, cpumask_t *mask)
 {
 	struct task_struct *p;
 	int retval;
 	get_online_cpus();
 	read_lock(&tasklist_lock);
 	retval = -ESRCH;
 	p = find_process_by_pid(pid);
 	if (!p)
 		goto out_unlock;
 	retval = security_task_getscheduler(p);
 	if (retval)
 		goto out_unlock;
 	cpus_and(*mask, p->cpus_allowed, cpu_online_map);
 out_unlock:
 	read_unlock(&tasklist_lock);
 	put_online_cpus();
 	return retval;
 }
 /**
  * sys_sched_getaffinity - get the cpu affinity of a process
  * @pid: pid of the process
  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
  * @user_mask_ptr: user-space pointer to hold the current cpu mask
  */
 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
 				      unsigned long __user *user_mask_ptr)
 {
 	int ret;
 	cpumask_t mask;
 	if (len < sizeof(cpumask_t))
 		return -EINVAL;
 	ret = sched_getaffinity(pid, &mask);
 	if (ret < 0)
 		return ret;
 	if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
 		return -EFAULT;
 	return sizeof(cpumask_t);
 }
 /**
  * sys_sched_yield - yield the current processor to other threads.
  *
  * This function yields the current CPU to other tasks. If there are no
  * other threads running on this CPU then this function will return.
  */
 asmlinkage long sys_sched_yield(void)
 {
 	struct rq *rq = this_rq_lock();
 	schedstat_inc(rq, yld_count);
 	current->sched_class->yield_task(rq);
 	/*
 	 * Since we are going to call schedule() anyway, there's
 	 * no need to preempt or enable interrupts:
 	 */
 	__release(rq->lock);
 	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
 	_raw_spin_unlock(&rq->lock);
 	preempt_enable_no_resched();
 	schedule();
 	return 0;
 }
 static void __cond_resched(void)
 {
 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
 	__might_sleep(__FILE__, __LINE__);
 #endif
 	/*
 	 * The BKS might be reacquired before we have dropped
 	 * PREEMPT_ACTIVE, which could trigger a second
 	 * cond_resched() call.
 	 */
 	do {
 		add_preempt_count(PREEMPT_ACTIVE);
 		schedule();
 		sub_preempt_count(PREEMPT_ACTIVE);
 	} while (need_resched());
 }
 #if !defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY)
 int __sched _cond_resched(void)
 {
 	if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
 					system_state == SYSTEM_RUNNING) {
 		__cond_resched();
 		return 1;
 	}
 	return 0;
 }
 EXPORT_SYMBOL(_cond_resched);
 #endif
 /*
  * cond_resched_lock() - if a reschedule is pending, drop the given lock,
  * call schedule, and on return reacquire the lock.
  *
  * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
  * operations here to prevent schedule() from being called twice (once via
  * spin_unlock(), once by hand).
  */
 int cond_resched_lock(spinlock_t *lock)
 {
+	int resched = need_resched() && system_state == SYSTEM_RUNNING;
 	int ret = 0;
-	if (need_lockbreak(lock)) {
+	if (spin_needbreak(lock) || resched) {
 		spin_unlock(lock);
-		cpu_relax();
+		if (resched && need_resched())
-		ret = 1;
+			__cond_resched();
-		spin_lock(lock);
+		else
-	}
+			cpu_relax();
-	if (need_resched() && system_state == SYSTEM_RUNNING) {
-		spin_release(&lock->dep_map, 1, _THIS_IP_);
-		_raw_spin_unlock(lock);
-		preempt_enable_no_resched();
-		__cond_resched();
 		ret = 1;
 		spin_lock(lock);
 	}
 	return ret;
 }
 EXPORT_SYMBOL(cond_resched_lock);
 int __sched cond_resched_softirq(void)
 {
 	BUG_ON(!in_softirq());
 	if (need_resched() && system_state == SYSTEM_RUNNING) {
 		local_bh_enable();
 		__cond_resched();
 		local_bh_disable();
 		return 1;
 	}
 	return 0;
 }
 EXPORT_SYMBOL(cond_resched_softirq);
 /**
  * yield - yield the current processor to other threads.
  *
  * This is a shortcut for kernel-space yielding - it marks the
  * thread runnable and calls sys_sched_yield().
  */
 void __sched yield(void)
 {
 	set_current_state(TASK_RUNNING);
 	sys_sched_yield();
 }
 EXPORT_SYMBOL(yield);
 /*
  * This task is about to go to sleep on IO. Increment rq->nr_iowait so
  * that process accounting knows that this is a task in IO wait state.
  *
  * But don't do that if it is a deliberate, throttling IO wait (this task
  * has set its backing_dev_info: the queue against which it should throttle)
  */
 void __sched io_schedule(void)
 {
 	struct rq *rq = &__raw_get_cpu_var(runqueues);
 	delayacct_blkio_start();
 	atomic_inc(&rq->nr_iowait);
 	schedule();
 	atomic_dec(&rq->nr_iowait);
 	delayacct_blkio_end();
 }
 EXPORT_SYMBOL(io_schedule);
 long __sched io_schedule_timeout(long timeout)
 {
 	struct rq *rq = &__raw_get_cpu_var(runqueues);
 	long ret;
 	delayacct_blkio_start();
 	atomic_inc(&rq->nr_iowait);
 	ret = schedule_timeout(timeout);
 	atomic_dec(&rq->nr_iowait);
 	delayacct_blkio_end();
 	return ret;
 }
 /**
  * sys_sched_get_priority_max - return maximum RT priority.
  * @policy: scheduling class.
  *
  * this syscall returns the maximum rt_priority that can be used
  * by a given scheduling class.
  */
 asmlinkage long sys_sched_get_priority_max(int policy)
 {
 	int ret = -EINVAL;
 	switch (policy) {
 	case SCHED_FIFO:
 	case SCHED_RR:
 		ret = MAX_USER_RT_PRIO-1;
 		break;
 	case SCHED_NORMAL:
 	case SCHED_BATCH:
 	case SCHED_IDLE:
 		ret = 0;
 		break;
 	}
 	return ret;
 }
 /**
  * sys_sched_get_priority_min - return minimum RT priority.
  * @policy: scheduling class.
  *
  * this syscall returns the minimum rt_priority that can be used
  * by a given scheduling class.
  */
 asmlinkage long sys_sched_get_priority_min(int policy)
 {
 	int ret = -EINVAL;
 	switch (policy) {
 	case SCHED_FIFO:
 	case SCHED_RR:
 		ret = 1;
 		break;
 	case SCHED_NORMAL:
 	case SCHED_BATCH:
 	case SCHED_IDLE:
 		ret = 0;
 	}
 	return ret;
 }
 /**
  * sys_sched_rr_get_interval - return the default timeslice of a process.
  * @pid: pid of the process.
  * @interval: userspace pointer to the timeslice value.
  *
  * this syscall writes the default timeslice value of a given process
  * into the user-space timespec buffer. A value of '0' means infinity.
  */
 asmlinkage
 long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
 {
 	struct task_struct *p;
 	unsigned int time_slice;
 	int retval;
 	struct timespec t;
 	if (pid < 0)
 		return -EINVAL;
 	retval = -ESRCH;
 	read_lock(&tasklist_lock);
 	p = find_process_by_pid(pid);
 	if (!p)
 		goto out_unlock;
 	retval = security_task_getscheduler(p);
 	if (retval)
 		goto out_unlock;
 	/*
 	 * Time slice is 0 for SCHED_FIFO tasks and for SCHED_OTHER
 	 * tasks that are on an otherwise idle runqueue:
 	 */
 	time_slice = 0;
 	if (p->policy == SCHED_RR) {
 		time_slice = DEF_TIMESLICE;
 	} else {
 		struct sched_entity *se = &p->se;
 		unsigned long flags;
 		struct rq *rq;
 		rq = task_rq_lock(p, &flags);
 		if (rq->cfs.load.weight)
 			time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
 		task_rq_unlock(rq, &flags);
 	}
 	read_unlock(&tasklist_lock);
 	jiffies_to_timespec(time_slice, &t);
 	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
 	return retval;
 out_unlock:
 	read_unlock(&tasklist_lock);
 	return retval;
 }
 static const char stat_nam[] = "RSDTtZX";
 void sched_show_task(struct task_struct *p)
 {
 	unsigned long free = 0;
 	unsigned state;
 	state = p->state ? __ffs(p->state) + 1 : 0;
 	printk(KERN_INFO "%-13.13s %c", p->comm,
 		state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
 #if BITS_PER_LONG == 32
 	if (state == TASK_RUNNING)
 		printk(KERN_CONT " running  ");
 	else
 		printk(KERN_CONT " %08lx ", thread_saved_pc(p));
 #else
 	if (state == TASK_RUNNING)
 		printk(KERN_CONT "  running task    ");
 	else
 		printk(KERN_CONT " %016lx ", thread_saved_pc(p));
 #endif
 #ifdef CONFIG_DEBUG_STACK_USAGE
 	{
 		unsigned long *n = end_of_stack(p);
 		while (!*n)
 			n++;
 		free = (unsigned long)n - (unsigned long)end_of_stack(p);
 	}
 #endif
 	printk(KERN_CONT "%5lu %5d %6d\n", free,
 		task_pid_nr(p), task_pid_nr(p->real_parent));
 	show_stack(p, NULL);
 }
 void show_state_filter(unsigned long state_filter)
 {
 	struct task_struct *g, *p;
 #if BITS_PER_LONG == 32
 	printk(KERN_INFO
 		"  task                PC stack   pid father\n");
 #else
 	printk(KERN_INFO
 		"  task                        PC stack   pid father\n");
 #endif
 	read_lock(&tasklist_lock);
 	do_each_thread(g, p) {
 		/*
 		 * reset the NMI-timeout, listing all files on a slow
 		 * console might take alot of time:
 		 */
 		touch_nmi_watchdog();
 		if (!state_filter || (p->state & state_filter))
 			sched_show_task(p);
 	} while_each_thread(g, p);
 	touch_all_softlockup_watchdogs();
 #ifdef CONFIG_SCHED_DEBUG
 	sysrq_sched_debug_show();
 #endif
 	read_unlock(&tasklist_lock);
 	/*
 	 * Only show locks if all tasks are dumped:
 	 */
 	if (state_filter == -1)
 		debug_show_all_locks();
 }
 void __cpuinit init_idle_bootup_task(struct task_struct *idle)
 {
 	idle->sched_class = &idle_sched_class;
 }
 /**
  * init_idle - set up an idle thread for a given CPU
  * @idle: task in question
  * @cpu: cpu the idle task belongs to
  *
  * NOTE: this function does not set the idle thread's NEED_RESCHED
  * flag, to make booting more robust.
  */
 void __cpuinit init_idle(struct task_struct *idle, int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
 	__sched_fork(idle);
 	idle->se.exec_start = sched_clock();
 	idle->prio = idle->normal_prio = MAX_PRIO;
 	idle->cpus_allowed = cpumask_of_cpu(cpu);
 	__set_task_cpu(idle, cpu);
 	spin_lock_irqsave(&rq->lock, flags);
 	rq->curr = rq->idle = idle;
 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
 	idle->oncpu = 1;
 #endif
 	spin_unlock_irqrestore(&rq->lock, flags);
 	/* Set the preempt count _outside_ the spinlocks! */
 	task_thread_info(idle)->preempt_count = 0;
 	/*
 	 * The idle tasks have their own, simple scheduling class:
 	 */
 	idle->sched_class = &idle_sched_class;
 }
 /*
  * In a system that switches off the HZ timer nohz_cpu_mask
  * indicates which cpus entered this state. This is used
  * in the rcu update to wait only for active cpus. For system
  * which do not switch off the HZ timer nohz_cpu_mask should
  * always be CPU_MASK_NONE.
  */
 cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
 /*
  * Increase the granularity value when there are more CPUs,
  * because with more CPUs the 'effective latency' as visible
  * to users decreases. But the relationship is not linear,
  * so pick a second-best guess by going with the log2 of the
  * number of CPUs.
  *
  * This idea comes from the SD scheduler of Con Kolivas:
  */
 static inline void sched_init_granularity(void)
 {
 	unsigned int factor = 1 + ilog2(num_online_cpus());
 	const unsigned long limit = 200000000;
 	sysctl_sched_min_granularity *= factor;
 	if (sysctl_sched_min_granularity > limit)
 		sysctl_sched_min_granularity = limit;
 	sysctl_sched_latency *= factor;
 	if (sysctl_sched_latency > limit)
 		sysctl_sched_latency = limit;
 	sysctl_sched_wakeup_granularity *= factor;
 	sysctl_sched_batch_wakeup_granularity *= factor;
 }
 #ifdef CONFIG_SMP
 /*
  * This is how migration works:
  *
  * 1) we queue a struct migration_req structure in the source CPU's
  *    runqueue and wake up that CPU's migration thread.
  * 2) we down() the locked semaphore => thread blocks.
  * 3) migration thread wakes up (implicitly it forces the migrated
  *    thread off the CPU)
  * 4) it gets the migration request and checks whether the migrated
  *    task is still in the wrong runqueue.
  * 5) if it's in the wrong runqueue then the migration thread removes
  *    it and puts it into the right queue.
  * 6) migration thread up()s the semaphore.
  * 7) we wake up and the migration is done.
  */
 /*
  * Change a given task's CPU affinity. Migrate the thread to a
  * proper CPU and schedule it away if the CPU it's executing on
  * is removed from the allowed bitmask.
  *
  * NOTE: the caller must have a valid reference to the task, the
  * task must not exit() & deallocate itself prematurely. The
  * call is not atomic; no spinlocks may be held.
  */
 int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
 {
 	struct migration_req req;
 	unsigned long flags;
 	struct rq *rq;
 	int ret = 0;
 	rq = task_rq_lock(p, &flags);
 	if (!cpus_intersects(new_mask, cpu_online_map)) {
 		ret = -EINVAL;
 		goto out;
 	}
 	if (p->sched_class->set_cpus_allowed)
 		p->sched_class->set_cpus_allowed(p, &new_mask);
 	else {
 		p->cpus_allowed = new_mask;
 		p->rt.nr_cpus_allowed = cpus_weight(new_mask);
 	}
 	/* Can the task run on the task's current CPU? If so, we're done */
 	if (cpu_isset(task_cpu(p), new_mask))
 		goto out;
 	if (migrate_task(p, any_online_cpu(new_mask), &req)) {
 		/* Need help from migration thread: drop lock and wait. */
 		task_rq_unlock(rq, &flags);
 		wake_up_process(rq->migration_thread);
 		wait_for_completion(&req.done);
 		tlb_migrate_finish(p->mm);
 		return 0;
 	}
 out:
 	task_rq_unlock(rq, &flags);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(set_cpus_allowed);
 /*
  * Move (not current) task off this cpu, onto dest cpu. We're doing
  * this because either it can't run here any more (set_cpus_allowed()
  * away from this CPU, or CPU going down), or because we're
  * attempting to rebalance this task on exec (sched_exec).
  *
  * So we race with normal scheduler movements, but that's OK, as long
  * as the task is no longer on this CPU.
  *
  * Returns non-zero if task was successfully migrated.
  */
 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 {
 	struct rq *rq_dest, *rq_src;
 	int ret = 0, on_rq;
 	if (unlikely(cpu_is_offline(dest_cpu)))
 		return ret;
 	rq_src = cpu_rq(src_cpu);
 	rq_dest = cpu_rq(dest_cpu);
 	double_rq_lock(rq_src, rq_dest);
 	/* Already moved. */
 	if (task_cpu(p) != src_cpu)
 		goto out;
 	/* Affinity changed (again). */
 	if (!cpu_isset(dest_cpu, p->cpus_allowed))
 		goto out;
 	on_rq = p->se.on_rq;
 	if (on_rq)
 		deactivate_task(rq_src, p, 0);
 	set_task_cpu(p, dest_cpu);
 	if (on_rq) {
 		activate_task(rq_dest, p, 0);
 		check_preempt_curr(rq_dest, p);
 	}
 	ret = 1;
 out:
 	double_rq_unlock(rq_src, rq_dest);
 	return ret;
 }
 /*
  * migration_thread - this is a highprio system thread that performs
  * thread migration by bumping thread off CPU then 'pushing' onto
  * another runqueue.
  */
 static int migration_thread(void *data)
 {
 	int cpu = (long)data;
 	struct rq *rq;
 	rq = cpu_rq(cpu);
 	BUG_ON(rq->migration_thread != current);
 	set_current_state(TASK_INTERRUPTIBLE);
 	while (!kthread_should_stop()) {
 		struct migration_req *req;
 		struct list_head *head;
 		spin_lock_irq(&rq->lock);
 		if (cpu_is_offline(cpu)) {
 			spin_unlock_irq(&rq->lock);
 			goto wait_to_die;
 		}
 		if (rq->active_balance) {
 			active_load_balance(rq, cpu);
 			rq->active_balance = 0;
 		}
 		head = &rq->migration_queue;
 		if (list_empty(head)) {
 			spin_unlock_irq(&rq->lock);
 			schedule();
 			set_current_state(TASK_INTERRUPTIBLE);
 			continue;
 		}
 		req = list_entry(head->next, struct migration_req, list);
 		list_del_init(head->next);
 		spin_unlock(&rq->lock);
 		__migrate_task(req->task, cpu, req->dest_cpu);
 		local_irq_enable();
 		complete(&req->done);
 	}
 	__set_current_state(TASK_RUNNING);
 	return 0;
 wait_to_die:
 	/* Wait for kthread_stop */
 	set_current_state(TASK_INTERRUPTIBLE);
 	while (!kthread_should_stop()) {
 		schedule();
 		set_current_state(TASK_INTERRUPTIBLE);
 	}
 	__set_current_state(TASK_RUNNING);
 	return 0;
 }
 #ifdef CONFIG_HOTPLUG_CPU
 static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
 {
 	int ret;
 	local_irq_disable();
 	ret = __migrate_task(p, src_cpu, dest_cpu);
 	local_irq_enable();
 	return ret;
 }
 /*
  * Figure out where task on dead CPU should go, use force if necessary.
  * NOTE: interrupts should be disabled by the caller
  */
 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 {
 	unsigned long flags;
 	cpumask_t mask;
 	struct rq *rq;
 	int dest_cpu;
 	do {
 		/* On same node? */
 		mask = node_to_cpumask(cpu_to_node(dead_cpu));
 		cpus_and(mask, mask, p->cpus_allowed);
 		dest_cpu = any_online_cpu(mask);
 		/* On any allowed CPU? */
 		if (dest_cpu == NR_CPUS)
 			dest_cpu = any_online_cpu(p->cpus_allowed);
 		/* No more Mr. Nice Guy. */
 		if (dest_cpu == NR_CPUS) {
 			cpumask_t cpus_allowed = cpuset_cpus_allowed_locked(p);
 			/*
 			 * Try to stay on the same cpuset, where the
 			 * current cpuset may be a subset of all cpus.
 			 * The cpuset_cpus_allowed_locked() variant of
 			 * cpuset_cpus_allowed() will not block. It must be
 			 * called within calls to cpuset_lock/cpuset_unlock.
 			 */
 			rq = task_rq_lock(p, &flags);
 			p->cpus_allowed = cpus_allowed;
 			dest_cpu = any_online_cpu(p->cpus_allowed);
 			task_rq_unlock(rq, &flags);
 			/*
 			 * Don't tell them about moving exiting tasks or
 			 * kernel threads (both mm NULL), since they never
 			 * leave kernel.
 			 */
 			if (p->mm && printk_ratelimit()) {
 				printk(KERN_INFO "process %d (%s) no "
 				       "longer affine to cpu%d\n",
 					task_pid_nr(p), p->comm, dead_cpu);
 			}
 		}
 	} while (!__migrate_task_irq(p, dead_cpu, dest_cpu));
 }
 /*
  * While a dead CPU has no uninterruptible tasks queued at this point,
  * it might still have a nonzero ->nr_uninterruptible counter, because
  * for performance reasons the counter is not stricly tracking tasks to
  * their home CPUs. So we just add the counter to another CPU's counter,
  * to keep the global sum constant after CPU-down:
  */
 static void migrate_nr_uninterruptible(struct rq *rq_src)
 {
 	struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
 	unsigned long flags;
 	local_irq_save(flags);
 	double_rq_lock(rq_src, rq_dest);
 	rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
 	rq_src->nr_uninterruptible = 0;
 	double_rq_unlock(rq_src, rq_dest);
 	local_irq_restore(flags);
 }
 /* Run through task list and migrate tasks from the dead cpu. */
 static void migrate_live_tasks(int src_cpu)
 {
 	struct task_struct *p, *t;
 	read_lock(&tasklist_lock);
 	do_each_thread(t, p) {
 		if (p == current)
 			continue;
 		if (task_cpu(p) == src_cpu)
 			move_task_off_dead_cpu(src_cpu, p);
 	} while_each_thread(t, p);
 	read_unlock(&tasklist_lock);
 }
 /*
  * Schedules idle task to be the next runnable task on current CPU.
  * It does so by boosting its priority to highest possible.
  * Used by CPU offline code.
  */
 void sched_idle_next(void)
 {
 	int this_cpu = smp_processor_id();
 	struct rq *rq = cpu_rq(this_cpu);
 	struct task_struct *p = rq->idle;
 	unsigned long flags;
 	/* cpu has to be offline */
 	BUG_ON(cpu_online(this_cpu));
 	/*
 	 * Strictly not necessary since rest of the CPUs are stopped by now
 	 * and interrupts disabled on the current cpu.
 	 */
 	spin_lock_irqsave(&rq->lock, flags);
 	__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
 	update_rq_clock(rq);
 	activate_task(rq, p, 0);
 	spin_unlock_irqrestore(&rq->lock, flags);
 }
 /*
  * Ensures that the idle task is using init_mm right before its cpu goes
  * offline.
  */
 void idle_task_exit(void)
 {
 	struct mm_struct *mm = current->active_mm;
 	BUG_ON(cpu_online(smp_processor_id()));
 	if (mm != &init_mm)
 		switch_mm(mm, &init_mm, current);
 	mmdrop(mm);
 }
 /* called under rq->lock with disabled interrupts */
 static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
 {
 	struct rq *rq = cpu_rq(dead_cpu);
 	/* Must be exiting, otherwise would be on tasklist. */
 	BUG_ON(!p->exit_state);
 	/* Cannot have done final schedule yet: would have vanished. */
 	BUG_ON(p->state == TASK_DEAD);
 	get_task_struct(p);
 	/*
 	 * Drop lock around migration; if someone else moves it,
 	 * that's OK. No task can be added to this CPU, so iteration is
 	 * fine.
 	 */
 	spin_unlock_irq(&rq->lock);
 	move_task_off_dead_cpu(dead_cpu, p);
 	spin_lock_irq(&rq->lock);
 	put_task_struct(p);
 }
 /* release_task() removes task from tasklist, so we won't find dead tasks. */
 static void migrate_dead_tasks(unsigned int dead_cpu)
 {
 	struct rq *rq = cpu_rq(dead_cpu);
 	struct task_struct *next;
 	for ( ; ; ) {
 		if (!rq->nr_running)
 			break;
 		update_rq_clock(rq);
 		next = pick_next_task(rq, rq->curr);
 		if (!next)
 			break;
 		migrate_dead(dead_cpu, next);
 	}
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
 static struct ctl_table sd_ctl_dir[] = {
 	{
 		.procname	= "sched_domain",
 		.mode		= 0555,
 	},
 	{0, },
 };
 static struct ctl_table sd_ctl_root[] = {
 	{
 		.ctl_name	= CTL_KERN,
 		.procname	= "kernel",
 		.mode		= 0555,
 		.child		= sd_ctl_dir,
 	},
 	{0, },
 };
 static struct ctl_table *sd_alloc_ctl_entry(int n)
 {
 	struct ctl_table *entry =
 		kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
 	return entry;
 }
 static void sd_free_ctl_entry(struct ctl_table **tablep)
 {
 	struct ctl_table *entry;
 	/*
 	 * In the intermediate directories, both the child directory and
 	 * procname are dynamically allocated and could fail but the mode
 	 * will always be set. In the lowest directory the names are
 	 * static strings and all have proc handlers.
 	 */
 	for (entry = *tablep; entry->mode; entry++) {
 		if (entry->child)
 			sd_free_ctl_entry(&entry->child);
 		if (entry->proc_handler == NULL)
 			kfree(entry->procname);
 	}
 	kfree(*tablep);
 	*tablep = NULL;
 }
 static void
 set_table_entry(struct ctl_table *entry,
 		const char *procname, void *data, int maxlen,
 		mode_t mode, proc_handler *proc_handler)
 {
 	entry->procname = procname;
 	entry->data = data;
 	entry->maxlen = maxlen;
 	entry->mode = mode;
 	entry->proc_handler = proc_handler;
 }
 static struct ctl_table *
 sd_alloc_ctl_domain_table(struct sched_domain *sd)
 {
 	struct ctl_table *table = sd_alloc_ctl_entry(12);
 	if (table == NULL)
 		return NULL;
 	set_table_entry(&table[0], "min_interval", &sd->min_interval,
 		sizeof(long), 0644, proc_doulongvec_minmax);
 	set_table_entry(&table[1], "max_interval", &sd->max_interval,
 		sizeof(long), 0644, proc_doulongvec_minmax);
 	set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[9], "cache_nice_tries",
 		&sd->cache_nice_tries,
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[10], "flags", &sd->flags,
 		sizeof(int), 0644, proc_dointvec_minmax);
 	/* &table[11] is terminator */
 	return table;
 }
 static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
 {
 	struct ctl_table *entry, *table;
 	struct sched_domain *sd;
 	int domain_num = 0, i;
 	char buf[32];
 	for_each_domain(cpu, sd)
 		domain_num++;
 	entry = table = sd_alloc_ctl_entry(domain_num + 1);
 	if (table == NULL)
 		return NULL;
 	i = 0;
 	for_each_domain(cpu, sd) {
 		snprintf(buf, 32, "domain%d", i);
 		entry->procname = kstrdup(buf, GFP_KERNEL);
 		entry->mode = 0555;
 		entry->child = sd_alloc_ctl_domain_table(sd);
 		entry++;
 		i++;
 	}
 	return table;
 }
 static struct ctl_table_header *sd_sysctl_header;
 static void register_sched_domain_sysctl(void)
 {
 	int i, cpu_num = num_online_cpus();
 	struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
 	char buf[32];
 	WARN_ON(sd_ctl_dir[0].child);
 	sd_ctl_dir[0].child = entry;
 	if (entry == NULL)
 		return;
 	for_each_online_cpu(i) {
 		snprintf(buf, 32, "cpu%d", i);
 		entry->procname = kstrdup(buf, GFP_KERNEL);
 		entry->mode = 0555;
 		entry->child = sd_alloc_ctl_cpu_table(i);
 		entry++;
 	}
 	WARN_ON(sd_sysctl_header);
 	sd_sysctl_header = register_sysctl_table(sd_ctl_root);
 }
 /* may be called multiple times per register */
 static void unregister_sched_domain_sysctl(void)
 {
 	if (sd_sysctl_header)
 		unregister_sysctl_table(sd_sysctl_header);
 	sd_sysctl_header = NULL;
 	if (sd_ctl_dir[0].child)
 		sd_free_ctl_entry(&sd_ctl_dir[0].child);
 }
 #else
 static void register_sched_domain_sysctl(void)
 {
 }
 static void unregister_sched_domain_sysctl(void)
 {
 }
 #endif
 /*
  * migration_call - callback that gets triggered when a CPU is added.
  * Here we can start up the necessary migration thread for the new CPU.
  */
 static int __cpuinit
 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
 	struct task_struct *p;
 	int cpu = (long)hcpu;
 	unsigned long flags;
 	struct rq *rq;
 	switch (action) {
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
 		p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
 		if (IS_ERR(p))
 			return NOTIFY_BAD;
 		kthread_bind(p, cpu);
 		/* Must be high prio: stop_machine expects to yield to it. */
 		rq = task_rq_lock(p, &flags);
 		__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
 		task_rq_unlock(rq, &flags);
 		cpu_rq(cpu)->migration_thread = p;
 		break;
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
 		/* Strictly unnecessary, as first user will wake it. */
 		wake_up_process(cpu_rq(cpu)->migration_thread);
 		/* Update our root-domain */
 		rq = cpu_rq(cpu);
 		spin_lock_irqsave(&rq->lock, flags);
 		if (rq->rd) {
 			BUG_ON(!cpu_isset(cpu, rq->rd->span));
 			cpu_set(cpu, rq->rd->online);
 		}
 		spin_unlock_irqrestore(&rq->lock, flags);
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_UP_CANCELED:
 	case CPU_UP_CANCELED_FROZEN:
 		if (!cpu_rq(cpu)->migration_thread)
 			break;
 		/* Unbind it from offline cpu so it can run. Fall thru. */
 		kthread_bind(cpu_rq(cpu)->migration_thread,
 			     any_online_cpu(cpu_online_map));
 		kthread_stop(cpu_rq(cpu)->migration_thread);
 		cpu_rq(cpu)->migration_thread = NULL;
 		break;
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
 		cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
 		migrate_live_tasks(cpu);
 		rq = cpu_rq(cpu);
 		kthread_stop(rq->migration_thread);
 		rq->migration_thread = NULL;
 		/* Idle task back to normal (off runqueue, low prio) */
 		spin_lock_irq(&rq->lock);
 		update_rq_clock(rq);
 		deactivate_task(rq, rq->idle, 0);
 		rq->idle->static_prio = MAX_PRIO;
 		__setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
 		rq->idle->sched_class = &idle_sched_class;
 		migrate_dead_tasks(cpu);
 		spin_unlock_irq(&rq->lock);
 		cpuset_unlock();
 		migrate_nr_uninterruptible(rq);
 		BUG_ON(rq->nr_running != 0);
 		/*
 		 * No need to migrate the tasks: it was best-effort if
 		 * they didn't take sched_hotcpu_mutex. Just wake up
 		 * the requestors.
 		 */
 		spin_lock_irq(&rq->lock);
 		while (!list_empty(&rq->migration_queue)) {
 			struct migration_req *req;
 			req = list_entry(rq->migration_queue.next,
 					 struct migration_req, list);
 			list_del_init(&req->list);
 			complete(&req->done);
 		}
 		spin_unlock_irq(&rq->lock);
 		break;
 	case CPU_DOWN_PREPARE:
 		/* Update our root-domain */
 		rq = cpu_rq(cpu);
 		spin_lock_irqsave(&rq->lock, flags);
 		if (rq->rd) {
 			BUG_ON(!cpu_isset(cpu, rq->rd->span));
 			cpu_clear(cpu, rq->rd->online);
 		}
 		spin_unlock_irqrestore(&rq->lock, flags);
 		break;
 #endif
 	}
 	return NOTIFY_OK;
 }
 /* Register at highest priority so that task migration (migrate_all_tasks)
  * happens before everything else.
  */
 static struct notifier_block __cpuinitdata migration_notifier = {
 	.notifier_call = migration_call,
 	.priority = 10
 };
 void __init migration_init(void)
 {
 	void *cpu = (void *)(long)smp_processor_id();
 	int err;
 	/* Start one for the boot CPU: */
 	err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
 	BUG_ON(err == NOTIFY_BAD);
 	migration_call(&migration_notifier, CPU_ONLINE, cpu);
 	register_cpu_notifier(&migration_notifier);
 }
 #endif
 #ifdef CONFIG_SMP
 /* Number of possible processor ids */
 int nr_cpu_ids __read_mostly = NR_CPUS;
 EXPORT_SYMBOL(nr_cpu_ids);
 #ifdef CONFIG_SCHED_DEBUG
 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level)
 {
 	struct sched_group *group = sd->groups;
 	cpumask_t groupmask;
 	char str[NR_CPUS];
 	cpumask_scnprintf(str, NR_CPUS, sd->span);
 	cpus_clear(groupmask);
 	printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
 	if (!(sd->flags & SD_LOAD_BALANCE)) {
 		printk("does not load-balance\n");
 		if (sd->parent)
 			printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
 					" has parent");
 		return -1;
 	}
 	printk(KERN_CONT "span %s\n", str);
 	if (!cpu_isset(cpu, sd->span)) {
 		printk(KERN_ERR "ERROR: domain->span does not contain "
 				"CPU%d\n", cpu);
 	}
 	if (!cpu_isset(cpu, group->cpumask)) {
 		printk(KERN_ERR "ERROR: domain->groups does not contain"
 				" CPU%d\n", cpu);
 	}
 	printk(KERN_DEBUG "%*s groups:", level + 1, "");
 	do {
 		if (!group) {
 			printk("\n");
 			printk(KERN_ERR "ERROR: group is NULL\n");
 			break;
 		}
 		if (!group->__cpu_power) {
 			printk(KERN_CONT "\n");
 			printk(KERN_ERR "ERROR: domain->cpu_power not "
 					"set\n");
 			break;
 		}
 		if (!cpus_weight(group->cpumask)) {
 			printk(KERN_CONT "\n");
 			printk(KERN_ERR "ERROR: empty group\n");
 			break;
 		}
 		if (cpus_intersects(groupmask, group->cpumask)) {
 			printk(KERN_CONT "\n");
 			printk(KERN_ERR "ERROR: repeated CPUs\n");
 			break;
 		}
 		cpus_or(groupmask, groupmask, group->cpumask);
 		cpumask_scnprintf(str, NR_CPUS, group->cpumask);
 		printk(KERN_CONT " %s", str);
 		group = group->next;
 	} while (group != sd->groups);
 	printk(KERN_CONT "\n");
 	if (!cpus_equal(sd->span, groupmask))
 		printk(KERN_ERR "ERROR: groups don't span domain->span\n");
 	if (sd->parent && !cpus_subset(groupmask, sd->parent->span))
 		printk(KERN_ERR "ERROR: parent span is not a superset "
 			"of domain->span\n");
 	return 0;
 }
 static void sched_domain_debug(struct sched_domain *sd, int cpu)
 {
 	int level = 0;
 	if (!sd) {
 		printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
 		return;
 	}
 	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
 	for (;;) {
 		if (sched_domain_debug_one(sd, cpu, level))
 			break;
 		level++;
 		sd = sd->parent;
 		if (!sd)
 			break;
 	}
 }
 #else
 # define sched_domain_debug(sd, cpu) do { } while (0)
 #endif
 static int sd_degenerate(struct sched_domain *sd)
 {
 	if (cpus_weight(sd->span) == 1)
 		return 1;
 	/* Following flags need at least 2 groups */
 	if (sd->flags & (SD_LOAD_BALANCE |
 			 SD_BALANCE_NEWIDLE |
 			 SD_BALANCE_FORK |
 			 SD_BALANCE_EXEC |
 			 SD_SHARE_CPUPOWER |
 			 SD_SHARE_PKG_RESOURCES)) {
 		if (sd->groups != sd->groups->next)
 			return 0;
 	}
 	/* Following flags don't use groups */
 	if (sd->flags & (SD_WAKE_IDLE |
 			 SD_WAKE_AFFINE |
 			 SD_WAKE_BALANCE))
 		return 0;
 	return 1;
 }
 static int
 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 {
 	unsigned long cflags = sd->flags, pflags = parent->flags;
 	if (sd_degenerate(parent))
 		return 1;
 	if (!cpus_equal(sd->span, parent->span))
 		return 0;
 	/* Does parent contain flags not in child? */
 	/* WAKE_BALANCE is a subset of WAKE_AFFINE */
 	if (cflags & SD_WAKE_AFFINE)
 		pflags &= ~SD_WAKE_BALANCE;
 	/* Flags needing groups don't count if only 1 group in parent */
 	if (parent->groups == parent->groups->next) {
 		pflags &= ~(SD_LOAD_BALANCE |
 				SD_BALANCE_NEWIDLE |
 				SD_BALANCE_FORK |
 				SD_BALANCE_EXEC |
 				SD_SHARE_CPUPOWER |
 				SD_SHARE_PKG_RESOURCES);
 	}
 	if (~cflags & pflags)
 		return 0;
 	return 1;
 }
 static void rq_attach_root(struct rq *rq, struct root_domain *rd)
 {
 	unsigned long flags;
 	const struct sched_class *class;
 	spin_lock_irqsave(&rq->lock, flags);
 	if (rq->rd) {
 		struct root_domain *old_rd = rq->rd;
 		for (class = sched_class_highest; class; class = class->next) {
 			if (class->leave_domain)
 				class->leave_domain(rq);
 		}
 		cpu_clear(rq->cpu, old_rd->span);
 		cpu_clear(rq->cpu, old_rd->online);
 		if (atomic_dec_and_test(&old_rd->refcount))
 			kfree(old_rd);
 	}
 	atomic_inc(&rd->refcount);
 	rq->rd = rd;
 	cpu_set(rq->cpu, rd->span);
 	if (cpu_isset(rq->cpu, cpu_online_map))
 		cpu_set(rq->cpu, rd->online);
 	for (class = sched_class_highest; class; class = class->next) {
 		if (class->join_domain)
 			class->join_domain(rq);
 	}
 	spin_unlock_irqrestore(&rq->lock, flags);
 }
 static void init_rootdomain(struct root_domain *rd)
 {
 	memset(rd, 0, sizeof(*rd));
 	cpus_clear(rd->span);
 	cpus_clear(rd->online);
 }
 static void init_defrootdomain(void)
 {
 	init_rootdomain(&def_root_domain);
 	atomic_set(&def_root_domain.refcount, 1);
 }
 static struct root_domain *alloc_rootdomain(void)
 {
 	struct root_domain *rd;
 	rd = kmalloc(sizeof(*rd), GFP_KERNEL);
 	if (!rd)
 		return NULL;
 	init_rootdomain(rd);
 	return rd;
 }
 /*
  * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
  * hold the hotplug lock.
  */
 static void
 cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	struct sched_domain *tmp;
 	/* Remove the sched domains which do not contribute to scheduling. */
 	for (tmp = sd; tmp; tmp = tmp->parent) {
 		struct sched_domain *parent = tmp->parent;
 		if (!parent)
 			break;
 		if (sd_parent_degenerate(tmp, parent)) {
 			tmp->parent = parent->parent;
 			if (parent->parent)
 				parent->parent->child = tmp;
 		}
 	}
 	if (sd && sd_degenerate(sd)) {
 		sd = sd->parent;
 		if (sd)
 			sd->child = NULL;
 	}
 	sched_domain_debug(sd, cpu);
 	rq_attach_root(rq, rd);
 	rcu_assign_pointer(rq->sd, sd);
 }
 /* cpus with isolated domains */
 static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
 {
 	int ints[NR_CPUS], i;
 	str = get_options(str, ARRAY_SIZE(ints), ints);
 	cpus_clear(cpu_isolated_map);
 	for (i = 1; i <= ints[0]; i++)
 		if (ints[i] < NR_CPUS)
 			cpu_set(ints[i], cpu_isolated_map);
 	return 1;
 }
 __setup("isolcpus=", isolated_cpu_setup);
 /*
  * init_sched_build_groups takes the cpumask we wish to span, and a pointer
  * to a function which identifies what group(along with sched group) a CPU
  * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
  * (due to the fact that we keep track of groups covered with a cpumask_t).
  *
  * init_sched_build_groups will build a circular linked list of the groups
  * covered by the given span, and will set each group's ->cpumask correctly,
  * and ->cpu_power to 0.
  */
 static void
 init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
 			int (*group_fn)(int cpu, const cpumask_t *cpu_map,
 					struct sched_group **sg))
 {
 	struct sched_group *first = NULL, *last = NULL;
 	cpumask_t covered = CPU_MASK_NONE;
 	int i;
 	for_each_cpu_mask(i, span) {
 		struct sched_group *sg;
 		int group = group_fn(i, cpu_map, &sg);
 		int j;
 		if (cpu_isset(i, covered))
 			continue;
 		sg->cpumask = CPU_MASK_NONE;
 		sg->__cpu_power = 0;
 		for_each_cpu_mask(j, span) {
 			if (group_fn(j, cpu_map, NULL) != group)
 				continue;
 			cpu_set(j, covered);
 			cpu_set(j, sg->cpumask);
 		}
 		if (!first)
 			first = sg;
 		if (last)
 			last->next = sg;
 		last = sg;
 	}
 	last->next = first;
 }
 #define SD_NODES_PER_DOMAIN 16
 #ifdef CONFIG_NUMA
 /**
  * find_next_best_node - find the next node to include in a sched_domain
  * @node: node whose sched_domain we're building
  * @used_nodes: nodes already in the sched_domain
  *
  * Find the next node to include in a given scheduling domain. Simply
  * finds the closest node not already in the @used_nodes map.
  *
  * Should use nodemask_t.
  */
 static int find_next_best_node(int node, unsigned long *used_nodes)
 {
 	int i, n, val, min_val, best_node = 0;
 	min_val = INT_MAX;
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		/* Start at @node */
 		n = (node + i) % MAX_NUMNODES;
 		if (!nr_cpus_node(n))
 			continue;
 		/* Skip already used nodes */
 		if (test_bit(n, used_nodes))
 			continue;
 		/* Simple min distance search */
 		val = node_distance(node, n);
 		if (val < min_val) {
 			min_val = val;
 			best_node = n;
 		}
 	}
 	set_bit(best_node, used_nodes);
 	return best_node;
 }
 /**
  * sched_domain_node_span - get a cpumask for a node's sched_domain
  * @node: node whose cpumask we're constructing
  * @size: number of nodes to include in this span
  *
  * Given a node, construct a good cpumask for its sched_domain to span. It
  * should be one that prevents unnecessary balancing, but also spreads tasks
  * out optimally.
  */
 static cpumask_t sched_domain_node_span(int node)
 {
 	DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
 	cpumask_t span, nodemask;
 	int i;
 	cpus_clear(span);
 	bitmap_zero(used_nodes, MAX_NUMNODES);
 	nodemask = node_to_cpumask(node);
 	cpus_or(span, span, nodemask);
 	set_bit(node, used_nodes);
 	for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
 		int next_node = find_next_best_node(node, used_nodes);
 		nodemask = node_to_cpumask(next_node);
 		cpus_or(span, span, nodemask);
 	}
 	return span;
 }
 #endif
 int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
 /*
  * SMT sched-domains:
  */
 #ifdef CONFIG_SCHED_SMT
 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
 static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
 static int
 cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg)
 {
 	if (sg)
 		*sg = &per_cpu(sched_group_cpus, cpu);
 	return cpu;
 }
 #endif
 /*
  * multi-core sched-domains:
  */
 #ifdef CONFIG_SCHED_MC
 static DEFINE_PER_CPU(struct sched_domain, core_domains);
 static DEFINE_PER_CPU(struct sched_group, sched_group_core);
 #endif
 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
 static int
 cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg)
 {
 	int group;
 	cpumask_t mask = per_cpu(cpu_sibling_map, cpu);
 	cpus_and(mask, mask, *cpu_map);
 	group = first_cpu(mask);
 	if (sg)
 		*sg = &per_cpu(sched_group_core, group);
 	return group;
 }
 #elif defined(CONFIG_SCHED_MC)
 static int
 cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg)
 {
 	if (sg)
 		*sg = &per_cpu(sched_group_core, cpu);
 	return cpu;
 }
 #endif
 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
 static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
 static int
 cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg)
 {
 	int group;
 #ifdef CONFIG_SCHED_MC
 	cpumask_t mask = cpu_coregroup_map(cpu);
 	cpus_and(mask, mask, *cpu_map);
 	group = first_cpu(mask);
 #elif defined(CONFIG_SCHED_SMT)
 	cpumask_t mask = per_cpu(cpu_sibling_map, cpu);
 	cpus_and(mask, mask, *cpu_map);
 	group = first_cpu(mask);
 #else
 	group = cpu;
 #endif
 	if (sg)
 		*sg = &per_cpu(sched_group_phys, group);
 	return group;
 }
 #ifdef CONFIG_NUMA
 /*
  * The init_sched_build_groups can't handle what we want to do with node
  * groups, so roll our own. Now each node has its own list of groups which
  * gets dynamically allocated.
  */
 static DEFINE_PER_CPU(struct sched_domain, node_domains);
 static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
 static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
 static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
 				 struct sched_group **sg)
 {
 	cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
 	int group;
 	cpus_and(nodemask, nodemask, *cpu_map);
 	group = first_cpu(nodemask);
 	if (sg)
 		*sg = &per_cpu(sched_group_allnodes, group);
 	return group;
 }
 static void init_numa_sched_groups_power(struct sched_group *group_head)
 {
 	struct sched_group *sg = group_head;
 	int j;
 	if (!sg)
 		return;
 	do {
 		for_each_cpu_mask(j, sg->cpumask) {
 			struct sched_domain *sd;
 			sd = &per_cpu(phys_domains, j);
 			if (j != first_cpu(sd->groups->cpumask)) {
 				/*
 				 * Only add "power" once for each
 				 * physical package.
 				 */
 				continue;
 			}
 			sg_inc_cpu_power(sg, sd->groups->__cpu_power);
 		}
 		sg = sg->next;
 	} while (sg != group_head);
 }
 #endif
 #ifdef CONFIG_NUMA
 /* Free memory allocated for various sched_group structures */
 static void free_sched_groups(const cpumask_t *cpu_map)
 {
 	int cpu, i;
 	for_each_cpu_mask(cpu, *cpu_map) {
 		struct sched_group **sched_group_nodes
 			= sched_group_nodes_bycpu[cpu];
 		if (!sched_group_nodes)
 			continue;
 		for (i = 0; i < MAX_NUMNODES; i++) {
 			cpumask_t nodemask = node_to_cpumask(i);
 			struct sched_group *oldsg, *sg = sched_group_nodes[i];
 			cpus_and(nodemask, nodemask, *cpu_map);
 			if (cpus_empty(nodemask))
 				continue;
 			if (sg == NULL)
 				continue;
 			sg = sg->next;
 next_sg:
 			oldsg = sg;
 			sg = sg->next;
 			kfree(oldsg);
 			if (oldsg != sched_group_nodes[i])
 				goto next_sg;
 		}
 		kfree(sched_group_nodes);
 		sched_group_nodes_bycpu[cpu] = NULL;
 	}
 }
 #else
 static void free_sched_groups(const cpumask_t *cpu_map)
 {
 }
 #endif
 /*
  * Initialize sched groups cpu_power.
  *
  * cpu_power indicates the capacity of sched group, which is used while
  * distributing the load between different sched groups in a sched domain.
  * Typically cpu_power for all the groups in a sched domain will be same unless
  * there are asymmetries in the topology. If there are asymmetries, group
  * having more cpu_power will pickup more load compared to the group having
  * less cpu_power.
  *
  * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
  * the maximum number of tasks a group can handle in the presence of other idle
  * or lightly loaded groups in the same sched domain.
  */
 static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 {
 	struct sched_domain *child;
 	struct sched_group *group;
 	WARN_ON(!sd || !sd->groups);
 	if (cpu != first_cpu(sd->groups->cpumask))
 		return;
 	child = sd->child;
 	sd->groups->__cpu_power = 0;
 	/*
 	 * For perf policy, if the groups in child domain share resources
 	 * (for example cores sharing some portions of the cache hierarchy
 	 * or SMT), then set this domain groups cpu_power such that each group
 	 * can handle only one task, when there are other idle groups in the
 	 * same sched domain.
 	 */
 	if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
 		       (child->flags &
 			(SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
 		sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
 		return;
 	}
 	/*
 	 * add cpu_power of each child group to this groups cpu_power
 	 */
 	group = child->groups;
 	do {
 		sg_inc_cpu_power(sd->groups, group->__cpu_power);
 		group = group->next;
 	} while (group != child->groups);
 }
 /*
  * Build sched domains for a given set of cpus and attach the sched domains
  * to the individual cpus
  */
 static int build_sched_domains(const cpumask_t *cpu_map)
 {
 	int i;
 	struct root_domain *rd;
 #ifdef CONFIG_NUMA
 	struct sched_group **sched_group_nodes = NULL;
 	int sd_allnodes = 0;
 	/*
 	 * Allocate the per-node list of sched groups
 	 */
 	sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *),
 				    GFP_KERNEL);
 	if (!sched_group_nodes) {
 		printk(KERN_WARNING "Can not alloc sched group node list\n");
 		return -ENOMEM;
 	}
 	sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
 #endif
 	rd = alloc_rootdomain();
 	if (!rd) {
 		printk(KERN_WARNING "Cannot alloc root domain\n");
 		return -ENOMEM;
 	}
 	/*
 	 * Set up domains for cpus specified by the cpu_map.
 	 */
 	for_each_cpu_mask(i, *cpu_map) {
 		struct sched_domain *sd = NULL, *p;
 		cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
 		cpus_and(nodemask, nodemask, *cpu_map);
 #ifdef CONFIG_NUMA
 		if (cpus_weight(*cpu_map) >
 				SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
 			sd = &per_cpu(allnodes_domains, i);
 			*sd = SD_ALLNODES_INIT;
 			sd->span = *cpu_map;
 			cpu_to_allnodes_group(i, cpu_map, &sd->groups);
 			p = sd;
 			sd_allnodes = 1;
 		} else
 			p = NULL;
 		sd = &per_cpu(node_domains, i);
 		*sd = SD_NODE_INIT;
 		sd->span = sched_domain_node_span(cpu_to_node(i));
 		sd->parent = p;
 		if (p)
 			p->child = sd;
 		cpus_and(sd->span, sd->span, *cpu_map);
 #endif
 		p = sd;
 		sd = &per_cpu(phys_domains, i);
 		*sd = SD_CPU_INIT;
 		sd->span = nodemask;
 		sd->parent = p;
 		if (p)
 			p->child = sd;
 		cpu_to_phys_group(i, cpu_map, &sd->groups);
 #ifdef CONFIG_SCHED_MC
 		p = sd;
 		sd = &per_cpu(core_domains, i);
 		*sd = SD_MC_INIT;
 		sd->span = cpu_coregroup_map(i);
 		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
 		p->child = sd;
 		cpu_to_core_group(i, cpu_map, &sd->groups);
 #endif
 #ifdef CONFIG_SCHED_SMT
 		p = sd;
 		sd = &per_cpu(cpu_domains, i);
 		*sd = SD_SIBLING_INIT;
 		sd->span = per_cpu(cpu_sibling_map, i);
 		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
 		p->child = sd;
 		cpu_to_cpu_group(i, cpu_map, &sd->groups);
 #endif
 	}
 #ifdef CONFIG_SCHED_SMT
 	/* Set up CPU (sibling) groups */
 	for_each_cpu_mask(i, *cpu_map) {
 		cpumask_t this_sibling_map = per_cpu(cpu_sibling_map, i);
 		cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
 		if (i != first_cpu(this_sibling_map))
 			continue;
 		init_sched_build_groups(this_sibling_map, cpu_map,
 					&cpu_to_cpu_group);
 	}
 #endif
 #ifdef CONFIG_SCHED_MC
 	/* Set up multi-core groups */
 	for_each_cpu_mask(i, *cpu_map) {
 		cpumask_t this_core_map = cpu_coregroup_map(i);
 		cpus_and(this_core_map, this_core_map, *cpu_map);
 		if (i != first_cpu(this_core_map))
 			continue;
 		init_sched_build_groups(this_core_map, cpu_map,
 					&cpu_to_core_group);
 	}
 #endif
 	/* Set up physical groups */
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		cpumask_t nodemask = node_to_cpumask(i);
 		cpus_and(nodemask, nodemask, *cpu_map);
 		if (cpus_empty(nodemask))
 			continue;
 		init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group);
 	}
 #ifdef CONFIG_NUMA
 	/* Set up node groups */
 	if (sd_allnodes)
 		init_sched_build_groups(*cpu_map, cpu_map,
 					&cpu_to_allnodes_group);
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		/* Set up node groups */
 		struct sched_group *sg, *prev;
 		cpumask_t nodemask = node_to_cpumask(i);
 		cpumask_t domainspan;
 		cpumask_t covered = CPU_MASK_NONE;
 		int j;
 		cpus_and(nodemask, nodemask, *cpu_map);
 		if (cpus_empty(nodemask)) {
 			sched_group_nodes[i] = NULL;
 			continue;
 		}
 		domainspan = sched_domain_node_span(i);
 		cpus_and(domainspan, domainspan, *cpu_map);
 		sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
 		if (!sg) {
 			printk(KERN_WARNING "Can not alloc domain group for "
 				"node %d\n", i);
 			goto error;
 		}
 		sched_group_nodes[i] = sg;
 		for_each_cpu_mask(j, nodemask) {
 			struct sched_domain *sd;
 			sd = &per_cpu(node_domains, j);
 			sd->groups = sg;
 		}
 		sg->__cpu_power = 0;
 		sg->cpumask = nodemask;
 		sg->next = sg;
 		cpus_or(covered, covered, nodemask);
 		prev = sg;
 		for (j = 0; j < MAX_NUMNODES; j++) {
 			cpumask_t tmp, notcovered;
 			int n = (i + j) % MAX_NUMNODES;
 			cpus_complement(notcovered, covered);
 			cpus_and(tmp, notcovered, *cpu_map);
 			cpus_and(tmp, tmp, domainspan);
 			if (cpus_empty(tmp))
 				break;
 			nodemask = node_to_cpumask(n);
 			cpus_and(tmp, tmp, nodemask);
 			if (cpus_empty(tmp))
 				continue;
 			sg = kmalloc_node(sizeof(struct sched_group),
 					  GFP_KERNEL, i);
 			if (!sg) {
 				printk(KERN_WARNING
 				"Can not alloc domain group for node %d\n", j);
 				goto error;
 			}
 			sg->__cpu_power = 0;
 			sg->cpumask = tmp;
 			sg->next = prev->next;
 			cpus_or(covered, covered, tmp);
 			prev->next = sg;
 			prev = sg;
 		}
 	}
 #endif
 	/* Calculate CPU power for physical packages and nodes */
 #ifdef CONFIG_SCHED_SMT
 	for_each_cpu_mask(i, *cpu_map) {
 		struct sched_domain *sd = &per_cpu(cpu_domains, i);
 		init_sched_groups_power(i, sd);
 	}
 #endif
 #ifdef CONFIG_SCHED_MC
 	for_each_cpu_mask(i, *cpu_map) {
 		struct sched_domain *sd = &per_cpu(core_domains, i);
 		init_sched_groups_power(i, sd);
 	}
 #endif
 	for_each_cpu_mask(i, *cpu_map) {
 		struct sched_domain *sd = &per_cpu(phys_domains, i);
 		init_sched_groups_power(i, sd);
 	}
 #ifdef CONFIG_NUMA
 	for (i = 0; i < MAX_NUMNODES; i++)
 		init_numa_sched_groups_power(sched_group_nodes[i]);
 	if (sd_allnodes) {
 		struct sched_group *sg;
 		cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg);
 		init_numa_sched_groups_power(sg);
 	}
 #endif
 	/* Attach the domains */
 	for_each_cpu_mask(i, *cpu_map) {
 		struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
 		sd = &per_cpu(cpu_domains, i);
 #elif defined(CONFIG_SCHED_MC)
 		sd = &per_cpu(core_domains, i);
 #else
 		sd = &per_cpu(phys_domains, i);
 #endif
 		cpu_attach_domain(sd, rd, i);
 	}
 	return 0;
 #ifdef CONFIG_NUMA
 error:
 	free_sched_groups(cpu_map);
 	return -ENOMEM;
 #endif
 }
 static cpumask_t *doms_cur;	/* current sched domains */
 static int ndoms_cur;		/* number of sched domains in 'doms_cur' */
 /*
  * Special case: If a kmalloc of a doms_cur partition (array of
  * cpumask_t) fails, then fallback to a single sched domain,
  * as determined by the single cpumask_t fallback_doms.
  */
 static cpumask_t fallback_doms;
 /*
  * Set up scheduler domains and groups. Callers must hold the hotplug lock.
  * For now this just excludes isolated cpus, but could be used to
  * exclude other special cases in the future.
  */
 static int arch_init_sched_domains(const cpumask_t *cpu_map)
 {
 	int err;
 	ndoms_cur = 1;
 	doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
 	if (!doms_cur)
 		doms_cur = &fallback_doms;
 	cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);
 	err = build_sched_domains(doms_cur);
 	register_sched_domain_sysctl();
 	return err;
 }
 static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
 {
 	free_sched_groups(cpu_map);
 }
 /*
  * Detach sched domains from a group of cpus specified in cpu_map
  * These cpus will now be attached to the NULL domain
  */
 static void detach_destroy_domains(const cpumask_t *cpu_map)
 {
 	int i;
 	unregister_sched_domain_sysctl();
 	for_each_cpu_mask(i, *cpu_map)
 		cpu_attach_domain(NULL, &def_root_domain, i);
 	synchronize_sched();
 	arch_destroy_sched_domains(cpu_map);
 }
 /*
  * Partition sched domains as specified by the 'ndoms_new'
  * cpumasks in the array doms_new[] of cpumasks. This compares
  * doms_new[] to the current sched domain partitioning, doms_cur[].
  * It destroys each deleted domain and builds each new domain.
  *
  * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'.
  * The masks don't intersect (don't overlap.) We should setup one
  * sched domain for each mask. CPUs not in any of the cpumasks will
  * not be load balanced. If the same cpumask appears both in the
  * current 'doms_cur' domains and in the new 'doms_new', we can leave
  * it as it is.
  *
  * The passed in 'doms_new' should be kmalloc'd. This routine takes
  * ownership of it and will kfree it when done with it. If the caller
  * failed the kmalloc call, then it can pass in doms_new == NULL,
  * and partition_sched_domains() will fallback to the single partition
  * 'fallback_doms'.
  *
  * Call with hotplug lock held
  */
 void partition_sched_domains(int ndoms_new, cpumask_t *doms_new)
 {
 	int i, j;
 	lock_doms_cur();
 	/* always unregister in case we don't destroy any domains */
 	unregister_sched_domain_sysctl();
 	if (doms_new == NULL) {
 		ndoms_new = 1;
 		doms_new = &fallback_doms;
 		cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
 	}
 	/* Destroy deleted domains */
 	for (i = 0; i < ndoms_cur; i++) {
 		for (j = 0; j < ndoms_new; j++) {
 			if (cpus_equal(doms_cur[i], doms_new[j]))
 				goto match1;
 		}
 		/* no match - a current sched domain not in new doms_new[] */
 		detach_destroy_domains(doms_cur + i);
 match1:
 		;
 	}
 	/* Build new domains */
 	for (i = 0; i < ndoms_new; i++) {
 		for (j = 0; j < ndoms_cur; j++) {
 			if (cpus_equal(doms_new[i], doms_cur[j]))
 				goto match2;
 		}
 		/* no match - add a new doms_new */
 		build_sched_domains(doms_new + i);
 match2:
 		;
 	}
 	/* Remember the new sched domains */
 	if (doms_cur != &fallback_doms)
 		kfree(doms_cur);
 	doms_cur = doms_new;
 	ndoms_cur = ndoms_new;
 	register_sched_domain_sysctl();
 	unlock_doms_cur();
 }
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 static int arch_reinit_sched_domains(void)
 {
 	int err;
 	get_online_cpus();
 	detach_destroy_domains(&cpu_online_map);
 	err = arch_init_sched_domains(&cpu_online_map);
 	put_online_cpus();
 	return err;
 }
 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
 {
 	int ret;
 	if (buf[0] != '0' && buf[0] != '1')
 		return -EINVAL;
 	if (smt)
 		sched_smt_power_savings = (buf[0] == '1');
 	else
 		sched_mc_power_savings = (buf[0] == '1');
 	ret = arch_reinit_sched_domains();
 	return ret ? ret : count;
 }
 #ifdef CONFIG_SCHED_MC
 static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
 {
 	return sprintf(page, "%u\n", sched_mc_power_savings);
 }
 static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
 					    const char *buf, size_t count)
 {
 	return sched_power_savings_store(buf, count, 0);
 }
 static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
 		   sched_mc_power_savings_store);
 #endif
 #ifdef CONFIG_SCHED_SMT
 static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
 {
 	return sprintf(page, "%u\n", sched_smt_power_savings);
 }
 static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
 					     const char *buf, size_t count)
 {
 	return sched_power_savings_store(buf, count, 1);
 }
 static SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
 		   sched_smt_power_savings_store);
 #endif
 int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
 {
 	int err = 0;
 #ifdef CONFIG_SCHED_SMT
 	if (smt_capable())
 		err = sysfs_create_file(&cls->kset.kobj,
 					&attr_sched_smt_power_savings.attr);
 #endif
 #ifdef CONFIG_SCHED_MC
 	if (!err && mc_capable())
 		err = sysfs_create_file(&cls->kset.kobj,
 					&attr_sched_mc_power_savings.attr);
 #endif
 	return err;
 }
 #endif
 /*
  * Force a reinitialization of the sched domains hierarchy. The domains
  * and groups cannot be updated in place without racing with the balancing
  * code, so we temporarily attach all running cpus to the NULL domain
  * which will prevent rebalancing while the sched domains are recalculated.
  */
 static int update_sched_domains(struct notifier_block *nfb,
 				unsigned long action, void *hcpu)
 {
 	switch (action) {
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
 		detach_destroy_domains(&cpu_online_map);
 		return NOTIFY_OK;
 	case CPU_UP_CANCELED:
 	case CPU_UP_CANCELED_FROZEN:
 	case CPU_DOWN_FAILED:
 	case CPU_DOWN_FAILED_FROZEN:
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
 		/*
 		 * Fall through and re-initialise the domains.
 		 */
 		break;
 	default:
 		return NOTIFY_DONE;
 	}
 	/* The hotplug lock is already held by cpu_up/cpu_down */
 	arch_init_sched_domains(&cpu_online_map);
 	return NOTIFY_OK;
 }
 void __init sched_init_smp(void)
 {
 	cpumask_t non_isolated_cpus;
 	get_online_cpus();
 	arch_init_sched_domains(&cpu_online_map);
 	cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
 	if (cpus_empty(non_isolated_cpus))
 		cpu_set(smp_processor_id(), non_isolated_cpus);
 	put_online_cpus();
 	/* XXX: Theoretical race here - CPU may be hotplugged now */
 	hotcpu_notifier(update_sched_domains, 0);
 	/* Move init over to a non-isolated CPU */
 	if (set_cpus_allowed(current, non_isolated_cpus) < 0)
 		BUG();
 	sched_init_granularity();
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	if (nr_cpu_ids == 1)
 		return;
 	lb_monitor_task = kthread_create(load_balance_monitor, NULL,
 					 "group_balance");
 	if (!IS_ERR(lb_monitor_task)) {
 		lb_monitor_task->flags |= PF_NOFREEZE;
 		wake_up_process(lb_monitor_task);
 	} else {
 		printk(KERN_ERR "Could not create load balance monitor thread"
 			"(error = %ld) \n", PTR_ERR(lb_monitor_task));
 	}
 #endif
 }
 #else
 void __init sched_init_smp(void)
 {
 	sched_init_granularity();
 }
 #endif /* CONFIG_SMP */
 int in_sched_functions(unsigned long addr)
 {
 	return in_lock_functions(addr) ||
 		(addr >= (unsigned long)__sched_text_start
 		&& addr < (unsigned long)__sched_text_end);
 }
 static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
 {
 	cfs_rq->tasks_timeline = RB_ROOT;
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	cfs_rq->rq = rq;
 #endif
 	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
 }
 static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
 {
 	struct rt_prio_array *array;
 	int i;
 	array = &rt_rq->active;
 	for (i = 0; i < MAX_RT_PRIO; i++) {
 		INIT_LIST_HEAD(array->queue + i);
 		__clear_bit(i, array->bitmap);
 	}
 	/* delimiter for bitsearch: */
 	__set_bit(MAX_RT_PRIO, array->bitmap);
 #if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
 	rt_rq->highest_prio = MAX_RT_PRIO;
 #endif
 #ifdef CONFIG_SMP
 	rt_rq->rt_nr_migratory = 0;
 	rt_rq->overloaded = 0;
 #endif
 	rt_rq->rt_time = 0;
 	rt_rq->rt_throttled = 0;
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	rt_rq->rq = rq;
 #endif
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg,
 		struct cfs_rq *cfs_rq, struct sched_entity *se,
 		int cpu, int add)
 {
 	tg->cfs_rq[cpu] = cfs_rq;
 	init_cfs_rq(cfs_rq, rq);
 	cfs_rq->tg = tg;
 	if (add)
 		list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
 	tg->se[cpu] = se;
 	se->cfs_rq = &rq->cfs;
 	se->my_q = cfs_rq;
 	se->load.weight = tg->shares;
 	se->load.inv_weight = div64_64(1ULL<<32, se->load.weight);
 	se->parent = NULL;
 }
 static void init_tg_rt_entry(struct rq *rq, struct task_group *tg,
 		struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
 		int cpu, int add)
 {
 	tg->rt_rq[cpu] = rt_rq;
 	init_rt_rq(rt_rq, rq);
 	rt_rq->tg = tg;
 	rt_rq->rt_se = rt_se;
 	if (add)
 		list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
 	tg->rt_se[cpu] = rt_se;
 	rt_se->rt_rq = &rq->rt;
 	rt_se->my_q = rt_rq;
 	rt_se->parent = NULL;
 	INIT_LIST_HEAD(&rt_se->run_list);
 }
 #endif
 void __init sched_init(void)
 {
 	int highest_cpu = 0;
 	int i, j;
 #ifdef CONFIG_SMP
 	init_defrootdomain();
 #endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	list_add(&init_task_group.list, &task_groups);
 #endif
 	for_each_possible_cpu(i) {
 		struct rq *rq;
 		rq = cpu_rq(i);
 		spin_lock_init(&rq->lock);
 		lockdep_set_class(&rq->lock, &rq->rq_lock_key);
 		rq->nr_running = 0;
 		rq->clock = 1;
 		init_cfs_rq(&rq->cfs, rq);
 		init_rt_rq(&rq->rt, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 		init_task_group.shares = init_task_group_load;
 		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
 		init_tg_cfs_entry(rq, &init_task_group,
 				&per_cpu(init_cfs_rq, i),
 				&per_cpu(init_sched_entity, i), i, 1);
 		init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */
 		INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
 		init_tg_rt_entry(rq, &init_task_group,
 				&per_cpu(init_rt_rq, i),
 				&per_cpu(init_sched_rt_entity, i), i, 1);
 #endif
 		rq->rt_period_expire = 0;
 		rq->rt_throttled = 0;
 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
 			rq->cpu_load[j] = 0;
 #ifdef CONFIG_SMP
 		rq->sd = NULL;
 		rq->rd = NULL;
 		rq->active_balance = 0;
 		rq->next_balance = jiffies;
 		rq->push_cpu = 0;
 		rq->cpu = i;
 		rq->migration_thread = NULL;
 		INIT_LIST_HEAD(&rq->migration_queue);
 		rq_attach_root(rq, &def_root_domain);
 #endif
 		init_rq_hrtick(rq);
 		atomic_set(&rq->nr_iowait, 0);
 		highest_cpu = i;
 	}
 	set_load_weight(&init_task);
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 	INIT_HLIST_HEAD(&init_task.preempt_notifiers);
 #endif
 #ifdef CONFIG_SMP
 	nr_cpu_ids = highest_cpu + 1;
 	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
 #endif
 #ifdef CONFIG_RT_MUTEXES
 	plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
 #endif
 	/*
 	 * The boot idle thread does lazy MMU switching as well:
 	 */
 	atomic_inc(&init_mm.mm_count);
 	enter_lazy_tlb(&init_mm, current);
 	/*
 	 * Make us the idle thread. Technically, schedule() should not be
 	 * called from this thread, however somewhere below it might be,
 	 * but because we are the idle thread, we just pick up running again
 	 * when this runqueue becomes "idle".
 	 */
 	init_idle(current, smp_processor_id());
 	/*
 	 * During early bootup we pretend to be a normal task:
 	 */
 	current->sched_class = &fair_sched_class;
 }
 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
 void __might_sleep(char *file, int line)
 {
 #ifdef in_atomic
 	static unsigned long prev_jiffy;	/* ratelimiting */
 	if ((in_atomic() || irqs_disabled()) &&
 	    system_state == SYSTEM_RUNNING && !oops_in_progress) {
 		if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
 			return;
 		prev_jiffy = jiffies;
 		printk(KERN_ERR "BUG: sleeping function called from invalid"
 				" context at %s:%d\n", file, line);
 		printk("in_atomic():%d, irqs_disabled():%d\n",
 			in_atomic(), irqs_disabled());
 		debug_show_held_locks(current);
 		if (irqs_disabled())
 			print_irqtrace_events(current);
 		dump_stack();
 	}
 #endif
 }
 EXPORT_SYMBOL(__might_sleep);
 #endif
 #ifdef CONFIG_MAGIC_SYSRQ
 static void normalize_task(struct rq *rq, struct task_struct *p)
 {
 	int on_rq;
 	update_rq_clock(rq);
 	on_rq = p->se.on_rq;
 	if (on_rq)
 		deactivate_task(rq, p, 0);
 	__setscheduler(rq, p, SCHED_NORMAL, 0);
 	if (on_rq) {
 		activate_task(rq, p, 0);
 		resched_task(rq->curr);
 	}
 }
 void normalize_rt_tasks(void)
 {
 	struct task_struct *g, *p;
 	unsigned long flags;
 	struct rq *rq;
 	read_lock_irq(&tasklist_lock);
 	do_each_thread(g, p) {
 		/*
 		 * Only normalize user tasks:
 		 */
 		if (!p->mm)
 			continue;
 		p->se.exec_start		= 0;
 #ifdef CONFIG_SCHEDSTATS
 		p->se.wait_start		= 0;
 		p->se.sleep_start		= 0;
 		p->se.block_start		= 0;
 #endif
 		task_rq(p)->clock		= 0;
 		if (!rt_task(p)) {
 			/*
 			 * Renice negative nice level userspace
 			 * tasks back to 0:
 			 */
 			if (TASK_NICE(p) < 0 && p->mm)
 				set_user_nice(p, 0);
 			continue;
 		}
 		spin_lock_irqsave(&p->pi_lock, flags);
 		rq = __task_rq_lock(p);
 		normalize_task(rq, p);
 		__task_rq_unlock(rq);
 		spin_unlock_irqrestore(&p->pi_lock, flags);
 	} while_each_thread(g, p);
 	read_unlock_irq(&tasklist_lock);
 }
 #endif /* CONFIG_MAGIC_SYSRQ */
 #ifdef CONFIG_IA64
 /*
  * These functions are only useful for the IA64 MCA handling.
  *
  * They can only be called when the whole system has been
  * stopped - every CPU needs to be quiescent, and no scheduling
  * activity can take place. Using them for anything else would
  * be a serious bug, and as a result, they aren't even visible
  * under any other configuration.
  */
 /**
  * curr_task - return the current task for a given cpu.
  * @cpu: the processor in question.
  *
  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
  */
 struct task_struct *curr_task(int cpu)
 {
 	return cpu_curr(cpu);
 }
 /**
  * set_curr_task - set the current task for a given cpu.
  * @cpu: the processor in question.
  * @p: the task pointer to set.
  *
  * Description: This function must only be used when non-maskable interrupts
  * are serviced on a separate stack. It allows the architecture to switch the
  * notion of the current task on a cpu in a non-blocking manner. This function
  * must be called with all CPU's synchronized, and interrupts disabled, the
  * and caller must save the original value of the current task (see
  * curr_task() above) and restore that value before reenabling interrupts and
  * re-starting the system.
  *
  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
  */
 void set_curr_task(int cpu, struct task_struct *p)
 {
 	cpu_curr(cpu) = p;
 }
 #endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
 #ifdef CONFIG_SMP
 /*
  * distribute shares of all task groups among their schedulable entities,
  * to reflect load distribution across cpus.
  */
 static int rebalance_shares(struct sched_domain *sd, int this_cpu)
 {
 	struct cfs_rq *cfs_rq;
 	struct rq *rq = cpu_rq(this_cpu);
 	cpumask_t sdspan = sd->span;
 	int balanced = 1;
 	/* Walk thr' all the task groups that we have */
 	for_each_leaf_cfs_rq(rq, cfs_rq) {
 		int i;
 		unsigned long total_load = 0, total_shares;
 		struct task_group *tg = cfs_rq->tg;
 		/* Gather total task load of this group across cpus */
 		for_each_cpu_mask(i, sdspan)
 			total_load += tg->cfs_rq[i]->load.weight;
 		/* Nothing to do if this group has no load */
 		if (!total_load)
 			continue;
 		/*
 		 * tg->shares represents the number of cpu shares the task group
 		 * is eligible to hold on a single cpu. On N cpus, it is
 		 * eligible to hold (N * tg->shares) number of cpu shares.
 		 */
 		total_shares = tg->shares * cpus_weight(sdspan);
 		/*
 		 * redistribute total_shares across cpus as per the task load
 		 * distribution.
 		 */
 		for_each_cpu_mask(i, sdspan) {
 			unsigned long local_load, local_shares;
 			local_load = tg->cfs_rq[i]->load.weight;
 			local_shares = (local_load * total_shares) / total_load;
 			if (!local_shares)
 				local_shares = MIN_GROUP_SHARES;
 			if (local_shares == tg->se[i]->load.weight)
 				continue;
 			spin_lock_irq(&cpu_rq(i)->lock);
 			set_se_shares(tg->se[i], local_shares);
 			spin_unlock_irq(&cpu_rq(i)->lock);
 			balanced = 0;
 		}
 	}
 	return balanced;
 }
 /*
  * How frequently should we rebalance_shares() across cpus?
  *
  * The more frequently we rebalance shares, the more accurate is the fairness
  * of cpu bandwidth distribution between task groups. However higher frequency
  * also implies increased scheduling overhead.
  *
  * sysctl_sched_min_bal_int_shares represents the minimum interval between
  * consecutive calls to rebalance_shares() in the same sched domain.
  *
  * sysctl_sched_max_bal_int_shares represents the maximum interval between
  * consecutive calls to rebalance_shares() in the same sched domain.
  *
  * These settings allows for the appropriate trade-off between accuracy of
  * fairness and the associated overhead.
  *
  */
 /* default: 8ms, units: milliseconds */
 const_debug unsigned int sysctl_sched_min_bal_int_shares = 8;
 /* default: 128ms, units: milliseconds */
 const_debug unsigned int sysctl_sched_max_bal_int_shares = 128;
 /* kernel thread that runs rebalance_shares() periodically */
 static int load_balance_monitor(void *unused)
 {
 	unsigned int timeout = sysctl_sched_min_bal_int_shares;
 	struct sched_param schedparm;
 	int ret;
 	/*
 	 * We don't want this thread's execution to be limited by the shares
 	 * assigned to default group (init_task_group). Hence make it run
 	 * as a SCHED_RR RT task at the lowest priority.
 	 */
 	schedparm.sched_priority = 1;
 	ret = sched_setscheduler(current, SCHED_RR, &schedparm);
 	if (ret)
 		printk(KERN_ERR "Couldn't set SCHED_RR policy for load balance"
 				" monitor thread (error = %d) \n", ret);
 	while (!kthread_should_stop()) {
 		int i, cpu, balanced = 1;
 		/* Prevent cpus going down or coming up */
 		get_online_cpus();
 		/* lockout changes to doms_cur[] array */
 		lock_doms_cur();
 		/*
 		 * Enter a rcu read-side critical section to safely walk rq->sd
 		 * chain on various cpus and to walk task group list
 		 * (rq->leaf_cfs_rq_list) in rebalance_shares().
 		 */
 		rcu_read_lock();
 		for (i = 0; i < ndoms_cur; i++) {
 			cpumask_t cpumap = doms_cur[i];
 			struct sched_domain *sd = NULL, *sd_prev = NULL;
 			cpu = first_cpu(cpumap);
 			/* Find the highest domain at which to balance shares */
 			for_each_domain(cpu, sd) {
 				if (!(sd->flags & SD_LOAD_BALANCE))
 					continue;
 				sd_prev = sd;
 			}
 			sd = sd_prev;
 			/* sd == NULL? No load balance reqd in this domain */
 			if (!sd)
 				continue;
 			balanced &= rebalance_shares(sd, cpu);
 		}
 		rcu_read_unlock();
 		unlock_doms_cur();
 		put_online_cpus();
 		if (!balanced)
 			timeout = sysctl_sched_min_bal_int_shares;
 		else if (timeout < sysctl_sched_max_bal_int_shares)
 			timeout *= 2;
 		msleep_interruptible(timeout);
 	}
 	return 0;
 }
 #endif	/* CONFIG_SMP */
 static void free_sched_group(struct task_group *tg)
 {
 	int i;
 	for_each_possible_cpu(i) {
 		if (tg->cfs_rq)
 			kfree(tg->cfs_rq[i]);
 		if (tg->se)
 			kfree(tg->se[i]);
 		if (tg->rt_rq)
 			kfree(tg->rt_rq[i]);
 		if (tg->rt_se)
 			kfree(tg->rt_se[i]);
 	}
 	kfree(tg->cfs_rq);
 	kfree(tg->se);
 	kfree(tg->rt_rq);
 	kfree(tg->rt_se);
 	kfree(tg);
 }
 /* allocate runqueue etc for a new task group */
 struct task_group *sched_create_group(void)
 {
 	struct task_group *tg;
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se;
 	struct rt_rq *rt_rq;
 	struct sched_rt_entity *rt_se;
 	struct rq *rq;
 	int i;
 	tg = kzalloc(sizeof(*tg), GFP_KERNEL);
 	if (!tg)
 		return ERR_PTR(-ENOMEM);
 	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL);
 	if (!tg->cfs_rq)
 		goto err;
 	tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL);
 	if (!tg->se)
 		goto err;
 	tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL);
 	if (!tg->rt_rq)
 		goto err;
 	tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL);
 	if (!tg->rt_se)
 		goto err;
 	tg->shares = NICE_0_LOAD;
 	tg->rt_ratio = 0; /* XXX */
 	for_each_possible_cpu(i) {
 		rq = cpu_rq(i);
 		cfs_rq = kmalloc_node(sizeof(struct cfs_rq),
 				GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
 		if (!cfs_rq)
 			goto err;
 		se = kmalloc_node(sizeof(struct sched_entity),
 				GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
 		if (!se)
 			goto err;
 		rt_rq = kmalloc_node(sizeof(struct rt_rq),
 				GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
 		if (!rt_rq)
 			goto err;
 		rt_se = kmalloc_node(sizeof(struct sched_rt_entity),
 				GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
 		if (!rt_se)
 			goto err;
 		init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0);
 		init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0);
 	}
 	lock_task_group_list();
 	for_each_possible_cpu(i) {
 		rq = cpu_rq(i);
 		cfs_rq = tg->cfs_rq[i];
 		list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
 		rt_rq = tg->rt_rq[i];
 		list_add_rcu(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
 	}
 	list_add_rcu(&tg->list, &task_groups);
 	unlock_task_group_list();
 	return tg;
 err:
 	free_sched_group(tg);
 	return ERR_PTR(-ENOMEM);
 }
 /* rcu callback to free various structures associated with a task group */
 static void free_sched_group_rcu(struct rcu_head *rhp)
 {
 	/* now it should be safe to free those cfs_rqs */
 	free_sched_group(container_of(rhp, struct task_group, rcu));
 }
 /* Destroy runqueue etc associated with a task group */
 void sched_destroy_group(struct task_group *tg)
 {
 	struct cfs_rq *cfs_rq = NULL;
 	struct rt_rq *rt_rq = NULL;
 	int i;
 	lock_task_group_list();
 	for_each_possible_cpu(i) {
 		cfs_rq = tg->cfs_rq[i];
 		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
 		rt_rq = tg->rt_rq[i];
 		list_del_rcu(&rt_rq->leaf_rt_rq_list);
 	}
 	list_del_rcu(&tg->list);
 	unlock_task_group_list();
 	BUG_ON(!cfs_rq);
 	/* wait for possible concurrent references to cfs_rqs complete */
 	call_rcu(&tg->rcu, free_sched_group_rcu);
 }
 /* change task's runqueue when it moves between groups.
  *	The caller of this function should have put the task in its new group
  *	by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
  *	reflect its new group.
  */
 void sched_move_task(struct task_struct *tsk)
 {
 	int on_rq, running;
 	unsigned long flags;
 	struct rq *rq;
 	rq = task_rq_lock(tsk, &flags);
 	update_rq_clock(rq);
 	running = task_current(rq, tsk);
 	on_rq = tsk->se.on_rq;
 	if (on_rq) {
 		dequeue_task(rq, tsk, 0);
 		if (unlikely(running))
 			tsk->sched_class->put_prev_task(rq, tsk);
 	}
 	set_task_rq(tsk, task_cpu(tsk));
 	if (on_rq) {
 		if (unlikely(running))
 			tsk->sched_class->set_curr_task(rq);
 		enqueue_task(rq, tsk, 0);
 	}
 	task_rq_unlock(rq, &flags);
 }
 /* rq->lock to be locked by caller */
 static void set_se_shares(struct sched_entity *se, unsigned long shares)
 {
 	struct cfs_rq *cfs_rq = se->cfs_rq;
 	struct rq *rq = cfs_rq->rq;
 	int on_rq;
 	if (!shares)
 		shares = MIN_GROUP_SHARES;
 	on_rq = se->on_rq;
 	if (on_rq) {
 		dequeue_entity(cfs_rq, se, 0);
 		dec_cpu_load(rq, se->load.weight);
 	}
 	se->load.weight = shares;
 	se->load.inv_weight = div64_64((1ULL<<32), shares);
 	if (on_rq) {
 		enqueue_entity(cfs_rq, se, 0);
 		inc_cpu_load(rq, se->load.weight);
 	}
 }
 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 {
 	int i;
 	struct cfs_rq *cfs_rq;
 	struct rq *rq;
 	lock_task_group_list();
 	if (tg->shares == shares)
 		goto done;
 	if (shares < MIN_GROUP_SHARES)
 		shares = MIN_GROUP_SHARES;
 	/*
 	 * Prevent any load balance activity (rebalance_shares,
 	 * load_balance_fair) from referring to this group first,
 	 * by taking it off the rq->leaf_cfs_rq_list on each cpu.
 	 */
 	for_each_possible_cpu(i) {
 		cfs_rq = tg->cfs_rq[i];
 		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
 	}
 	/* wait for any ongoing reference to this group to finish */
 	synchronize_sched();
 	/*
 	 * Now we are free to modify the group's share on each cpu
 	 * w/o tripping rebalance_share or load_balance_fair.
 	 */
 	tg->shares = shares;
 	for_each_possible_cpu(i) {
 		spin_lock_irq(&cpu_rq(i)->lock);
 		set_se_shares(tg->se[i], shares);
 		spin_unlock_irq(&cpu_rq(i)->lock);
 	}
 	/*
 	 * Enable load balance activity on this group, by inserting it back on
 	 * each cpu's rq->leaf_cfs_rq_list.
 	 */
 	for_each_possible_cpu(i) {
 		rq = cpu_rq(i);
 		cfs_rq = tg->cfs_rq[i];
 		list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
 	}
 done:
 	unlock_task_group_list();
 	return 0;
 }
 unsigned long sched_group_shares(struct task_group *tg)
 {
 	return tg->shares;
 }
 /*
  * Ensure the total rt_ratio <= sysctl_sched_rt_ratio
  */
 int sched_group_set_rt_ratio(struct task_group *tg, unsigned long rt_ratio)
 {
 	struct task_group *tgi;
 	unsigned long total = 0;
 	rcu_read_lock();
 	list_for_each_entry_rcu(tgi, &task_groups, list)
 		total += tgi->rt_ratio;
 	rcu_read_unlock();
 	if (total + rt_ratio - tg->rt_ratio > sysctl_sched_rt_ratio)
 		return -EINVAL;
 	tg->rt_ratio = rt_ratio;
 	return 0;
 }
 unsigned long sched_group_rt_ratio(struct task_group *tg)
 {
 	return tg->rt_ratio;
 }
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_FAIR_CGROUP_SCHED
 /* return corresponding task_group object of a cgroup */
 static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
 {
 	return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
 			    struct task_group, css);
 }
 static struct cgroup_subsys_state *
 cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
 	struct task_group *tg;
 	if (!cgrp->parent) {
 		/* This is early initialization for the top cgroup */
 		init_task_group.css.cgroup = cgrp;
 		return &init_task_group.css;
 	}
 	/* we support only 1-level deep hierarchical scheduler atm */
 	if (cgrp->parent->parent)
 		return ERR_PTR(-EINVAL);
 	tg = sched_create_group();
 	if (IS_ERR(tg))
 		return ERR_PTR(-ENOMEM);
 	/* Bind the cgroup to task_group object we just created */
 	tg->css.cgroup = cgrp;
 	return &tg->css;
 }
 static void
 cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
 	struct task_group *tg = cgroup_tg(cgrp);
 	sched_destroy_group(tg);
 }
 static int
 cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 		      struct task_struct *tsk)
 {
 	/* We don't support RT-tasks being in separate groups */
 	if (tsk->sched_class != &fair_sched_class)
 		return -EINVAL;
 	return 0;
 }
 static void
 cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 			struct cgroup *old_cont, struct task_struct *tsk)
 {
 	sched_move_task(tsk);
 }
 static int cpu_shares_write_uint(struct cgroup *cgrp, struct cftype *cftype,
 				u64 shareval)
 {
 	return sched_group_set_shares(cgroup_tg(cgrp), shareval);
 }
 static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft)
 {
 	struct task_group *tg = cgroup_tg(cgrp);
 	return (u64) tg->shares;
 }
 static int cpu_rt_ratio_write_uint(struct cgroup *cgrp, struct cftype *cftype,
 		u64 rt_ratio_val)
 {
 	return sched_group_set_rt_ratio(cgroup_tg(cgrp), rt_ratio_val);
 }
 static u64 cpu_rt_ratio_read_uint(struct cgroup *cgrp, struct cftype *cft)
 {
 	struct task_group *tg = cgroup_tg(cgrp);
 	return (u64) tg->rt_ratio;
 }
 static struct cftype cpu_files[] = {
 	{
 		.name = "shares",
 		.read_uint = cpu_shares_read_uint,
 		.write_uint = cpu_shares_write_uint,
 	},
 	{
 		.name = "rt_ratio",
 		.read_uint = cpu_rt_ratio_read_uint,
 		.write_uint = cpu_rt_ratio_write_uint,
 	},
 };
 static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
 {
 	return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
 }
 struct cgroup_subsys cpu_cgroup_subsys = {
 	.name		= "cpu",
 	.create		= cpu_cgroup_create,
 	.destroy	= cpu_cgroup_destroy,
 	.can_attach	= cpu_cgroup_can_attach,
 	.attach		= cpu_cgroup_attach,
 	.populate	= cpu_cgroup_populate,
 	.subsys_id	= cpu_cgroup_subsys_id,
 	.early_init	= 1,
 };
 #endif	/* CONFIG_FAIR_CGROUP_SCHED */
 #ifdef CONFIG_CGROUP_CPUACCT
 /*
  * CPU accounting code for task groups.
  *
  * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
  * (balbir@in.ibm.com).
  */
 /* track cpu usage of a group of tasks */
 struct cpuacct {
 	struct cgroup_subsys_state css;
 	/* cpuusage holds pointer to a u64-type object on every cpu */
 	u64 *cpuusage;
 };
 struct cgroup_subsys cpuacct_subsys;
 /* return cpu accounting group corresponding to this container */
 static inline struct cpuacct *cgroup_ca(struct cgroup *cont)
 {
 	return container_of(cgroup_subsys_state(cont, cpuacct_subsys_id),
 			    struct cpuacct, css);
 }
 /* return cpu accounting group to which this task belongs */
 static inline struct cpuacct *task_ca(struct task_struct *tsk)
 {
 	return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
 			    struct cpuacct, css);
 }
 /* create a new cpu accounting group */
 static struct cgroup_subsys_state *cpuacct_create(
 	struct cgroup_subsys *ss, struct cgroup *cont)
 {
 	struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
 	if (!ca)
 		return ERR_PTR(-ENOMEM);
 	ca->cpuusage = alloc_percpu(u64);
 	if (!ca->cpuusage) {
 		kfree(ca);
 		return ERR_PTR(-ENOMEM);
 	}
 	return &ca->css;
 }
 /* destroy an existing cpu accounting group */
 static void
 cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
 {
 	struct cpuacct *ca = cgroup_ca(cont);
 	free_percpu(ca->cpuusage);
 	kfree(ca);
 }
 /* return total cpu usage (in nanoseconds) of a group */
 static u64 cpuusage_read(struct cgroup *cont, struct cftype *cft)
 {
 	struct cpuacct *ca = cgroup_ca(cont);
 	u64 totalcpuusage = 0;
 	int i;
 	for_each_possible_cpu(i) {
 		u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
 		/*
 		 * Take rq->lock to make 64-bit addition safe on 32-bit
 		 * platforms.
 		 */
 		spin_lock_irq(&cpu_rq(i)->lock);
 		totalcpuusage += *cpuusage;
 		spin_unlock_irq(&cpu_rq(i)->lock);
 	}
 	return totalcpuusage;
 }
 static struct cftype files[] = {
 	{
 		.name = "usage",
 		.read_uint = cpuusage_read,
 	},
 };
 static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cont)
 {
 	return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
 }
 /*
  * charge this task's execution time to its accounting group.
  *
  * called with rq->lock held.
  */
 static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 {
 	struct cpuacct *ca;
 	if (!cpuacct_subsys.active)
 		return;
 	ca = task_ca(tsk);
 	if (ca) {
 		u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk));
 		*cpuusage += cputime;
 	}
 }
 struct cgroup_subsys cpuacct_subsys = {
 	.name = "cpuacct",
 	.create = cpuacct_create,
 	.destroy = cpuacct_destroy,
 	.populate = cpuacct_populate,
 	.subsys_id = cpuacct_subsys_id,
 };
 #endif	/* CONFIG_CGROUP_CPUACCT */

kernel/spinlock.c

Diff comments View file @ 95c354f

 /*
  * Copyright (2004) Linus Torvalds
  *
  * Author: Zwane Mwaikambo <zwane@fsmlabs.com>
  *
  * Copyright (2004, 2005) Ingo Molnar
  *
  * This file contains the spinlock/rwlock implementations for the
  * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them)
  *
  * Note that some architectures have special knowledge about the
  * stack frames of these functions in their profile_pc. If you
  * change anything significant here that could change the stack
  * frame contact the architecture maintainers.
  */
 #include <linux/linkage.h>
 #include <linux/preempt.h>
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
 #include <linux/debug_locks.h>
 #include <linux/module.h>
 int __lockfunc _spin_trylock(spinlock_t *lock)
 {
 	preempt_disable();
 	if (_raw_spin_trylock(lock)) {
 		spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
 		return 1;
 	}
 	preempt_enable();
 	return 0;
 }
 EXPORT_SYMBOL(_spin_trylock);
 int __lockfunc _read_trylock(rwlock_t *lock)
 {
 	preempt_disable();
 	if (_raw_read_trylock(lock)) {
 		rwlock_acquire_read(&lock->dep_map, 0, 1, _RET_IP_);
 		return 1;
 	}
 	preempt_enable();
 	return 0;
 }
 EXPORT_SYMBOL(_read_trylock);
 int __lockfunc _write_trylock(rwlock_t *lock)
 {
 	preempt_disable();
 	if (_raw_write_trylock(lock)) {
 		rwlock_acquire(&lock->dep_map, 0, 1, _RET_IP_);
 		return 1;
 	}
 	preempt_enable();
 	return 0;
 }
 EXPORT_SYMBOL(_write_trylock);
 /*
  * If lockdep is enabled then we use the non-preemption spin-ops
  * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are
  * not re-enabled during lock-acquire (which the preempt-spin-ops do):
  */
-#if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP) || \
+#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
-	defined(CONFIG_DEBUG_LOCK_ALLOC)
 void __lockfunc _read_lock(rwlock_t *lock)
 {
 	preempt_disable();
 	rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
 	LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
 }
 EXPORT_SYMBOL(_read_lock);
 unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
 {
 	unsigned long flags;
 	local_irq_save(flags);
 	preempt_disable();
 	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
 	/*
 	 * On lockdep we dont want the hand-coded irq-enable of
 	 * _raw_spin_lock_flags() code, because lockdep assumes
 	 * that interrupts are not re-enabled during lock-acquire:
 	 */
 #ifdef CONFIG_LOCKDEP
 	LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
 #else
 	_raw_spin_lock_flags(lock, &flags);
 #endif
 	return flags;
 }
 EXPORT_SYMBOL(_spin_lock_irqsave);
 void __lockfunc _spin_lock_irq(spinlock_t *lock)
 {
 	local_irq_disable();
 	preempt_disable();
 	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
 	LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
 }
 EXPORT_SYMBOL(_spin_lock_irq);
 void __lockfunc _spin_lock_bh(spinlock_t *lock)
 {
 	local_bh_disable();
 	preempt_disable();
 	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
 	LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
 }
 EXPORT_SYMBOL(_spin_lock_bh);
 unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
 {
 	unsigned long flags;
 	local_irq_save(flags);
 	preempt_disable();
 	rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
 	LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
 	return flags;
 }
 EXPORT_SYMBOL(_read_lock_irqsave);
 void __lockfunc _read_lock_irq(rwlock_t *lock)
 {
 	local_irq_disable();
 	preempt_disable();
 	rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
 	LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
 }
 EXPORT_SYMBOL(_read_lock_irq);
 void __lockfunc _read_lock_bh(rwlock_t *lock)
 {
 	local_bh_disable();
 	preempt_disable();
 	rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
 	LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
 }
 EXPORT_SYMBOL(_read_lock_bh);
 unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
 {
 	unsigned long flags;
 	local_irq_save(flags);
 	preempt_disable();
 	rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
 	LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
 	return flags;
 }
 EXPORT_SYMBOL(_write_lock_irqsave);
 void __lockfunc _write_lock_irq(rwlock_t *lock)
 {
 	local_irq_disable();
 	preempt_disable();
 	rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
 	LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
 }
 EXPORT_SYMBOL(_write_lock_irq);
 void __lockfunc _write_lock_bh(rwlock_t *lock)
 {
 	local_bh_disable();
 	preempt_disable();
 	rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
 	LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
 }
 EXPORT_SYMBOL(_write_lock_bh);
 void __lockfunc _spin_lock(spinlock_t *lock)
 {
 	preempt_disable();
 	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
 	LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
 }
 EXPORT_SYMBOL(_spin_lock);
 void __lockfunc _write_lock(rwlock_t *lock)
 {
 	preempt_disable();
 	rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
 	LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
 }
 EXPORT_SYMBOL(_write_lock);
 #else /* CONFIG_PREEMPT: */
 /*
  * This could be a long-held lock. We both prepare to spin for a long
  * time (making _this_ CPU preemptable if possible), and we also signal
  * towards that other CPU that it should break the lock ASAP.
  *
  * (We do this in a function because inlining it would be excessive.)
  */
 #define BUILD_LOCK_OPS(op, locktype)					\
 void __lockfunc _##op##_lock(locktype##_t *lock)			\
 {									\
 	for (;;) {							\
 		preempt_disable();					\
 		if (likely(_raw_##op##_trylock(lock)))			\
 			break;						\
 		preempt_enable();					\
 									\
 		if (!(lock)->break_lock)				\
 			(lock)->break_lock = 1;				\
 		while (!op##_can_lock(lock) && (lock)->break_lock)	\
 			_raw_##op##_relax(&lock->raw_lock);		\
 	}								\
 	(lock)->break_lock = 0;						\
 }									\
 									\
 EXPORT_SYMBOL(_##op##_lock);						\
 									\
 unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock)	\
 {									\
 	unsigned long flags;						\
 									\
 	for (;;) {							\
 		preempt_disable();					\
 		local_irq_save(flags);					\
 		if (likely(_raw_##op##_trylock(lock)))			\
 			break;						\
 		local_irq_restore(flags);				\
 		preempt_enable();					\
 									\
 		if (!(lock)->break_lock)				\
 			(lock)->break_lock = 1;				\
 		while (!op##_can_lock(lock) && (lock)->break_lock)	\
 			_raw_##op##_relax(&lock->raw_lock);		\
 	}								\
 	(lock)->break_lock = 0;						\
 	return flags;							\
 }									\
 									\
 EXPORT_SYMBOL(_##op##_lock_irqsave);					\
 									\
 void __lockfunc _##op##_lock_irq(locktype##_t *lock)			\
 {									\
 	_##op##_lock_irqsave(lock);					\
 }									\
 									\
 EXPORT_SYMBOL(_##op##_lock_irq);					\
 									\
 void __lockfunc _##op##_lock_bh(locktype##_t *lock)			\
 {									\
 	unsigned long flags;						\
 									\
 	/*							*/	\
 	/* Careful: we must exclude softirqs too, hence the	*/	\
 	/* irq-disabling. We use the generic preemption-aware	*/	\
 	/* function:						*/	\
 	/**/								\
 	flags = _##op##_lock_irqsave(lock);				\
 	local_bh_disable();						\
 	local_irq_restore(flags);					\
 }									\
 									\
 EXPORT_SYMBOL(_##op##_lock_bh)
 /*
  * Build preemption-friendly versions of the following
  * lock-spinning functions:
  *
  *         _[spin|read|write]_lock()
  *         _[spin|read|write]_lock_irq()
  *         _[spin|read|write]_lock_irqsave()
  *         _[spin|read|write]_lock_bh()
  */
 BUILD_LOCK_OPS(spin, spinlock);
 BUILD_LOCK_OPS(read, rwlock);
 BUILD_LOCK_OPS(write, rwlock);
 #endif /* CONFIG_PREEMPT */
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass)
 {
 	preempt_disable();
 	spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
 	LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
 }
 EXPORT_SYMBOL(_spin_lock_nested);
 unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass)
 {
 	unsigned long flags;
 	local_irq_save(flags);
 	preempt_disable();
 	spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
 	/*
 	 * On lockdep we dont want the hand-coded irq-enable of
 	 * _raw_spin_lock_flags() code, because lockdep assumes
 	 * that interrupts are not re-enabled during lock-acquire:
 	 */
 #ifdef CONFIG_LOCKDEP
 	LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
 #else
 	_raw_spin_lock_flags(lock, &flags);
 #endif
 	return flags;
 }
 EXPORT_SYMBOL(_spin_lock_irqsave_nested);
 #endif
 void __lockfunc _spin_unlock(spinlock_t *lock)
 {
 	spin_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_spin_unlock(lock);
 	preempt_enable();
 }
 EXPORT_SYMBOL(_spin_unlock);
 void __lockfunc _write_unlock(rwlock_t *lock)
 {
 	rwlock_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_write_unlock(lock);
 	preempt_enable();
 }
 EXPORT_SYMBOL(_write_unlock);
 void __lockfunc _read_unlock(rwlock_t *lock)
 {
 	rwlock_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_read_unlock(lock);
 	preempt_enable();
 }
 EXPORT_SYMBOL(_read_unlock);
 void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
 {
 	spin_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_spin_unlock(lock);
 	local_irq_restore(flags);
 	preempt_enable();
 }
 EXPORT_SYMBOL(_spin_unlock_irqrestore);
 void __lockfunc _spin_unlock_irq(spinlock_t *lock)
 {
 	spin_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_spin_unlock(lock);
 	local_irq_enable();
 	preempt_enable();
 }
 EXPORT_SYMBOL(_spin_unlock_irq);
 void __lockfunc _spin_unlock_bh(spinlock_t *lock)
 {
 	spin_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_spin_unlock(lock);
 	preempt_enable_no_resched();
 	local_bh_enable_ip((unsigned long)__builtin_return_address(0));
 }
 EXPORT_SYMBOL(_spin_unlock_bh);
 void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
 {
 	rwlock_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_read_unlock(lock);
 	local_irq_restore(flags);
 	preempt_enable();
 }
 EXPORT_SYMBOL(_read_unlock_irqrestore);
 void __lockfunc _read_unlock_irq(rwlock_t *lock)
 {
 	rwlock_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_read_unlock(lock);
 	local_irq_enable();
 	preempt_enable();
 }
 EXPORT_SYMBOL(_read_unlock_irq);
 void __lockfunc _read_unlock_bh(rwlock_t *lock)
 {
 	rwlock_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_read_unlock(lock);
 	preempt_enable_no_resched();
 	local_bh_enable_ip((unsigned long)__builtin_return_address(0));
 }
 EXPORT_SYMBOL(_read_unlock_bh);
 void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
 {
 	rwlock_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_write_unlock(lock);
 	local_irq_restore(flags);
 	preempt_enable();
 }
 EXPORT_SYMBOL(_write_unlock_irqrestore);
 void __lockfunc _write_unlock_irq(rwlock_t *lock)
 {
 	rwlock_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_write_unlock(lock);
 	local_irq_enable();
 	preempt_enable();
 }
 EXPORT_SYMBOL(_write_unlock_irq);
 void __lockfunc _write_unlock_bh(rwlock_t *lock)
 {
 	rwlock_release(&lock->dep_map, 1, _RET_IP_);
 	_raw_write_unlock(lock);
 	preempt_enable_no_resched();
 	local_bh_enable_ip((unsigned long)__builtin_return_address(0));
 }
 EXPORT_SYMBOL(_write_unlock_bh);
 int __lockfunc _spin_trylock_bh(spinlock_t *lock)
 {
 	local_bh_disable();
 	preempt_disable();
 	if (_raw_spin_trylock(lock)) {
 		spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
 		return 1;
 	}
 	preempt_enable_no_resched();
 	local_bh_enable_ip((unsigned long)__builtin_return_address(0));
 	return 0;
 }
 EXPORT_SYMBOL(_spin_trylock_bh);
 int in_lock_functions(unsigned long addr)
 {
 	/* Linker adds these: start and end of __lockfunc functions */
 	extern char __lock_text_start[], __lock_text_end[];
 	return addr >= (unsigned long)__lock_text_start
 	&& addr < (unsigned long)__lock_text_end;
 }
 EXPORT_SYMBOL(in_lock_functions);

mm/memory.c

Diff comments View file @ 95c354f

 /*
  *  linux/mm/memory.c
  *
  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
  */
 /*
  * demand-loading started 01.12.91 - seems it is high on the list of
  * things wanted, and it should be easy to implement. - Linus
  */
 /*
  * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
  * pages started 02.12.91, seems to work. - Linus.
  *
  * Tested sharing by executing about 30 /bin/sh: under the old kernel it
  * would have taken more than the 6M I have free, but it worked well as
  * far as I could see.
  *
  * Also corrected some "invalidate()"s - I wasn't doing enough of them.
  */
 /*
  * Real VM (paging to/from disk) started 18.12.91. Much more work and
  * thought has to go into this. Oh, well..
  * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
  *		Found it. Everything seems to work now.
  * 20.12.91  -  Ok, making the swap-device changeable like the root.
  */
 /*
  * 05.04.94  -  Multi-page memory management added for v1.1.
  * 		Idea by Alex Bligh (alex@cconcepts.co.uk)
  *
  * 16.07.99  -  Support of BIGMEM added by Gerhard Wichert, Siemens AG
  *		(Gerhard.Wichert@pdb.siemens.de)
  *
  * Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
  */
 #include <linux/kernel_stat.h>
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
 #include <linux/mman.h>
 #include <linux/swap.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <linux/rmap.h>
 #include <linux/module.h>
 #include <linux/delayacct.h>
 #include <linux/init.h>
 #include <linux/writeback.h>
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 #include <asm/pgtable.h>
 #include <linux/swapops.h>
 #include <linux/elf.h>
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 /* use the per-pgdat data instead for discontigmem - mbligh */
 unsigned long max_mapnr;
 struct page *mem_map;
 EXPORT_SYMBOL(max_mapnr);
 EXPORT_SYMBOL(mem_map);
 #endif
 unsigned long num_physpages;
 /*
  * A number of key systems in x86 including ioremap() rely on the assumption
  * that high_memory defines the upper bound on direct map memory, then end
  * of ZONE_NORMAL.  Under CONFIG_DISCONTIG this means that max_low_pfn and
  * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
  * and ZONE_HIGHMEM.
  */
 void * high_memory;
 EXPORT_SYMBOL(num_physpages);
 EXPORT_SYMBOL(high_memory);
 int randomize_va_space __read_mostly = 1;
 static int __init disable_randmaps(char *s)
 {
 	randomize_va_space = 0;
 	return 1;
 }
 __setup("norandmaps", disable_randmaps);
 /*
  * If a p?d_bad entry is found while walking page tables, report
  * the error, before resetting entry to p?d_none.  Usually (but
  * very seldom) called out from the p?d_none_or_clear_bad macros.
  */
 void pgd_clear_bad(pgd_t *pgd)
 {
 	pgd_ERROR(*pgd);
 	pgd_clear(pgd);
 }
 void pud_clear_bad(pud_t *pud)
 {
 	pud_ERROR(*pud);
 	pud_clear(pud);
 }
 void pmd_clear_bad(pmd_t *pmd)
 {
 	pmd_ERROR(*pmd);
 	pmd_clear(pmd);
 }
 /*
  * Note: this doesn't free the actual pages themselves. That
  * has been handled earlier when unmapping all the memory regions.
  */
 static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
 {
 	struct page *page = pmd_page(*pmd);
 	pmd_clear(pmd);
 	pte_lock_deinit(page);
 	pte_free_tlb(tlb, page);
 	dec_zone_page_state(page, NR_PAGETABLE);
 	tlb->mm->nr_ptes--;
 }
 static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 				unsigned long addr, unsigned long end,
 				unsigned long floor, unsigned long ceiling)
 {
 	pmd_t *pmd;
 	unsigned long next;
 	unsigned long start;
 	start = addr;
 	pmd = pmd_offset(pud, addr);
 	do {
 		next = pmd_addr_end(addr, end);
 		if (pmd_none_or_clear_bad(pmd))
 			continue;
 		free_pte_range(tlb, pmd);
 	} while (pmd++, addr = next, addr != end);
 	start &= PUD_MASK;
 	if (start < floor)
 		return;
 	if (ceiling) {
 		ceiling &= PUD_MASK;
 		if (!ceiling)
 			return;
 	}
 	if (end - 1 > ceiling - 1)
 		return;
 	pmd = pmd_offset(pud, start);
 	pud_clear(pud);
 	pmd_free_tlb(tlb, pmd);
 }
 static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 				unsigned long addr, unsigned long end,
 				unsigned long floor, unsigned long ceiling)
 {
 	pud_t *pud;
 	unsigned long next;
 	unsigned long start;
 	start = addr;
 	pud = pud_offset(pgd, addr);
 	do {
 		next = pud_addr_end(addr, end);
 		if (pud_none_or_clear_bad(pud))
 			continue;
 		free_pmd_range(tlb, pud, addr, next, floor, ceiling);
 	} while (pud++, addr = next, addr != end);
 	start &= PGDIR_MASK;
 	if (start < floor)
 		return;
 	if (ceiling) {
 		ceiling &= PGDIR_MASK;
 		if (!ceiling)
 			return;
 	}
 	if (end - 1 > ceiling - 1)
 		return;
 	pud = pud_offset(pgd, start);
 	pgd_clear(pgd);
 	pud_free_tlb(tlb, pud);
 }
 /*
  * This function frees user-level page tables of a process.
  *
  * Must be called with pagetable lock held.
  */
 void free_pgd_range(struct mmu_gather **tlb,
 			unsigned long addr, unsigned long end,
 			unsigned long floor, unsigned long ceiling)
 {
 	pgd_t *pgd;
 	unsigned long next;
 	unsigned long start;
 	/*
 	 * The next few lines have given us lots of grief...
 	 *
 	 * Why are we testing PMD* at this top level?  Because often
 	 * there will be no work to do at all, and we'd prefer not to
 	 * go all the way down to the bottom just to discover that.
 	 *
 	 * Why all these "- 1"s?  Because 0 represents both the bottom
 	 * of the address space and the top of it (using -1 for the
 	 * top wouldn't help much: the masks would do the wrong thing).
 	 * The rule is that addr 0 and floor 0 refer to the bottom of
 	 * the address space, but end 0 and ceiling 0 refer to the top
 	 * Comparisons need to use "end - 1" and "ceiling - 1" (though
 	 * that end 0 case should be mythical).
 	 *
 	 * Wherever addr is brought up or ceiling brought down, we must
 	 * be careful to reject "the opposite 0" before it confuses the
 	 * subsequent tests.  But what about where end is brought down
 	 * by PMD_SIZE below? no, end can't go down to 0 there.
 	 *
 	 * Whereas we round start (addr) and ceiling down, by different
 	 * masks at different levels, in order to test whether a table
 	 * now has no other vmas using it, so can be freed, we don't
 	 * bother to round floor or end up - the tests don't need that.
 	 */
 	addr &= PMD_MASK;
 	if (addr < floor) {
 		addr += PMD_SIZE;
 		if (!addr)
 			return;
 	}
 	if (ceiling) {
 		ceiling &= PMD_MASK;
 		if (!ceiling)
 			return;
 	}
 	if (end - 1 > ceiling - 1)
 		end -= PMD_SIZE;
 	if (addr > end - 1)
 		return;
 	start = addr;
 	pgd = pgd_offset((*tlb)->mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
 		free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
 	} while (pgd++, addr = next, addr != end);
 }
 void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
 		unsigned long floor, unsigned long ceiling)
 {
 	while (vma) {
 		struct vm_area_struct *next = vma->vm_next;
 		unsigned long addr = vma->vm_start;
 		/*
 		 * Hide vma from rmap and vmtruncate before freeing pgtables
 		 */
 		anon_vma_unlink(vma);
 		unlink_file_vma(vma);
 		if (is_vm_hugetlb_page(vma)) {
 			hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
 				floor, next? next->vm_start: ceiling);
 		} else {
 			/*
 			 * Optimization: gather nearby vmas into one call down
 			 */
 			while (next && next->vm_start <= vma->vm_end + PMD_SIZE
 			       && !is_vm_hugetlb_page(next)) {
 				vma = next;
 				next = vma->vm_next;
 				anon_vma_unlink(vma);
 				unlink_file_vma(vma);
 			}
 			free_pgd_range(tlb, addr, vma->vm_end,
 				floor, next? next->vm_start: ceiling);
 		}
 		vma = next;
 	}
 }
 int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
 {
 	struct page *new = pte_alloc_one(mm, address);
 	if (!new)
 		return -ENOMEM;
 	pte_lock_init(new);
 	spin_lock(&mm->page_table_lock);
 	if (pmd_present(*pmd)) {	/* Another has populated it */
 		pte_lock_deinit(new);
 		pte_free(new);
 	} else {
 		mm->nr_ptes++;
 		inc_zone_page_state(new, NR_PAGETABLE);
 		pmd_populate(mm, pmd, new);
 	}
 	spin_unlock(&mm->page_table_lock);
 	return 0;
 }
 int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
 {
 	pte_t *new = pte_alloc_one_kernel(&init_mm, address);
 	if (!new)
 		return -ENOMEM;
 	spin_lock(&init_mm.page_table_lock);
 	if (pmd_present(*pmd))		/* Another has populated it */
 		pte_free_kernel(new);
 	else
 		pmd_populate_kernel(&init_mm, pmd, new);
 	spin_unlock(&init_mm.page_table_lock);
 	return 0;
 }
 static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
 {
 	if (file_rss)
 		add_mm_counter(mm, file_rss, file_rss);
 	if (anon_rss)
 		add_mm_counter(mm, anon_rss, anon_rss);
 }
 /*
  * This function is called to print an error when a bad pte
  * is found. For example, we might have a PFN-mapped pte in
  * a region that doesn't allow it.
  *
  * The calling function must still handle the error.
  */
 void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
 {
 	printk(KERN_ERR "Bad pte = %08llx, process = %s, "
 			"vm_flags = %lx, vaddr = %lx\n",
 		(long long)pte_val(pte),
 		(vma->vm_mm == current->mm ? current->comm : "???"),
 		vma->vm_flags, vaddr);
 	dump_stack();
 }
 static inline int is_cow_mapping(unsigned int flags)
 {
 	return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 }
 /*
  * This function gets the "struct page" associated with a pte.
  *
  * NOTE! Some mappings do not have "struct pages". A raw PFN mapping
  * will have each page table entry just pointing to a raw page frame
  * number, and as far as the VM layer is concerned, those do not have
  * pages associated with them - even if the PFN might point to memory
  * that otherwise is perfectly fine and has a "struct page".
  *
  * The way we recognize those mappings is through the rules set up
  * by "remap_pfn_range()": the vma will have the VM_PFNMAP bit set,
  * and the vm_pgoff will point to the first PFN mapped: thus every
  * page that is a raw mapping will always honor the rule
  *
  *	pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
  *
  * and if that isn't true, the page has been COW'ed (in which case it
  * _does_ have a "struct page" associated with it even if it is in a
  * VM_PFNMAP range).
  */
 struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
 {
 	unsigned long pfn = pte_pfn(pte);
 	if (unlikely(vma->vm_flags & VM_PFNMAP)) {
 		unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT;
 		if (pfn == vma->vm_pgoff + off)
 			return NULL;
 		if (!is_cow_mapping(vma->vm_flags))
 			return NULL;
 	}
 #ifdef CONFIG_DEBUG_VM
 	/*
 	 * Add some anal sanity checks for now. Eventually,
 	 * we should just do "return pfn_to_page(pfn)", but
 	 * in the meantime we check that we get a valid pfn,
 	 * and that the resulting page looks ok.
 	 */
 	if (unlikely(!pfn_valid(pfn))) {
 		print_bad_pte(vma, pte, addr);
 		return NULL;
 	}
 #endif
 	/*
 	 * NOTE! We still have PageReserved() pages in the page
 	 * tables.
 	 *
 	 * The PAGE_ZERO() pages and various VDSO mappings can
 	 * cause them to exist.
 	 */
 	return pfn_to_page(pfn);
 }
 /*
  * copy one vm_area from one task to the other. Assumes the page tables
  * already present in the new task to be cleared in the whole range
  * covered by this vma.
  */
 static inline void
 copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
 		unsigned long addr, int *rss)
 {
 	unsigned long vm_flags = vma->vm_flags;
 	pte_t pte = *src_pte;
 	struct page *page;
 	/* pte contains position in swap or file, so copy. */
 	if (unlikely(!pte_present(pte))) {
 		if (!pte_file(pte)) {
 			swp_entry_t entry = pte_to_swp_entry(pte);
 			swap_duplicate(entry);
 			/* make sure dst_mm is on swapoff's mmlist. */
 			if (unlikely(list_empty(&dst_mm->mmlist))) {
 				spin_lock(&mmlist_lock);
 				if (list_empty(&dst_mm->mmlist))
 					list_add(&dst_mm->mmlist,
 						 &src_mm->mmlist);
 				spin_unlock(&mmlist_lock);
 			}
 			if (is_write_migration_entry(entry) &&
 					is_cow_mapping(vm_flags)) {
 				/*
 				 * COW mappings require pages in both parent
 				 * and child to be set to read.
 				 */
 				make_migration_entry_read(&entry);
 				pte = swp_entry_to_pte(entry);
 				set_pte_at(src_mm, addr, src_pte, pte);
 			}
 		}
 		goto out_set_pte;
 	}
 	/*
 	 * If it's a COW mapping, write protect it both
 	 * in the parent and the child
 	 */
 	if (is_cow_mapping(vm_flags)) {
 		ptep_set_wrprotect(src_mm, addr, src_pte);
 		pte = pte_wrprotect(pte);
 	}
 	/*
 	 * If it's a shared mapping, mark it clean in
 	 * the child
 	 */
 	if (vm_flags & VM_SHARED)
 		pte = pte_mkclean(pte);
 	pte = pte_mkold(pte);
 	page = vm_normal_page(vma, addr, pte);
 	if (page) {
 		get_page(page);
 		page_dup_rmap(page, vma, addr);
 		rss[!!PageAnon(page)]++;
 	}
 out_set_pte:
 	set_pte_at(dst_mm, addr, dst_pte, pte);
 }
 static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
 		unsigned long addr, unsigned long end)
 {
 	pte_t *src_pte, *dst_pte;
 	spinlock_t *src_ptl, *dst_ptl;
 	int progress = 0;
 	int rss[2];
 again:
 	rss[1] = rss[0] = 0;
 	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
 	if (!dst_pte)
 		return -ENOMEM;
 	src_pte = pte_offset_map_nested(src_pmd, addr);
 	src_ptl = pte_lockptr(src_mm, src_pmd);
 	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
 	arch_enter_lazy_mmu_mode();
 	do {
 		/*
 		 * We are holding two locks at this point - either of them
 		 * could generate latencies in another task on another CPU.
 		 */
 		if (progress >= 32) {
 			progress = 0;
 			if (need_resched() ||
-			    need_lockbreak(src_ptl) ||
+			    spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
-			    need_lockbreak(dst_ptl))
 				break;
 		}
 		if (pte_none(*src_pte)) {
 			progress++;
 			continue;
 		}
 		copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
 		progress += 8;
 	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
 	arch_leave_lazy_mmu_mode();
 	spin_unlock(src_ptl);
 	pte_unmap_nested(src_pte - 1);
 	add_mm_rss(dst_mm, rss[0], rss[1]);
 	pte_unmap_unlock(dst_pte - 1, dst_ptl);
 	cond_resched();
 	if (addr != end)
 		goto again;
 	return 0;
 }
 static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
 		unsigned long addr, unsigned long end)
 {
 	pmd_t *src_pmd, *dst_pmd;
 	unsigned long next;
 	dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
 	if (!dst_pmd)
 		return -ENOMEM;
 	src_pmd = pmd_offset(src_pud, addr);
 	do {
 		next = pmd_addr_end(addr, end);
 		if (pmd_none_or_clear_bad(src_pmd))
 			continue;
 		if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
 						vma, addr, next))
 			return -ENOMEM;
 	} while (dst_pmd++, src_pmd++, addr = next, addr != end);
 	return 0;
 }
 static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
 		unsigned long addr, unsigned long end)
 {
 	pud_t *src_pud, *dst_pud;
 	unsigned long next;
 	dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
 	if (!dst_pud)
 		return -ENOMEM;
 	src_pud = pud_offset(src_pgd, addr);
 	do {
 		next = pud_addr_end(addr, end);
 		if (pud_none_or_clear_bad(src_pud))
 			continue;
 		if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
 						vma, addr, next))
 			return -ENOMEM;
 	} while (dst_pud++, src_pud++, addr = next, addr != end);
 	return 0;
 }
 int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		struct vm_area_struct *vma)
 {
 	pgd_t *src_pgd, *dst_pgd;
 	unsigned long next;
 	unsigned long addr = vma->vm_start;
 	unsigned long end = vma->vm_end;
 	/*
 	 * Don't copy ptes where a page fault will fill them correctly.
 	 * Fork becomes much lighter when there are big shared or private
 	 * readonly mappings. The tradeoff is that copy_page_range is more
 	 * efficient than faulting.
 	 */
 	if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {
 		if (!vma->anon_vma)
 			return 0;
 	}
 	if (is_vm_hugetlb_page(vma))
 		return copy_hugetlb_page_range(dst_mm, src_mm, vma);
 	dst_pgd = pgd_offset(dst_mm, addr);
 	src_pgd = pgd_offset(src_mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(src_pgd))
 			continue;
 		if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
 						vma, addr, next))
 			return -ENOMEM;
 	} while (dst_pgd++, src_pgd++, addr = next, addr != end);
 	return 0;
 }
 static unsigned long zap_pte_range(struct mmu_gather *tlb,
 				struct vm_area_struct *vma, pmd_t *pmd,
 				unsigned long addr, unsigned long end,
 				long *zap_work, struct zap_details *details)
 {
 	struct mm_struct *mm = tlb->mm;
 	pte_t *pte;
 	spinlock_t *ptl;
 	int file_rss = 0;
 	int anon_rss = 0;
 	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	arch_enter_lazy_mmu_mode();
 	do {
 		pte_t ptent = *pte;
 		if (pte_none(ptent)) {
 			(*zap_work)--;
 			continue;
 		}
 		(*zap_work) -= PAGE_SIZE;
 		if (pte_present(ptent)) {
 			struct page *page;
 			page = vm_normal_page(vma, addr, ptent);
 			if (unlikely(details) && page) {
 				/*
 				 * unmap_shared_mapping_pages() wants to
 				 * invalidate cache without truncating:
 				 * unmap shared but keep private pages.
 				 */
 				if (details->check_mapping &&
 				    details->check_mapping != page->mapping)
 					continue;
 				/*
 				 * Each page->index must be checked when
 				 * invalidating or truncating nonlinear.
 				 */
 				if (details->nonlinear_vma &&
 				    (page->index < details->first_index ||
 				     page->index > details->last_index))
 					continue;
 			}
 			ptent = ptep_get_and_clear_full(mm, addr, pte,
 							tlb->fullmm);
 			tlb_remove_tlb_entry(tlb, pte, addr);
 			if (unlikely(!page))
 				continue;
 			if (unlikely(details) && details->nonlinear_vma
 			    && linear_page_index(details->nonlinear_vma,
 						addr) != page->index)
 				set_pte_at(mm, addr, pte,
 					   pgoff_to_pte(page->index));
 			if (PageAnon(page))
 				anon_rss--;
 			else {
 				if (pte_dirty(ptent))
 					set_page_dirty(page);
 				if (pte_young(ptent))
 					SetPageReferenced(page);
 				file_rss--;
 			}
 			page_remove_rmap(page, vma);
 			tlb_remove_page(tlb, page);
 			continue;
 		}
 		/*
 		 * If details->check_mapping, we leave swap entries;
 		 * if details->nonlinear_vma, we leave file entries.
 		 */
 		if (unlikely(details))
 			continue;
 		if (!pte_file(ptent))
 			free_swap_and_cache(pte_to_swp_entry(ptent));
 		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
 	} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
 	add_mm_rss(mm, file_rss, anon_rss);
 	arch_leave_lazy_mmu_mode();
 	pte_unmap_unlock(pte - 1, ptl);
 	return addr;
 }
 static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
 				struct vm_area_struct *vma, pud_t *pud,
 				unsigned long addr, unsigned long end,
 				long *zap_work, struct zap_details *details)
 {
 	pmd_t *pmd;
 	unsigned long next;
 	pmd = pmd_offset(pud, addr);
 	do {
 		next = pmd_addr_end(addr, end);
 		if (pmd_none_or_clear_bad(pmd)) {
 			(*zap_work)--;
 			continue;
 		}
 		next = zap_pte_range(tlb, vma, pmd, addr, next,
 						zap_work, details);
 	} while (pmd++, addr = next, (addr != end && *zap_work > 0));
 	return addr;
 }
 static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
 				struct vm_area_struct *vma, pgd_t *pgd,
 				unsigned long addr, unsigned long end,
 				long *zap_work, struct zap_details *details)
 {
 	pud_t *pud;
 	unsigned long next;
 	pud = pud_offset(pgd, addr);
 	do {
 		next = pud_addr_end(addr, end);
 		if (pud_none_or_clear_bad(pud)) {
 			(*zap_work)--;
 			continue;
 		}
 		next = zap_pmd_range(tlb, vma, pud, addr, next,
 						zap_work, details);
 	} while (pud++, addr = next, (addr != end && *zap_work > 0));
 	return addr;
 }
 static unsigned long unmap_page_range(struct mmu_gather *tlb,
 				struct vm_area_struct *vma,
 				unsigned long addr, unsigned long end,
 				long *zap_work, struct zap_details *details)
 {
 	pgd_t *pgd;
 	unsigned long next;
 	if (details && !details->check_mapping && !details->nonlinear_vma)
 		details = NULL;
 	BUG_ON(addr >= end);
 	tlb_start_vma(tlb, vma);
 	pgd = pgd_offset(vma->vm_mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd)) {
 			(*zap_work)--;
 			continue;
 		}
 		next = zap_pud_range(tlb, vma, pgd, addr, next,
 						zap_work, details);
 	} while (pgd++, addr = next, (addr != end && *zap_work > 0));
 	tlb_end_vma(tlb, vma);
 	return addr;
 }
 #ifdef CONFIG_PREEMPT
 # define ZAP_BLOCK_SIZE	(8 * PAGE_SIZE)
 #else
 /* No preempt: go for improved straight-line efficiency */
 # define ZAP_BLOCK_SIZE	(1024 * PAGE_SIZE)
 #endif
 /**
  * unmap_vmas - unmap a range of memory covered by a list of vma's
  * @tlbp: address of the caller's struct mmu_gather
  * @vma: the starting vma
  * @start_addr: virtual address at which to start unmapping
  * @end_addr: virtual address at which to end unmapping
  * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here
  * @details: details of nonlinear truncation or shared cache invalidation
  *
  * Returns the end address of the unmapping (restart addr if interrupted).
  *
  * Unmap all pages in the vma list.
  *
  * We aim to not hold locks for too long (for scheduling latency reasons).
  * So zap pages in ZAP_BLOCK_SIZE bytecounts.  This means we need to
  * return the ending mmu_gather to the caller.
  *
  * Only addresses between `start' and `end' will be unmapped.
  *
  * The VMA list must be sorted in ascending virtual address order.
  *
  * unmap_vmas() assumes that the caller will flush the whole unmapped address
  * range after unmap_vmas() returns.  So the only responsibility here is to
  * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
  * drops the lock and schedules.
  */
 unsigned long unmap_vmas(struct mmu_gather **tlbp,
 		struct vm_area_struct *vma, unsigned long start_addr,
 		unsigned long end_addr, unsigned long *nr_accounted,
 		struct zap_details *details)
 {
 	long zap_work = ZAP_BLOCK_SIZE;
 	unsigned long tlb_start = 0;	/* For tlb_finish_mmu */
 	int tlb_start_valid = 0;
 	unsigned long start = start_addr;
 	spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
 	int fullmm = (*tlbp)->fullmm;
 	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
 		unsigned long end;
 		start = max(vma->vm_start, start_addr);
 		if (start >= vma->vm_end)
 			continue;
 		end = min(vma->vm_end, end_addr);
 		if (end <= vma->vm_start)
 			continue;
 		if (vma->vm_flags & VM_ACCOUNT)
 			*nr_accounted += (end - start) >> PAGE_SHIFT;
 		while (start != end) {
 			if (!tlb_start_valid) {
 				tlb_start = start;
 				tlb_start_valid = 1;
 			}
 			if (unlikely(is_vm_hugetlb_page(vma))) {
 				unmap_hugepage_range(vma, start, end);
 				zap_work -= (end - start) /
 						(HPAGE_SIZE / PAGE_SIZE);
 				start = end;
 			} else
 				start = unmap_page_range(*tlbp, vma,
 						start, end, &zap_work, details);
 			if (zap_work > 0) {
 				BUG_ON(start != end);
 				break;
 			}
 			tlb_finish_mmu(*tlbp, tlb_start, start);
 			if (need_resched() ||
-				(i_mmap_lock && need_lockbreak(i_mmap_lock))) {
+				(i_mmap_lock && spin_needbreak(i_mmap_lock))) {
 				if (i_mmap_lock) {
 					*tlbp = NULL;
 					goto out;
 				}
 				cond_resched();
 			}
 			*tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
 			tlb_start_valid = 0;
 			zap_work = ZAP_BLOCK_SIZE;
 		}
 	}
 out:
 	return start;	/* which is now the end (or restart) address */
 }
 /**
  * zap_page_range - remove user pages in a given range
  * @vma: vm_area_struct holding the applicable pages
  * @address: starting address of pages to zap
  * @size: number of bytes to zap
  * @details: details of nonlinear truncation or shared cache invalidation
  */
 unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
 		unsigned long size, struct zap_details *details)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	struct mmu_gather *tlb;
 	unsigned long end = address + size;
 	unsigned long nr_accounted = 0;
 	lru_add_drain();
 	tlb = tlb_gather_mmu(mm, 0);
 	update_hiwater_rss(mm);
 	end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
 	if (tlb)
 		tlb_finish_mmu(tlb, address, end);
 	return end;
 }
 /*
  * Do a quick page-table lookup for a single page.
  */
 struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 			unsigned int flags)
 {
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *ptep, pte;
 	spinlock_t *ptl;
 	struct page *page;
 	struct mm_struct *mm = vma->vm_mm;
 	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
 	if (!IS_ERR(page)) {
 		BUG_ON(flags & FOLL_GET);
 		goto out;
 	}
 	page = NULL;
 	pgd = pgd_offset(mm, address);
 	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
 		goto no_page_table;
 	pud = pud_offset(pgd, address);
 	if (pud_none(*pud) || unlikely(pud_bad(*pud)))
 		goto no_page_table;
 	pmd = pmd_offset(pud, address);
 	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
 		goto no_page_table;
 	if (pmd_huge(*pmd)) {
 		BUG_ON(flags & FOLL_GET);
 		page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
 		goto out;
 	}
 	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
 	if (!ptep)
 		goto out;
 	pte = *ptep;
 	if (!pte_present(pte))
 		goto unlock;
 	if ((flags & FOLL_WRITE) && !pte_write(pte))
 		goto unlock;
 	page = vm_normal_page(vma, address, pte);
 	if (unlikely(!page))
 		goto unlock;
 	if (flags & FOLL_GET)
 		get_page(page);
 	if (flags & FOLL_TOUCH) {
 		if ((flags & FOLL_WRITE) &&
 		    !pte_dirty(pte) && !PageDirty(page))
 			set_page_dirty(page);
 		mark_page_accessed(page);
 	}
 unlock:
 	pte_unmap_unlock(ptep, ptl);
 out:
 	return page;
 no_page_table:
 	/*
 	 * When core dumping an enormous anonymous area that nobody
 	 * has touched so far, we don't want to allocate page tables.
 	 */
 	if (flags & FOLL_ANON) {
 		page = ZERO_PAGE(0);
 		if (flags & FOLL_GET)
 			get_page(page);
 		BUG_ON(flags & FOLL_WRITE);
 	}
 	return page;
 }
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		unsigned long start, int len, int write, int force,
 		struct page **pages, struct vm_area_struct **vmas)
 {
 	int i;
 	unsigned int vm_flags;
 	/*
 	 * Require read or write permissions.
 	 * If 'force' is set, we only require the "MAY" flags.
 	 */
 	vm_flags  = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
 	vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
 	i = 0;
 	do {
 		struct vm_area_struct *vma;
 		unsigned int foll_flags;
 		vma = find_extend_vma(mm, start);
 		if (!vma && in_gate_area(tsk, start)) {
 			unsigned long pg = start & PAGE_MASK;
 			struct vm_area_struct *gate_vma = get_gate_vma(tsk);
 			pgd_t *pgd;
 			pud_t *pud;
 			pmd_t *pmd;
 			pte_t *pte;
 			if (write) /* user gate pages are read-only */
 				return i ? : -EFAULT;
 			if (pg > TASK_SIZE)
 				pgd = pgd_offset_k(pg);
 			else
 				pgd = pgd_offset_gate(mm, pg);
 			BUG_ON(pgd_none(*pgd));
 			pud = pud_offset(pgd, pg);
 			BUG_ON(pud_none(*pud));
 			pmd = pmd_offset(pud, pg);
 			if (pmd_none(*pmd))
 				return i ? : -EFAULT;
 			pte = pte_offset_map(pmd, pg);
 			if (pte_none(*pte)) {
 				pte_unmap(pte);
 				return i ? : -EFAULT;
 			}
 			if (pages) {
 				struct page *page = vm_normal_page(gate_vma, start, *pte);
 				pages[i] = page;
 				if (page)
 					get_page(page);
 			}
 			pte_unmap(pte);
 			if (vmas)
 				vmas[i] = gate_vma;
 			i++;
 			start += PAGE_SIZE;
 			len--;
 			continue;
 		}
 		if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP))
 				|| !(vm_flags & vma->vm_flags))
 			return i ? : -EFAULT;
 		if (is_vm_hugetlb_page(vma)) {
 			i = follow_hugetlb_page(mm, vma, pages, vmas,
 						&start, &len, i, write);
 			continue;
 		}
 		foll_flags = FOLL_TOUCH;
 		if (pages)
 			foll_flags |= FOLL_GET;
 		if (!write && !(vma->vm_flags & VM_LOCKED) &&
 		    (!vma->vm_ops || (!vma->vm_ops->nopage &&
 					!vma->vm_ops->fault)))
 			foll_flags |= FOLL_ANON;
 		do {
 			struct page *page;
 			/*
 			 * If tsk is ooming, cut off its access to large memory
 			 * allocations. It has a pending SIGKILL, but it can't
 			 * be processed until returning to user space.
 			 */
 			if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE)))
 				return -ENOMEM;
 			if (write)
 				foll_flags |= FOLL_WRITE;
 			cond_resched();
 			while (!(page = follow_page(vma, start, foll_flags))) {
 				int ret;
 				ret = handle_mm_fault(mm, vma, start,
 						foll_flags & FOLL_WRITE);
 				if (ret & VM_FAULT_ERROR) {
 					if (ret & VM_FAULT_OOM)
 						return i ? i : -ENOMEM;
 					else if (ret & VM_FAULT_SIGBUS)
 						return i ? i : -EFAULT;
 					BUG();
 				}
 				if (ret & VM_FAULT_MAJOR)
 					tsk->maj_flt++;
 				else
 					tsk->min_flt++;
 				/*
 				 * The VM_FAULT_WRITE bit tells us that
 				 * do_wp_page has broken COW when necessary,
 				 * even if maybe_mkwrite decided not to set
 				 * pte_write. We can thus safely do subsequent
 				 * page lookups as if they were reads.
 				 */
 				if (ret & VM_FAULT_WRITE)
 					foll_flags &= ~FOLL_WRITE;
 				cond_resched();
 			}
 			if (pages) {
 				pages[i] = page;
 				flush_anon_page(vma, page, start);
 				flush_dcache_page(page);
 			}
 			if (vmas)
 				vmas[i] = vma;
 			i++;
 			start += PAGE_SIZE;
 			len--;
 		} while (len && start < vma->vm_end);
 	} while (len);
 	return i;
 }
 EXPORT_SYMBOL(get_user_pages);
 pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl)
 {
 	pgd_t * pgd = pgd_offset(mm, addr);
 	pud_t * pud = pud_alloc(mm, pgd, addr);
 	if (pud) {
 		pmd_t * pmd = pmd_alloc(mm, pud, addr);
 		if (pmd)
 			return pte_alloc_map_lock(mm, pmd, addr, ptl);
 	}
 	return NULL;
 }
 /*
  * This is the old fallback for page remapping.
  *
  * For historical reasons, it only allows reserved pages. Only
  * old drivers should use this, and they needed to mark their
  * pages reserved for the old functions anyway.
  */
 static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *page, pgprot_t prot)
 {
 	int retval;
 	pte_t *pte;
 	spinlock_t *ptl;
 	retval = -EINVAL;
 	if (PageAnon(page))
 		goto out;
 	retval = -ENOMEM;
 	flush_dcache_page(page);
 	pte = get_locked_pte(mm, addr, &ptl);
 	if (!pte)
 		goto out;
 	retval = -EBUSY;
 	if (!pte_none(*pte))
 		goto out_unlock;
 	/* Ok, finally just insert the thing.. */
 	get_page(page);
 	inc_mm_counter(mm, file_rss);
 	page_add_file_rmap(page);
 	set_pte_at(mm, addr, pte, mk_pte(page, prot));
 	retval = 0;
 out_unlock:
 	pte_unmap_unlock(pte, ptl);
 out:
 	return retval;
 }
 /**
  * vm_insert_page - insert single page into user vma
  * @vma: user vma to map to
  * @addr: target user address of this page
  * @page: source kernel page
  *
  * This allows drivers to insert individual pages they've allocated
  * into a user vma.
  *
  * The page has to be a nice clean _individual_ kernel allocation.
  * If you allocate a compound page, you need to have marked it as
  * such (__GFP_COMP), or manually just split the page up yourself
  * (see split_page()).
  *
  * NOTE! Traditionally this was done with "remap_pfn_range()" which
  * took an arbitrary page protection parameter. This doesn't allow
  * that. Your vma protection will have to be set up correctly, which
  * means that if you want a shared writable mapping, you'd better
  * ask for a shared writable mapping!
  *
  * The page does not need to be reserved.
  */
 int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page)
 {
 	if (addr < vma->vm_start || addr >= vma->vm_end)
 		return -EFAULT;
 	if (!page_count(page))
 		return -EINVAL;
 	vma->vm_flags |= VM_INSERTPAGE;
 	return insert_page(vma->vm_mm, addr, page, vma->vm_page_prot);
 }
 EXPORT_SYMBOL(vm_insert_page);
 /**
  * vm_insert_pfn - insert single pfn into user vma
  * @vma: user vma to map to
  * @addr: target user address of this page
  * @pfn: source kernel pfn
  *
  * Similar to vm_inert_page, this allows drivers to insert individual pages
  * they've allocated into a user vma. Same comments apply.
  *
  * This function should only be called from a vm_ops->fault handler, and
  * in that case the handler should return NULL.
  */
 int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 		unsigned long pfn)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	int retval;
 	pte_t *pte, entry;
 	spinlock_t *ptl;
 	BUG_ON(!(vma->vm_flags & VM_PFNMAP));
 	BUG_ON(is_cow_mapping(vma->vm_flags));
 	retval = -ENOMEM;
 	pte = get_locked_pte(mm, addr, &ptl);
 	if (!pte)
 		goto out;
 	retval = -EBUSY;
 	if (!pte_none(*pte))
 		goto out_unlock;
 	/* Ok, finally just insert the thing.. */
 	entry = pfn_pte(pfn, vma->vm_page_prot);
 	set_pte_at(mm, addr, pte, entry);
 	update_mmu_cache(vma, addr, entry);
 	retval = 0;
 out_unlock:
 	pte_unmap_unlock(pte, ptl);
 out:
 	return retval;
 }
 EXPORT_SYMBOL(vm_insert_pfn);
 /*
  * maps a range of physical memory into the requested pages. the old
  * mappings are removed. any references to nonexistent pages results
  * in null mappings (currently treated as "copy-on-access")
  */
 static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
 			unsigned long addr, unsigned long end,
 			unsigned long pfn, pgprot_t prot)
 {
 	pte_t *pte;
 	spinlock_t *ptl;
 	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
 	if (!pte)
 		return -ENOMEM;
 	arch_enter_lazy_mmu_mode();
 	do {
 		BUG_ON(!pte_none(*pte));
 		set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
 		pfn++;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	arch_leave_lazy_mmu_mode();
 	pte_unmap_unlock(pte - 1, ptl);
 	return 0;
 }
 static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
 			unsigned long addr, unsigned long end,
 			unsigned long pfn, pgprot_t prot)
 {
 	pmd_t *pmd;
 	unsigned long next;
 	pfn -= addr >> PAGE_SHIFT;
 	pmd = pmd_alloc(mm, pud, addr);
 	if (!pmd)
 		return -ENOMEM;
 	do {
 		next = pmd_addr_end(addr, end);
 		if (remap_pte_range(mm, pmd, addr, next,
 				pfn + (addr >> PAGE_SHIFT), prot))
 			return -ENOMEM;
 	} while (pmd++, addr = next, addr != end);
 	return 0;
 }
 static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
 			unsigned long addr, unsigned long end,
 			unsigned long pfn, pgprot_t prot)
 {
 	pud_t *pud;
 	unsigned long next;
 	pfn -= addr >> PAGE_SHIFT;
 	pud = pud_alloc(mm, pgd, addr);
 	if (!pud)
 		return -ENOMEM;
 	do {
 		next = pud_addr_end(addr, end);
 		if (remap_pmd_range(mm, pud, addr, next,
 				pfn + (addr >> PAGE_SHIFT), prot))
 			return -ENOMEM;
 	} while (pud++, addr = next, addr != end);
 	return 0;
 }
 /**
  * remap_pfn_range - remap kernel memory to userspace
  * @vma: user vma to map to
  * @addr: target user address to start at
  * @pfn: physical address of kernel memory
  * @size: size of map area
  * @prot: page protection flags for this mapping
  *
  *  Note: this is only safe if the mm semaphore is held when called.
  */
 int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 		    unsigned long pfn, unsigned long size, pgprot_t prot)
 {
 	pgd_t *pgd;
 	unsigned long next;
 	unsigned long end = addr + PAGE_ALIGN(size);
 	struct mm_struct *mm = vma->vm_mm;
 	int err;
 	/*
 	 * Physically remapped pages are special. Tell the
 	 * rest of the world about it:
 	 *   VM_IO tells people not to look at these pages
 	 *	(accesses can have side effects).
 	 *   VM_RESERVED is specified all over the place, because
 	 *	in 2.4 it kept swapout's vma scan off this vma; but
 	 *	in 2.6 the LRU scan won't even find its pages, so this
 	 *	flag means no more than count its pages in reserved_vm,
 	 * 	and omit it from core dump, even when VM_IO turned off.
 	 *   VM_PFNMAP tells the core MM that the base pages are just
 	 *	raw PFN mappings, and do not have a "struct page" associated
 	 *	with them.
 	 *
 	 * There's a horrible special case to handle copy-on-write
 	 * behaviour that some programs depend on. We mark the "original"
 	 * un-COW'ed pages by matching them up with "vma->vm_pgoff".
 	 */
 	if (is_cow_mapping(vma->vm_flags)) {
 		if (addr != vma->vm_start || end != vma->vm_end)
 			return -EINVAL;
 		vma->vm_pgoff = pfn;
 	}
 	vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
 	BUG_ON(addr >= end);
 	pfn -= addr >> PAGE_SHIFT;
 	pgd = pgd_offset(mm, addr);
 	flush_cache_range(vma, addr, end);
 	do {
 		next = pgd_addr_end(addr, end);
 		err = remap_pud_range(mm, pgd, addr, next,
 				pfn + (addr >> PAGE_SHIFT), prot);
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
 	return err;
 }
 EXPORT_SYMBOL(remap_pfn_range);
 static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
 				     unsigned long addr, unsigned long end,
 				     pte_fn_t fn, void *data)
 {
 	pte_t *pte;
 	int err;
 	struct page *pmd_page;
 	spinlock_t *uninitialized_var(ptl);
 	pte = (mm == &init_mm) ?
 		pte_alloc_kernel(pmd, addr) :
 		pte_alloc_map_lock(mm, pmd, addr, &ptl);
 	if (!pte)
 		return -ENOMEM;
 	BUG_ON(pmd_huge(*pmd));
 	pmd_page = pmd_page(*pmd);
 	do {
 		err = fn(pte, pmd_page, addr, data);
 		if (err)
 			break;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	if (mm != &init_mm)
 		pte_unmap_unlock(pte-1, ptl);
 	return err;
 }
 static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
 				     unsigned long addr, unsigned long end,
 				     pte_fn_t fn, void *data)
 {
 	pmd_t *pmd;
 	unsigned long next;
 	int err;
 	pmd = pmd_alloc(mm, pud, addr);
 	if (!pmd)
 		return -ENOMEM;
 	do {
 		next = pmd_addr_end(addr, end);
 		err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
 		if (err)
 			break;
 	} while (pmd++, addr = next, addr != end);
 	return err;
 }
 static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
 				     unsigned long addr, unsigned long end,
 				     pte_fn_t fn, void *data)
 {
 	pud_t *pud;
 	unsigned long next;
 	int err;
 	pud = pud_alloc(mm, pgd, addr);
 	if (!pud)
 		return -ENOMEM;
 	do {
 		next = pud_addr_end(addr, end);
 		err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
 		if (err)
 			break;
 	} while (pud++, addr = next, addr != end);
 	return err;
 }
 /*
  * Scan a region of virtual memory, filling in page tables as necessary
  * and calling a provided function on each leaf page table.
  */
 int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
 			unsigned long size, pte_fn_t fn, void *data)
 {
 	pgd_t *pgd;
 	unsigned long next;
 	unsigned long end = addr + size;
 	int err;
 	BUG_ON(addr >= end);
 	pgd = pgd_offset(mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
 		err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
 	return err;
 }
 EXPORT_SYMBOL_GPL(apply_to_page_range);
 /*
  * handle_pte_fault chooses page fault handler according to an entry
  * which was read non-atomically.  Before making any commitment, on
  * those architectures or configurations (e.g. i386 with PAE) which
  * might give a mix of unmatched parts, do_swap_page and do_file_page
  * must check under lock before unmapping the pte and proceeding
  * (but do_wp_page is only called after already making such a check;
  * and do_anonymous_page and do_no_page can safely check later on).
  */
 static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
 				pte_t *page_table, pte_t orig_pte)
 {
 	int same = 1;
 #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
 	if (sizeof(pte_t) > sizeof(unsigned long)) {
 		spinlock_t *ptl = pte_lockptr(mm, pmd);
 		spin_lock(ptl);
 		same = pte_same(*page_table, orig_pte);
 		spin_unlock(ptl);
 	}
 #endif
 	pte_unmap(page_table);
 	return same;
 }
 /*
  * Do pte_mkwrite, but only if the vma says VM_WRITE.  We do this when
  * servicing faults for write access.  In the normal case, do always want
  * pte_mkwrite.  But get_user_pages can cause write faults for mappings
  * that do not have writing enabled, when used by access_process_vm.
  */
 static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
 {
 	if (likely(vma->vm_flags & VM_WRITE))
 		pte = pte_mkwrite(pte);
 	return pte;
 }
 static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
 {
 	/*
 	 * If the source page was a PFN mapping, we don't have
 	 * a "struct page" for it. We do a best-effort copy by
 	 * just copying from the original user address. If that
 	 * fails, we just zero-fill it. Live with it.
 	 */
 	if (unlikely(!src)) {
 		void *kaddr = kmap_atomic(dst, KM_USER0);
 		void __user *uaddr = (void __user *)(va & PAGE_MASK);
 		/*
 		 * This really shouldn't fail, because the page is there
 		 * in the page tables. But it might just be unreadable,
 		 * in which case we just give up and fill the result with
 		 * zeroes.
 		 */
 		if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
 			memset(kaddr, 0, PAGE_SIZE);
 		kunmap_atomic(kaddr, KM_USER0);
 		flush_dcache_page(dst);
 		return;
 	}
 	copy_user_highpage(dst, src, va, vma);
 }
 /*
  * This routine handles present pages, when users try to write
  * to a shared page. It is done by copying the page to a new address
  * and decrementing the shared-page counter for the old page.
  *
  * Note that this routine assumes that the protection checks have been
  * done by the caller (the low-level page fault routine in most cases).
  * Thus we can safely just mark it writable once we've done any necessary
  * COW.
  *
  * We also mark the page dirty at this point even though the page will
  * change only once the write actually happens. This avoids a few races,
  * and potentially makes it more efficient.
  *
  * We enter with non-exclusive mmap_sem (to exclude vma changes,
  * but allow concurrent faults), with pte both mapped and locked.
  * We return with mmap_sem still held, but pte unmapped and unlocked.
  */
 static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
 		spinlock_t *ptl, pte_t orig_pte)
 {
 	struct page *old_page, *new_page;
 	pte_t entry;
 	int reuse = 0, ret = 0;
 	int page_mkwrite = 0;
 	struct page *dirty_page = NULL;
 	old_page = vm_normal_page(vma, address, orig_pte);
 	if (!old_page)
 		goto gotten;
 	/*
 	 * Take out anonymous pages first, anonymous shared vmas are
 	 * not dirty accountable.
 	 */
 	if (PageAnon(old_page)) {
 		if (!TestSetPageLocked(old_page)) {
 			reuse = can_share_swap_page(old_page);
 			unlock_page(old_page);
 		}
 	} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
 					(VM_WRITE|VM_SHARED))) {
 		/*
 		 * Only catch write-faults on shared writable pages,
 		 * read-only shared pages can get COWed by
 		 * get_user_pages(.write=1, .force=1).
 		 */
 		if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
 			/*
 			 * Notify the address space that the page is about to
 			 * become writable so that it can prohibit this or wait
 			 * for the page to get into an appropriate state.
 			 *
 			 * We do this without the lock held, so that it can
 			 * sleep if it needs to.
 			 */
 			page_cache_get(old_page);
 			pte_unmap_unlock(page_table, ptl);
 			if (vma->vm_ops->page_mkwrite(vma, old_page) < 0)
 				goto unwritable_page;
 			/*
 			 * Since we dropped the lock we need to revalidate
 			 * the PTE as someone else may have changed it.  If
 			 * they did, we just return, as we can count on the
 			 * MMU to tell us if they didn't also make it writable.
 			 */
 			page_table = pte_offset_map_lock(mm, pmd, address,
 							 &ptl);
 			page_cache_release(old_page);
 			if (!pte_same(*page_table, orig_pte))
 				goto unlock;
 			page_mkwrite = 1;
 		}
 		dirty_page = old_page;
 		get_page(dirty_page);
 		reuse = 1;
 	}
 	if (reuse) {
 		flush_cache_page(vma, address, pte_pfn(orig_pte));
 		entry = pte_mkyoung(orig_pte);
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 		if (ptep_set_access_flags(vma, address, page_table, entry,1))
 			update_mmu_cache(vma, address, entry);
 		ret |= VM_FAULT_WRITE;
 		goto unlock;
 	}
 	/*
 	 * Ok, we need to copy. Oh, well..
 	 */
 	page_cache_get(old_page);
 gotten:
 	pte_unmap_unlock(page_table, ptl);
 	if (unlikely(anon_vma_prepare(vma)))
 		goto oom;
 	VM_BUG_ON(old_page == ZERO_PAGE(0));
 	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
 	if (!new_page)
 		goto oom;
 	cow_user_page(new_page, old_page, address, vma);
 	/*
 	 * Re-check the pte - we dropped the lock
 	 */
 	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
 	if (likely(pte_same(*page_table, orig_pte))) {
 		if (old_page) {
 			page_remove_rmap(old_page, vma);
 			if (!PageAnon(old_page)) {
 				dec_mm_counter(mm, file_rss);
 				inc_mm_counter(mm, anon_rss);
 			}
 		} else
 			inc_mm_counter(mm, anon_rss);
 		flush_cache_page(vma, address, pte_pfn(orig_pte));
 		entry = mk_pte(new_page, vma->vm_page_prot);
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 		/*
 		 * Clear the pte entry and flush it first, before updating the
 		 * pte with the new entry. This will avoid a race condition
 		 * seen in the presence of one thread doing SMC and another
 		 * thread doing COW.
 		 */
 		ptep_clear_flush(vma, address, page_table);
 		set_pte_at(mm, address, page_table, entry);
 		update_mmu_cache(vma, address, entry);
 		lru_cache_add_active(new_page);
 		page_add_new_anon_rmap(new_page, vma, address);
 		/* Free the old page.. */
 		new_page = old_page;
 		ret |= VM_FAULT_WRITE;
 	}
 	if (new_page)
 		page_cache_release(new_page);
 	if (old_page)
 		page_cache_release(old_page);
 unlock:
 	pte_unmap_unlock(page_table, ptl);
 	if (dirty_page) {
 		if (vma->vm_file)
 			file_update_time(vma->vm_file);
 		/*
 		 * Yes, Virginia, this is actually required to prevent a race
 		 * with clear_page_dirty_for_io() from clearing the page dirty
 		 * bit after it clear all dirty ptes, but before a racing
 		 * do_wp_page installs a dirty pte.
 		 *
 		 * do_no_page is protected similarly.
 		 */
 		wait_on_page_locked(dirty_page);
 		set_page_dirty_balance(dirty_page, page_mkwrite);
 		put_page(dirty_page);
 	}
 	return ret;
 oom:
 	if (old_page)
 		page_cache_release(old_page);
 	return VM_FAULT_OOM;
 unwritable_page:
 	page_cache_release(old_page);
 	return VM_FAULT_SIGBUS;
 }
 /*
  * Helper functions for unmap_mapping_range().
  *
  * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __
  *
  * We have to restart searching the prio_tree whenever we drop the lock,
  * since the iterator is only valid while the lock is held, and anyway
  * a later vma might be split and reinserted earlier while lock dropped.
  *
  * The list of nonlinear vmas could be handled more efficiently, using
  * a placeholder, but handle it in the same way until a need is shown.
  * It is important to search the prio_tree before nonlinear list: a vma
  * may become nonlinear and be shifted from prio_tree to nonlinear list
  * while the lock is dropped; but never shifted from list to prio_tree.
  *
  * In order to make forward progress despite restarting the search,
  * vm_truncate_count is used to mark a vma as now dealt with, so we can
  * quickly skip it next time around.  Since the prio_tree search only
  * shows us those vmas affected by unmapping the range in question, we
  * can't efficiently keep all vmas in step with mapping->truncate_count:
  * so instead reset them all whenever it wraps back to 0 (then go to 1).
  * mapping->truncate_count and vma->vm_truncate_count are protected by
  * i_mmap_lock.
  *
  * In order to make forward progress despite repeatedly restarting some
  * large vma, note the restart_addr from unmap_vmas when it breaks out:
  * and restart from that address when we reach that vma again.  It might
  * have been split or merged, shrunk or extended, but never shifted: so
  * restart_addr remains valid so long as it remains in the vma's range.
  * unmap_mapping_range forces truncate_count to leap over page-aligned
  * values so we can save vma's restart_addr in its truncate_count field.
  */
 #define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK))
 static void reset_vma_truncate_counts(struct address_space *mapping)
 {
 	struct vm_area_struct *vma;
 	struct prio_tree_iter iter;
 	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
 		vma->vm_truncate_count = 0;
 	list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
 		vma->vm_truncate_count = 0;
 }
 static int unmap_mapping_range_vma(struct vm_area_struct *vma,
 		unsigned long start_addr, unsigned long end_addr,
 		struct zap_details *details)
 {
 	unsigned long restart_addr;
 	int need_break;
 	/*
 	 * files that support invalidating or truncating portions of the
 	 * file from under mmaped areas must have their ->fault function
 	 * return a locked page (and set VM_FAULT_LOCKED in the return).
 	 * This provides synchronisation against concurrent unmapping here.
 	 */
 again:
 	restart_addr = vma->vm_truncate_count;
 	if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
 		start_addr = restart_addr;
 		if (start_addr >= end_addr) {
 			/* Top of vma has been split off since last time */
 			vma->vm_truncate_count = details->truncate_count;
 			return 0;
 		}
 	}
 	restart_addr = zap_page_range(vma, start_addr,
 					end_addr - start_addr, details);
-	need_break = need_resched() ||
+	need_break = need_resched() || spin_needbreak(details->i_mmap_lock);
-			need_lockbreak(details->i_mmap_lock);
 	if (restart_addr >= end_addr) {
 		/* We have now completed this vma: mark it so */
 		vma->vm_truncate_count = details->truncate_count;
 		if (!need_break)
 			return 0;
 	} else {
 		/* Note restart_addr in vma's truncate_count field */
 		vma->vm_truncate_count = restart_addr;
 		if (!need_break)
 			goto again;
 	}
 	spin_unlock(details->i_mmap_lock);
 	cond_resched();
 	spin_lock(details->i_mmap_lock);
 	return -EINTR;
 }
 static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
 					    struct zap_details *details)
 {
 	struct vm_area_struct *vma;
 	struct prio_tree_iter iter;
 	pgoff_t vba, vea, zba, zea;
 restart:
 	vma_prio_tree_foreach(vma, &iter, root,
 			details->first_index, details->last_index) {
 		/* Skip quickly over those we have already dealt with */
 		if (vma->vm_truncate_count == details->truncate_count)
 			continue;
 		vba = vma->vm_pgoff;
 		vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
 		/* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */
 		zba = details->first_index;
 		if (zba < vba)
 			zba = vba;
 		zea = details->last_index;
 		if (zea > vea)
 			zea = vea;
 		if (unmap_mapping_range_vma(vma,
 			((zba - vba) << PAGE_SHIFT) + vma->vm_start,
 			((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
 				details) < 0)
 			goto restart;
 	}
 }
 static inline void unmap_mapping_range_list(struct list_head *head,
 					    struct zap_details *details)
 {
 	struct vm_area_struct *vma;
 	/*
 	 * In nonlinear VMAs there is no correspondence between virtual address
 	 * offset and file offset.  So we must perform an exhaustive search
 	 * across *all* the pages in each nonlinear VMA, not just the pages
 	 * whose virtual address lies outside the file truncation point.
 	 */
 restart:
 	list_for_each_entry(vma, head, shared.vm_set.list) {
 		/* Skip quickly over those we have already dealt with */
 		if (vma->vm_truncate_count == details->truncate_count)
 			continue;
 		details->nonlinear_vma = vma;
 		if (unmap_mapping_range_vma(vma, vma->vm_start,
 					vma->vm_end, details) < 0)
 			goto restart;
 	}
 }
 /**
  * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file.
  * @mapping: the address space containing mmaps to be unmapped.
  * @holebegin: byte in first page to unmap, relative to the start of
  * the underlying file.  This will be rounded down to a PAGE_SIZE
  * boundary.  Note that this is different from vmtruncate(), which
  * must keep the partial page.  In contrast, we must get rid of
  * partial pages.
  * @holelen: size of prospective hole in bytes.  This will be rounded
  * up to a PAGE_SIZE boundary.  A holelen of zero truncates to the
  * end of the file.
  * @even_cows: 1 when truncating a file, unmap even private COWed pages;
  * but 0 when invalidating pagecache, don't throw away private data.
  */
 void unmap_mapping_range(struct address_space *mapping,
 		loff_t const holebegin, loff_t const holelen, int even_cows)
 {
 	struct zap_details details;
 	pgoff_t hba = holebegin >> PAGE_SHIFT;
 	pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	/* Check for overflow. */
 	if (sizeof(holelen) > sizeof(hlen)) {
 		long long holeend =
 			(holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
 		if (holeend & ~(long long)ULONG_MAX)
 			hlen = ULONG_MAX - hba + 1;
 	}
 	details.check_mapping = even_cows? NULL: mapping;
 	details.nonlinear_vma = NULL;
 	details.first_index = hba;
 	details.last_index = hba + hlen - 1;
 	if (details.last_index < details.first_index)
 		details.last_index = ULONG_MAX;
 	details.i_mmap_lock = &mapping->i_mmap_lock;
 	spin_lock(&mapping->i_mmap_lock);
 	/* Protect against endless unmapping loops */
 	mapping->truncate_count++;
 	if (unlikely(is_restart_addr(mapping->truncate_count))) {
 		if (mapping->truncate_count == 0)
 			reset_vma_truncate_counts(mapping);
 		mapping->truncate_count++;
 	}
 	details.truncate_count = mapping->truncate_count;
 	if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
 		unmap_mapping_range_tree(&mapping->i_mmap, &details);
 	if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
 		unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
 	spin_unlock(&mapping->i_mmap_lock);
 }
 EXPORT_SYMBOL(unmap_mapping_range);
 /**
  * vmtruncate - unmap mappings "freed" by truncate() syscall
  * @inode: inode of the file used
  * @offset: file offset to start truncating
  *
  * NOTE! We have to be ready to update the memory sharing
  * between the file and the memory map for a potential last
  * incomplete page.  Ugly, but necessary.
  */
 int vmtruncate(struct inode * inode, loff_t offset)
 {
 	struct address_space *mapping = inode->i_mapping;
 	unsigned long limit;
 	if (inode->i_size < offset)
 		goto do_expand;
 	/*
 	 * truncation of in-use swapfiles is disallowed - it would cause
 	 * subsequent swapout to scribble on the now-freed blocks.
 	 */
 	if (IS_SWAPFILE(inode))
 		goto out_busy;
 	i_size_write(inode, offset);
 	/*
 	 * unmap_mapping_range is called twice, first simply for efficiency
 	 * so that truncate_inode_pages does fewer single-page unmaps. However
 	 * after this first call, and before truncate_inode_pages finishes,
 	 * it is possible for private pages to be COWed, which remain after
 	 * truncate_inode_pages finishes, hence the second unmap_mapping_range
 	 * call must be made for correctness.
 	 */
 	unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
 	truncate_inode_pages(mapping, offset);
 	unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
 	goto out_truncate;
 do_expand:
 	limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
 	if (limit != RLIM_INFINITY && offset > limit)
 		goto out_sig;
 	if (offset > inode->i_sb->s_maxbytes)
 		goto out_big;
 	i_size_write(inode, offset);
 out_truncate:
 	if (inode->i_op && inode->i_op->truncate)
 		inode->i_op->truncate(inode);
 	return 0;
 out_sig:
 	send_sig(SIGXFSZ, current, 0);
 out_big:
 	return -EFBIG;
 out_busy:
 	return -ETXTBSY;
 }
 EXPORT_SYMBOL(vmtruncate);
 int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
 {
 	struct address_space *mapping = inode->i_mapping;
 	/*
 	 * If the underlying filesystem is not going to provide
 	 * a way to truncate a range of blocks (punch a hole) -
 	 * we should return failure right now.
 	 */
 	if (!inode->i_op || !inode->i_op->truncate_range)
 		return -ENOSYS;
 	mutex_lock(&inode->i_mutex);
 	down_write(&inode->i_alloc_sem);
 	unmap_mapping_range(mapping, offset, (end - offset), 1);
 	truncate_inode_pages_range(mapping, offset, end);
 	unmap_mapping_range(mapping, offset, (end - offset), 1);
 	inode->i_op->truncate_range(inode, offset, end);
 	up_write(&inode->i_alloc_sem);
 	mutex_unlock(&inode->i_mutex);
 	return 0;
 }
 /**
  * swapin_readahead - swap in pages in hope we need them soon
  * @entry: swap entry of this memory
  * @addr: address to start
  * @vma: user vma this addresses belong to
  *
  * Primitive swap readahead code. We simply read an aligned block of
  * (1 << page_cluster) entries in the swap area. This method is chosen
  * because it doesn't cost us any seek time.  We also make sure to queue
  * the 'original' request together with the readahead ones...
  *
  * This has been extended to use the NUMA policies from the mm triggering
  * the readahead.
  *
  * Caller must hold down_read on the vma->vm_mm if vma is not NULL.
  */
 void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struct *vma)
 {
 #ifdef CONFIG_NUMA
 	struct vm_area_struct *next_vma = vma ? vma->vm_next : NULL;
 #endif
 	int i, num;
 	struct page *new_page;
 	unsigned long offset;
 	/*
 	 * Get the number of handles we should do readahead io to.
 	 */
 	num = valid_swaphandles(entry, &offset);
 	for (i = 0; i < num; offset++, i++) {
 		/* Ok, do the async read-ahead now */
 		new_page = read_swap_cache_async(swp_entry(swp_type(entry),
 							   offset), vma, addr);
 		if (!new_page)
 			break;
 		page_cache_release(new_page);
 #ifdef CONFIG_NUMA
 		/*
 		 * Find the next applicable VMA for the NUMA policy.
 		 */
 		addr += PAGE_SIZE;
 		if (addr == 0)
 			vma = NULL;
 		if (vma) {
 			if (addr >= vma->vm_end) {
 				vma = next_vma;
 				next_vma = vma ? vma->vm_next : NULL;
 			}
 			if (vma && addr < vma->vm_start)
 				vma = NULL;
 		} else {
 			if (next_vma && addr >= next_vma->vm_start) {
 				vma = next_vma;
 				next_vma = vma->vm_next;
 			}
 		}
 #endif
 	}
 	lru_add_drain();	/* Push any new pages onto the LRU now */
 }
 /*
  * We enter with non-exclusive mmap_sem (to exclude vma changes,
  * but allow concurrent faults), and pte mapped but not yet locked.
  * We return with mmap_sem still held, but pte unmapped and unlocked.
  */
 static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
 		int write_access, pte_t orig_pte)
 {
 	spinlock_t *ptl;
 	struct page *page;
 	swp_entry_t entry;
 	pte_t pte;
 	int ret = 0;
 	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
 		goto out;
 	entry = pte_to_swp_entry(orig_pte);
 	if (is_migration_entry(entry)) {
 		migration_entry_wait(mm, pmd, address);
 		goto out;
 	}
 	delayacct_set_flag(DELAYACCT_PF_SWAPIN);
 	page = lookup_swap_cache(entry);
 	if (!page) {
 		grab_swap_token(); /* Contend for token _before_ read-in */
  		swapin_readahead(entry, address, vma);
  		page = read_swap_cache_async(entry, vma, address);
 		if (!page) {
 			/*
 			 * Back out if somebody else faulted in this pte
 			 * while we released the pte lock.
 			 */
 			page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
 			if (likely(pte_same(*page_table, orig_pte)))
 				ret = VM_FAULT_OOM;
 			delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
 			goto unlock;
 		}
 		/* Had to read the page from swap area: Major fault */
 		ret = VM_FAULT_MAJOR;
 		count_vm_event(PGMAJFAULT);
 	}
 	mark_page_accessed(page);
 	lock_page(page);
 	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
 	/*
 	 * Back out if somebody else already faulted in this pte.
 	 */
 	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
 	if (unlikely(!pte_same(*page_table, orig_pte)))
 		goto out_nomap;
 	if (unlikely(!PageUptodate(page))) {
 		ret = VM_FAULT_SIGBUS;
 		goto out_nomap;
 	}
 	/* The page isn't present yet, go ahead with the fault. */
 	inc_mm_counter(mm, anon_rss);
 	pte = mk_pte(page, vma->vm_page_prot);
 	if (write_access && can_share_swap_page(page)) {
 		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
 		write_access = 0;
 	}
 	flush_icache_page(vma, page);
 	set_pte_at(mm, address, page_table, pte);
 	page_add_anon_rmap(page, vma, address);
 	swap_free(entry);
 	if (vm_swap_full())
 		remove_exclusive_swap_page(page);
 	unlock_page(page);
 	if (write_access) {
 		/* XXX: We could OR the do_wp_page code with this one? */
 		if (do_wp_page(mm, vma, address,
 				page_table, pmd, ptl, pte) & VM_FAULT_OOM)
 			ret = VM_FAULT_OOM;
 		goto out;
 	}
 	/* No need to invalidate - it was non-present before */
 	update_mmu_cache(vma, address, pte);
 unlock:
 	pte_unmap_unlock(page_table, ptl);
 out:
 	return ret;
 out_nomap:
 	pte_unmap_unlock(page_table, ptl);
 	unlock_page(page);
 	page_cache_release(page);
 	return ret;
 }
 /*
  * We enter with non-exclusive mmap_sem (to exclude vma changes,
  * but allow concurrent faults), and pte mapped but not yet locked.
  * We return with mmap_sem still held, but pte unmapped and unlocked.
  */
 static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
 		int write_access)
 {
 	struct page *page;
 	spinlock_t *ptl;
 	pte_t entry;
 	/* Allocate our own private page. */
 	pte_unmap(page_table);
 	if (unlikely(anon_vma_prepare(vma)))
 		goto oom;
 	page = alloc_zeroed_user_highpage_movable(vma, address);
 	if (!page)
 		goto oom;
 	entry = mk_pte(page, vma->vm_page_prot);
 	entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
 	if (!pte_none(*page_table))
 		goto release;
 	inc_mm_counter(mm, anon_rss);
 	lru_cache_add_active(page);
 	page_add_new_anon_rmap(page, vma, address);
 	set_pte_at(mm, address, page_table, entry);
 	/* No need to invalidate - it was non-present before */
 	update_mmu_cache(vma, address, entry);
 unlock:
 	pte_unmap_unlock(page_table, ptl);
 	return 0;
 release:
 	page_cache_release(page);
 	goto unlock;
 oom:
 	return VM_FAULT_OOM;
 }
 /*
  * __do_fault() tries to create a new page mapping. It aggressively
  * tries to share with existing pages, but makes a separate copy if
  * the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid
  * the next page fault.
  *
  * As this is called only for pages that do not currently exist, we
  * do not need to flush old virtual caches or the TLB.
  *
  * We enter with non-exclusive mmap_sem (to exclude vma changes,
  * but allow concurrent faults), and pte neither mapped nor locked.
  * We return with mmap_sem still held, but pte unmapped and unlocked.
  */
 static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pmd_t *pmd,
 		pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
 {
 	pte_t *page_table;
 	spinlock_t *ptl;
 	struct page *page;
 	pte_t entry;
 	int anon = 0;
 	struct page *dirty_page = NULL;
 	struct vm_fault vmf;
 	int ret;
 	int page_mkwrite = 0;
 	vmf.virtual_address = (void __user *)(address & PAGE_MASK);
 	vmf.pgoff = pgoff;
 	vmf.flags = flags;
 	vmf.page = NULL;
 	BUG_ON(vma->vm_flags & VM_PFNMAP);
 	if (likely(vma->vm_ops->fault)) {
 		ret = vma->vm_ops->fault(vma, &vmf);
 		if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
 			return ret;
 	} else {
 		/* Legacy ->nopage path */
 		ret = 0;
 		vmf.page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
 		/* no page was available -- either SIGBUS or OOM */
 		if (unlikely(vmf.page == NOPAGE_SIGBUS))
 			return VM_FAULT_SIGBUS;
 		else if (unlikely(vmf.page == NOPAGE_OOM))
 			return VM_FAULT_OOM;
 	}
 	/*
 	 * For consistency in subsequent calls, make the faulted page always
 	 * locked.
 	 */
 	if (unlikely(!(ret & VM_FAULT_LOCKED)))
 		lock_page(vmf.page);
 	else
 		VM_BUG_ON(!PageLocked(vmf.page));
 	/*
 	 * Should we do an early C-O-W break?
 	 */
 	page = vmf.page;
 	if (flags & FAULT_FLAG_WRITE) {
 		if (!(vma->vm_flags & VM_SHARED)) {
 			anon = 1;
 			if (unlikely(anon_vma_prepare(vma))) {
 				ret = VM_FAULT_OOM;
 				goto out;
 			}
 			page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
 						vma, address);
 			if (!page) {
 				ret = VM_FAULT_OOM;
 				goto out;
 			}
 			copy_user_highpage(page, vmf.page, address, vma);
 		} else {
 			/*
 			 * If the page will be shareable, see if the backing
 			 * address space wants to know that the page is about
 			 * to become writable
 			 */
 			if (vma->vm_ops->page_mkwrite) {
 				unlock_page(page);
 				if (vma->vm_ops->page_mkwrite(vma, page) < 0) {
 					ret = VM_FAULT_SIGBUS;
 					anon = 1; /* no anon but release vmf.page */
 					goto out_unlocked;
 				}
 				lock_page(page);
 				/*
 				 * XXX: this is not quite right (racy vs
 				 * invalidate) to unlock and relock the page
 				 * like this, however a better fix requires
 				 * reworking page_mkwrite locking API, which
 				 * is better done later.
 				 */
 				if (!page->mapping) {
 					ret = 0;
 					anon = 1; /* no anon but release vmf.page */
 					goto out;
 				}
 				page_mkwrite = 1;
 			}
 		}
 	}
 	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
 	/*
 	 * This silly early PAGE_DIRTY setting removes a race
 	 * due to the bad i386 page protection. But it's valid
 	 * for other architectures too.
 	 *
 	 * Note that if write_access is true, we either now have
 	 * an exclusive copy of the page, or this is a shared mapping,
 	 * so we can make it writable and dirty to avoid having to
 	 * handle that later.
 	 */
 	/* Only go through if we didn't race with anybody else... */
 	if (likely(pte_same(*page_table, orig_pte))) {
 		flush_icache_page(vma, page);
 		entry = mk_pte(page, vma->vm_page_prot);
 		if (flags & FAULT_FLAG_WRITE)
 			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 		set_pte_at(mm, address, page_table, entry);
 		if (anon) {
                         inc_mm_counter(mm, anon_rss);
                         lru_cache_add_active(page);
                         page_add_new_anon_rmap(page, vma, address);
 		} else {
 			inc_mm_counter(mm, file_rss);
 			page_add_file_rmap(page);
 			if (flags & FAULT_FLAG_WRITE) {
 				dirty_page = page;
 				get_page(dirty_page);
 			}
 		}
 		/* no need to invalidate: a not-present page won't be cached */
 		update_mmu_cache(vma, address, entry);
 	} else {
 		if (anon)
 			page_cache_release(page);
 		else
 			anon = 1; /* no anon but release faulted_page */
 	}
 	pte_unmap_unlock(page_table, ptl);
 out:
 	unlock_page(vmf.page);
 out_unlocked:
 	if (anon)
 		page_cache_release(vmf.page);
 	else if (dirty_page) {
 		if (vma->vm_file)
 			file_update_time(vma->vm_file);
 		set_page_dirty_balance(dirty_page, page_mkwrite);
 		put_page(dirty_page);
 	}
 	return ret;
 }
 static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
 		int write_access, pte_t orig_pte)
 {
 	pgoff_t pgoff = (((address & PAGE_MASK)
 			- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 	unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
 	pte_unmap(page_table);
 	return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
 }
 /*
  * do_no_pfn() tries to create a new page mapping for a page without
  * a struct_page backing it
  *
  * As this is called only for pages that do not currently exist, we
  * do not need to flush old virtual caches or the TLB.
  *
  * We enter with non-exclusive mmap_sem (to exclude vma changes,
  * but allow concurrent faults), and pte mapped but not yet locked.
  * We return with mmap_sem still held, but pte unmapped and unlocked.
  *
  * It is expected that the ->nopfn handler always returns the same pfn
  * for a given virtual mapping.
  *
  * Mark this `noinline' to prevent it from bloating the main pagefault code.
  */
 static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
 		     unsigned long address, pte_t *page_table, pmd_t *pmd,
 		     int write_access)
 {
 	spinlock_t *ptl;
 	pte_t entry;
 	unsigned long pfn;
 	pte_unmap(page_table);
 	BUG_ON(!(vma->vm_flags & VM_PFNMAP));
 	BUG_ON(is_cow_mapping(vma->vm_flags));
 	pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK);
 	if (unlikely(pfn == NOPFN_OOM))
 		return VM_FAULT_OOM;
 	else if (unlikely(pfn == NOPFN_SIGBUS))
 		return VM_FAULT_SIGBUS;
 	else if (unlikely(pfn == NOPFN_REFAULT))
 		return 0;
 	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
 	/* Only go through if we didn't race with anybody else... */
 	if (pte_none(*page_table)) {
 		entry = pfn_pte(pfn, vma->vm_page_prot);
 		if (write_access)
 			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 		set_pte_at(mm, address, page_table, entry);
 	}
 	pte_unmap_unlock(page_table, ptl);
 	return 0;
 }
 /*
  * Fault of a previously existing named mapping. Repopulate the pte
  * from the encoded file_pte if possible. This enables swappable
  * nonlinear vmas.
  *
  * We enter with non-exclusive mmap_sem (to exclude vma changes,
  * but allow concurrent faults), and pte mapped but not yet locked.
  * We return with mmap_sem still held, but pte unmapped and unlocked.
  */
 static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
 		int write_access, pte_t orig_pte)
 {
 	unsigned int flags = FAULT_FLAG_NONLINEAR |
 				(write_access ? FAULT_FLAG_WRITE : 0);
 	pgoff_t pgoff;
 	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
 		return 0;
 	if (unlikely(!(vma->vm_flags & VM_NONLINEAR) ||
 			!(vma->vm_flags & VM_CAN_NONLINEAR))) {
 		/*
 		 * Page table corrupted: show pte and kill process.
 		 */
 		print_bad_pte(vma, orig_pte, address);
 		return VM_FAULT_OOM;
 	}
 	pgoff = pte_to_pgoff(orig_pte);
 	return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
 }
 /*
  * These routines also need to handle stuff like marking pages dirty
  * and/or accessed for architectures that don't do it in hardware (most
  * RISC architectures).  The early dirtying is also good on the i386.
  *
  * There is also a hook called "update_mmu_cache()" that architectures
  * with external mmu caches can use to update those (ie the Sparc or
  * PowerPC hashed page tables that act as extended TLBs).
  *
  * We enter with non-exclusive mmap_sem (to exclude vma changes,
  * but allow concurrent faults), and pte mapped but not yet locked.
  * We return with mmap_sem still held, but pte unmapped and unlocked.
  */
 static inline int handle_pte_fault(struct mm_struct *mm,
 		struct vm_area_struct *vma, unsigned long address,
 		pte_t *pte, pmd_t *pmd, int write_access)
 {
 	pte_t entry;
 	spinlock_t *ptl;
 	entry = *pte;
 	if (!pte_present(entry)) {
 		if (pte_none(entry)) {
 			if (vma->vm_ops) {
 				if (vma->vm_ops->fault || vma->vm_ops->nopage)
 					return do_linear_fault(mm, vma, address,
 						pte, pmd, write_access, entry);
 				if (unlikely(vma->vm_ops->nopfn))
 					return do_no_pfn(mm, vma, address, pte,
 							 pmd, write_access);
 			}
 			return do_anonymous_page(mm, vma, address,
 						 pte, pmd, write_access);
 		}
 		if (pte_file(entry))
 			return do_nonlinear_fault(mm, vma, address,
 					pte, pmd, write_access, entry);
 		return do_swap_page(mm, vma, address,
 					pte, pmd, write_access, entry);
 	}
 	ptl = pte_lockptr(mm, pmd);
 	spin_lock(ptl);
 	if (unlikely(!pte_same(*pte, entry)))
 		goto unlock;
 	if (write_access) {
 		if (!pte_write(entry))
 			return do_wp_page(mm, vma, address,
 					pte, pmd, ptl, entry);
 		entry = pte_mkdirty(entry);
 	}
 	entry = pte_mkyoung(entry);
 	if (ptep_set_access_flags(vma, address, pte, entry, write_access)) {
 		update_mmu_cache(vma, address, entry);
 	} else {
 		/*
 		 * This is needed only for protection faults but the arch code
 		 * is not yet telling us if this is a protection fault or not.
 		 * This still avoids useless tlb flushes for .text page faults
 		 * with threads.
 		 */
 		if (write_access)
 			flush_tlb_page(vma, address);
 	}
 unlock:
 	pte_unmap_unlock(pte, ptl);
 	return 0;
 }
 /*
  * By the time we get here, we already hold the mm semaphore
  */
 int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, int write_access)
 {
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *pte;
 	__set_current_state(TASK_RUNNING);
 	count_vm_event(PGFAULT);
 	if (unlikely(is_vm_hugetlb_page(vma)))
 		return hugetlb_fault(mm, vma, address, write_access);
 	pgd = pgd_offset(mm, address);
 	pud = pud_alloc(mm, pgd, address);
 	if (!pud)
 		return VM_FAULT_OOM;
 	pmd = pmd_alloc(mm, pud, address);
 	if (!pmd)
 		return VM_FAULT_OOM;
 	pte = pte_alloc_map(mm, pmd, address);
 	if (!pte)
 		return VM_FAULT_OOM;
 	return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
 }
 #ifndef __PAGETABLE_PUD_FOLDED
 /*
  * Allocate page upper directory.
  * We've already handled the fast-path in-line.
  */
 int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
 {
 	pud_t *new = pud_alloc_one(mm, address);
 	if (!new)
 		return -ENOMEM;
 	spin_lock(&mm->page_table_lock);
 	if (pgd_present(*pgd))		/* Another has populated it */
 		pud_free(new);
 	else
 		pgd_populate(mm, pgd, new);
 	spin_unlock(&mm->page_table_lock);
 	return 0;
 }
 #endif /* __PAGETABLE_PUD_FOLDED */
 #ifndef __PAGETABLE_PMD_FOLDED
 /*
  * Allocate page middle directory.
  * We've already handled the fast-path in-line.
  */
 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 {
 	pmd_t *new = pmd_alloc_one(mm, address);
 	if (!new)
 		return -ENOMEM;
 	spin_lock(&mm->page_table_lock);
 #ifndef __ARCH_HAS_4LEVEL_HACK
 	if (pud_present(*pud))		/* Another has populated it */
 		pmd_free(new);
 	else
 		pud_populate(mm, pud, new);
 #else
 	if (pgd_present(*pud))		/* Another has populated it */
 		pmd_free(new);
 	else
 		pgd_populate(mm, pud, new);
 #endif /* __ARCH_HAS_4LEVEL_HACK */
 	spin_unlock(&mm->page_table_lock);
 	return 0;
 }
 #endif /* __PAGETABLE_PMD_FOLDED */
 int make_pages_present(unsigned long addr, unsigned long end)
 {
 	int ret, len, write;
 	struct vm_area_struct * vma;
 	vma = find_vma(current->mm, addr);
 	if (!vma)
 		return -1;
 	write = (vma->vm_flags & VM_WRITE) != 0;
 	BUG_ON(addr >= end);
 	BUG_ON(end > vma->vm_end);
 	len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
 	ret = get_user_pages(current, current->mm, addr,
 			len, write, 0, NULL, NULL);
 	if (ret < 0)
 		return ret;
 	return ret == len ? 0 : -1;
 }
 /*
  * Map a vmalloc()-space virtual address to the physical page.
  */
 struct page * vmalloc_to_page(void * vmalloc_addr)
 {
 	unsigned long addr = (unsigned long) vmalloc_addr;
 	struct page *page = NULL;
 	pgd_t *pgd = pgd_offset_k(addr);
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *ptep, pte;
 	if (!pgd_none(*pgd)) {
 		pud = pud_offset(pgd, addr);
 		if (!pud_none(*pud)) {
 			pmd = pmd_offset(pud, addr);
 			if (!pmd_none(*pmd)) {
 				ptep = pte_offset_map(pmd, addr);
 				pte = *ptep;
 				if (pte_present(pte))
 					page = pte_page(pte);
 				pte_unmap(ptep);
 			}
 		}
 	}
 	return page;
 }
 EXPORT_SYMBOL(vmalloc_to_page);
 /*
  * Map a vmalloc()-space virtual address to the physical page frame number.
  */
 unsigned long vmalloc_to_pfn(void * vmalloc_addr)
 {
 	return page_to_pfn(vmalloc_to_page(vmalloc_addr));
 }
 EXPORT_SYMBOL(vmalloc_to_pfn);
 #if !defined(__HAVE_ARCH_GATE_AREA)
 #if defined(AT_SYSINFO_EHDR)
 static struct vm_area_struct gate_vma;
 static int __init gate_vma_init(void)
 {
 	gate_vma.vm_mm = NULL;
 	gate_vma.vm_start = FIXADDR_USER_START;
 	gate_vma.vm_end = FIXADDR_USER_END;
 	gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
 	gate_vma.vm_page_prot = __P101;
 	/*
 	 * Make sure the vDSO gets into every core dump.
 	 * Dumping its contents makes post-mortem fully interpretable later
 	 * without matching up the same kernel and hardware config to see
 	 * what PC values meant.
 	 */
 	gate_vma.vm_flags |= VM_ALWAYSDUMP;
 	return 0;
 }
 __initcall(gate_vma_init);
 #endif
 struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
 {
 #ifdef AT_SYSINFO_EHDR
 	return &gate_vma;
 #else
 	return NULL;
 #endif
 }
 int in_gate_area_no_task(unsigned long addr)
 {
 #ifdef AT_SYSINFO_EHDR
 	if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
 		return 1;
 #endif
 	return 0;
 }
 #endif	/* __HAVE_ARCH_GATE_AREA */
 /*
  * Access another process' address space.
  * Source/target buffer must be kernel space,
  * Do not walk the page table directly, use get_user_pages
  */
 int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
 {
 	struct mm_struct *mm;
 	struct vm_area_struct *vma;
 	struct page *page;
 	void *old_buf = buf;
 	mm = get_task_mm(tsk);
 	if (!mm)
 		return 0;
 	down_read(&mm->mmap_sem);
 	/* ignore errors, just check how much was successfully transferred */
 	while (len) {
 		int bytes, ret, offset;
 		void *maddr;
 		ret = get_user_pages(tsk, mm, addr, 1,
 				write, 1, &page, &vma);
 		if (ret <= 0)
 			break;
 		bytes = len;
 		offset = addr & (PAGE_SIZE-1);
 		if (bytes > PAGE_SIZE-offset)
 			bytes = PAGE_SIZE-offset;
 		maddr = kmap(page);
 		if (write) {
 			copy_to_user_page(vma, page, addr,
 					  maddr + offset, buf, bytes);
 			set_page_dirty_lock(page);
 		} else {
 			copy_from_user_page(vma, page, addr,
 					    buf, maddr + offset, bytes);
 		}
 		kunmap(page);
 		page_cache_release(page);
 		len -= bytes;
 		buf += bytes;
 		addr += bytes;
 	}
 	up_read(&mm->mmap_sem);
 	mmput(mm);
 	return buf - old_buf;
 }

1	/*	1	/*
2	* linux/fs/jbd/commit.c	2	* linux/fs/jbd/commit.c
3	*	3	*
4	* Written by Stephen C. Tweedie <sct@redhat.com>, 1998	4	* Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5	*	5	*
6	* Copyright 1998 Red Hat corp --- All Rights Reserved	6	* Copyright 1998 Red Hat corp --- All Rights Reserved
7	*	7	*
8	* This file is part of the Linux kernel and is made available under	8	* This file is part of the Linux kernel and is made available under
9	* the terms of the GNU General Public License, version 2, or at your	9	* the terms of the GNU General Public License, version 2, or at your
10	* option, any later version, incorporated herein by reference.	10	* option, any later version, incorporated herein by reference.
11	*	11	*
12	* Journal commit routines for the generic filesystem journaling code;	12	* Journal commit routines for the generic filesystem journaling code;
13	* part of the ext2fs journaling system.	13	* part of the ext2fs journaling system.
14	*/	14	*/
15		15
16	#include <linux/time.h>	16	#include <linux/time.h>
17	#include <linux/fs.h>	17	#include <linux/fs.h>
18	#include <linux/jbd.h>	18	#include <linux/jbd.h>
19	#include <linux/errno.h>	19	#include <linux/errno.h>
20	#include <linux/slab.h>	20	#include <linux/slab.h>
21	#include <linux/mm.h>	21	#include <linux/mm.h>
22	#include <linux/pagemap.h>	22	#include <linux/pagemap.h>
23		23
24	/*	24	/*
25	* Default IO end handler for temporary BJ_IO buffer_heads.	25	* Default IO end handler for temporary BJ_IO buffer_heads.
26	*/	26	*/
27	static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)	27	static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
28	{	28	{
29	BUFFER_TRACE(bh, "");	29	BUFFER_TRACE(bh, "");
30	if (uptodate)	30	if (uptodate)
31	set_buffer_uptodate(bh);	31	set_buffer_uptodate(bh);
32	else	32	else
33	clear_buffer_uptodate(bh);	33	clear_buffer_uptodate(bh);
34	unlock_buffer(bh);	34	unlock_buffer(bh);
35	}	35	}
36		36
37	/*	37	/*
38	* When an ext3-ordered file is truncated, it is possible that many pages are	38	* When an ext3-ordered file is truncated, it is possible that many pages are
39	* not sucessfully freed, because they are attached to a committing transaction.	39	* not sucessfully freed, because they are attached to a committing transaction.
40	* After the transaction commits, these pages are left on the LRU, with no	40	* After the transaction commits, these pages are left on the LRU, with no

1	/*	1	/*
2	* linux/fs/jbd2/commit.c	2	* linux/fs/jbd2/commit.c
3	*	3	*
4	* Written by Stephen C. Tweedie <sct@redhat.com>, 1998	4	* Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5	*	5	*
6	* Copyright 1998 Red Hat corp --- All Rights Reserved	6	* Copyright 1998 Red Hat corp --- All Rights Reserved
7	*	7	*
8	* This file is part of the Linux kernel and is made available under	8	* This file is part of the Linux kernel and is made available under
9	* the terms of the GNU General Public License, version 2, or at your	9	* the terms of the GNU General Public License, version 2, or at your
10	* option, any later version, incorporated herein by reference.	10	* option, any later version, incorporated herein by reference.
11	*	11	*
12	* Journal commit routines for the generic filesystem journaling code;	12	* Journal commit routines for the generic filesystem journaling code;
13	* part of the ext2fs journaling system.	13	* part of the ext2fs journaling system.
14	*/	14	*/
15		15
16	#include <linux/time.h>	16	#include <linux/time.h>
17	#include <linux/fs.h>	17	#include <linux/fs.h>
18	#include <linux/jbd2.h>	18	#include <linux/jbd2.h>
19	#include <linux/errno.h>	19	#include <linux/errno.h>
20	#include <linux/slab.h>	20	#include <linux/slab.h>
21	#include <linux/mm.h>	21	#include <linux/mm.h>
22	#include <linux/pagemap.h>	22	#include <linux/pagemap.h>
23	#include <linux/jiffies.h>	23	#include <linux/jiffies.h>
24	#include <linux/crc32.h>	24	#include <linux/crc32.h>
25		25
26	/*	26	/*
27	* Default IO end handler for temporary BJ_IO buffer_heads.	27	* Default IO end handler for temporary BJ_IO buffer_heads.
28	*/	28	*/
29	static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)	29	static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
30	{	30	{
31	BUFFER_TRACE(bh, "");	31	BUFFER_TRACE(bh, "");
32	if (uptodate)	32	if (uptodate)
33	set_buffer_uptodate(bh);	33	set_buffer_uptodate(bh);
34	else	34	else
35	clear_buffer_uptodate(bh);	35	clear_buffer_uptodate(bh);
36	unlock_buffer(bh);	36	unlock_buffer(bh);
37	}	37	}
38		38
39	/*	39	/*
40	* When an ext3-ordered file is truncated, it is possible that many pages are	40	* When an ext3-ordered file is truncated, it is possible that many pages are