Doug / smarc-fsl-linux-kernel | Embedian Git Server

Commit 2b2811178e85553405b86e3fe78357b9b95889ce

Authored by Seth Jennings 2013-07-11 07:05:03 +0800

Committed by Linus Torvalds 2013-07-11 09:11:34 +0800

Exists in smarc-imx_3.14.28_1.0.0_ga and in 1 other branch

zswap: add to mm/

zswap is a thin backend for frontswap that takes pages that are in the
process of being swapped out and attempts to compress them and store
them in a RAM-based memory pool.  This can result in a significant I/O
reduction on the swap device and, in the case where decompressing from
RAM is faster than reading from the swap device, can also improve
workload performance.

It also has support for evicting swap pages that are currently
compressed in zswap to the swap device on an LRU(ish) basis.  This
functionality makes zswap a true cache in that, once the cache is full,
the oldest pages can be moved out of zswap to the swap device so newer
pages can be compressed and stored in zswap.

This patch adds the zswap driver to mm/

Signed-off-by: Seth Jennings <sjenning@linux.vnet.ibm.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Nitin Gupta <ngupta@vflare.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Dan Magenheimer <dan.magenheimer@oracle.com>
Cc: Robert Jennings <rcj@linux.vnet.ibm.com>
Cc: Jenifer Hopper <jhopper@us.ibm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Dave Hansen <dave@sr71.net>
Cc: Joe Perches <joe@perches.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Cody P Schafer <cody@linux.vnet.ibm.com>
Cc: Hugh Dickens <hughd@google.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 3 changed files with 964 additions and 0 deletions Inline Diff

mm/Kconfig
mm/Makefile
mm/zswap.c

mm/Kconfig

Diff comments View file @ 2b28111

 config SELECT_MEMORY_MODEL
 	def_bool y
 	depends on ARCH_SELECT_MEMORY_MODEL
 choice
 	prompt "Memory model"
 	depends on SELECT_MEMORY_MODEL
 	default DISCONTIGMEM_MANUAL if ARCH_DISCONTIGMEM_DEFAULT
 	default SPARSEMEM_MANUAL if ARCH_SPARSEMEM_DEFAULT
 	default FLATMEM_MANUAL
 config FLATMEM_MANUAL
 	bool "Flat Memory"
 	depends on !(ARCH_DISCONTIGMEM_ENABLE || ARCH_SPARSEMEM_ENABLE) || ARCH_FLATMEM_ENABLE
 	help
 	  This option allows you to change some of the ways that
 	  Linux manages its memory internally.  Most users will
 	  only have one option here: FLATMEM.  This is normal
 	  and a correct option.
 	  Some users of more advanced features like NUMA and
 	  memory hotplug may have different options here.
 	  DISCONTIGMEM is an more mature, better tested system,
 	  but is incompatible with memory hotplug and may suffer
 	  decreased performance over SPARSEMEM.  If unsure between
 	  "Sparse Memory" and "Discontiguous Memory", choose
 	  "Discontiguous Memory".
 	  If unsure, choose this option (Flat Memory) over any other.
 config DISCONTIGMEM_MANUAL
 	bool "Discontiguous Memory"
 	depends on ARCH_DISCONTIGMEM_ENABLE
 	help
 	  This option provides enhanced support for discontiguous
 	  memory systems, over FLATMEM.  These systems have holes
 	  in their physical address spaces, and this option provides
 	  more efficient handling of these holes.  However, the vast
 	  majority of hardware has quite flat address spaces, and
 	  can have degraded performance from the extra overhead that
 	  this option imposes.
 	  Many NUMA configurations will have this as the only option.
 	  If unsure, choose "Flat Memory" over this option.
 config SPARSEMEM_MANUAL
 	bool "Sparse Memory"
 	depends on ARCH_SPARSEMEM_ENABLE
 	help
 	  This will be the only option for some systems, including
 	  memory hotplug systems.  This is normal.
 	  For many other systems, this will be an alternative to
 	  "Discontiguous Memory".  This option provides some potential
 	  performance benefits, along with decreased code complexity,
 	  but it is newer, and more experimental.
 	  If unsure, choose "Discontiguous Memory" or "Flat Memory"
 	  over this option.
 endchoice
 config DISCONTIGMEM
 	def_bool y
 	depends on (!SELECT_MEMORY_MODEL && ARCH_DISCONTIGMEM_ENABLE) || DISCONTIGMEM_MANUAL
 config SPARSEMEM
 	def_bool y
 	depends on (!SELECT_MEMORY_MODEL && ARCH_SPARSEMEM_ENABLE) || SPARSEMEM_MANUAL
 config FLATMEM
 	def_bool y
 	depends on (!DISCONTIGMEM && !SPARSEMEM) || FLATMEM_MANUAL
 config FLAT_NODE_MEM_MAP
 	def_bool y
 	depends on !SPARSEMEM
 #
 # Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's
 # to represent different areas of memory.  This variable allows
 # those dependencies to exist individually.
 #
 config NEED_MULTIPLE_NODES
 	def_bool y
 	depends on DISCONTIGMEM || NUMA
 config HAVE_MEMORY_PRESENT
 	def_bool y
 	depends on ARCH_HAVE_MEMORY_PRESENT || SPARSEMEM
 #
 # SPARSEMEM_EXTREME (which is the default) does some bootmem
 # allocations when memory_present() is called.  If this cannot
 # be done on your architecture, select this option.  However,
 # statically allocating the mem_section[] array can potentially
 # consume vast quantities of .bss, so be careful.
 #
 # This option will also potentially produce smaller runtime code
 # with gcc 3.4 and later.
 #
 config SPARSEMEM_STATIC
 	bool
 #
 # Architecture platforms which require a two level mem_section in SPARSEMEM
 # must select this option. This is usually for architecture platforms with
 # an extremely sparse physical address space.
 #
 config SPARSEMEM_EXTREME
 	def_bool y
 	depends on SPARSEMEM && !SPARSEMEM_STATIC
 config SPARSEMEM_VMEMMAP_ENABLE
 	bool
 config SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
 	def_bool y
 	depends on SPARSEMEM && X86_64
 config SPARSEMEM_VMEMMAP
 	bool "Sparse Memory virtual memmap"
 	depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE
 	default y
 	help
 	 SPARSEMEM_VMEMMAP uses a virtually mapped memmap to optimise
 	 pfn_to_page and page_to_pfn operations.  This is the most
 	 efficient option when sufficient kernel resources are available.
 config HAVE_MEMBLOCK
 	boolean
 config HAVE_MEMBLOCK_NODE_MAP
 	boolean
 config ARCH_DISCARD_MEMBLOCK
 	boolean
 config NO_BOOTMEM
 	boolean
 config MEMORY_ISOLATION
 	boolean
 config MOVABLE_NODE
 	boolean "Enable to assign a node which has only movable memory"
 	depends on HAVE_MEMBLOCK
 	depends on NO_BOOTMEM
 	depends on X86_64
 	depends on NUMA
 	default n
 	help
 	  Allow a node to have only movable memory.  Pages used by the kernel,
 	  such as direct mapping pages cannot be migrated.  So the corresponding
 	  memory device cannot be hotplugged.  This option allows users to
 	  online all the memory of a node as movable memory so that the whole
 	  node can be hotplugged.  Users who don't use the memory hotplug
 	  feature are fine with this option on since they don't online memory
 	  as movable.
 	  Say Y here if you want to hotplug a whole node.
 	  Say N here if you want kernel to use memory on all nodes evenly.
 #
 # Only be set on architectures that have completely implemented memory hotplug
 # feature. If you are not sure, don't touch it.
 #
 config HAVE_BOOTMEM_INFO_NODE
 	def_bool n
 # eventually, we can have this option just 'select SPARSEMEM'
 config MEMORY_HOTPLUG
 	bool "Allow for memory hot-add"
 	depends on SPARSEMEM || X86_64_ACPI_NUMA
 	depends on ARCH_ENABLE_MEMORY_HOTPLUG
 	depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390)
 config MEMORY_HOTPLUG_SPARSE
 	def_bool y
 	depends on SPARSEMEM && MEMORY_HOTPLUG
 config MEMORY_HOTREMOVE
 	bool "Allow for memory hot remove"
 	select MEMORY_ISOLATION
 	select HAVE_BOOTMEM_INFO_NODE if X86_64
 	depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
 	depends on MIGRATION
 #
 # If we have space for more page flags then we can enable additional
 # optimizations and functionality.
 #
 # Regular Sparsemem takes page flag bits for the sectionid if it does not
 # use a virtual memmap. Disable extended page flags for 32 bit platforms
 # that require the use of a sectionid in the page flags.
 #
 config PAGEFLAGS_EXTENDED
 	def_bool y
 	depends on 64BIT || SPARSEMEM_VMEMMAP || !SPARSEMEM
 # Heavily threaded applications may benefit from splitting the mm-wide
 # page_table_lock, so that faults on different parts of the user address
 # space can be handled with less contention: split it at this NR_CPUS.
 # Default to 4 for wider testing, though 8 might be more appropriate.
 # ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock.
 # PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes.
 # DEBUG_SPINLOCK and DEBUG_LOCK_ALLOC spinlock_t also enlarge struct page.
 #
 config SPLIT_PTLOCK_CPUS
 	int
 	default "999999" if ARM && !CPU_CACHE_VIPT
 	default "999999" if PARISC && !PA20
 	default "999999" if DEBUG_SPINLOCK || DEBUG_LOCK_ALLOC
 	default "4"
 #
 # support for memory balloon compaction
 config BALLOON_COMPACTION
 	bool "Allow for balloon memory compaction/migration"
 	def_bool y
 	depends on COMPACTION && VIRTIO_BALLOON
 	help
 	  Memory fragmentation introduced by ballooning might reduce
 	  significantly the number of 2MB contiguous memory blocks that can be
 	  used within a guest, thus imposing performance penalties associated
 	  with the reduced number of transparent huge pages that could be used
 	  by the guest workload. Allowing the compaction & migration for memory
 	  pages enlisted as being part of memory balloon devices avoids the
 	  scenario aforementioned and helps improving memory defragmentation.
 #
 # support for memory compaction
 config COMPACTION
 	bool "Allow for memory compaction"
 	def_bool y
 	select MIGRATION
 	depends on MMU
 	help
 	  Allows the compaction of memory for the allocation of huge pages.
 #
 # support for page migration
 #
 config MIGRATION
 	bool "Page migration"
 	def_bool y
 	depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION || CMA
 	help
 	  Allows the migration of the physical location of pages of processes
 	  while the virtual addresses are not changed. This is useful in
 	  two situations. The first is on NUMA systems to put pages nearer
 	  to the processors accessing. The second is when allocating huge
 	  pages as migration can relocate pages to satisfy a huge page
 	  allocation instead of reclaiming.
 config PHYS_ADDR_T_64BIT
 	def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT
 config ZONE_DMA_FLAG
 	int
 	default "0" if !ZONE_DMA
 	default "1"
 config BOUNCE
 	bool "Enable bounce buffers"
 	default y
 	depends on BLOCK && MMU && (ZONE_DMA || HIGHMEM)
 	help
 	  Enable bounce buffers for devices that cannot access
 	  the full range of memory available to the CPU. Enabled
 	  by default when ZONE_DMA or HIGHMEM is selected, but you
 	  may say n to override this.
 # On the 'tile' arch, USB OHCI needs the bounce pool since tilegx will often
 # have more than 4GB of memory, but we don't currently use the IOTLB to present
 # a 32-bit address to OHCI.  So we need to use a bounce pool instead.
 #
 # We also use the bounce pool to provide stable page writes for jbd.  jbd
 # initiates buffer writeback without locking the page or setting PG_writeback,
 # and fixing that behavior (a second time; jbd2 doesn't have this problem) is
 # a major rework effort.  Instead, use the bounce buffer to snapshot pages
 # (until jbd goes away).  The only jbd user is ext3.
 config NEED_BOUNCE_POOL
 	bool
 	default y if (TILE && USB_OHCI_HCD) || (BLK_DEV_INTEGRITY && JBD)
 config NR_QUICK
 	int
 	depends on QUICKLIST
 	default "2" if AVR32
 	default "1"
 config VIRT_TO_BUS
 	bool
 	help
 	  An architecture should select this if it implements the
 	  deprecated interface virt_to_bus().  All new architectures
 	  should probably not select this.
 config MMU_NOTIFIER
 	bool
 config KSM
 	bool "Enable KSM for page merging"
 	depends on MMU
 	help
 	  Enable Kernel Samepage Merging: KSM periodically scans those areas
 	  of an application's address space that an app has advised may be
 	  mergeable.  When it finds pages of identical content, it replaces
 	  the many instances by a single page with that content, so
 	  saving memory until one or another app needs to modify the content.
 	  Recommended for use with KVM, or with other duplicative applications.
 	  See Documentation/vm/ksm.txt for more information: KSM is inactive
 	  until a program has madvised that an area is MADV_MERGEABLE, and
 	  root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set).
 config DEFAULT_MMAP_MIN_ADDR
         int "Low address space to protect from user allocation"
 	depends on MMU
         default 4096
         help
 	  This is the portion of low virtual memory which should be protected
 	  from userspace allocation.  Keeping a user from writing to low pages
 	  can help reduce the impact of kernel NULL pointer bugs.
 	  For most ia64, ppc64 and x86 users with lots of address space
 	  a value of 65536 is reasonable and should cause no problems.
 	  On arm and other archs it should not be higher than 32768.
 	  Programs which use vm86 functionality or have some need to map
 	  this low address space will need CAP_SYS_RAWIO or disable this
 	  protection by setting the value to 0.
 	  This value can be changed after boot using the
 	  /proc/sys/vm/mmap_min_addr tunable.
 config ARCH_SUPPORTS_MEMORY_FAILURE
 	bool
 config MEMORY_FAILURE
 	depends on MMU
 	depends on ARCH_SUPPORTS_MEMORY_FAILURE
 	bool "Enable recovery from hardware memory errors"
 	select MEMORY_ISOLATION
 	help
 	  Enables code to recover from some memory failures on systems
 	  with MCA recovery. This allows a system to continue running
 	  even when some of its memory has uncorrected errors. This requires
 	  special hardware support and typically ECC memory.
 config HWPOISON_INJECT
 	tristate "HWPoison pages injector"
 	depends on MEMORY_FAILURE && DEBUG_KERNEL && PROC_FS
 	select PROC_PAGE_MONITOR
 config NOMMU_INITIAL_TRIM_EXCESS
 	int "Turn on mmap() excess space trimming before booting"
 	depends on !MMU
 	default 1
 	help
 	  The NOMMU mmap() frequently needs to allocate large contiguous chunks
 	  of memory on which to store mappings, but it can only ask the system
 	  allocator for chunks in 2^N*PAGE_SIZE amounts - which is frequently
 	  more than it requires.  To deal with this, mmap() is able to trim off
 	  the excess and return it to the allocator.
 	  If trimming is enabled, the excess is trimmed off and returned to the
 	  system allocator, which can cause extra fragmentation, particularly
 	  if there are a lot of transient processes.
 	  If trimming is disabled, the excess is kept, but not used, which for
 	  long-term mappings means that the space is wasted.
 	  Trimming can be dynamically controlled through a sysctl option
 	  (/proc/sys/vm/nr_trim_pages) which specifies the minimum number of
 	  excess pages there must be before trimming should occur, or zero if
 	  no trimming is to occur.
 	  This option specifies the initial value of this option.  The default
 	  of 1 says that all excess pages should be trimmed.
 	  See Documentation/nommu-mmap.txt for more information.
 config TRANSPARENT_HUGEPAGE
 	bool "Transparent Hugepage Support"
 	depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
 	select COMPACTION
 	help
 	  Transparent Hugepages allows the kernel to use huge pages and
 	  huge tlb transparently to the applications whenever possible.
 	  This feature can improve computing performance to certain
 	  applications by speeding up page faults during memory
 	  allocation, by reducing the number of tlb misses and by speeding
 	  up the pagetable walking.
 	  If memory constrained on embedded, you may want to say N.
 choice
 	prompt "Transparent Hugepage Support sysfs defaults"
 	depends on TRANSPARENT_HUGEPAGE
 	default TRANSPARENT_HUGEPAGE_ALWAYS
 	help
 	  Selects the sysfs defaults for Transparent Hugepage Support.
 	config TRANSPARENT_HUGEPAGE_ALWAYS
 		bool "always"
 	help
 	  Enabling Transparent Hugepage always, can increase the
 	  memory footprint of applications without a guaranteed
 	  benefit but it will work automatically for all applications.
 	config TRANSPARENT_HUGEPAGE_MADVISE
 		bool "madvise"
 	help
 	  Enabling Transparent Hugepage madvise, will only provide a
 	  performance improvement benefit to the applications using
 	  madvise(MADV_HUGEPAGE) but it won't risk to increase the
 	  memory footprint of applications without a guaranteed
 	  benefit.
 endchoice
 config CROSS_MEMORY_ATTACH
 	bool "Cross Memory Support"
 	depends on MMU
 	default y
 	help
 	  Enabling this option adds the system calls process_vm_readv and
 	  process_vm_writev which allow a process with the correct privileges
 	  to directly read from or write to to another process's address space.
 	  See the man page for more details.
 #
 # UP and nommu archs use km based percpu allocator
 #
 config NEED_PER_CPU_KM
 	depends on !SMP
 	bool
 	default y
 config CLEANCACHE
 	bool "Enable cleancache driver to cache clean pages if tmem is present"
 	default n
 	help
 	  Cleancache can be thought of as a page-granularity victim cache
 	  for clean pages that the kernel's pageframe replacement algorithm
 	  (PFRA) would like to keep around, but can't since there isn't enough
 	  memory.  So when the PFRA "evicts" a page, it first attempts to use
 	  cleancache code to put the data contained in that page into
 	  "transcendent memory", memory that is not directly accessible or
 	  addressable by the kernel and is of unknown and possibly
 	  time-varying size.  And when a cleancache-enabled
 	  filesystem wishes to access a page in a file on disk, it first
 	  checks cleancache to see if it already contains it; if it does,
 	  the page is copied into the kernel and a disk access is avoided.
 	  When a transcendent memory driver is available (such as zcache or
 	  Xen transcendent memory), a significant I/O reduction
 	  may be achieved.  When none is available, all cleancache calls
 	  are reduced to a single pointer-compare-against-NULL resulting
 	  in a negligible performance hit.
 	  If unsure, say Y to enable cleancache
 config FRONTSWAP
 	bool "Enable frontswap to cache swap pages if tmem is present"
 	depends on SWAP
 	default n
 	help
 	  Frontswap is so named because it can be thought of as the opposite
 	  of a "backing" store for a swap device.  The data is stored into
 	  "transcendent memory", memory that is not directly accessible or
 	  addressable by the kernel and is of unknown and possibly
 	  time-varying size.  When space in transcendent memory is available,
 	  a significant swap I/O reduction may be achieved.  When none is
 	  available, all frontswap calls are reduced to a single pointer-
 	  compare-against-NULL resulting in a negligible performance hit
 	  and swap data is stored as normal on the matching swap device.
 	  If unsure, say Y to enable frontswap.
 config ZBUD
 	tristate
 	default n
 	help
 	  A special purpose allocator for storing compressed pages.
 	  It is designed to store up to two compressed pages per physical
 	  page.  While this design limits storage density, it has simple and
 	  deterministic reclaim properties that make it preferable to a higher
 	  density approach when reclaim will be used.
+config ZSWAP
+	bool "Compressed cache for swap pages (EXPERIMENTAL)"
+	depends on FRONTSWAP && CRYPTO=y
+	select CRYPTO_LZO
+	select ZBUD
+	default n
+	help
+	  A lightweight compressed cache for swap pages.  It takes
+	  pages that are in the process of being swapped out and attempts to
+	  compress them into a dynamically allocated RAM-based memory pool.
+	  This can result in a significant I/O reduction on swap device and,
+	  in the case where decompressing from RAM is faster that swap device
+	  reads, can also improve workload performance.
+	  This is marked experimental because it is a new feature (as of
+	  v3.11) that interacts heavily with memory reclaim.  While these
+	  interactions don't cause any known issues on simple memory setups,
+	  they have not be fully explored on the large set of potential
+	  configurations and workloads that exist.
 config MEM_SOFT_DIRTY
 	bool "Track memory changes"
 	depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY
 	select PROC_PAGE_MONITOR
 	help
 	  This option enables memory changes tracking by introducing a
 	  soft-dirty bit on pte-s. This bit it set when someone writes
 	  into a page just as regular dirty bit, but unlike the latter
 	  it can be cleared by hands.
 	  See Documentation/vm/soft-dirty.txt for more details.

mm/Makefile

Diff comments View file @ 2b28111

 #
 # Makefile for the linux memory manager.
 #
 mmu-y			:= nommu.o
 mmu-$(CONFIG_MMU)	:= fremap.o highmem.o madvise.o memory.o mincore.o \
 			   mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
 			   vmalloc.o pagewalk.o pgtable-generic.o
 ifdef CONFIG_CROSS_MEMORY_ATTACH
 mmu-$(CONFIG_MMU)	+= process_vm_access.o
 endif
 obj-y			:= filemap.o mempool.o oom_kill.o fadvise.o \
 			   maccess.o page_alloc.o page-writeback.o \
 			   readahead.o swap.o truncate.o vmscan.o shmem.o \
 			   util.o mmzone.o vmstat.o backing-dev.o \
 			   mm_init.o mmu_context.o percpu.o slab_common.o \
 			   compaction.o balloon_compaction.o \
 			   interval_tree.o $(mmu-y)
 obj-y += init-mm.o
 ifdef CONFIG_NO_BOOTMEM
 	obj-y		+= nobootmem.o
 else
 	obj-y		+= bootmem.o
 endif
 obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
 obj-$(CONFIG_BOUNCE)	+= bounce.o
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o
 obj-$(CONFIG_FRONTSWAP)	+= frontswap.o
+obj-$(CONFIG_ZSWAP)	+= zswap.o
 obj-$(CONFIG_HAS_DMA)	+= dmapool.o
 obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
 obj-$(CONFIG_NUMA) 	+= mempolicy.o
 obj-$(CONFIG_SPARSEMEM)	+= sparse.o
 obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
 obj-$(CONFIG_SLOB) += slob.o
 obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
 obj-$(CONFIG_KSM) += ksm.o
 obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
 obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_SLUB) += slub.o
 obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
 obj-$(CONFIG_FAILSLAB) += failslab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
 obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o vmpressure.o
 obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
 obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
 obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
 obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
 obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
 obj-$(CONFIG_CLEANCACHE) += cleancache.o
 obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
 obj-$(CONFIG_ZBUD)	+= zbud.o

mm/zswap.c

Diff comments View file @ 2b28111

File was created	1	/*
	2	* zswap.c - zswap driver file
	3	*
	4	* zswap is a backend for frontswap that takes pages that are in the process
	5	* of being swapped out and attempts to compress and store them in a
	6	* RAM-based memory pool. This can result in a significant I/O reduction on
	7	* the swap device and, in the case where decompressing from RAM is faster
	8	* than reading from the swap device, can also improve workload performance.
	9	*
	10	* Copyright (C) 2012 Seth Jennings <sjenning@linux.vnet.ibm.com>
	11	*
	12	* This program is free software; you can redistribute it and/or
	13	* modify it under the terms of the GNU General Public License
	14	* as published by the Free Software Foundation; either version 2
	15	* of the License, or (at your option) any later version.
	16	*
	17	* This program is distributed in the hope that it will be useful,
	18	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	* GNU General Public License for more details.
	21	*/
	22
	23	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
	24
	25	#include <linux/module.h>
	26	#include <linux/cpu.h>
	27	#include <linux/highmem.h>
	28	#include <linux/slab.h>
	29	#include <linux/spinlock.h>
	30	#include <linux/types.h>
	31	#include <linux/atomic.h>
	32	#include <linux/frontswap.h>
	33	#include <linux/rbtree.h>
	34	#include <linux/swap.h>
	35	#include <linux/crypto.h>
	36	#include <linux/mempool.h>
	37	#include <linux/zbud.h>
	38
	39	#include <linux/mm_types.h>
	40	#include <linux/page-flags.h>
	41	#include <linux/swapops.h>
	42	#include <linux/writeback.h>
	43	#include <linux/pagemap.h>
	44
	45	/*********************************
	46	* statistics
	47	**********************************/
	48	/* Number of memory pages used by the compressed pool */
	49	static u64 zswap_pool_pages;
	50	/* The number of compressed pages currently stored in zswap */
	51	static atomic_t zswap_stored_pages = ATOMIC_INIT(0);
	52
	53	/*
	54	* The statistics below are not protected from concurrent access for
	55	* performance reasons so they may not be a 100% accurate. However,
	56	* they do provide useful information on roughly how many times a
	57	* certain event is occurring.
	58	*/
	59
	60	/* Pool limit was hit (see zswap_max_pool_percent) */
	61	static u64 zswap_pool_limit_hit;
	62	/* Pages written back when pool limit was reached */
	63	static u64 zswap_written_back_pages;
	64	/* Store failed due to a reclaim failure after pool limit was reached */
	65	static u64 zswap_reject_reclaim_fail;
	66	/* Compressed page was too big for the allocator to (optimally) store */
	67	static u64 zswap_reject_compress_poor;
	68	/* Store failed because underlying allocator could not get memory */
	69	static u64 zswap_reject_alloc_fail;
	70	/* Store failed because the entry metadata could not be allocated (rare) */
	71	static u64 zswap_reject_kmemcache_fail;
	72	/* Duplicate store was encountered (rare) */
	73	static u64 zswap_duplicate_entry;
	74
	75	/*********************************
	76	* tunables
	77	**********************************/
	78	/* Enable/disable zswap (disabled by default, fixed at boot for now) */
	79	static bool zswap_enabled __read_mostly;
	80	module_param_named(enabled, zswap_enabled, bool, 0);
	81
	82	/* Compressor to be used by zswap (fixed at boot for now) */
	83	#define ZSWAP_COMPRESSOR_DEFAULT "lzo"
	84	static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
	85	module_param_named(compressor, zswap_compressor, charp, 0);
	86
	87	/* The maximum percentage of memory that the compressed pool can occupy */
	88	static unsigned int zswap_max_pool_percent = 20;
	89	module_param_named(max_pool_percent,
	90	zswap_max_pool_percent, uint, 0644);
	91
	92	/*********************************
	93	* compression functions
	94	**********************************/
	95	/* per-cpu compression transforms */
	96	static struct crypto_comp * __percpu *zswap_comp_pcpu_tfms;
	97
	98	enum comp_op {
	99	ZSWAP_COMPOP_COMPRESS,
	100	ZSWAP_COMPOP_DECOMPRESS
	101	};
	102
	103	static int zswap_comp_op(enum comp_op op, const u8 *src, unsigned int slen,
	104	u8 dst, unsigned int dlen)
	105	{
	106	struct crypto_comp *tfm;
	107	int ret;
	108
	109	tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, get_cpu());
	110	switch (op) {
	111	case ZSWAP_COMPOP_COMPRESS:
	112	ret = crypto_comp_compress(tfm, src, slen, dst, dlen);
	113	break;
	114	case ZSWAP_COMPOP_DECOMPRESS:
	115	ret = crypto_comp_decompress(tfm, src, slen, dst, dlen);
	116	break;
	117	default:
	118	ret = -EINVAL;
	119	}
	120
	121	put_cpu();
	122	return ret;
	123	}
	124
	125	static int __init zswap_comp_init(void)
	126	{
	127	if (!crypto_has_comp(zswap_compressor, 0, 0)) {
	128	pr_info("%s compressor not available\n", zswap_compressor);
	129	/* fall back to default compressor */
	130	zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
	131	if (!crypto_has_comp(zswap_compressor, 0, 0))
	132	/* can't even load the default compressor */
	133	return -ENODEV;
	134	}
	135	pr_info("using %s compressor\n", zswap_compressor);
	136
	137	/* alloc percpu transforms */
	138	zswap_comp_pcpu_tfms = alloc_percpu(struct crypto_comp *);
	139	if (!zswap_comp_pcpu_tfms)
	140	return -ENOMEM;
	141	return 0;
	142	}
	143
	144	static void zswap_comp_exit(void)
	145	{
	146	/* free percpu transforms */
	147	if (zswap_comp_pcpu_tfms)
	148	free_percpu(zswap_comp_pcpu_tfms);
	149	}
	150
	151	/*********************************
	152	* data structures
	153	**********************************/
	154	/*
	155	* struct zswap_entry
	156	*
	157	* This structure contains the metadata for tracking a single compressed
	158	* page within zswap.
	159	*
	160	* rbnode - links the entry into red-black tree for the appropriate swap type
	161	* refcount - the number of outstanding reference to the entry. This is needed
	162	* to protect against premature freeing of the entry by code
	163	* concurent calls to load, invalidate, and writeback. The lock
	164	* for the zswap_tree structure that contains the entry must
	165	* be held while changing the refcount. Since the lock must
	166	* be held, there is no reason to also make refcount atomic.
	167	* offset - the swap offset for the entry. Index into the red-black tree.
	168	* handle - zsmalloc allocation handle that stores the compressed page data
	169	* length - the length in bytes of the compressed page data. Needed during
	170	* decompression
	171	*/
	172	struct zswap_entry {
	173	struct rb_node rbnode;
	174	pgoff_t offset;
	175	int refcount;
	176	unsigned int length;
	177	unsigned long handle;
	178	};
	179
	180	struct zswap_header {
	181	swp_entry_t swpentry;
	182	};
	183
	184	/*
	185	* The tree lock in the zswap_tree struct protects a few things:
	186	* - the rbtree
	187	* - the refcount field of each entry in the tree
	188	*/
	189	struct zswap_tree {
	190	struct rb_root rbroot;
	191	spinlock_t lock;
	192	struct zbud_pool *pool;
	193	};
	194
	195	static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
	196
	197	/*********************************
	198	* zswap entry functions
	199	**********************************/
	200	static struct kmem_cache *zswap_entry_cache;
	201
	202	static int zswap_entry_cache_create(void)
	203	{
	204	zswap_entry_cache = KMEM_CACHE(zswap_entry, 0);
	205	return (zswap_entry_cache == NULL);
	206	}
	207
	208	static void zswap_entry_cache_destory(void)
	209	{
	210	kmem_cache_destroy(zswap_entry_cache);
	211	}
	212
	213	static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp)
	214	{
	215	struct zswap_entry *entry;
	216	entry = kmem_cache_alloc(zswap_entry_cache, gfp);
	217	if (!entry)
	218	return NULL;
	219	entry->refcount = 1;
	220	return entry;
	221	}
	222
	223	static void zswap_entry_cache_free(struct zswap_entry *entry)
	224	{
	225	kmem_cache_free(zswap_entry_cache, entry);
	226	}
	227
	228	/* caller must hold the tree lock */
	229	static void zswap_entry_get(struct zswap_entry *entry)
	230	{
	231	entry->refcount++;
	232	}
	233
	234	/* caller must hold the tree lock */
	235	static int zswap_entry_put(struct zswap_entry *entry)
	236	{
	237	entry->refcount--;
	238	return entry->refcount;
	239	}
	240
	241	/*********************************
	242	* rbtree functions
	243	**********************************/
	244	static struct zswap_entry zswap_rb_search(struct rb_root root, pgoff_t offset)
	245	{
	246	struct rb_node *node = root->rb_node;
	247	struct zswap_entry *entry;
	248
	249	while (node) {
	250	entry = rb_entry(node, struct zswap_entry, rbnode);
	251	if (entry->offset > offset)
	252	node = node->rb_left;
	253	else if (entry->offset < offset)
	254	node = node->rb_right;
	255	else
	256	return entry;
	257	}
	258	return NULL;
	259	}
	260
	261	/*
	262	* In the case that a entry with the same offset is found, a pointer to
	263	* the existing entry is stored in dupentry and the function returns -EEXIST
	264	*/
	265	static int zswap_rb_insert(struct rb_root root, struct zswap_entry entry,
	266	struct zswap_entry **dupentry)
	267	{
	268	struct rb_node *link = &root->rb_node, parent = NULL;
	269	struct zswap_entry *myentry;
	270
	271	while (*link) {
	272	parent = *link;
	273	myentry = rb_entry(parent, struct zswap_entry, rbnode);
	274	if (myentry->offset > entry->offset)
	275	link = &(*link)->rb_left;
	276	else if (myentry->offset < entry->offset)
	277	link = &(*link)->rb_right;
	278	else {
	279	*dupentry = myentry;
	280	return -EEXIST;
	281	}
	282	}
	283	rb_link_node(&entry->rbnode, parent, link);
	284	rb_insert_color(&entry->rbnode, root);
	285	return 0;
	286	}
	287
	288	/*********************************
	289	* per-cpu code
	290	**********************************/
	291	static DEFINE_PER_CPU(u8 *, zswap_dstmem);
	292
	293	static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu)
	294	{
	295	struct crypto_comp *tfm;
	296	u8 *dst;
	297
	298	switch (action) {
	299	case CPU_UP_PREPARE:
	300	tfm = crypto_alloc_comp(zswap_compressor, 0, 0);
	301	if (IS_ERR(tfm)) {
	302	pr_err("can't allocate compressor transform\n");
	303	return NOTIFY_BAD;
	304	}
	305	*per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = tfm;
	306	dst = kmalloc(PAGE_SIZE * 2, GFP_KERNEL);
	307	if (!dst) {
	308	pr_err("can't allocate compressor buffer\n");
	309	crypto_free_comp(tfm);
	310	*per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL;
	311	return NOTIFY_BAD;
	312	}
	313	per_cpu(zswap_dstmem, cpu) = dst;
	314	break;
	315	case CPU_DEAD:
	316	case CPU_UP_CANCELED:
	317	tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu);
	318	if (tfm) {
	319	crypto_free_comp(tfm);
	320	*per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL;
	321	}
	322	dst = per_cpu(zswap_dstmem, cpu);
	323	kfree(dst);
	324	per_cpu(zswap_dstmem, cpu) = NULL;
	325	break;
	326	default:
	327	break;
	328	}
	329	return NOTIFY_OK;
	330	}
	331
	332	static int zswap_cpu_notifier(struct notifier_block *nb,
	333	unsigned long action, void *pcpu)
	334	{
	335	unsigned long cpu = (unsigned long)pcpu;
	336	return __zswap_cpu_notifier(action, cpu);
	337	}
	338
	339	static struct notifier_block zswap_cpu_notifier_block = {
	340	.notifier_call = zswap_cpu_notifier
	341	};
	342
	343	static int zswap_cpu_init(void)
	344	{
	345	unsigned long cpu;
	346
	347	get_online_cpus();
	348	for_each_online_cpu(cpu)
	349	if (__zswap_cpu_notifier(CPU_UP_PREPARE, cpu) != NOTIFY_OK)
	350	goto cleanup;
	351	register_cpu_notifier(&zswap_cpu_notifier_block);
	352	put_online_cpus();
	353	return 0;
	354
	355	cleanup:
	356	for_each_online_cpu(cpu)
	357	__zswap_cpu_notifier(CPU_UP_CANCELED, cpu);
	358	put_online_cpus();
	359	return -ENOMEM;
	360	}
	361
	362	/*********************************
	363	* helpers
	364	**********************************/
	365	static bool zswap_is_full(void)
	366	{
	367	return (totalram_pages * zswap_max_pool_percent / 100 <
	368	zswap_pool_pages);
	369	}
	370
	371	/*
	372	* Carries out the common pattern of freeing and entry's zsmalloc allocation,
	373	* freeing the entry itself, and decrementing the number of stored pages.
	374	*/
	375	static void zswap_free_entry(struct zswap_tree tree, struct zswap_entry entry)
	376	{
	377	zbud_free(tree->pool, entry->handle);
	378	zswap_entry_cache_free(entry);
	379	atomic_dec(&zswap_stored_pages);
	380	zswap_pool_pages = zbud_get_pool_size(tree->pool);
	381	}
	382
	383	/*********************************
	384	* writeback code
	385	**********************************/
	386	/* return enum for zswap_get_swap_cache_page */
	387	enum zswap_get_swap_ret {
	388	ZSWAP_SWAPCACHE_NEW,
	389	ZSWAP_SWAPCACHE_EXIST,
	390	ZSWAP_SWAPCACHE_NOMEM
	391	};
	392
	393	/*
	394	* zswap_get_swap_cache_page
	395	*
	396	* This is an adaption of read_swap_cache_async()
	397	*
	398	* This function tries to find a page with the given swap entry
	399	* in the swapper_space address space (the swap cache). If the page
	400	* is found, it is returned in retpage. Otherwise, a page is allocated,
	401	* added to the swap cache, and returned in retpage.
	402	*
	403	* If success, the swap cache page is returned in retpage
	404	* Returns 0 if page was already in the swap cache, page is not locked
	405	* Returns 1 if the new page needs to be populated, page is locked
	406	* Returns <0 on error
	407	*/
	408	static int zswap_get_swap_cache_page(swp_entry_t entry,
	409	struct page **retpage)
	410	{
	411	struct page found_page, new_page = NULL;
	412	struct address_space *swapper_space = &swapper_spaces[swp_type(entry)];
	413	int err;
	414
	415	*retpage = NULL;
	416	do {
	417	/*
	418	* First check the swap cache. Since this is normally
	419	* called after lookup_swap_cache() failed, re-calling
	420	* that would confuse statistics.
	421	*/
	422	found_page = find_get_page(swapper_space, entry.val);
	423	if (found_page)
	424	break;
	425
	426	/*
	427	* Get a new page to read into from swap.
	428	*/
	429	if (!new_page) {
	430	new_page = alloc_page(GFP_KERNEL);
	431	if (!new_page)
	432	break; /* Out of memory */
	433	}
	434
	435	/*
	436	* call radix_tree_preload() while we can wait.
	437	*/
	438	err = radix_tree_preload(GFP_KERNEL);
	439	if (err)
	440	break;
	441
	442	/*
	443	* Swap entry may have been freed since our caller observed it.
	444	*/
	445	err = swapcache_prepare(entry);
	446	if (err == -EEXIST) { /* seems racy */
	447	radix_tree_preload_end();
	448	continue;
	449	}
	450	if (err) { /* swp entry is obsolete ? */
	451	radix_tree_preload_end();
	452	break;
	453	}
	454
	455	/* May fail (-ENOMEM) if radix-tree node allocation failed. */
	456	__set_page_locked(new_page);
	457	SetPageSwapBacked(new_page);
	458	err = __add_to_swap_cache(new_page, entry);
	459	if (likely(!err)) {
	460	radix_tree_preload_end();
	461	lru_cache_add_anon(new_page);
	462	*retpage = new_page;
	463	return ZSWAP_SWAPCACHE_NEW;
	464	}
	465	radix_tree_preload_end();
	466	ClearPageSwapBacked(new_page);
	467	__clear_page_locked(new_page);
	468	/*
	469	* add_to_swap_cache() doesn't return -EEXIST, so we can safely
	470	* clear SWAP_HAS_CACHE flag.
	471	*/
	472	swapcache_free(entry, NULL);
	473	} while (err != -ENOMEM);
	474
	475	if (new_page)
	476	page_cache_release(new_page);
	477	if (!found_page)
	478	return ZSWAP_SWAPCACHE_NOMEM;
	479	*retpage = found_page;
	480	return ZSWAP_SWAPCACHE_EXIST;
	481	}
	482
	483	/*
	484	* Attempts to free an entry by adding a page to the swap cache,
	485	* decompressing the entry data into the page, and issuing a
	486	* bio write to write the page back to the swap device.
	487	*
	488	* This can be thought of as a "resumed writeback" of the page
	489	* to the swap device. We are basically resuming the same swap
	490	* writeback path that was intercepted with the frontswap_store()
	491	* in the first place. After the page has been decompressed into
	492	* the swap cache, the compressed version stored by zswap can be
	493	* freed.
	494	*/
	495	static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
	496	{
	497	struct zswap_header *zhdr;
	498	swp_entry_t swpentry;
	499	struct zswap_tree *tree;
	500	pgoff_t offset;
	501	struct zswap_entry *entry;
	502	struct page *page;
	503	u8 src, dst;
	504	unsigned int dlen;
	505	int ret, refcount;
	506	struct writeback_control wbc = {
	507	.sync_mode = WB_SYNC_NONE,
	508	};
	509
	510	/* extract swpentry from data */
	511	zhdr = zbud_map(pool, handle);
	512	swpentry = zhdr->swpentry; /* here */
	513	zbud_unmap(pool, handle);
	514	tree = zswap_trees[swp_type(swpentry)];
	515	offset = swp_offset(swpentry);
	516	BUG_ON(pool != tree->pool);
	517
	518	/* find and ref zswap entry */
	519	spin_lock(&tree->lock);
	520	entry = zswap_rb_search(&tree->rbroot, offset);
	521	if (!entry) {
	522	/* entry was invalidated */
	523	spin_unlock(&tree->lock);
	524	return 0;
	525	}
	526	zswap_entry_get(entry);
	527	spin_unlock(&tree->lock);
	528	BUG_ON(offset != entry->offset);
	529
	530	/* try to allocate swap cache page */
	531	switch (zswap_get_swap_cache_page(swpentry, &page)) {
	532	case ZSWAP_SWAPCACHE_NOMEM: /* no memory */
	533	ret = -ENOMEM;
	534	goto fail;
	535
	536	case ZSWAP_SWAPCACHE_EXIST: /* page is unlocked */
	537	/* page is already in the swap cache, ignore for now */
	538	page_cache_release(page);
	539	ret = -EEXIST;
	540	goto fail;
	541
	542	case ZSWAP_SWAPCACHE_NEW: /* page is locked */
	543	/* decompress */
	544	dlen = PAGE_SIZE;
	545	src = (u8 *)zbud_map(tree->pool, entry->handle) +
	546	sizeof(struct zswap_header);
	547	dst = kmap_atomic(page);
	548	ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src,
	549	entry->length, dst, &dlen);
	550	kunmap_atomic(dst);
	551	zbud_unmap(tree->pool, entry->handle);
	552	BUG_ON(ret);
	553	BUG_ON(dlen != PAGE_SIZE);
	554
	555	/* page is up to date */
	556	SetPageUptodate(page);
	557	}
	558
	559	/* start writeback */
	560	__swap_writepage(page, &wbc, end_swap_bio_write);
	561	page_cache_release(page);
	562	zswap_written_back_pages++;
	563
	564	spin_lock(&tree->lock);
	565
	566	/* drop local reference */
	567	zswap_entry_put(entry);
	568	/* drop the initial reference from entry creation */
	569	refcount = zswap_entry_put(entry);
	570
	571	/*
	572	* There are three possible values for refcount here:
	573	* (1) refcount is 1, load is in progress, unlink from rbtree,
	574	* load will free
	575	* (2) refcount is 0, (normal case) entry is valid,
	576	* remove from rbtree and free entry
	577	* (3) refcount is -1, invalidate happened during writeback,
	578	* free entry
	579	*/
	580	if (refcount >= 0) {
	581	/* no invalidate yet, remove from rbtree */
	582	rb_erase(&entry->rbnode, &tree->rbroot);
	583	}
	584	spin_unlock(&tree->lock);
	585	if (refcount <= 0) {
	586	/* free the entry */
	587	zswap_free_entry(tree, entry);
	588	return 0;
	589	}
	590	return -EAGAIN;
	591
	592	fail:
	593	spin_lock(&tree->lock);
	594	zswap_entry_put(entry);
	595	spin_unlock(&tree->lock);
	596	return ret;
	597	}
	598
	599	/*********************************
	600	* frontswap hooks
	601	**********************************/
	602	/* attempts to compress and store an single page */
	603	static int zswap_frontswap_store(unsigned type, pgoff_t offset,
	604	struct page *page)
	605	{
	606	struct zswap_tree *tree = zswap_trees[type];
	607	struct zswap_entry entry, dupentry;
	608	int ret;
	609	unsigned int dlen = PAGE_SIZE, len;
	610	unsigned long handle;
	611	char *buf;
	612	u8 src, dst;
	613	struct zswap_header *zhdr;
	614
	615	if (!tree) {
	616	ret = -ENODEV;
	617	goto reject;
	618	}
	619
	620	/* reclaim space if needed */
	621	if (zswap_is_full()) {
	622	zswap_pool_limit_hit++;
	623	if (zbud_reclaim_page(tree->pool, 8)) {
	624	zswap_reject_reclaim_fail++;
	625	ret = -ENOMEM;
	626	goto reject;
	627	}
	628	}
	629
	630	/* allocate entry */
	631	entry = zswap_entry_cache_alloc(GFP_KERNEL);
	632	if (!entry) {
	633	zswap_reject_kmemcache_fail++;
	634	ret = -ENOMEM;
	635	goto reject;
	636	}
	637
	638	/* compress */
	639	dst = get_cpu_var(zswap_dstmem);
	640	src = kmap_atomic(page);
	641	ret = zswap_comp_op(ZSWAP_COMPOP_COMPRESS, src, PAGE_SIZE, dst, &dlen);
	642	kunmap_atomic(src);
	643	if (ret) {
	644	ret = -EINVAL;
	645	goto freepage;
	646	}
	647
	648	/* store */
	649	len = dlen + sizeof(struct zswap_header);
	650	ret = zbud_alloc(tree->pool, len, __GFP_NORETRY \| __GFP_NOWARN,
	651	&handle);
	652	if (ret == -ENOSPC) {
	653	zswap_reject_compress_poor++;
	654	goto freepage;
	655	}
	656	if (ret) {
	657	zswap_reject_alloc_fail++;
	658	goto freepage;
	659	}
	660	zhdr = zbud_map(tree->pool, handle);
	661	zhdr->swpentry = swp_entry(type, offset);
	662	buf = (u8 *)(zhdr + 1);
	663	memcpy(buf, dst, dlen);
	664	zbud_unmap(tree->pool, handle);
	665	put_cpu_var(zswap_dstmem);
	666
	667	/* populate entry */
	668	entry->offset = offset;
	669	entry->handle = handle;
	670	entry->length = dlen;
	671
	672	/* map */
	673	spin_lock(&tree->lock);
	674	do {
	675	ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry);
	676	if (ret == -EEXIST) {
	677	zswap_duplicate_entry++;
	678	/* remove from rbtree */
	679	rb_erase(&dupentry->rbnode, &tree->rbroot);
	680	if (!zswap_entry_put(dupentry)) {
	681	/* free */
	682	zswap_free_entry(tree, dupentry);
	683	}
	684	}
	685	} while (ret == -EEXIST);
	686	spin_unlock(&tree->lock);
	687
	688	/* update stats */
	689	atomic_inc(&zswap_stored_pages);
	690	zswap_pool_pages = zbud_get_pool_size(tree->pool);
	691
	692	return 0;
	693
	694	freepage:
	695	put_cpu_var(zswap_dstmem);
	696	zswap_entry_cache_free(entry);
	697	reject:
	698	return ret;
	699	}
	700
	701	/*
	702	* returns 0 if the page was successfully decompressed
	703	* return -1 on entry not found or error
	704	*/
	705	static int zswap_frontswap_load(unsigned type, pgoff_t offset,
	706	struct page *page)
	707	{
	708	struct zswap_tree *tree = zswap_trees[type];
	709	struct zswap_entry *entry;
	710	u8 src, dst;
	711	unsigned int dlen;
	712	int refcount, ret;
	713
	714	/* find */
	715	spin_lock(&tree->lock);
	716	entry = zswap_rb_search(&tree->rbroot, offset);
	717	if (!entry) {
	718	/* entry was written back */
	719	spin_unlock(&tree->lock);
	720	return -1;
	721	}
	722	zswap_entry_get(entry);
	723	spin_unlock(&tree->lock);
	724
	725	/* decompress */
	726	dlen = PAGE_SIZE;
	727	src = (u8 *)zbud_map(tree->pool, entry->handle) +
	728	sizeof(struct zswap_header);
	729	dst = kmap_atomic(page);
	730	ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length,
	731	dst, &dlen);
	732	kunmap_atomic(dst);
	733	zbud_unmap(tree->pool, entry->handle);
	734	BUG_ON(ret);
	735
	736	spin_lock(&tree->lock);
	737	refcount = zswap_entry_put(entry);
	738	if (likely(refcount)) {
	739	spin_unlock(&tree->lock);
	740	return 0;
	741	}
	742	spin_unlock(&tree->lock);
	743
	744	/*
	745	* We don't have to unlink from the rbtree because
	746	* zswap_writeback_entry() or zswap_frontswap_invalidate page()
	747	* has already done this for us if we are the last reference.
	748	*/
	749	/* free */
	750
	751	zswap_free_entry(tree, entry);
	752
	753	return 0;
	754	}
	755
	756	/* frees an entry in zswap */
	757	static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
	758	{
	759	struct zswap_tree *tree = zswap_trees[type];
	760	struct zswap_entry *entry;
	761	int refcount;
	762
	763	/* find */
	764	spin_lock(&tree->lock);
	765	entry = zswap_rb_search(&tree->rbroot, offset);
	766	if (!entry) {
	767	/* entry was written back */
	768	spin_unlock(&tree->lock);
	769	return;
	770	}
	771
	772	/* remove from rbtree */
	773	rb_erase(&entry->rbnode, &tree->rbroot);
	774
	775	/* drop the initial reference from entry creation */
	776	refcount = zswap_entry_put(entry);
	777
	778	spin_unlock(&tree->lock);
	779
	780	if (refcount) {
	781	/* writeback in progress, writeback will free */
	782	return;
	783	}
	784
	785	/* free */
	786	zswap_free_entry(tree, entry);
	787	}
	788
	789	/* frees all zswap entries for the given swap type */
	790	static void zswap_frontswap_invalidate_area(unsigned type)
	791	{
	792	struct zswap_tree *tree = zswap_trees[type];
	793	struct rb_node *node;
	794	struct zswap_entry *entry;
	795
	796	if (!tree)
	797	return;
	798
	799	/* walk the tree and free everything */
	800	spin_lock(&tree->lock);
	801	/*
	802	* TODO: Even though this code should not be executed because
	803	* the try_to_unuse() in swapoff should have emptied the tree,
	804	* it is very wasteful to rebalance the tree after every
	805	* removal when we are freeing the whole tree.
	806	*
	807	* If post-order traversal code is ever added to the rbtree
	808	* implementation, it should be used here.
	809	*/
	810	while ((node = rb_first(&tree->rbroot))) {
	811	entry = rb_entry(node, struct zswap_entry, rbnode);
	812	rb_erase(&entry->rbnode, &tree->rbroot);
	813	zbud_free(tree->pool, entry->handle);
	814	zswap_entry_cache_free(entry);
	815	atomic_dec(&zswap_stored_pages);
	816	}
	817	tree->rbroot = RB_ROOT;
	818	spin_unlock(&tree->lock);
	819	}
	820
	821	static struct zbud_ops zswap_zbud_ops = {
	822	.evict = zswap_writeback_entry
	823	};
	824
	825	static void zswap_frontswap_init(unsigned type)
	826	{
	827	struct zswap_tree *tree;
	828
	829	tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL);
	830	if (!tree)
	831	goto err;
	832	tree->pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops);
	833	if (!tree->pool)
	834	goto freetree;
	835	tree->rbroot = RB_ROOT;
	836	spin_lock_init(&tree->lock);
	837	zswap_trees[type] = tree;
	838	return;
	839
	840	freetree:
	841	kfree(tree);
	842	err:
	843	pr_err("alloc failed, zswap disabled for swap type %d\n", type);
	844	}
	845
	846	static struct frontswap_ops zswap_frontswap_ops = {
	847	.store = zswap_frontswap_store,
	848	.load = zswap_frontswap_load,
	849	.invalidate_page = zswap_frontswap_invalidate_page,
	850	.invalidate_area = zswap_frontswap_invalidate_area,
	851	.init = zswap_frontswap_init
	852	};
	853
	854	/*********************************
	855	* debugfs functions
	856	**********************************/
	857	#ifdef CONFIG_DEBUG_FS
	858	#include <linux/debugfs.h>
	859
	860	static struct dentry *zswap_debugfs_root;
	861
	862	static int __init zswap_debugfs_init(void)
	863	{
	864	if (!debugfs_initialized())
	865	return -ENODEV;
	866
	867	zswap_debugfs_root = debugfs_create_dir("zswap", NULL);
	868	if (!zswap_debugfs_root)
	869	return -ENOMEM;
	870
	871	debugfs_create_u64("pool_limit_hit", S_IRUGO,
	872	zswap_debugfs_root, &zswap_pool_limit_hit);
	873	debugfs_create_u64("reject_reclaim_fail", S_IRUGO,
	874	zswap_debugfs_root, &zswap_reject_reclaim_fail);
	875	debugfs_create_u64("reject_alloc_fail", S_IRUGO,
	876	zswap_debugfs_root, &zswap_reject_alloc_fail);
	877	debugfs_create_u64("reject_kmemcache_fail", S_IRUGO,
	878	zswap_debugfs_root, &zswap_reject_kmemcache_fail);
	879	debugfs_create_u64("reject_compress_poor", S_IRUGO,
	880	zswap_debugfs_root, &zswap_reject_compress_poor);
	881	debugfs_create_u64("written_back_pages", S_IRUGO,
	882	zswap_debugfs_root, &zswap_written_back_pages);
	883	debugfs_create_u64("duplicate_entry", S_IRUGO,
	884	zswap_debugfs_root, &zswap_duplicate_entry);
	885	debugfs_create_u64("pool_pages", S_IRUGO,
	886	zswap_debugfs_root, &zswap_pool_pages);
	887	debugfs_create_atomic_t("stored_pages", S_IRUGO,
	888	zswap_debugfs_root, &zswap_stored_pages);
	889
	890	return 0;
	891	}
	892
	893	static void __exit zswap_debugfs_exit(void)
	894	{
	895	debugfs_remove_recursive(zswap_debugfs_root);
	896	}
	897	#else
	898	static int __init zswap_debugfs_init(void)
	899	{
	900	return 0;
	901	}
	902
	903	static void __exit zswap_debugfs_exit(void) { }
	904	#endif
	905
	906	/*********************************
	907	* module init and exit
	908	**********************************/
	909	static int __init init_zswap(void)
	910	{
	911	if (!zswap_enabled)
	912	return 0;
	913
	914	pr_info("loading zswap\n");
	915	if (zswap_entry_cache_create()) {
	916	pr_err("entry cache creation failed\n");
	917	goto error;
	918	}
	919	if (zswap_comp_init()) {
	920	pr_err("compressor initialization failed\n");
	921	goto compfail;
	922	}
	923	if (zswap_cpu_init()) {
	924	pr_err("per-cpu initialization failed\n");
	925	goto pcpufail;
	926	}
	927	frontswap_register_ops(&zswap_frontswap_ops);
	928	if (zswap_debugfs_init())
	929	pr_warn("debugfs initialization failed\n");
	930	return 0;
	931	pcpufail:
	932	zswap_comp_exit();
	933	compfail:
	934	zswap_entry_cache_destory();
	935	error:
	936	return -ENOMEM;
	937	}
	938	/* must be late so crypto has time to come up */
	939	late_initcall(init_zswap);
	940
	941	MODULE_LICENSE("GPL");
	942	MODULE_AUTHOR("Seth Jennings <sjenning@linux.vnet.ibm.com>");
	943	MODULE_DESCRIPTION("Compressed cache for swap pages");
	944