Commit 29f233cfffe7fbc6672938117ce7e4154a2f515f

Authored by Dan Magenheimer
Committed by Konrad Rzeszutek Wilk
1 parent 38b5faf4b1

mm: frontswap: core frontswap functionality

This patch, 3of4, provides the core frontswap code that interfaces between
the hooks in the swap subsystem and a frontswap backend via frontswap_ops.

---
New file added: mm/frontswap.c

[v14: add support for writethrough, per suggestion by aarcange@redhat.com]
[v11: sjenning@linux.vnet.ibm.com: s/puts/failed_puts/]
[v10: sjenning@linux.vnet.ibm.com: fix debugfs calls on 32-bit]
[v9: akpm@linux-foundation.org: change "flush" to "invalidate", part 1]
[v9: akpm@linux-foundation.org: mark some statics __read_mostly]
[v9: akpm@linux-foundation.org: add clarifying comments]
[v9: akpm@linux-foundation.org: no need to loop repeating try_to_unuse]
[v9: error27@gmail.com: remove superfluous check for NULL]
[v8: rebase to 3.0-rc4]
[v8: kamezawa.hiroyu@jp.fujitsu.com: add comment to clarify find_next_to_unuse]
[v7: rebase to 3.0-rc3]
[v7: JBeulich@novell.com: use new static inlines, no-ops if not config'd]
[v6: rebase to 3.1-rc1]
[v6: lliubbo@gmail.com: use vzalloc]
[v6: lliubbo@gmail.com: fix null pointer deref if vzalloc fails]
[v6: konrad.wilk@oracl.com: various checks and code clarifications/comments]
[v4: rebase to 2.6.39]
Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com>
Acked-by: Jan Beulich <JBeulich@novell.com>
Acked-by: Seth Jennings <sjenning@linux.vnet.ibm.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Nitin Gupta <ngupta@vflare.org>
Cc: Matthew Wilcox <matthew@wil.cx>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Rik Riel <riel@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
[v12: Squashed s/flush/invalidate/ in]
[v15: A bit of cleanup and seperate DEBUGFS]
Signed-off-by: Konrad Wilk <konrad.wilk@oracle.com>

Showing 1 changed file with 314 additions and 0 deletions Side-by-side Diff

  1 +/*
  2 + * Frontswap frontend
  3 + *
  4 + * This code provides the generic "frontend" layer to call a matching
  5 + * "backend" driver implementation of frontswap. See
  6 + * Documentation/vm/frontswap.txt for more information.
  7 + *
  8 + * Copyright (C) 2009-2012 Oracle Corp. All rights reserved.
  9 + * Author: Dan Magenheimer
  10 + *
  11 + * This work is licensed under the terms of the GNU GPL, version 2.
  12 + */
  13 +
  14 +#include <linux/mm.h>
  15 +#include <linux/mman.h>
  16 +#include <linux/swap.h>
  17 +#include <linux/swapops.h>
  18 +#include <linux/proc_fs.h>
  19 +#include <linux/security.h>
  20 +#include <linux/capability.h>
  21 +#include <linux/module.h>
  22 +#include <linux/uaccess.h>
  23 +#include <linux/debugfs.h>
  24 +#include <linux/frontswap.h>
  25 +#include <linux/swapfile.h>
  26 +
  27 +/*
  28 + * frontswap_ops is set by frontswap_register_ops to contain the pointers
  29 + * to the frontswap "backend" implementation functions.
  30 + */
  31 +static struct frontswap_ops frontswap_ops __read_mostly;
  32 +
  33 +/*
  34 + * This global enablement flag reduces overhead on systems where frontswap_ops
  35 + * has not been registered, so is preferred to the slower alternative: a
  36 + * function call that checks a non-global.
  37 + */
  38 +bool frontswap_enabled __read_mostly;
  39 +EXPORT_SYMBOL(frontswap_enabled);
  40 +
  41 +/*
  42 + * If enabled, frontswap_put will return failure even on success. As
  43 + * a result, the swap subsystem will always write the page to swap, in
  44 + * effect converting frontswap into a writethrough cache. In this mode,
  45 + * there is no direct reduction in swap writes, but a frontswap backend
  46 + * can unilaterally "reclaim" any pages in use with no data loss, thus
  47 + * providing increases control over maximum memory usage due to frontswap.
  48 + */
  49 +static bool frontswap_writethrough_enabled __read_mostly;
  50 +
  51 +#ifdef CONFIG_DEBUG_FS
  52 +/*
  53 + * Counters available via /sys/kernel/debug/frontswap (if debugfs is
  54 + * properly configured). These are for information only so are not protected
  55 + * against increment races.
  56 + */
  57 +static u64 frontswap_gets;
  58 +static u64 frontswap_succ_puts;
  59 +static u64 frontswap_failed_puts;
  60 +static u64 frontswap_invalidates;
  61 +
  62 +static inline void inc_frontswap_gets(void) {
  63 + frontswap_gets++;
  64 +}
  65 +static inline void inc_frontswap_succ_puts(void) {
  66 + frontswap_succ_puts++;
  67 +}
  68 +static inline void inc_frontswap_failed_puts(void) {
  69 + frontswap_failed_puts++;
  70 +}
  71 +static inline void inc_frontswap_invalidates(void) {
  72 + frontswap_invalidates++;
  73 +}
  74 +#else
  75 +static inline void inc_frontswap_gets(void) { }
  76 +static inline void inc_frontswap_succ_puts(void) { }
  77 +static inline void inc_frontswap_failed_puts(void) { }
  78 +static inline void inc_frontswap_invalidates(void) { }
  79 +#endif
  80 +/*
  81 + * Register operations for frontswap, returning previous thus allowing
  82 + * detection of multiple backends and possible nesting.
  83 + */
  84 +struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops)
  85 +{
  86 + struct frontswap_ops old = frontswap_ops;
  87 +
  88 + frontswap_ops = *ops;
  89 + frontswap_enabled = true;
  90 + return old;
  91 +}
  92 +EXPORT_SYMBOL(frontswap_register_ops);
  93 +
  94 +/*
  95 + * Enable/disable frontswap writethrough (see above).
  96 + */
  97 +void frontswap_writethrough(bool enable)
  98 +{
  99 + frontswap_writethrough_enabled = enable;
  100 +}
  101 +EXPORT_SYMBOL(frontswap_writethrough);
  102 +
  103 +/*
  104 + * Called when a swap device is swapon'd.
  105 + */
  106 +void __frontswap_init(unsigned type)
  107 +{
  108 + struct swap_info_struct *sis = swap_info[type];
  109 +
  110 + BUG_ON(sis == NULL);
  111 + if (sis->frontswap_map == NULL)
  112 + return;
  113 + if (frontswap_enabled)
  114 + (*frontswap_ops.init)(type);
  115 +}
  116 +EXPORT_SYMBOL(__frontswap_init);
  117 +
  118 +/*
  119 + * "Put" data from a page to frontswap and associate it with the page's
  120 + * swaptype and offset. Page must be locked and in the swap cache.
  121 + * If frontswap already contains a page with matching swaptype and
  122 + * offset, the frontswap implmentation may either overwrite the data and
  123 + * return success or invalidate the page from frontswap and return failure.
  124 + */
  125 +int __frontswap_put_page(struct page *page)
  126 +{
  127 + int ret = -1, dup = 0;
  128 + swp_entry_t entry = { .val = page_private(page), };
  129 + int type = swp_type(entry);
  130 + struct swap_info_struct *sis = swap_info[type];
  131 + pgoff_t offset = swp_offset(entry);
  132 +
  133 + BUG_ON(!PageLocked(page));
  134 + BUG_ON(sis == NULL);
  135 + if (frontswap_test(sis, offset))
  136 + dup = 1;
  137 + ret = (*frontswap_ops.put_page)(type, offset, page);
  138 + if (ret == 0) {
  139 + frontswap_set(sis, offset);
  140 + inc_frontswap_succ_puts();
  141 + if (!dup)
  142 + atomic_inc(&sis->frontswap_pages);
  143 + } else if (dup) {
  144 + /*
  145 + failed dup always results in automatic invalidate of
  146 + the (older) page from frontswap
  147 + */
  148 + frontswap_clear(sis, offset);
  149 + atomic_dec(&sis->frontswap_pages);
  150 + inc_frontswap_failed_puts();
  151 + } else
  152 + inc_frontswap_failed_puts();
  153 + if (frontswap_writethrough_enabled)
  154 + /* report failure so swap also writes to swap device */
  155 + ret = -1;
  156 + return ret;
  157 +}
  158 +EXPORT_SYMBOL(__frontswap_put_page);
  159 +
  160 +/*
  161 + * "Get" data from frontswap associated with swaptype and offset that were
  162 + * specified when the data was put to frontswap and use it to fill the
  163 + * specified page with data. Page must be locked and in the swap cache.
  164 + */
  165 +int __frontswap_get_page(struct page *page)
  166 +{
  167 + int ret = -1;
  168 + swp_entry_t entry = { .val = page_private(page), };
  169 + int type = swp_type(entry);
  170 + struct swap_info_struct *sis = swap_info[type];
  171 + pgoff_t offset = swp_offset(entry);
  172 +
  173 + BUG_ON(!PageLocked(page));
  174 + BUG_ON(sis == NULL);
  175 + if (frontswap_test(sis, offset))
  176 + ret = (*frontswap_ops.get_page)(type, offset, page);
  177 + if (ret == 0)
  178 + inc_frontswap_gets();
  179 + return ret;
  180 +}
  181 +EXPORT_SYMBOL(__frontswap_get_page);
  182 +
  183 +/*
  184 + * Invalidate any data from frontswap associated with the specified swaptype
  185 + * and offset so that a subsequent "get" will fail.
  186 + */
  187 +void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
  188 +{
  189 + struct swap_info_struct *sis = swap_info[type];
  190 +
  191 + BUG_ON(sis == NULL);
  192 + if (frontswap_test(sis, offset)) {
  193 + (*frontswap_ops.invalidate_page)(type, offset);
  194 + atomic_dec(&sis->frontswap_pages);
  195 + frontswap_clear(sis, offset);
  196 + inc_frontswap_invalidates();
  197 + }
  198 +}
  199 +EXPORT_SYMBOL(__frontswap_invalidate_page);
  200 +
  201 +/*
  202 + * Invalidate all data from frontswap associated with all offsets for the
  203 + * specified swaptype.
  204 + */
  205 +void __frontswap_invalidate_area(unsigned type)
  206 +{
  207 + struct swap_info_struct *sis = swap_info[type];
  208 +
  209 + BUG_ON(sis == NULL);
  210 + if (sis->frontswap_map == NULL)
  211 + return;
  212 + (*frontswap_ops.invalidate_area)(type);
  213 + atomic_set(&sis->frontswap_pages, 0);
  214 + memset(sis->frontswap_map, 0, sis->max / sizeof(long));
  215 +}
  216 +EXPORT_SYMBOL(__frontswap_invalidate_area);
  217 +
  218 +/*
  219 + * Frontswap, like a true swap device, may unnecessarily retain pages
  220 + * under certain circumstances; "shrink" frontswap is essentially a
  221 + * "partial swapoff" and works by calling try_to_unuse to attempt to
  222 + * unuse enough frontswap pages to attempt to -- subject to memory
  223 + * constraints -- reduce the number of pages in frontswap to the
  224 + * number given in the parameter target_pages.
  225 + */
  226 +void frontswap_shrink(unsigned long target_pages)
  227 +{
  228 + struct swap_info_struct *si = NULL;
  229 + int si_frontswap_pages;
  230 + unsigned long total_pages = 0, total_pages_to_unuse;
  231 + unsigned long pages = 0, pages_to_unuse = 0;
  232 + int type;
  233 + bool locked = false;
  234 +
  235 + /*
  236 + * we don't want to hold swap_lock while doing a very
  237 + * lengthy try_to_unuse, but swap_list may change
  238 + * so restart scan from swap_list.head each time
  239 + */
  240 + spin_lock(&swap_lock);
  241 + locked = true;
  242 + total_pages = 0;
  243 + for (type = swap_list.head; type >= 0; type = si->next) {
  244 + si = swap_info[type];
  245 + total_pages += atomic_read(&si->frontswap_pages);
  246 + }
  247 + if (total_pages <= target_pages)
  248 + goto out;
  249 + total_pages_to_unuse = total_pages - target_pages;
  250 + for (type = swap_list.head; type >= 0; type = si->next) {
  251 + si = swap_info[type];
  252 + si_frontswap_pages = atomic_read(&si->frontswap_pages);
  253 + if (total_pages_to_unuse < si_frontswap_pages)
  254 + pages = pages_to_unuse = total_pages_to_unuse;
  255 + else {
  256 + pages = si_frontswap_pages;
  257 + pages_to_unuse = 0; /* unuse all */
  258 + }
  259 + /* ensure there is enough RAM to fetch pages from frontswap */
  260 + if (security_vm_enough_memory_mm(current->mm, pages))
  261 + continue;
  262 + vm_unacct_memory(pages);
  263 + break;
  264 + }
  265 + if (type < 0)
  266 + goto out;
  267 + locked = false;
  268 + spin_unlock(&swap_lock);
  269 + try_to_unuse(type, true, pages_to_unuse);
  270 +out:
  271 + if (locked)
  272 + spin_unlock(&swap_lock);
  273 + return;
  274 +}
  275 +EXPORT_SYMBOL(frontswap_shrink);
  276 +
  277 +/*
  278 + * Count and return the number of frontswap pages across all
  279 + * swap devices. This is exported so that backend drivers can
  280 + * determine current usage without reading debugfs.
  281 + */
  282 +unsigned long frontswap_curr_pages(void)
  283 +{
  284 + int type;
  285 + unsigned long totalpages = 0;
  286 + struct swap_info_struct *si = NULL;
  287 +
  288 + spin_lock(&swap_lock);
  289 + for (type = swap_list.head; type >= 0; type = si->next) {
  290 + si = swap_info[type];
  291 + totalpages += atomic_read(&si->frontswap_pages);
  292 + }
  293 + spin_unlock(&swap_lock);
  294 + return totalpages;
  295 +}
  296 +EXPORT_SYMBOL(frontswap_curr_pages);
  297 +
  298 +static int __init init_frontswap(void)
  299 +{
  300 +#ifdef CONFIG_DEBUG_FS
  301 + struct dentry *root = debugfs_create_dir("frontswap", NULL);
  302 + if (root == NULL)
  303 + return -ENXIO;
  304 + debugfs_create_u64("gets", S_IRUGO, root, &frontswap_gets);
  305 + debugfs_create_u64("succ_puts", S_IRUGO, root, &frontswap_succ_puts);
  306 + debugfs_create_u64("failed_puts", S_IRUGO, root,
  307 + &frontswap_failed_puts);
  308 + debugfs_create_u64("invalidates", S_IRUGO,
  309 + root, &frontswap_invalidates);
  310 +#endif
  311 + return 0;
  312 +}
  313 +
  314 +module_init(init_frontswap);