Commit 27a7faa0779dd13729196c1a818c294f44bbd1ee

Authored by KAMEZAWA Hiroyuki
Committed by Linus Torvalds
1 parent c077719be8

memcg: swap cgroup for remembering usage

For accounting swap, we need a record per swap entry, at least.

This patch adds following function.
  - swap_cgroup_swapon() .... called from swapon
  - swap_cgroup_swapoff() ... called at the end of swapoff.

  - swap_cgroup_record() .... record information of swap entry.
  - swap_cgroup_lookup() .... lookup information of swap entry.

This patch just implements "how to record information".  No actual method
for limit the usage of swap.  These routine uses flat table to record and
lookup.  "wise" lookup system like radix-tree requires requires memory
allocation at new records but swap-out is usually called under memory
shortage (or memcg hits limit.) So, I used static allocation.  (maybe
dynamic allocation is not very hard but it adds additional memory
allocation in memory shortage path.)

Note1: In this, we use pointer to record information and this means
      8bytes per swap entry. I think we can reduce this when we
      create "id of cgroup" in the range of 0-65535 or 0-255.

Reported-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Tested-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Reported-by: Hugh Dickins <hugh@veritas.com>
Reported-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Pavel Emelianov <xemul@openvz.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 3 changed files with 242 additions and 0 deletions Side-by-side Diff

include/linux/page_cgroup.h
... ... @@ -105,5 +105,40 @@
105 105 }
106 106  
107 107 #endif
  108 +
  109 +#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
  110 +#include <linux/swap.h>
  111 +extern struct mem_cgroup *
  112 +swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem);
  113 +extern struct mem_cgroup *lookup_swap_cgroup(swp_entry_t ent);
  114 +extern int swap_cgroup_swapon(int type, unsigned long max_pages);
  115 +extern void swap_cgroup_swapoff(int type);
  116 +#else
  117 +#include <linux/swap.h>
  118 +
  119 +static inline
  120 +struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem)
  121 +{
  122 + return NULL;
  123 +}
  124 +
  125 +static inline
  126 +struct mem_cgroup *lookup_swap_cgroup(swp_entry_t ent)
  127 +{
  128 + return NULL;
  129 +}
  130 +
  131 +static inline int
  132 +swap_cgroup_swapon(int type, unsigned long max_pages)
  133 +{
  134 + return 0;
  135 +}
  136 +
  137 +static inline void swap_cgroup_swapoff(int type)
  138 +{
  139 + return;
  140 +}
  141 +
  142 +#endif
108 143 #endif
... ... @@ -8,6 +8,7 @@
8 8 #include <linux/memory.h>
9 9 #include <linux/vmalloc.h>
10 10 #include <linux/cgroup.h>
  11 +#include <linux/swapops.h>
11 12  
12 13 static void __meminit
13 14 __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
... ... @@ -267,6 +268,202 @@
267 268 void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
268 269 {
269 270 return;
  271 +}
  272 +
  273 +#endif
  274 +
  275 +
  276 +#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
  277 +
  278 +static DEFINE_MUTEX(swap_cgroup_mutex);
  279 +struct swap_cgroup_ctrl {
  280 + struct page **map;
  281 + unsigned long length;
  282 +};
  283 +
  284 +struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
  285 +
  286 +/*
  287 + * This 8bytes seems big..maybe we can reduce this when we can use "id" for
  288 + * cgroup rather than pointer.
  289 + */
  290 +struct swap_cgroup {
  291 + struct mem_cgroup *val;
  292 +};
  293 +#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup))
  294 +#define SC_POS_MASK (SC_PER_PAGE - 1)
  295 +
  296 +/*
  297 + * SwapCgroup implements "lookup" and "exchange" operations.
  298 + * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
  299 + * against SwapCache. At swap_free(), this is accessed directly from swap.
  300 + *
  301 + * This means,
  302 + * - we have no race in "exchange" when we're accessed via SwapCache because
  303 + * SwapCache(and its swp_entry) is under lock.
  304 + * - When called via swap_free(), there is no user of this entry and no race.
  305 + * Then, we don't need lock around "exchange".
  306 + *
  307 + * TODO: we can push these buffers out to HIGHMEM.
  308 + */
  309 +
  310 +/*
  311 + * allocate buffer for swap_cgroup.
  312 + */
  313 +static int swap_cgroup_prepare(int type)
  314 +{
  315 + struct page *page;
  316 + struct swap_cgroup_ctrl *ctrl;
  317 + unsigned long idx, max;
  318 +
  319 + if (!do_swap_account)
  320 + return 0;
  321 + ctrl = &swap_cgroup_ctrl[type];
  322 +
  323 + for (idx = 0; idx < ctrl->length; idx++) {
  324 + page = alloc_page(GFP_KERNEL | __GFP_ZERO);
  325 + if (!page)
  326 + goto not_enough_page;
  327 + ctrl->map[idx] = page;
  328 + }
  329 + return 0;
  330 +not_enough_page:
  331 + max = idx;
  332 + for (idx = 0; idx < max; idx++)
  333 + __free_page(ctrl->map[idx]);
  334 +
  335 + return -ENOMEM;
  336 +}
  337 +
  338 +/**
  339 + * swap_cgroup_record - record mem_cgroup for this swp_entry.
  340 + * @ent: swap entry to be recorded into
  341 + * @mem: mem_cgroup to be recorded
  342 + *
  343 + * Returns old value at success, NULL at failure.
  344 + * (Of course, old value can be NULL.)
  345 + */
  346 +struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem)
  347 +{
  348 + int type = swp_type(ent);
  349 + unsigned long offset = swp_offset(ent);
  350 + unsigned long idx = offset / SC_PER_PAGE;
  351 + unsigned long pos = offset & SC_POS_MASK;
  352 + struct swap_cgroup_ctrl *ctrl;
  353 + struct page *mappage;
  354 + struct swap_cgroup *sc;
  355 + struct mem_cgroup *old;
  356 +
  357 + if (!do_swap_account)
  358 + return NULL;
  359 +
  360 + ctrl = &swap_cgroup_ctrl[type];
  361 +
  362 + mappage = ctrl->map[idx];
  363 + sc = page_address(mappage);
  364 + sc += pos;
  365 + old = sc->val;
  366 + sc->val = mem;
  367 +
  368 + return old;
  369 +}
  370 +
  371 +/**
  372 + * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry
  373 + * @ent: swap entry to be looked up.
  374 + *
  375 + * Returns pointer to mem_cgroup at success. NULL at failure.
  376 + */
  377 +struct mem_cgroup *lookup_swap_cgroup(swp_entry_t ent)
  378 +{
  379 + int type = swp_type(ent);
  380 + unsigned long offset = swp_offset(ent);
  381 + unsigned long idx = offset / SC_PER_PAGE;
  382 + unsigned long pos = offset & SC_POS_MASK;
  383 + struct swap_cgroup_ctrl *ctrl;
  384 + struct page *mappage;
  385 + struct swap_cgroup *sc;
  386 + struct mem_cgroup *ret;
  387 +
  388 + if (!do_swap_account)
  389 + return NULL;
  390 +
  391 + ctrl = &swap_cgroup_ctrl[type];
  392 + mappage = ctrl->map[idx];
  393 + sc = page_address(mappage);
  394 + sc += pos;
  395 + ret = sc->val;
  396 + return ret;
  397 +}
  398 +
  399 +int swap_cgroup_swapon(int type, unsigned long max_pages)
  400 +{
  401 + void *array;
  402 + unsigned long array_size;
  403 + unsigned long length;
  404 + struct swap_cgroup_ctrl *ctrl;
  405 +
  406 + if (!do_swap_account)
  407 + return 0;
  408 +
  409 + length = ((max_pages/SC_PER_PAGE) + 1);
  410 + array_size = length * sizeof(void *);
  411 +
  412 + array = vmalloc(array_size);
  413 + if (!array)
  414 + goto nomem;
  415 +
  416 + memset(array, 0, array_size);
  417 + ctrl = &swap_cgroup_ctrl[type];
  418 + mutex_lock(&swap_cgroup_mutex);
  419 + ctrl->length = length;
  420 + ctrl->map = array;
  421 + if (swap_cgroup_prepare(type)) {
  422 + /* memory shortage */
  423 + ctrl->map = NULL;
  424 + ctrl->length = 0;
  425 + vfree(array);
  426 + mutex_unlock(&swap_cgroup_mutex);
  427 + goto nomem;
  428 + }
  429 + mutex_unlock(&swap_cgroup_mutex);
  430 +
  431 + printk(KERN_INFO
  432 + "swap_cgroup: uses %ld bytes of vmalloc for pointer array space"
  433 + " and %ld bytes to hold mem_cgroup pointers on swap\n",
  434 + array_size, length * PAGE_SIZE);
  435 + printk(KERN_INFO
  436 + "swap_cgroup can be disabled by noswapaccount boot option.\n");
  437 +
  438 + return 0;
  439 +nomem:
  440 + printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
  441 + printk(KERN_INFO
  442 + "swap_cgroup can be disabled by noswapaccount boot option\n");
  443 + return -ENOMEM;
  444 +}
  445 +
  446 +void swap_cgroup_swapoff(int type)
  447 +{
  448 + int i;
  449 + struct swap_cgroup_ctrl *ctrl;
  450 +
  451 + if (!do_swap_account)
  452 + return;
  453 +
  454 + mutex_lock(&swap_cgroup_mutex);
  455 + ctrl = &swap_cgroup_ctrl[type];
  456 + if (ctrl->map) {
  457 + for (i = 0; i < ctrl->length; i++) {
  458 + struct page *page = ctrl->map[i];
  459 + if (page)
  460 + __free_page(page);
  461 + }
  462 + vfree(ctrl->map);
  463 + ctrl->map = NULL;
  464 + ctrl->length = 0;
  465 + }
  466 + mutex_unlock(&swap_cgroup_mutex);
270 467 }
271 468  
272 469 #endif
... ... @@ -33,6 +33,7 @@
33 33 #include <asm/pgtable.h>
34 34 #include <asm/tlbflush.h>
35 35 #include <linux/swapops.h>
  36 +#include <linux/page_cgroup.h>
36 37  
37 38 static DEFINE_SPINLOCK(swap_lock);
38 39 static unsigned int nr_swapfiles;
... ... @@ -1494,6 +1495,9 @@
1494 1495 spin_unlock(&swap_lock);
1495 1496 mutex_unlock(&swapon_mutex);
1496 1497 vfree(swap_map);
  1498 + /* Destroy swap account informatin */
  1499 + swap_cgroup_swapoff(type);
  1500 +
1497 1501 inode = mapping->host;
1498 1502 if (S_ISBLK(inode->i_mode)) {
1499 1503 struct block_device *bdev = I_BDEV(inode);
... ... @@ -1811,6 +1815,11 @@
1811 1815 }
1812 1816 swap_map[page_nr] = SWAP_MAP_BAD;
1813 1817 }
  1818 +
  1819 + error = swap_cgroup_swapon(type, maxpages);
  1820 + if (error)
  1821 + goto bad_swap;
  1822 +
1814 1823 nr_good_pages = swap_header->info.last_page -
1815 1824 swap_header->info.nr_badpages -
1816 1825 1 /* header page */;
... ... @@ -1882,6 +1891,7 @@
1882 1891 bd_release(bdev);
1883 1892 }
1884 1893 destroy_swap_extents(p);
  1894 + swap_cgroup_swapoff(type);
1885 1895 bad_swap_2:
1886 1896 spin_lock(&swap_lock);
1887 1897 p->swap_file = NULL;