Commit 024914477e15ef8b17f271ec47f1bb8a589f0806

Authored by Daisuke Nishimura
Committed by Linus Torvalds
1 parent 8033b97c9b

memcg: move charges of anonymous swap

This patch is another core part of this move-charge-at-task-migration
feature.  It enables moving charges of anonymous swaps.

To move the charge of swap, we need to exchange swap_cgroup's record.

In current implementation, swap_cgroup's record is protected by:

  - page lock: if the entry is on swap cache.
  - swap_lock: if the entry is not on swap cache.

This works well in usual swap-in/out activity.

But this behavior make the feature of moving swap charge check many
conditions to exchange swap_cgroup's record safely.

So I changed modification of swap_cgroup's recored(swap_cgroup_record())
to use xchg, and define a new function to cmpxchg swap_cgroup's record.

This patch also enables moving charge of non pte_present but not uncharged
swap caches, which can be exist on swap-out path, by getting the target
pages via find_get_page() as do_mincore() does.

[kosaki.motohiro@jp.fujitsu.com: fix ia64 build]
[akpm@linux-foundation.org: fix typos]
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 6 changed files with 223 additions and 38 deletions Side-by-side Diff

Documentation/cgroups/memory.txt
... ... @@ -420,6 +420,8 @@
420 420  
421 421 Users can move charges associated with a task along with task migration, that
422 422 is, uncharge task's pages from the old cgroup and charge them to the new cgroup.
  423 +This feature is not supported in !CONFIG_MMU environments because of lack of
  424 +page tables.
423 425  
424 426 8.1 Interface
425 427  
include/linux/page_cgroup.h
... ... @@ -118,6 +118,8 @@
118 118 #include <linux/swap.h>
119 119  
120 120 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
  121 +extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
  122 + unsigned short old, unsigned short new);
121 123 extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id);
122 124 extern unsigned short lookup_swap_cgroup(swp_entry_t ent);
123 125 extern int swap_cgroup_swapon(int type, unsigned long max_pages);
include/linux/swap.h
... ... @@ -355,6 +355,7 @@
355 355 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
356 356 extern void
357 357 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout);
  358 +extern int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep);
358 359 #else
359 360 static inline void
360 361 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
... ... @@ -484,6 +485,14 @@
484 485 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
485 486 {
486 487 }
  488 +
  489 +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
  490 +static inline int
  491 +mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
  492 +{
  493 + return 0;
  494 +}
  495 +#endif
487 496  
488 497 #endif /* CONFIG_SWAP */
489 498 #endif /* __KERNEL__*/
... ... @@ -33,6 +33,7 @@
33 33 #include <linux/rbtree.h>
34 34 #include <linux/slab.h>
35 35 #include <linux/swap.h>
  36 +#include <linux/swapops.h>
36 37 #include <linux/spinlock.h>
37 38 #include <linux/fs.h>
38 39 #include <linux/seq_file.h>
... ... @@ -2270,6 +2271,54 @@
2270 2271 }
2271 2272 rcu_read_unlock();
2272 2273 }
  2274 +
  2275 +/**
  2276 + * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
  2277 + * @entry: swap entry to be moved
  2278 + * @from: mem_cgroup which the entry is moved from
  2279 + * @to: mem_cgroup which the entry is moved to
  2280 + *
  2281 + * It succeeds only when the swap_cgroup's record for this entry is the same
  2282 + * as the mem_cgroup's id of @from.
  2283 + *
  2284 + * Returns 0 on success, -EINVAL on failure.
  2285 + *
  2286 + * The caller must have charged to @to, IOW, called res_counter_charge() about
  2287 + * both res and memsw, and called css_get().
  2288 + */
  2289 +static int mem_cgroup_move_swap_account(swp_entry_t entry,
  2290 + struct mem_cgroup *from, struct mem_cgroup *to)
  2291 +{
  2292 + unsigned short old_id, new_id;
  2293 +
  2294 + old_id = css_id(&from->css);
  2295 + new_id = css_id(&to->css);
  2296 +
  2297 + if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
  2298 + if (!mem_cgroup_is_root(from))
  2299 + res_counter_uncharge(&from->memsw, PAGE_SIZE);
  2300 + mem_cgroup_swap_statistics(from, false);
  2301 + mem_cgroup_put(from);
  2302 + /*
  2303 + * we charged both to->res and to->memsw, so we should uncharge
  2304 + * to->res.
  2305 + */
  2306 + if (!mem_cgroup_is_root(to))
  2307 + res_counter_uncharge(&to->res, PAGE_SIZE);
  2308 + mem_cgroup_swap_statistics(to, true);
  2309 + mem_cgroup_get(to);
  2310 + css_put(&to->css);
  2311 +
  2312 + return 0;
  2313 + }
  2314 + return -EINVAL;
  2315 +}
  2316 +#else
  2317 +static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
  2318 + struct mem_cgroup *from, struct mem_cgroup *to)
  2319 +{
  2320 + return -EINVAL;
  2321 +}
2273 2322 #endif
2274 2323  
2275 2324 /*
... ... @@ -2949,6 +2998,7 @@
2949 2998 return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;
2950 2999 }
2951 3000  
  3001 +#ifdef CONFIG_MMU
2952 3002 static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
2953 3003 struct cftype *cft, u64 val)
2954 3004 {
... ... @@ -2967,6 +3017,13 @@
2967 3017  
2968 3018 return 0;
2969 3019 }
  3020 +#else
  3021 +static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
  3022 + struct cftype *cft, u64 val)
  3023 +{
  3024 + return -ENOSYS;
  3025 +}
  3026 +#endif
2970 3027  
2971 3028  
2972 3029 /* For read statistics */
... ... @@ -3489,6 +3546,7 @@
3489 3546 return ret;
3490 3547 }
3491 3548  
  3549 +#ifdef CONFIG_MMU
3492 3550 /* Handlers for move charge at task migration. */
3493 3551 #define PRECHARGE_COUNT_AT_ONCE 256
3494 3552 static int mem_cgroup_do_precharge(unsigned long count)
3495 3553  
3496 3554  
3497 3555  
3498 3556  
3499 3557  
3500 3558  
3501 3559  
3502 3560  
3503 3561  
3504 3562  
3505 3563  
3506 3564  
... ... @@ -3544,77 +3602,124 @@
3544 3602 }
3545 3603 return ret;
3546 3604 }
  3605 +#else /* !CONFIG_MMU */
  3606 +static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
  3607 + struct cgroup *cgroup,
  3608 + struct task_struct *p,
  3609 + bool threadgroup)
  3610 +{
  3611 + return 0;
  3612 +}
  3613 +static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
  3614 + struct cgroup *cgroup,
  3615 + struct task_struct *p,
  3616 + bool threadgroup)
  3617 +{
  3618 +}
  3619 +static void mem_cgroup_move_task(struct cgroup_subsys *ss,
  3620 + struct cgroup *cont,
  3621 + struct cgroup *old_cont,
  3622 + struct task_struct *p,
  3623 + bool threadgroup)
  3624 +{
  3625 +}
  3626 +#endif
3547 3627  
3548 3628 /**
3549 3629 * is_target_pte_for_mc - check a pte whether it is valid for move charge
3550 3630 * @vma: the vma the pte to be checked belongs
3551 3631 * @addr: the address corresponding to the pte to be checked
3552 3632 * @ptent: the pte to be checked
3553   - * @target: the pointer the target page will be stored(can be NULL)
  3633 + * @target: the pointer the target page or swap ent will be stored(can be NULL)
3554 3634 *
3555 3635 * Returns
3556 3636 * 0(MC_TARGET_NONE): if the pte is not a target for move charge.
3557 3637 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
3558 3638 * move charge. if @target is not NULL, the page is stored in target->page
3559 3639 * with extra refcnt got(Callers should handle it).
  3640 + * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
  3641 + * target for charge migration. if @target is not NULL, the entry is stored
  3642 + * in target->ent.
3560 3643 *
3561 3644 * Called with pte lock held.
3562 3645 */
3563   -/* We add a new member later. */
3564 3646 union mc_target {
3565 3647 struct page *page;
  3648 + swp_entry_t ent;
3566 3649 };
3567 3650  
3568   -/* We add a new type later. */
3569 3651 enum mc_target_type {
3570 3652 MC_TARGET_NONE, /* not used */
3571 3653 MC_TARGET_PAGE,
  3654 + MC_TARGET_SWAP,
3572 3655 };
3573 3656  
3574 3657 static int is_target_pte_for_mc(struct vm_area_struct *vma,
3575 3658 unsigned long addr, pte_t ptent, union mc_target *target)
3576 3659 {
3577   - struct page *page;
  3660 + struct page *page = NULL;
3578 3661 struct page_cgroup *pc;
3579 3662 int ret = 0;
  3663 + swp_entry_t ent = { .val = 0 };
  3664 + int usage_count = 0;
3580 3665 bool move_anon = test_bit(MOVE_CHARGE_TYPE_ANON,
3581 3666 &mc.to->move_charge_at_immigrate);
3582 3667  
3583   - if (!pte_present(ptent))
  3668 + if (!pte_present(ptent)) {
  3669 + /* TODO: handle swap of shmes/tmpfs */
  3670 + if (pte_none(ptent) || pte_file(ptent))
  3671 + return 0;
  3672 + else if (is_swap_pte(ptent)) {
  3673 + ent = pte_to_swp_entry(ptent);
  3674 + if (!move_anon || non_swap_entry(ent))
  3675 + return 0;
  3676 + usage_count = mem_cgroup_count_swap_user(ent, &page);
  3677 + }
  3678 + } else {
  3679 + page = vm_normal_page(vma, addr, ptent);
  3680 + if (!page || !page_mapped(page))
  3681 + return 0;
  3682 + /*
  3683 + * TODO: We don't move charges of file(including shmem/tmpfs)
  3684 + * pages for now.
  3685 + */
  3686 + if (!move_anon || !PageAnon(page))
  3687 + return 0;
  3688 + if (!get_page_unless_zero(page))
  3689 + return 0;
  3690 + usage_count = page_mapcount(page);
  3691 + }
  3692 + if (usage_count > 1) {
  3693 + /*
  3694 + * TODO: We don't move charges of shared(used by multiple
  3695 + * processes) pages for now.
  3696 + */
  3697 + if (page)
  3698 + put_page(page);
3584 3699 return 0;
3585   -
3586   - page = vm_normal_page(vma, addr, ptent);
3587   - if (!page || !page_mapped(page))
3588   - return 0;
3589   - /*
3590   - * TODO: We don't move charges of file(including shmem/tmpfs) pages for
3591   - * now.
3592   - */
3593   - if (!move_anon || !PageAnon(page))
3594   - return 0;
3595   - /*
3596   - * TODO: We don't move charges of shared(used by multiple processes)
3597   - * pages for now.
3598   - */
3599   - if (page_mapcount(page) > 1)
3600   - return 0;
3601   - if (!get_page_unless_zero(page))
3602   - return 0;
3603   -
3604   - pc = lookup_page_cgroup(page);
3605   - /*
3606   - * Do only loose check w/o page_cgroup lock. mem_cgroup_move_account()
3607   - * checks the pc is valid or not under the lock.
3608   - */
3609   - if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
3610   - ret = MC_TARGET_PAGE;
  3700 + }
  3701 + if (page) {
  3702 + pc = lookup_page_cgroup(page);
  3703 + /*
  3704 + * Do only loose check w/o page_cgroup lock.
  3705 + * mem_cgroup_move_account() checks the pc is valid or not under
  3706 + * the lock.
  3707 + */
  3708 + if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
  3709 + ret = MC_TARGET_PAGE;
  3710 + if (target)
  3711 + target->page = page;
  3712 + }
  3713 + if (!ret || !target)
  3714 + put_page(page);
  3715 + }
  3716 + /* throught */
  3717 + if (ent.val && do_swap_account && !ret &&
  3718 + css_id(&mc.from->css) == lookup_swap_cgroup(ent)) {
  3719 + ret = MC_TARGET_SWAP;
3611 3720 if (target)
3612   - target->page = page;
  3721 + target->ent = ent;
3613 3722 }
3614   -
3615   - if (!ret || !target)
3616   - put_page(page);
3617   -
3618 3723 return ret;
3619 3724 }
3620 3725  
... ... @@ -3754,6 +3859,7 @@
3754 3859 int type;
3755 3860 struct page *page;
3756 3861 struct page_cgroup *pc;
  3862 + swp_entry_t ent;
3757 3863  
3758 3864 if (!mc.precharge)
3759 3865 break;
... ... @@ -3774,6 +3880,11 @@
3774 3880 putback_lru_page(page);
3775 3881 put: /* is_target_pte_for_mc() gets the page */
3776 3882 put_page(page);
  3883 + break;
  3884 + case MC_TARGET_SWAP:
  3885 + ent = target.ent;
  3886 + if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to))
  3887 + mc.precharge--;
3777 3888 break;
3778 3889 default:
3779 3890 break;
... ... @@ -335,6 +335,37 @@
335 335 }
336 336  
337 337 /**
  338 + * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
  339 + * @end: swap entry to be cmpxchged
  340 + * @old: old id
  341 + * @new: new id
  342 + *
  343 + * Returns old id at success, 0 at failure.
  344 + * (There is no mem_cgroup useing 0 as its id)
  345 + */
  346 +unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
  347 + unsigned short old, unsigned short new)
  348 +{
  349 + int type = swp_type(ent);
  350 + unsigned long offset = swp_offset(ent);
  351 + unsigned long idx = offset / SC_PER_PAGE;
  352 + unsigned long pos = offset & SC_POS_MASK;
  353 + struct swap_cgroup_ctrl *ctrl;
  354 + struct page *mappage;
  355 + struct swap_cgroup *sc;
  356 +
  357 + ctrl = &swap_cgroup_ctrl[type];
  358 +
  359 + mappage = ctrl->map[idx];
  360 + sc = page_address(mappage);
  361 + sc += pos;
  362 + if (cmpxchg(&sc->id, old, new) == old)
  363 + return old;
  364 + else
  365 + return 0;
  366 +}
  367 +
  368 +/**
338 369 * swap_cgroup_record - record mem_cgroup for this swp_entry.
339 370 * @ent: swap entry to be recorded into
340 371 * @mem: mem_cgroup to be recorded
... ... @@ -358,8 +389,7 @@
358 389 mappage = ctrl->map[idx];
359 390 sc = page_address(mappage);
360 391 sc += pos;
361   - old = sc->id;
362   - sc->id = id;
  392 + old = xchg(&sc->id, id);
363 393  
364 394 return old;
365 395 }
... ... @@ -723,6 +723,37 @@
723 723 return p != NULL;
724 724 }
725 725  
  726 +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
  727 +/**
  728 + * mem_cgroup_count_swap_user - count the user of a swap entry
  729 + * @ent: the swap entry to be checked
  730 + * @pagep: the pointer for the swap cache page of the entry to be stored
  731 + *
  732 + * Returns the number of the user of the swap entry. The number is valid only
  733 + * for swaps of anonymous pages.
  734 + * If the entry is found on swap cache, the page is stored to pagep with
  735 + * refcount of it being incremented.
  736 + */
  737 +int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
  738 +{
  739 + struct page *page;
  740 + struct swap_info_struct *p;
  741 + int count = 0;
  742 +
  743 + page = find_get_page(&swapper_space, ent.val);
  744 + if (page)
  745 + count += page_mapcount(page);
  746 + p = swap_info_get(ent);
  747 + if (p) {
  748 + count += swap_count(p->swap_map[swp_offset(ent)]);
  749 + spin_unlock(&swap_lock);
  750 + }
  751 +
  752 + *pagep = page;
  753 + return count;
  754 +}
  755 +#endif
  756 +
726 757 #ifdef CONFIG_HIBERNATION
727 758 /*
728 759 * Find the swap type that corresponds to given device (if any).