Commit 4ffef5feff4e4240e767d2f1144b1634a41762e3

Authored by Daisuke Nishimura
Committed by Linus Torvalds
1 parent 7dc74be032

memcg: move charges of anonymous page

This patch is the core part of this move-charge-at-task-migration feature.
 It implements functions to move charges of anonymous pages mapped only by
the target task.

Implementation:
- define struct move_charge_struct and a valuable of it(mc) to remember the
  count of pre-charges and other information.
- At can_attach(), get anon_rss of the target mm, call __mem_cgroup_try_charge()
  repeatedly and count up mc.precharge.
- At attach(), parse the page table, find a target page to be move, and call
  mem_cgroup_move_account() about the page.
- Cancel all precharges if mc.precharge > 0 on failure or at the end of
  task move.

[akpm@linux-foundation.org: a little simplification]
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 1 changed file with 284 additions and 10 deletions Side-by-side Diff

... ... @@ -21,6 +21,7 @@
21 21 #include <linux/memcontrol.h>
22 22 #include <linux/cgroup.h>
23 23 #include <linux/mm.h>
  24 +#include <linux/hugetlb.h>
24 25 #include <linux/pagemap.h>
25 26 #include <linux/smp.h>
26 27 #include <linux/page-flags.h>
27 28  
... ... @@ -243,9 +244,17 @@
243 244 * left-shifted bitmap of these types.
244 245 */
245 246 enum move_type {
  247 + MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */
246 248 NR_MOVE_TYPE,
247 249 };
248 250  
  251 +/* "mc" and its members are protected by cgroup_mutex */
  252 +static struct move_charge_struct {
  253 + struct mem_cgroup *from;
  254 + struct mem_cgroup *to;
  255 + unsigned long precharge;
  256 +} mc;
  257 +
249 258 /*
250 259 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
251 260 * limit reclaim to prevent infinite loops, if they ever occur.
... ... @@ -1513,7 +1522,7 @@
1513 1522 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
1514 1523 * if they exceeds softlimit.
1515 1524 */
1516   - if (mem_cgroup_soft_limit_check(mem))
  1525 + if (page && mem_cgroup_soft_limit_check(mem))
1517 1526 mem_cgroup_update_tree(mem, page);
1518 1527 done:
1519 1528 return 0;
... ... @@ -1690,8 +1699,9 @@
1690 1699 /*
1691 1700 * We charges against "to" which may not have any tasks. Then, "to"
1692 1701 * can be under rmdir(). But in current implementation, caller of
1693   - * this function is just force_empty() and it's garanteed that
1694   - * "to" is never removed. So, we don't check rmdir status here.
  1702 + * this function is just force_empty() and move charge, so it's
  1703 + * garanteed that "to" is never removed. So, we don't check rmdir
  1704 + * status here.
1695 1705 */
1696 1706 }
1697 1707  
1698 1708  
1699 1709  
... ... @@ -3428,11 +3438,171 @@
3428 3438 }
3429 3439  
3430 3440 /* Handlers for move charge at task migration. */
3431   -static int mem_cgroup_can_move_charge(void)
  3441 +static int mem_cgroup_do_precharge(void)
3432 3442 {
  3443 + int ret = -ENOMEM;
  3444 + struct mem_cgroup *mem = mc.to;
  3445 +
  3446 + ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, NULL);
  3447 + if (ret || !mem)
  3448 + return -ENOMEM;
  3449 +
  3450 + mc.precharge++;
  3451 + return ret;
  3452 +}
  3453 +
  3454 +/**
  3455 + * is_target_pte_for_mc - check a pte whether it is valid for move charge
  3456 + * @vma: the vma the pte to be checked belongs
  3457 + * @addr: the address corresponding to the pte to be checked
  3458 + * @ptent: the pte to be checked
  3459 + * @target: the pointer the target page will be stored(can be NULL)
  3460 + *
  3461 + * Returns
  3462 + * 0(MC_TARGET_NONE): if the pte is not a target for move charge.
  3463 + * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
  3464 + * move charge. if @target is not NULL, the page is stored in target->page
  3465 + * with extra refcnt got(Callers should handle it).
  3466 + *
  3467 + * Called with pte lock held.
  3468 + */
  3469 +/* We add a new member later. */
  3470 +union mc_target {
  3471 + struct page *page;
  3472 +};
  3473 +
  3474 +/* We add a new type later. */
  3475 +enum mc_target_type {
  3476 + MC_TARGET_NONE, /* not used */
  3477 + MC_TARGET_PAGE,
  3478 +};
  3479 +
  3480 +static int is_target_pte_for_mc(struct vm_area_struct *vma,
  3481 + unsigned long addr, pte_t ptent, union mc_target *target)
  3482 +{
  3483 + struct page *page;
  3484 + struct page_cgroup *pc;
  3485 + int ret = 0;
  3486 + bool move_anon = test_bit(MOVE_CHARGE_TYPE_ANON,
  3487 + &mc.to->move_charge_at_immigrate);
  3488 +
  3489 + if (!pte_present(ptent))
  3490 + return 0;
  3491 +
  3492 + page = vm_normal_page(vma, addr, ptent);
  3493 + if (!page || !page_mapped(page))
  3494 + return 0;
  3495 + /*
  3496 + * TODO: We don't move charges of file(including shmem/tmpfs) pages for
  3497 + * now.
  3498 + */
  3499 + if (!move_anon || !PageAnon(page))
  3500 + return 0;
  3501 + /*
  3502 + * TODO: We don't move charges of shared(used by multiple processes)
  3503 + * pages for now.
  3504 + */
  3505 + if (page_mapcount(page) > 1)
  3506 + return 0;
  3507 + if (!get_page_unless_zero(page))
  3508 + return 0;
  3509 +
  3510 + pc = lookup_page_cgroup(page);
  3511 + /*
  3512 + * Do only loose check w/o page_cgroup lock. mem_cgroup_move_account()
  3513 + * checks the pc is valid or not under the lock.
  3514 + */
  3515 + if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
  3516 + ret = MC_TARGET_PAGE;
  3517 + if (target)
  3518 + target->page = page;
  3519 + }
  3520 +
  3521 + if (!ret || !target)
  3522 + put_page(page);
  3523 +
  3524 + return ret;
  3525 +}
  3526 +
  3527 +static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
  3528 + unsigned long addr, unsigned long end,
  3529 + struct mm_walk *walk)
  3530 +{
  3531 + struct vm_area_struct *vma = walk->private;
  3532 + pte_t *pte;
  3533 + spinlock_t *ptl;
  3534 +
  3535 + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
  3536 + for (; addr != end; pte++, addr += PAGE_SIZE)
  3537 + if (is_target_pte_for_mc(vma, addr, *pte, NULL))
  3538 + mc.precharge++; /* increment precharge temporarily */
  3539 + pte_unmap_unlock(pte - 1, ptl);
  3540 + cond_resched();
  3541 +
3433 3542 return 0;
3434 3543 }
3435 3544  
  3545 +static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
  3546 +{
  3547 + unsigned long precharge;
  3548 + struct vm_area_struct *vma;
  3549 +
  3550 + down_read(&mm->mmap_sem);
  3551 + for (vma = mm->mmap; vma; vma = vma->vm_next) {
  3552 + struct mm_walk mem_cgroup_count_precharge_walk = {
  3553 + .pmd_entry = mem_cgroup_count_precharge_pte_range,
  3554 + .mm = mm,
  3555 + .private = vma,
  3556 + };
  3557 + if (is_vm_hugetlb_page(vma))
  3558 + continue;
  3559 + /* TODO: We don't move charges of shmem/tmpfs pages for now. */
  3560 + if (vma->vm_flags & VM_SHARED)
  3561 + continue;
  3562 + walk_page_range(vma->vm_start, vma->vm_end,
  3563 + &mem_cgroup_count_precharge_walk);
  3564 + }
  3565 + up_read(&mm->mmap_sem);
  3566 +
  3567 + precharge = mc.precharge;
  3568 + mc.precharge = 0;
  3569 +
  3570 + return precharge;
  3571 +}
  3572 +
  3573 +#define PRECHARGE_AT_ONCE 256
  3574 +static int mem_cgroup_precharge_mc(struct mm_struct *mm)
  3575 +{
  3576 + int ret = 0;
  3577 + int count = PRECHARGE_AT_ONCE;
  3578 + unsigned long precharge = mem_cgroup_count_precharge(mm);
  3579 +
  3580 + while (!ret && precharge--) {
  3581 + if (signal_pending(current)) {
  3582 + ret = -EINTR;
  3583 + break;
  3584 + }
  3585 + if (!count--) {
  3586 + count = PRECHARGE_AT_ONCE;
  3587 + cond_resched();
  3588 + }
  3589 + ret = mem_cgroup_do_precharge();
  3590 + }
  3591 +
  3592 + return ret;
  3593 +}
  3594 +
  3595 +static void mem_cgroup_clear_mc(void)
  3596 +{
  3597 + /* we must uncharge all the leftover precharges from mc.to */
  3598 + while (mc.precharge) {
  3599 + mem_cgroup_cancel_charge(mc.to);
  3600 + mc.precharge--;
  3601 + }
  3602 + mc.from = NULL;
  3603 + mc.to = NULL;
  3604 +}
  3605 +
3436 3606 static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
3437 3607 struct cgroup *cgroup,
3438 3608 struct task_struct *p,
3439 3609  
3440 3610  
... ... @@ -3450,11 +3620,19 @@
3450 3620 mm = get_task_mm(p);
3451 3621 if (!mm)
3452 3622 return 0;
3453   -
3454 3623 /* We move charges only when we move a owner of the mm */
3455   - if (mm->owner == p)
3456   - ret = mem_cgroup_can_move_charge();
  3624 + if (mm->owner == p) {
  3625 + VM_BUG_ON(mc.from);
  3626 + VM_BUG_ON(mc.to);
  3627 + VM_BUG_ON(mc.precharge);
  3628 + mc.from = from;
  3629 + mc.to = mem;
  3630 + mc.precharge = 0;
3457 3631  
  3632 + ret = mem_cgroup_precharge_mc(mm);
  3633 + if (ret)
  3634 + mem_cgroup_clear_mc();
  3635 + }
3458 3636 mmput(mm);
3459 3637 }
3460 3638 return ret;
3461 3639  
3462 3640  
3463 3641  
3464 3642  
... ... @@ -3465,19 +3643,115 @@
3465 3643 struct task_struct *p,
3466 3644 bool threadgroup)
3467 3645 {
  3646 + mem_cgroup_clear_mc();
3468 3647 }
3469 3648  
3470   -static void mem_cgroup_move_charge(void)
  3649 +static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
  3650 + unsigned long addr, unsigned long end,
  3651 + struct mm_walk *walk)
3471 3652 {
  3653 + int ret = 0;
  3654 + struct vm_area_struct *vma = walk->private;
  3655 + pte_t *pte;
  3656 + spinlock_t *ptl;
  3657 +
  3658 +retry:
  3659 + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
  3660 + for (; addr != end; addr += PAGE_SIZE) {
  3661 + pte_t ptent = *(pte++);
  3662 + union mc_target target;
  3663 + int type;
  3664 + struct page *page;
  3665 + struct page_cgroup *pc;
  3666 +
  3667 + if (!mc.precharge)
  3668 + break;
  3669 +
  3670 + type = is_target_pte_for_mc(vma, addr, ptent, &target);
  3671 + switch (type) {
  3672 + case MC_TARGET_PAGE:
  3673 + page = target.page;
  3674 + if (isolate_lru_page(page))
  3675 + goto put;
  3676 + pc = lookup_page_cgroup(page);
  3677 + if (!mem_cgroup_move_account(pc, mc.from, mc.to)) {
  3678 + css_put(&mc.to->css);
  3679 + mc.precharge--;
  3680 + }
  3681 + putback_lru_page(page);
  3682 +put: /* is_target_pte_for_mc() gets the page */
  3683 + put_page(page);
  3684 + break;
  3685 + default:
  3686 + break;
  3687 + }
  3688 + }
  3689 + pte_unmap_unlock(pte - 1, ptl);
  3690 + cond_resched();
  3691 +
  3692 + if (addr != end) {
  3693 + /*
  3694 + * We have consumed all precharges we got in can_attach().
  3695 + * We try charge one by one, but don't do any additional
  3696 + * charges to mc.to if we have failed in charge once in attach()
  3697 + * phase.
  3698 + */
  3699 + ret = mem_cgroup_do_precharge();
  3700 + if (!ret)
  3701 + goto retry;
  3702 + }
  3703 +
  3704 + return ret;
3472 3705 }
3473 3706  
  3707 +static void mem_cgroup_move_charge(struct mm_struct *mm)
  3708 +{
  3709 + struct vm_area_struct *vma;
  3710 +
  3711 + lru_add_drain_all();
  3712 + down_read(&mm->mmap_sem);
  3713 + for (vma = mm->mmap; vma; vma = vma->vm_next) {
  3714 + int ret;
  3715 + struct mm_walk mem_cgroup_move_charge_walk = {
  3716 + .pmd_entry = mem_cgroup_move_charge_pte_range,
  3717 + .mm = mm,
  3718 + .private = vma,
  3719 + };
  3720 + if (is_vm_hugetlb_page(vma))
  3721 + continue;
  3722 + /* TODO: We don't move charges of shmem/tmpfs pages for now. */
  3723 + if (vma->vm_flags & VM_SHARED)
  3724 + continue;
  3725 + ret = walk_page_range(vma->vm_start, vma->vm_end,
  3726 + &mem_cgroup_move_charge_walk);
  3727 + if (ret)
  3728 + /*
  3729 + * means we have consumed all precharges and failed in
  3730 + * doing additional charge. Just abandon here.
  3731 + */
  3732 + break;
  3733 + }
  3734 + up_read(&mm->mmap_sem);
  3735 +}
  3736 +
3474 3737 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
3475 3738 struct cgroup *cont,
3476 3739 struct cgroup *old_cont,
3477 3740 struct task_struct *p,
3478 3741 bool threadgroup)
3479 3742 {
3480   - mem_cgroup_move_charge();
  3743 + struct mm_struct *mm;
  3744 +
  3745 + if (!mc.to)
  3746 + /* no need to move charge */
  3747 + return;
  3748 +
  3749 + mm = get_task_mm(p);
  3750 + if (mm) {
  3751 + mem_cgroup_move_charge(mm);
  3752 + mmput(mm);
  3753 + }
  3754 + mem_cgroup_clear_mc();
3481 3755 }
3482 3756  
3483 3757 struct cgroup_subsys mem_cgroup_subsys = {