Commit 4ffef5feff4e4240e767d2f1144b1634a41762e3
Committed by
Linus Torvalds
1 parent
7dc74be032
Exists in
master
and in
7 other branches
memcg: move charges of anonymous page
This patch is the core part of this move-charge-at-task-migration feature. It implements functions to move charges of anonymous pages mapped only by the target task. Implementation: - define struct move_charge_struct and a valuable of it(mc) to remember the count of pre-charges and other information. - At can_attach(), get anon_rss of the target mm, call __mem_cgroup_try_charge() repeatedly and count up mc.precharge. - At attach(), parse the page table, find a target page to be move, and call mem_cgroup_move_account() about the page. - Cancel all precharges if mc.precharge > 0 on failure or at the end of task move. [akpm@linux-foundation.org: a little simplification] Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Cc: Balbir Singh <balbir@linux.vnet.ibm.com> Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Paul Menage <menage@google.com> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 1 changed file with 284 additions and 10 deletions Side-by-side Diff
mm/memcontrol.c
... | ... | @@ -21,6 +21,7 @@ |
21 | 21 | #include <linux/memcontrol.h> |
22 | 22 | #include <linux/cgroup.h> |
23 | 23 | #include <linux/mm.h> |
24 | +#include <linux/hugetlb.h> | |
24 | 25 | #include <linux/pagemap.h> |
25 | 26 | #include <linux/smp.h> |
26 | 27 | #include <linux/page-flags.h> |
27 | 28 | |
... | ... | @@ -243,9 +244,17 @@ |
243 | 244 | * left-shifted bitmap of these types. |
244 | 245 | */ |
245 | 246 | enum move_type { |
247 | + MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ | |
246 | 248 | NR_MOVE_TYPE, |
247 | 249 | }; |
248 | 250 | |
251 | +/* "mc" and its members are protected by cgroup_mutex */ | |
252 | +static struct move_charge_struct { | |
253 | + struct mem_cgroup *from; | |
254 | + struct mem_cgroup *to; | |
255 | + unsigned long precharge; | |
256 | +} mc; | |
257 | + | |
249 | 258 | /* |
250 | 259 | * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft |
251 | 260 | * limit reclaim to prevent infinite loops, if they ever occur. |
... | ... | @@ -1513,7 +1522,7 @@ |
1513 | 1522 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. |
1514 | 1523 | * if they exceeds softlimit. |
1515 | 1524 | */ |
1516 | - if (mem_cgroup_soft_limit_check(mem)) | |
1525 | + if (page && mem_cgroup_soft_limit_check(mem)) | |
1517 | 1526 | mem_cgroup_update_tree(mem, page); |
1518 | 1527 | done: |
1519 | 1528 | return 0; |
... | ... | @@ -1690,8 +1699,9 @@ |
1690 | 1699 | /* |
1691 | 1700 | * We charges against "to" which may not have any tasks. Then, "to" |
1692 | 1701 | * can be under rmdir(). But in current implementation, caller of |
1693 | - * this function is just force_empty() and it's garanteed that | |
1694 | - * "to" is never removed. So, we don't check rmdir status here. | |
1702 | + * this function is just force_empty() and move charge, so it's | |
1703 | + * garanteed that "to" is never removed. So, we don't check rmdir | |
1704 | + * status here. | |
1695 | 1705 | */ |
1696 | 1706 | } |
1697 | 1707 | |
1698 | 1708 | |
1699 | 1709 | |
... | ... | @@ -3428,11 +3438,171 @@ |
3428 | 3438 | } |
3429 | 3439 | |
3430 | 3440 | /* Handlers for move charge at task migration. */ |
3431 | -static int mem_cgroup_can_move_charge(void) | |
3441 | +static int mem_cgroup_do_precharge(void) | |
3432 | 3442 | { |
3443 | + int ret = -ENOMEM; | |
3444 | + struct mem_cgroup *mem = mc.to; | |
3445 | + | |
3446 | + ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, NULL); | |
3447 | + if (ret || !mem) | |
3448 | + return -ENOMEM; | |
3449 | + | |
3450 | + mc.precharge++; | |
3451 | + return ret; | |
3452 | +} | |
3453 | + | |
3454 | +/** | |
3455 | + * is_target_pte_for_mc - check a pte whether it is valid for move charge | |
3456 | + * @vma: the vma the pte to be checked belongs | |
3457 | + * @addr: the address corresponding to the pte to be checked | |
3458 | + * @ptent: the pte to be checked | |
3459 | + * @target: the pointer the target page will be stored(can be NULL) | |
3460 | + * | |
3461 | + * Returns | |
3462 | + * 0(MC_TARGET_NONE): if the pte is not a target for move charge. | |
3463 | + * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for | |
3464 | + * move charge. if @target is not NULL, the page is stored in target->page | |
3465 | + * with extra refcnt got(Callers should handle it). | |
3466 | + * | |
3467 | + * Called with pte lock held. | |
3468 | + */ | |
3469 | +/* We add a new member later. */ | |
3470 | +union mc_target { | |
3471 | + struct page *page; | |
3472 | +}; | |
3473 | + | |
3474 | +/* We add a new type later. */ | |
3475 | +enum mc_target_type { | |
3476 | + MC_TARGET_NONE, /* not used */ | |
3477 | + MC_TARGET_PAGE, | |
3478 | +}; | |
3479 | + | |
3480 | +static int is_target_pte_for_mc(struct vm_area_struct *vma, | |
3481 | + unsigned long addr, pte_t ptent, union mc_target *target) | |
3482 | +{ | |
3483 | + struct page *page; | |
3484 | + struct page_cgroup *pc; | |
3485 | + int ret = 0; | |
3486 | + bool move_anon = test_bit(MOVE_CHARGE_TYPE_ANON, | |
3487 | + &mc.to->move_charge_at_immigrate); | |
3488 | + | |
3489 | + if (!pte_present(ptent)) | |
3490 | + return 0; | |
3491 | + | |
3492 | + page = vm_normal_page(vma, addr, ptent); | |
3493 | + if (!page || !page_mapped(page)) | |
3494 | + return 0; | |
3495 | + /* | |
3496 | + * TODO: We don't move charges of file(including shmem/tmpfs) pages for | |
3497 | + * now. | |
3498 | + */ | |
3499 | + if (!move_anon || !PageAnon(page)) | |
3500 | + return 0; | |
3501 | + /* | |
3502 | + * TODO: We don't move charges of shared(used by multiple processes) | |
3503 | + * pages for now. | |
3504 | + */ | |
3505 | + if (page_mapcount(page) > 1) | |
3506 | + return 0; | |
3507 | + if (!get_page_unless_zero(page)) | |
3508 | + return 0; | |
3509 | + | |
3510 | + pc = lookup_page_cgroup(page); | |
3511 | + /* | |
3512 | + * Do only loose check w/o page_cgroup lock. mem_cgroup_move_account() | |
3513 | + * checks the pc is valid or not under the lock. | |
3514 | + */ | |
3515 | + if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { | |
3516 | + ret = MC_TARGET_PAGE; | |
3517 | + if (target) | |
3518 | + target->page = page; | |
3519 | + } | |
3520 | + | |
3521 | + if (!ret || !target) | |
3522 | + put_page(page); | |
3523 | + | |
3524 | + return ret; | |
3525 | +} | |
3526 | + | |
3527 | +static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, | |
3528 | + unsigned long addr, unsigned long end, | |
3529 | + struct mm_walk *walk) | |
3530 | +{ | |
3531 | + struct vm_area_struct *vma = walk->private; | |
3532 | + pte_t *pte; | |
3533 | + spinlock_t *ptl; | |
3534 | + | |
3535 | + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | |
3536 | + for (; addr != end; pte++, addr += PAGE_SIZE) | |
3537 | + if (is_target_pte_for_mc(vma, addr, *pte, NULL)) | |
3538 | + mc.precharge++; /* increment precharge temporarily */ | |
3539 | + pte_unmap_unlock(pte - 1, ptl); | |
3540 | + cond_resched(); | |
3541 | + | |
3433 | 3542 | return 0; |
3434 | 3543 | } |
3435 | 3544 | |
3545 | +static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) | |
3546 | +{ | |
3547 | + unsigned long precharge; | |
3548 | + struct vm_area_struct *vma; | |
3549 | + | |
3550 | + down_read(&mm->mmap_sem); | |
3551 | + for (vma = mm->mmap; vma; vma = vma->vm_next) { | |
3552 | + struct mm_walk mem_cgroup_count_precharge_walk = { | |
3553 | + .pmd_entry = mem_cgroup_count_precharge_pte_range, | |
3554 | + .mm = mm, | |
3555 | + .private = vma, | |
3556 | + }; | |
3557 | + if (is_vm_hugetlb_page(vma)) | |
3558 | + continue; | |
3559 | + /* TODO: We don't move charges of shmem/tmpfs pages for now. */ | |
3560 | + if (vma->vm_flags & VM_SHARED) | |
3561 | + continue; | |
3562 | + walk_page_range(vma->vm_start, vma->vm_end, | |
3563 | + &mem_cgroup_count_precharge_walk); | |
3564 | + } | |
3565 | + up_read(&mm->mmap_sem); | |
3566 | + | |
3567 | + precharge = mc.precharge; | |
3568 | + mc.precharge = 0; | |
3569 | + | |
3570 | + return precharge; | |
3571 | +} | |
3572 | + | |
3573 | +#define PRECHARGE_AT_ONCE 256 | |
3574 | +static int mem_cgroup_precharge_mc(struct mm_struct *mm) | |
3575 | +{ | |
3576 | + int ret = 0; | |
3577 | + int count = PRECHARGE_AT_ONCE; | |
3578 | + unsigned long precharge = mem_cgroup_count_precharge(mm); | |
3579 | + | |
3580 | + while (!ret && precharge--) { | |
3581 | + if (signal_pending(current)) { | |
3582 | + ret = -EINTR; | |
3583 | + break; | |
3584 | + } | |
3585 | + if (!count--) { | |
3586 | + count = PRECHARGE_AT_ONCE; | |
3587 | + cond_resched(); | |
3588 | + } | |
3589 | + ret = mem_cgroup_do_precharge(); | |
3590 | + } | |
3591 | + | |
3592 | + return ret; | |
3593 | +} | |
3594 | + | |
3595 | +static void mem_cgroup_clear_mc(void) | |
3596 | +{ | |
3597 | + /* we must uncharge all the leftover precharges from mc.to */ | |
3598 | + while (mc.precharge) { | |
3599 | + mem_cgroup_cancel_charge(mc.to); | |
3600 | + mc.precharge--; | |
3601 | + } | |
3602 | + mc.from = NULL; | |
3603 | + mc.to = NULL; | |
3604 | +} | |
3605 | + | |
3436 | 3606 | static int mem_cgroup_can_attach(struct cgroup_subsys *ss, |
3437 | 3607 | struct cgroup *cgroup, |
3438 | 3608 | struct task_struct *p, |
3439 | 3609 | |
3440 | 3610 | |
... | ... | @@ -3450,11 +3620,19 @@ |
3450 | 3620 | mm = get_task_mm(p); |
3451 | 3621 | if (!mm) |
3452 | 3622 | return 0; |
3453 | - | |
3454 | 3623 | /* We move charges only when we move a owner of the mm */ |
3455 | - if (mm->owner == p) | |
3456 | - ret = mem_cgroup_can_move_charge(); | |
3624 | + if (mm->owner == p) { | |
3625 | + VM_BUG_ON(mc.from); | |
3626 | + VM_BUG_ON(mc.to); | |
3627 | + VM_BUG_ON(mc.precharge); | |
3628 | + mc.from = from; | |
3629 | + mc.to = mem; | |
3630 | + mc.precharge = 0; | |
3457 | 3631 | |
3632 | + ret = mem_cgroup_precharge_mc(mm); | |
3633 | + if (ret) | |
3634 | + mem_cgroup_clear_mc(); | |
3635 | + } | |
3458 | 3636 | mmput(mm); |
3459 | 3637 | } |
3460 | 3638 | return ret; |
3461 | 3639 | |
3462 | 3640 | |
3463 | 3641 | |
3464 | 3642 | |
... | ... | @@ -3465,19 +3643,115 @@ |
3465 | 3643 | struct task_struct *p, |
3466 | 3644 | bool threadgroup) |
3467 | 3645 | { |
3646 | + mem_cgroup_clear_mc(); | |
3468 | 3647 | } |
3469 | 3648 | |
3470 | -static void mem_cgroup_move_charge(void) | |
3649 | +static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, | |
3650 | + unsigned long addr, unsigned long end, | |
3651 | + struct mm_walk *walk) | |
3471 | 3652 | { |
3653 | + int ret = 0; | |
3654 | + struct vm_area_struct *vma = walk->private; | |
3655 | + pte_t *pte; | |
3656 | + spinlock_t *ptl; | |
3657 | + | |
3658 | +retry: | |
3659 | + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | |
3660 | + for (; addr != end; addr += PAGE_SIZE) { | |
3661 | + pte_t ptent = *(pte++); | |
3662 | + union mc_target target; | |
3663 | + int type; | |
3664 | + struct page *page; | |
3665 | + struct page_cgroup *pc; | |
3666 | + | |
3667 | + if (!mc.precharge) | |
3668 | + break; | |
3669 | + | |
3670 | + type = is_target_pte_for_mc(vma, addr, ptent, &target); | |
3671 | + switch (type) { | |
3672 | + case MC_TARGET_PAGE: | |
3673 | + page = target.page; | |
3674 | + if (isolate_lru_page(page)) | |
3675 | + goto put; | |
3676 | + pc = lookup_page_cgroup(page); | |
3677 | + if (!mem_cgroup_move_account(pc, mc.from, mc.to)) { | |
3678 | + css_put(&mc.to->css); | |
3679 | + mc.precharge--; | |
3680 | + } | |
3681 | + putback_lru_page(page); | |
3682 | +put: /* is_target_pte_for_mc() gets the page */ | |
3683 | + put_page(page); | |
3684 | + break; | |
3685 | + default: | |
3686 | + break; | |
3687 | + } | |
3688 | + } | |
3689 | + pte_unmap_unlock(pte - 1, ptl); | |
3690 | + cond_resched(); | |
3691 | + | |
3692 | + if (addr != end) { | |
3693 | + /* | |
3694 | + * We have consumed all precharges we got in can_attach(). | |
3695 | + * We try charge one by one, but don't do any additional | |
3696 | + * charges to mc.to if we have failed in charge once in attach() | |
3697 | + * phase. | |
3698 | + */ | |
3699 | + ret = mem_cgroup_do_precharge(); | |
3700 | + if (!ret) | |
3701 | + goto retry; | |
3702 | + } | |
3703 | + | |
3704 | + return ret; | |
3472 | 3705 | } |
3473 | 3706 | |
3707 | +static void mem_cgroup_move_charge(struct mm_struct *mm) | |
3708 | +{ | |
3709 | + struct vm_area_struct *vma; | |
3710 | + | |
3711 | + lru_add_drain_all(); | |
3712 | + down_read(&mm->mmap_sem); | |
3713 | + for (vma = mm->mmap; vma; vma = vma->vm_next) { | |
3714 | + int ret; | |
3715 | + struct mm_walk mem_cgroup_move_charge_walk = { | |
3716 | + .pmd_entry = mem_cgroup_move_charge_pte_range, | |
3717 | + .mm = mm, | |
3718 | + .private = vma, | |
3719 | + }; | |
3720 | + if (is_vm_hugetlb_page(vma)) | |
3721 | + continue; | |
3722 | + /* TODO: We don't move charges of shmem/tmpfs pages for now. */ | |
3723 | + if (vma->vm_flags & VM_SHARED) | |
3724 | + continue; | |
3725 | + ret = walk_page_range(vma->vm_start, vma->vm_end, | |
3726 | + &mem_cgroup_move_charge_walk); | |
3727 | + if (ret) | |
3728 | + /* | |
3729 | + * means we have consumed all precharges and failed in | |
3730 | + * doing additional charge. Just abandon here. | |
3731 | + */ | |
3732 | + break; | |
3733 | + } | |
3734 | + up_read(&mm->mmap_sem); | |
3735 | +} | |
3736 | + | |
3474 | 3737 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, |
3475 | 3738 | struct cgroup *cont, |
3476 | 3739 | struct cgroup *old_cont, |
3477 | 3740 | struct task_struct *p, |
3478 | 3741 | bool threadgroup) |
3479 | 3742 | { |
3480 | - mem_cgroup_move_charge(); | |
3743 | + struct mm_struct *mm; | |
3744 | + | |
3745 | + if (!mc.to) | |
3746 | + /* no need to move charge */ | |
3747 | + return; | |
3748 | + | |
3749 | + mm = get_task_mm(p); | |
3750 | + if (mm) { | |
3751 | + mem_cgroup_move_charge(mm); | |
3752 | + mmput(mm); | |
3753 | + } | |
3754 | + mem_cgroup_clear_mc(); | |
3481 | 3755 | } |
3482 | 3756 | |
3483 | 3757 | struct cgroup_subsys mem_cgroup_subsys = { |