Commit 024914477e15ef8b17f271ec47f1bb8a589f0806
Committed by
Linus Torvalds
1 parent
8033b97c9b
Exists in
master
and in
4 other branches
memcg: move charges of anonymous swap
This patch is another core part of this move-charge-at-task-migration feature. It enables moving charges of anonymous swaps. To move the charge of swap, we need to exchange swap_cgroup's record. In current implementation, swap_cgroup's record is protected by: - page lock: if the entry is on swap cache. - swap_lock: if the entry is not on swap cache. This works well in usual swap-in/out activity. But this behavior make the feature of moving swap charge check many conditions to exchange swap_cgroup's record safely. So I changed modification of swap_cgroup's recored(swap_cgroup_record()) to use xchg, and define a new function to cmpxchg swap_cgroup's record. This patch also enables moving charge of non pte_present but not uncharged swap caches, which can be exist on swap-out path, by getting the target pages via find_get_page() as do_mincore() does. [kosaki.motohiro@jp.fujitsu.com: fix ia64 build] [akpm@linux-foundation.org: fix typos] Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Cc: Balbir Singh <balbir@linux.vnet.ibm.com> Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Paul Menage <menage@google.com> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 6 changed files with 223 additions and 38 deletions Side-by-side Diff
Documentation/cgroups/memory.txt
... | ... | @@ -420,6 +420,8 @@ |
420 | 420 | |
421 | 421 | Users can move charges associated with a task along with task migration, that |
422 | 422 | is, uncharge task's pages from the old cgroup and charge them to the new cgroup. |
423 | +This feature is not supported in !CONFIG_MMU environments because of lack of | |
424 | +page tables. | |
423 | 425 | |
424 | 426 | 8.1 Interface |
425 | 427 |
include/linux/page_cgroup.h
... | ... | @@ -118,6 +118,8 @@ |
118 | 118 | #include <linux/swap.h> |
119 | 119 | |
120 | 120 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
121 | +extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, | |
122 | + unsigned short old, unsigned short new); | |
121 | 123 | extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id); |
122 | 124 | extern unsigned short lookup_swap_cgroup(swp_entry_t ent); |
123 | 125 | extern int swap_cgroup_swapon(int type, unsigned long max_pages); |
include/linux/swap.h
... | ... | @@ -355,6 +355,7 @@ |
355 | 355 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR |
356 | 356 | extern void |
357 | 357 | mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout); |
358 | +extern int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep); | |
358 | 359 | #else |
359 | 360 | static inline void |
360 | 361 | mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) |
... | ... | @@ -484,6 +485,14 @@ |
484 | 485 | mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent) |
485 | 486 | { |
486 | 487 | } |
488 | + | |
489 | +#ifdef CONFIG_CGROUP_MEM_RES_CTLR | |
490 | +static inline int | |
491 | +mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep) | |
492 | +{ | |
493 | + return 0; | |
494 | +} | |
495 | +#endif | |
487 | 496 | |
488 | 497 | #endif /* CONFIG_SWAP */ |
489 | 498 | #endif /* __KERNEL__*/ |
mm/memcontrol.c
... | ... | @@ -33,6 +33,7 @@ |
33 | 33 | #include <linux/rbtree.h> |
34 | 34 | #include <linux/slab.h> |
35 | 35 | #include <linux/swap.h> |
36 | +#include <linux/swapops.h> | |
36 | 37 | #include <linux/spinlock.h> |
37 | 38 | #include <linux/fs.h> |
38 | 39 | #include <linux/seq_file.h> |
... | ... | @@ -2270,6 +2271,54 @@ |
2270 | 2271 | } |
2271 | 2272 | rcu_read_unlock(); |
2272 | 2273 | } |
2274 | + | |
2275 | +/** | |
2276 | + * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. | |
2277 | + * @entry: swap entry to be moved | |
2278 | + * @from: mem_cgroup which the entry is moved from | |
2279 | + * @to: mem_cgroup which the entry is moved to | |
2280 | + * | |
2281 | + * It succeeds only when the swap_cgroup's record for this entry is the same | |
2282 | + * as the mem_cgroup's id of @from. | |
2283 | + * | |
2284 | + * Returns 0 on success, -EINVAL on failure. | |
2285 | + * | |
2286 | + * The caller must have charged to @to, IOW, called res_counter_charge() about | |
2287 | + * both res and memsw, and called css_get(). | |
2288 | + */ | |
2289 | +static int mem_cgroup_move_swap_account(swp_entry_t entry, | |
2290 | + struct mem_cgroup *from, struct mem_cgroup *to) | |
2291 | +{ | |
2292 | + unsigned short old_id, new_id; | |
2293 | + | |
2294 | + old_id = css_id(&from->css); | |
2295 | + new_id = css_id(&to->css); | |
2296 | + | |
2297 | + if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { | |
2298 | + if (!mem_cgroup_is_root(from)) | |
2299 | + res_counter_uncharge(&from->memsw, PAGE_SIZE); | |
2300 | + mem_cgroup_swap_statistics(from, false); | |
2301 | + mem_cgroup_put(from); | |
2302 | + /* | |
2303 | + * we charged both to->res and to->memsw, so we should uncharge | |
2304 | + * to->res. | |
2305 | + */ | |
2306 | + if (!mem_cgroup_is_root(to)) | |
2307 | + res_counter_uncharge(&to->res, PAGE_SIZE); | |
2308 | + mem_cgroup_swap_statistics(to, true); | |
2309 | + mem_cgroup_get(to); | |
2310 | + css_put(&to->css); | |
2311 | + | |
2312 | + return 0; | |
2313 | + } | |
2314 | + return -EINVAL; | |
2315 | +} | |
2316 | +#else | |
2317 | +static inline int mem_cgroup_move_swap_account(swp_entry_t entry, | |
2318 | + struct mem_cgroup *from, struct mem_cgroup *to) | |
2319 | +{ | |
2320 | + return -EINVAL; | |
2321 | +} | |
2273 | 2322 | #endif |
2274 | 2323 | |
2275 | 2324 | /* |
... | ... | @@ -2949,6 +2998,7 @@ |
2949 | 2998 | return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate; |
2950 | 2999 | } |
2951 | 3000 | |
3001 | +#ifdef CONFIG_MMU | |
2952 | 3002 | static int mem_cgroup_move_charge_write(struct cgroup *cgrp, |
2953 | 3003 | struct cftype *cft, u64 val) |
2954 | 3004 | { |
... | ... | @@ -2967,6 +3017,13 @@ |
2967 | 3017 | |
2968 | 3018 | return 0; |
2969 | 3019 | } |
3020 | +#else | |
3021 | +static int mem_cgroup_move_charge_write(struct cgroup *cgrp, | |
3022 | + struct cftype *cft, u64 val) | |
3023 | +{ | |
3024 | + return -ENOSYS; | |
3025 | +} | |
3026 | +#endif | |
2970 | 3027 | |
2971 | 3028 | |
2972 | 3029 | /* For read statistics */ |
... | ... | @@ -3489,6 +3546,7 @@ |
3489 | 3546 | return ret; |
3490 | 3547 | } |
3491 | 3548 | |
3549 | +#ifdef CONFIG_MMU | |
3492 | 3550 | /* Handlers for move charge at task migration. */ |
3493 | 3551 | #define PRECHARGE_COUNT_AT_ONCE 256 |
3494 | 3552 | static int mem_cgroup_do_precharge(unsigned long count) |
3495 | 3553 | |
3496 | 3554 | |
3497 | 3555 | |
3498 | 3556 | |
3499 | 3557 | |
3500 | 3558 | |
3501 | 3559 | |
3502 | 3560 | |
3503 | 3561 | |
3504 | 3562 | |
3505 | 3563 | |
3506 | 3564 | |
... | ... | @@ -3544,77 +3602,124 @@ |
3544 | 3602 | } |
3545 | 3603 | return ret; |
3546 | 3604 | } |
3605 | +#else /* !CONFIG_MMU */ | |
3606 | +static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | |
3607 | + struct cgroup *cgroup, | |
3608 | + struct task_struct *p, | |
3609 | + bool threadgroup) | |
3610 | +{ | |
3611 | + return 0; | |
3612 | +} | |
3613 | +static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, | |
3614 | + struct cgroup *cgroup, | |
3615 | + struct task_struct *p, | |
3616 | + bool threadgroup) | |
3617 | +{ | |
3618 | +} | |
3619 | +static void mem_cgroup_move_task(struct cgroup_subsys *ss, | |
3620 | + struct cgroup *cont, | |
3621 | + struct cgroup *old_cont, | |
3622 | + struct task_struct *p, | |
3623 | + bool threadgroup) | |
3624 | +{ | |
3625 | +} | |
3626 | +#endif | |
3547 | 3627 | |
3548 | 3628 | /** |
3549 | 3629 | * is_target_pte_for_mc - check a pte whether it is valid for move charge |
3550 | 3630 | * @vma: the vma the pte to be checked belongs |
3551 | 3631 | * @addr: the address corresponding to the pte to be checked |
3552 | 3632 | * @ptent: the pte to be checked |
3553 | - * @target: the pointer the target page will be stored(can be NULL) | |
3633 | + * @target: the pointer the target page or swap ent will be stored(can be NULL) | |
3554 | 3634 | * |
3555 | 3635 | * Returns |
3556 | 3636 | * 0(MC_TARGET_NONE): if the pte is not a target for move charge. |
3557 | 3637 | * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for |
3558 | 3638 | * move charge. if @target is not NULL, the page is stored in target->page |
3559 | 3639 | * with extra refcnt got(Callers should handle it). |
3640 | + * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a | |
3641 | + * target for charge migration. if @target is not NULL, the entry is stored | |
3642 | + * in target->ent. | |
3560 | 3643 | * |
3561 | 3644 | * Called with pte lock held. |
3562 | 3645 | */ |
3563 | -/* We add a new member later. */ | |
3564 | 3646 | union mc_target { |
3565 | 3647 | struct page *page; |
3648 | + swp_entry_t ent; | |
3566 | 3649 | }; |
3567 | 3650 | |
3568 | -/* We add a new type later. */ | |
3569 | 3651 | enum mc_target_type { |
3570 | 3652 | MC_TARGET_NONE, /* not used */ |
3571 | 3653 | MC_TARGET_PAGE, |
3654 | + MC_TARGET_SWAP, | |
3572 | 3655 | }; |
3573 | 3656 | |
3574 | 3657 | static int is_target_pte_for_mc(struct vm_area_struct *vma, |
3575 | 3658 | unsigned long addr, pte_t ptent, union mc_target *target) |
3576 | 3659 | { |
3577 | - struct page *page; | |
3660 | + struct page *page = NULL; | |
3578 | 3661 | struct page_cgroup *pc; |
3579 | 3662 | int ret = 0; |
3663 | + swp_entry_t ent = { .val = 0 }; | |
3664 | + int usage_count = 0; | |
3580 | 3665 | bool move_anon = test_bit(MOVE_CHARGE_TYPE_ANON, |
3581 | 3666 | &mc.to->move_charge_at_immigrate); |
3582 | 3667 | |
3583 | - if (!pte_present(ptent)) | |
3668 | + if (!pte_present(ptent)) { | |
3669 | + /* TODO: handle swap of shmes/tmpfs */ | |
3670 | + if (pte_none(ptent) || pte_file(ptent)) | |
3671 | + return 0; | |
3672 | + else if (is_swap_pte(ptent)) { | |
3673 | + ent = pte_to_swp_entry(ptent); | |
3674 | + if (!move_anon || non_swap_entry(ent)) | |
3675 | + return 0; | |
3676 | + usage_count = mem_cgroup_count_swap_user(ent, &page); | |
3677 | + } | |
3678 | + } else { | |
3679 | + page = vm_normal_page(vma, addr, ptent); | |
3680 | + if (!page || !page_mapped(page)) | |
3681 | + return 0; | |
3682 | + /* | |
3683 | + * TODO: We don't move charges of file(including shmem/tmpfs) | |
3684 | + * pages for now. | |
3685 | + */ | |
3686 | + if (!move_anon || !PageAnon(page)) | |
3687 | + return 0; | |
3688 | + if (!get_page_unless_zero(page)) | |
3689 | + return 0; | |
3690 | + usage_count = page_mapcount(page); | |
3691 | + } | |
3692 | + if (usage_count > 1) { | |
3693 | + /* | |
3694 | + * TODO: We don't move charges of shared(used by multiple | |
3695 | + * processes) pages for now. | |
3696 | + */ | |
3697 | + if (page) | |
3698 | + put_page(page); | |
3584 | 3699 | return 0; |
3585 | - | |
3586 | - page = vm_normal_page(vma, addr, ptent); | |
3587 | - if (!page || !page_mapped(page)) | |
3588 | - return 0; | |
3589 | - /* | |
3590 | - * TODO: We don't move charges of file(including shmem/tmpfs) pages for | |
3591 | - * now. | |
3592 | - */ | |
3593 | - if (!move_anon || !PageAnon(page)) | |
3594 | - return 0; | |
3595 | - /* | |
3596 | - * TODO: We don't move charges of shared(used by multiple processes) | |
3597 | - * pages for now. | |
3598 | - */ | |
3599 | - if (page_mapcount(page) > 1) | |
3600 | - return 0; | |
3601 | - if (!get_page_unless_zero(page)) | |
3602 | - return 0; | |
3603 | - | |
3604 | - pc = lookup_page_cgroup(page); | |
3605 | - /* | |
3606 | - * Do only loose check w/o page_cgroup lock. mem_cgroup_move_account() | |
3607 | - * checks the pc is valid or not under the lock. | |
3608 | - */ | |
3609 | - if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { | |
3610 | - ret = MC_TARGET_PAGE; | |
3700 | + } | |
3701 | + if (page) { | |
3702 | + pc = lookup_page_cgroup(page); | |
3703 | + /* | |
3704 | + * Do only loose check w/o page_cgroup lock. | |
3705 | + * mem_cgroup_move_account() checks the pc is valid or not under | |
3706 | + * the lock. | |
3707 | + */ | |
3708 | + if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { | |
3709 | + ret = MC_TARGET_PAGE; | |
3710 | + if (target) | |
3711 | + target->page = page; | |
3712 | + } | |
3713 | + if (!ret || !target) | |
3714 | + put_page(page); | |
3715 | + } | |
3716 | + /* throught */ | |
3717 | + if (ent.val && do_swap_account && !ret && | |
3718 | + css_id(&mc.from->css) == lookup_swap_cgroup(ent)) { | |
3719 | + ret = MC_TARGET_SWAP; | |
3611 | 3720 | if (target) |
3612 | - target->page = page; | |
3721 | + target->ent = ent; | |
3613 | 3722 | } |
3614 | - | |
3615 | - if (!ret || !target) | |
3616 | - put_page(page); | |
3617 | - | |
3618 | 3723 | return ret; |
3619 | 3724 | } |
3620 | 3725 | |
... | ... | @@ -3754,6 +3859,7 @@ |
3754 | 3859 | int type; |
3755 | 3860 | struct page *page; |
3756 | 3861 | struct page_cgroup *pc; |
3862 | + swp_entry_t ent; | |
3757 | 3863 | |
3758 | 3864 | if (!mc.precharge) |
3759 | 3865 | break; |
... | ... | @@ -3774,6 +3880,11 @@ |
3774 | 3880 | putback_lru_page(page); |
3775 | 3881 | put: /* is_target_pte_for_mc() gets the page */ |
3776 | 3882 | put_page(page); |
3883 | + break; | |
3884 | + case MC_TARGET_SWAP: | |
3885 | + ent = target.ent; | |
3886 | + if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) | |
3887 | + mc.precharge--; | |
3777 | 3888 | break; |
3778 | 3889 | default: |
3779 | 3890 | break; |
mm/page_cgroup.c
... | ... | @@ -335,6 +335,37 @@ |
335 | 335 | } |
336 | 336 | |
337 | 337 | /** |
338 | + * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. | |
339 | + * @end: swap entry to be cmpxchged | |
340 | + * @old: old id | |
341 | + * @new: new id | |
342 | + * | |
343 | + * Returns old id at success, 0 at failure. | |
344 | + * (There is no mem_cgroup useing 0 as its id) | |
345 | + */ | |
346 | +unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, | |
347 | + unsigned short old, unsigned short new) | |
348 | +{ | |
349 | + int type = swp_type(ent); | |
350 | + unsigned long offset = swp_offset(ent); | |
351 | + unsigned long idx = offset / SC_PER_PAGE; | |
352 | + unsigned long pos = offset & SC_POS_MASK; | |
353 | + struct swap_cgroup_ctrl *ctrl; | |
354 | + struct page *mappage; | |
355 | + struct swap_cgroup *sc; | |
356 | + | |
357 | + ctrl = &swap_cgroup_ctrl[type]; | |
358 | + | |
359 | + mappage = ctrl->map[idx]; | |
360 | + sc = page_address(mappage); | |
361 | + sc += pos; | |
362 | + if (cmpxchg(&sc->id, old, new) == old) | |
363 | + return old; | |
364 | + else | |
365 | + return 0; | |
366 | +} | |
367 | + | |
368 | +/** | |
338 | 369 | * swap_cgroup_record - record mem_cgroup for this swp_entry. |
339 | 370 | * @ent: swap entry to be recorded into |
340 | 371 | * @mem: mem_cgroup to be recorded |
... | ... | @@ -358,8 +389,7 @@ |
358 | 389 | mappage = ctrl->map[idx]; |
359 | 390 | sc = page_address(mappage); |
360 | 391 | sc += pos; |
361 | - old = sc->id; | |
362 | - sc->id = id; | |
392 | + old = xchg(&sc->id, id); | |
363 | 393 | |
364 | 394 | return old; |
365 | 395 | } |
mm/swapfile.c
... | ... | @@ -723,6 +723,37 @@ |
723 | 723 | return p != NULL; |
724 | 724 | } |
725 | 725 | |
726 | +#ifdef CONFIG_CGROUP_MEM_RES_CTLR | |
727 | +/** | |
728 | + * mem_cgroup_count_swap_user - count the user of a swap entry | |
729 | + * @ent: the swap entry to be checked | |
730 | + * @pagep: the pointer for the swap cache page of the entry to be stored | |
731 | + * | |
732 | + * Returns the number of the user of the swap entry. The number is valid only | |
733 | + * for swaps of anonymous pages. | |
734 | + * If the entry is found on swap cache, the page is stored to pagep with | |
735 | + * refcount of it being incremented. | |
736 | + */ | |
737 | +int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep) | |
738 | +{ | |
739 | + struct page *page; | |
740 | + struct swap_info_struct *p; | |
741 | + int count = 0; | |
742 | + | |
743 | + page = find_get_page(&swapper_space, ent.val); | |
744 | + if (page) | |
745 | + count += page_mapcount(page); | |
746 | + p = swap_info_get(ent); | |
747 | + if (p) { | |
748 | + count += swap_count(p->swap_map[swp_offset(ent)]); | |
749 | + spin_unlock(&swap_lock); | |
750 | + } | |
751 | + | |
752 | + *pagep = page; | |
753 | + return count; | |
754 | +} | |
755 | +#endif | |
756 | + | |
726 | 757 | #ifdef CONFIG_HIBERNATION |
727 | 758 | /* |
728 | 759 | * Find the swap type that corresponds to given device (if any). |