Commit 7a81b88cb53e335ff7d019e6398c95792c817d93

Authored by KAMEZAWA Hiroyuki
Committed by Linus Torvalds
1 parent 0b82ac37b8

memcg: introduce charge-commit-cancel style of functions

There is a small race in do_swap_page().  When the page swapped-in is
charged, the mapcount can be greater than 0.  But, at the same time some
process (shares it ) call unmap and make mapcount 1->0 and the page is
uncharged.

      CPUA 			CPUB
       mapcount == 1.
   (1) charge if mapcount==0     zap_pte_range()
                                (2) mapcount 1 => 0.
			        (3) uncharge(). (success)
   (4) set page's rmap()
       mapcount 0=>1

Then, this swap page's account is leaked.

For fixing this, I added a new interface.
  - charge
   account to res_counter by PAGE_SIZE and try to free pages if necessary.
  - commit
   register page_cgroup and add to LRU if necessary.
  - cancel
   uncharge PAGE_SIZE because of do_swap_page failure.

     CPUA
  (1) charge (always)
  (2) set page's rmap (mapcount > 0)
  (3) commit charge was necessary or not after set_pte().

This protocol uses PCG_USED bit on page_cgroup for avoiding over accounting.
Usual mem_cgroup_charge_common() does charge -> commit at a time.

And this patch also adds following function to clarify all charges.

  - mem_cgroup_newpage_charge() ....replacement for mem_cgroup_charge()
	called against newly allocated anon pages.

  - mem_cgroup_charge_migrate_fixup()
        called only from remove_migration_ptes().
	we'll have to rewrite this later.(this patch just keeps old behavior)
	This function will be removed by additional patch to make migration
	clearer.

Good for clarifying "what we do"

Then, we have 4 following charge points.
  - newpage
  - swap-in
  - add-to-cache.
  - migration.

[akpm@linux-foundation.org: add missing inline directives to stubs]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 5 changed files with 170 additions and 41 deletions Side-by-side Diff

include/linux/memcontrol.h
... ... @@ -27,8 +27,17 @@
27 27  
28 28 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
29 29  
30   -extern int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
  30 +extern int mem_cgroup_newpage_charge(struct page *page, struct mm_struct *mm,
31 31 gfp_t gfp_mask);
  32 +extern int mem_cgroup_charge_migrate_fixup(struct page *page,
  33 + struct mm_struct *mm, gfp_t gfp_mask);
  34 +/* for swap handling */
  35 +extern int mem_cgroup_try_charge(struct mm_struct *mm,
  36 + gfp_t gfp_mask, struct mem_cgroup **ptr);
  37 +extern void mem_cgroup_commit_charge_swapin(struct page *page,
  38 + struct mem_cgroup *ptr);
  39 +extern void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *ptr);
  40 +
32 41 extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
33 42 gfp_t gfp_mask);
34 43 extern void mem_cgroup_move_lists(struct page *page, enum lru_list lru);
... ... @@ -71,7 +80,9 @@
71 80  
72 81  
73 82 #else /* CONFIG_CGROUP_MEM_RES_CTLR */
74   -static inline int mem_cgroup_charge(struct page *page,
  83 +struct mem_cgroup;
  84 +
  85 +static inline int mem_cgroup_newpage_charge(struct page *page,
75 86 struct mm_struct *mm, gfp_t gfp_mask)
76 87 {
77 88 return 0;
... ... @@ -81,6 +92,27 @@
81 92 struct mm_struct *mm, gfp_t gfp_mask)
82 93 {
83 94 return 0;
  95 +}
  96 +
  97 +static inline int mem_cgroup_charge_migrate_fixup(struct page *page,
  98 + struct mm_struct *mm, gfp_t gfp_mask)
  99 +{
  100 + return 0;
  101 +}
  102 +
  103 +static inline int mem_cgroup_try_charge(struct mm_struct *mm,
  104 + gfp_t gfp_mask, struct mem_cgroup **ptr)
  105 +{
  106 + return 0;
  107 +}
  108 +
  109 +static inline void mem_cgroup_commit_charge_swapin(struct page *page,
  110 + struct mem_cgroup *ptr)
  111 +{
  112 +}
  113 +
  114 +static inline void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *ptr)
  115 +{
84 116 }
85 117  
86 118 static inline void mem_cgroup_uncharge_page(struct page *page)
... ... @@ -467,35 +467,31 @@
467 467 return nr_taken;
468 468 }
469 469  
470   -/*
471   - * Charge the memory controller for page usage.
472   - * Return
473   - * 0 if the charge was successful
474   - * < 0 if the cgroup is over its limit
  470 +
  471 +/**
  472 + * mem_cgroup_try_charge - get charge of PAGE_SIZE.
  473 + * @mm: an mm_struct which is charged against. (when *memcg is NULL)
  474 + * @gfp_mask: gfp_mask for reclaim.
  475 + * @memcg: a pointer to memory cgroup which is charged against.
  476 + *
  477 + * charge against memory cgroup pointed by *memcg. if *memcg == NULL, estimated
  478 + * memory cgroup from @mm is got and stored in *memcg.
  479 + *
  480 + * Returns 0 if success. -ENOMEM at failure.
475 481 */
476   -static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
477   - gfp_t gfp_mask, enum charge_type ctype,
478   - struct mem_cgroup *memcg)
  482 +
  483 +int mem_cgroup_try_charge(struct mm_struct *mm,
  484 + gfp_t gfp_mask, struct mem_cgroup **memcg)
479 485 {
480 486 struct mem_cgroup *mem;
481   - struct page_cgroup *pc;
482   - unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
483   - struct mem_cgroup_per_zone *mz;
484   - unsigned long flags;
485   -
486   - pc = lookup_page_cgroup(page);
487   - /* can happen at boot */
488   - if (unlikely(!pc))
489   - return 0;
490   - prefetchw(pc);
  487 + int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
491 488 /*
492 489 * We always charge the cgroup the mm_struct belongs to.
493 490 * The mm_struct's mem_cgroup changes on task migration if the
494 491 * thread group leader migrates. It's possible that mm is not
495 492 * set, if so charge the init_mm (happens for pagecache usage).
496 493 */
497   -
498   - if (likely(!memcg)) {
  494 + if (likely(!*memcg)) {
499 495 rcu_read_lock();
500 496 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
501 497 if (unlikely(!mem)) {
502 498  
503 499  
504 500  
... ... @@ -506,15 +502,17 @@
506 502 * For every charge from the cgroup, increment reference count
507 503 */
508 504 css_get(&mem->css);
  505 + *memcg = mem;
509 506 rcu_read_unlock();
510 507 } else {
511   - mem = memcg;
512   - css_get(&memcg->css);
  508 + mem = *memcg;
  509 + css_get(&mem->css);
513 510 }
514 511  
  512 +
515 513 while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) {
516 514 if (!(gfp_mask & __GFP_WAIT))
517   - goto out;
  515 + goto nomem;
518 516  
519 517 if (try_to_free_mem_cgroup_pages(mem, gfp_mask))
520 518 continue;
521 519  
522 520  
523 521  
524 522  
... ... @@ -531,18 +529,37 @@
531 529  
532 530 if (!nr_retries--) {
533 531 mem_cgroup_out_of_memory(mem, gfp_mask);
534   - goto out;
  532 + goto nomem;
535 533 }
536 534 }
  535 + return 0;
  536 +nomem:
  537 + css_put(&mem->css);
  538 + return -ENOMEM;
  539 +}
537 540  
  541 +/*
  542 + * commit a charge got by mem_cgroup_try_charge() and makes page_cgroup to be
  543 + * USED state. If already USED, uncharge and return.
  544 + */
538 545  
  546 +static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
  547 + struct page_cgroup *pc,
  548 + enum charge_type ctype)
  549 +{
  550 + struct mem_cgroup_per_zone *mz;
  551 + unsigned long flags;
  552 +
  553 + /* try_charge() can return NULL to *memcg, taking care of it. */
  554 + if (!mem)
  555 + return;
  556 +
539 557 lock_page_cgroup(pc);
540 558 if (unlikely(PageCgroupUsed(pc))) {
541 559 unlock_page_cgroup(pc);
542 560 res_counter_uncharge(&mem->res, PAGE_SIZE);
543 561 css_put(&mem->css);
544   -
545   - goto done;
  562 + return;
546 563 }
547 564 pc->mem_cgroup = mem;
548 565 /*
549 566  
550 567  
551 568  
... ... @@ -557,15 +574,39 @@
557 574 __mem_cgroup_add_list(mz, pc);
558 575 spin_unlock_irqrestore(&mz->lru_lock, flags);
559 576 unlock_page_cgroup(pc);
  577 +}
560 578  
561   -done:
  579 +/*
  580 + * Charge the memory controller for page usage.
  581 + * Return
  582 + * 0 if the charge was successful
  583 + * < 0 if the cgroup is over its limit
  584 + */
  585 +static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
  586 + gfp_t gfp_mask, enum charge_type ctype,
  587 + struct mem_cgroup *memcg)
  588 +{
  589 + struct mem_cgroup *mem;
  590 + struct page_cgroup *pc;
  591 + int ret;
  592 +
  593 + pc = lookup_page_cgroup(page);
  594 + /* can happen at boot */
  595 + if (unlikely(!pc))
  596 + return 0;
  597 + prefetchw(pc);
  598 +
  599 + mem = memcg;
  600 + ret = mem_cgroup_try_charge(mm, gfp_mask, &mem);
  601 + if (ret)
  602 + return ret;
  603 +
  604 + __mem_cgroup_commit_charge(mem, pc, ctype);
562 605 return 0;
563   -out:
564   - css_put(&mem->css);
565   - return -ENOMEM;
566 606 }
567 607  
568   -int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
  608 +int mem_cgroup_newpage_charge(struct page *page,
  609 + struct mm_struct *mm, gfp_t gfp_mask)
569 610 {
570 611 if (mem_cgroup_subsys.disabled)
571 612 return 0;
... ... @@ -586,6 +627,34 @@
586 627 MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
587 628 }
588 629  
  630 +/*
  631 + * same as mem_cgroup_newpage_charge(), now.
  632 + * But what we assume is different from newpage, and this is special case.
  633 + * treat this in special function. easy for maintenance.
  634 + */
  635 +
  636 +int mem_cgroup_charge_migrate_fixup(struct page *page,
  637 + struct mm_struct *mm, gfp_t gfp_mask)
  638 +{
  639 + if (mem_cgroup_subsys.disabled)
  640 + return 0;
  641 +
  642 + if (PageCompound(page))
  643 + return 0;
  644 +
  645 + if (page_mapped(page) || (page->mapping && !PageAnon(page)))
  646 + return 0;
  647 +
  648 + if (unlikely(!mm))
  649 + mm = &init_mm;
  650 +
  651 + return mem_cgroup_charge_common(page, mm, gfp_mask,
  652 + MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
  653 +}
  654 +
  655 +
  656 +
  657 +
589 658 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
590 659 gfp_t gfp_mask)
591 660 {
... ... @@ -627,6 +696,30 @@
627 696 return mem_cgroup_charge_common(page, mm, gfp_mask,
628 697 MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);
629 698 }
  699 +
  700 +
  701 +void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
  702 +{
  703 + struct page_cgroup *pc;
  704 +
  705 + if (mem_cgroup_subsys.disabled)
  706 + return;
  707 + if (!ptr)
  708 + return;
  709 + pc = lookup_page_cgroup(page);
  710 + __mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED);
  711 +}
  712 +
  713 +void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
  714 +{
  715 + if (mem_cgroup_subsys.disabled)
  716 + return;
  717 + if (!mem)
  718 + return;
  719 + res_counter_uncharge(&mem->res, PAGE_SIZE);
  720 + css_put(&mem->css);
  721 +}
  722 +
630 723  
631 724 /*
632 725 * uncharge if !page_mapped(page)
... ... @@ -2000,7 +2000,7 @@
2000 2000 cow_user_page(new_page, old_page, address, vma);
2001 2001 __SetPageUptodate(new_page);
2002 2002  
2003   - if (mem_cgroup_charge(new_page, mm, GFP_KERNEL))
  2003 + if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
2004 2004 goto oom_free_new;
2005 2005  
2006 2006 /*
... ... @@ -2392,6 +2392,7 @@
2392 2392 struct page *page;
2393 2393 swp_entry_t entry;
2394 2394 pte_t pte;
  2395 + struct mem_cgroup *ptr = NULL;
2395 2396 int ret = 0;
2396 2397  
2397 2398 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
... ... @@ -2430,7 +2431,7 @@
2430 2431 lock_page(page);
2431 2432 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2432 2433  
2433   - if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
  2434 + if (mem_cgroup_try_charge(mm, GFP_KERNEL, &ptr) == -ENOMEM) {
2434 2435 ret = VM_FAULT_OOM;
2435 2436 unlock_page(page);
2436 2437 goto out;
... ... @@ -2460,6 +2461,7 @@
2460 2461 flush_icache_page(vma, page);
2461 2462 set_pte_at(mm, address, page_table, pte);
2462 2463 page_add_anon_rmap(page, vma, address);
  2464 + mem_cgroup_commit_charge_swapin(page, ptr);
2463 2465  
2464 2466 swap_free(entry);
2465 2467 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
... ... @@ -2480,7 +2482,7 @@
2480 2482 out:
2481 2483 return ret;
2482 2484 out_nomap:
2483   - mem_cgroup_uncharge_page(page);
  2485 + mem_cgroup_cancel_charge_swapin(ptr);
2484 2486 pte_unmap_unlock(page_table, ptl);
2485 2487 unlock_page(page);
2486 2488 page_cache_release(page);
... ... @@ -2510,7 +2512,7 @@
2510 2512 goto oom;
2511 2513 __SetPageUptodate(page);
2512 2514  
2513   - if (mem_cgroup_charge(page, mm, GFP_KERNEL))
  2515 + if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
2514 2516 goto oom_free_page;
2515 2517  
2516 2518 entry = mk_pte(page, vma->vm_page_prot);
... ... @@ -2601,7 +2603,7 @@
2601 2603 ret = VM_FAULT_OOM;
2602 2604 goto out;
2603 2605 }
2604   - if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
  2606 + if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
2605 2607 ret = VM_FAULT_OOM;
2606 2608 page_cache_release(page);
2607 2609 goto out;
... ... @@ -133,7 +133,7 @@
133 133 * be reliable, and this charge can actually fail: oh well, we don't
134 134 * make the situation any worse by proceeding as if it had succeeded.
135 135 */
136   - mem_cgroup_charge(new, mm, GFP_ATOMIC);
  136 + mem_cgroup_charge_migrate_fixup(new, mm, GFP_ATOMIC);
137 137  
138 138 get_page(new);
139 139 pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
... ... @@ -690,17 +690,18 @@
690 690 static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
691 691 unsigned long addr, swp_entry_t entry, struct page *page)
692 692 {
  693 + struct mem_cgroup *ptr = NULL;
693 694 spinlock_t *ptl;
694 695 pte_t *pte;
695 696 int ret = 1;
696 697  
697   - if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
  698 + if (mem_cgroup_try_charge(vma->vm_mm, GFP_KERNEL, &ptr))
698 699 ret = -ENOMEM;
699 700  
700 701 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
701 702 if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
702 703 if (ret > 0)
703   - mem_cgroup_uncharge_page(page);
  704 + mem_cgroup_cancel_charge_swapin(ptr);
704 705 ret = 0;
705 706 goto out;
706 707 }
... ... @@ -710,6 +711,7 @@
710 711 set_pte_at(vma->vm_mm, addr, pte,
711 712 pte_mkold(mk_pte(page, vma->vm_page_prot)));
712 713 page_add_anon_rmap(page, vma, addr);
  714 + mem_cgroup_commit_charge_swapin(page, ptr);
713 715 swap_free(entry);
714 716 /*
715 717 * Move the page to the active list so it is not