Commit 6d1fdc48938cd51a3964778d78f27cb26c8eb55d

Authored by Johannes Weiner
Committed by Linus Torvalds
1 parent b6b6cc72bc

memcg: sanitize __mem_cgroup_try_charge() call protocol

Some callsites pass a memcg directly, some callsites pass an mm that
then has to be translated to a memcg.  This makes for a terrible
function interface.

Just push the mm-to-memcg translation into the respective callsites and
always pass a memcg to mem_cgroup_try_charge().

[mhocko@suse.cz: add charge mm helper]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 1 changed file with 102 additions and 105 deletions Side-by-side Diff

... ... @@ -2575,7 +2575,7 @@
2575 2575 }
2576 2576  
2577 2577  
2578   -/* See __mem_cgroup_try_charge() for details */
  2578 +/* See mem_cgroup_try_charge() for details */
2579 2579 enum {
2580 2580 CHARGE_OK, /* success */
2581 2581 CHARGE_RETRY, /* need to retry but retry is not bad */
2582 2582  
2583 2583  
2584 2584  
2585 2585  
2586 2586  
2587 2587  
... ... @@ -2648,45 +2648,34 @@
2648 2648 return CHARGE_NOMEM;
2649 2649 }
2650 2650  
2651   -/*
2652   - * __mem_cgroup_try_charge() does
2653   - * 1. detect memcg to be charged against from passed *mm and *ptr,
2654   - * 2. update res_counter
2655   - * 3. call memory reclaim if necessary.
  2651 +/**
  2652 + * mem_cgroup_try_charge - try charging a memcg
  2653 + * @memcg: memcg to charge
  2654 + * @nr_pages: number of pages to charge
  2655 + * @oom: trigger OOM if reclaim fails
2656 2656 *
2657   - * In some special case, if the task is fatal, fatal_signal_pending() or
2658   - * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup
2659   - * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon
2660   - * as possible without any hazards. 2: all pages should have a valid
2661   - * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg
2662   - * pointer, that is treated as a charge to root_mem_cgroup.
2663   - *
2664   - * So __mem_cgroup_try_charge() will return
2665   - * 0 ... on success, filling *ptr with a valid memcg pointer.
2666   - * -ENOMEM ... charge failure because of resource limits.
2667   - * -EINTR ... if thread is fatal. *ptr is filled with root_mem_cgroup.
2668   - *
2669   - * Unlike the exported interface, an "oom" parameter is added. if oom==true,
2670   - * the oom-killer can be invoked.
  2657 + * Returns 0 if @memcg was charged successfully, -EINTR if the charge
  2658 + * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed.
2671 2659 */
2672   -static int __mem_cgroup_try_charge(struct mm_struct *mm,
2673   - gfp_t gfp_mask,
2674   - unsigned int nr_pages,
2675   - struct mem_cgroup **ptr,
2676   - bool oom)
  2660 +static int mem_cgroup_try_charge(struct mem_cgroup *memcg,
  2661 + gfp_t gfp_mask,
  2662 + unsigned int nr_pages,
  2663 + bool oom)
2677 2664 {
2678 2665 unsigned int batch = max(CHARGE_BATCH, nr_pages);
2679 2666 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2680   - struct mem_cgroup *memcg = NULL;
2681 2667 int ret;
2682 2668  
  2669 + if (mem_cgroup_is_root(memcg))
  2670 + goto done;
2683 2671 /*
2684   - * Unlike gloval-vm's OOM-kill, we're not in memory shortage
2685   - * in system level. So, allow to go ahead dying process in addition to
2686   - * MEMDIE process.
  2672 + * Unlike in global OOM situations, memcg is not in a physical
  2673 + * memory shortage. Allow dying and OOM-killed tasks to
  2674 + * bypass the last charges so that they can exit quickly and
  2675 + * free their memory.
2687 2676 */
2688   - if (unlikely(test_thread_flag(TIF_MEMDIE)
2689   - || fatal_signal_pending(current)))
  2677 + if (unlikely(test_thread_flag(TIF_MEMDIE) ||
  2678 + fatal_signal_pending(current)))
2690 2679 goto bypass;
2691 2680  
2692 2681 if (unlikely(task_in_memcg_oom(current)))
... ... @@ -2695,14 +2684,6 @@
2695 2684 if (gfp_mask & __GFP_NOFAIL)
2696 2685 oom = false;
2697 2686 again:
2698   - if (*ptr) { /* css should be a valid one */
2699   - memcg = *ptr;
2700   - css_get(&memcg->css);
2701   - } else {
2702   - memcg = get_mem_cgroup_from_mm(mm);
2703   - }
2704   - if (mem_cgroup_is_root(memcg))
2705   - goto done;
2706 2687 if (consume_stock(memcg, nr_pages))
2707 2688 goto done;
2708 2689  
2709 2690  
... ... @@ -2710,10 +2691,8 @@
2710 2691 bool invoke_oom = oom && !nr_oom_retries;
2711 2692  
2712 2693 /* If killed, bypass charge */
2713   - if (fatal_signal_pending(current)) {
2714   - css_put(&memcg->css);
  2694 + if (fatal_signal_pending(current))
2715 2695 goto bypass;
2716   - }
2717 2696  
2718 2697 ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
2719 2698 nr_pages, invoke_oom);
2720 2699  
2721 2700  
2722 2701  
... ... @@ -2722,17 +2701,12 @@
2722 2701 break;
2723 2702 case CHARGE_RETRY: /* not in OOM situation but retry */
2724 2703 batch = nr_pages;
2725   - css_put(&memcg->css);
2726   - memcg = NULL;
2727 2704 goto again;
2728 2705 case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
2729   - css_put(&memcg->css);
2730 2706 goto nomem;
2731 2707 case CHARGE_NOMEM: /* OOM routine works */
2732   - if (!oom || invoke_oom) {
2733   - css_put(&memcg->css);
  2708 + if (!oom || invoke_oom)
2734 2709 goto nomem;
2735   - }
2736 2710 nr_oom_retries--;
2737 2711 break;
2738 2712 }
2739 2713  
2740 2714  
2741 2715  
2742 2716  
... ... @@ -2741,19 +2715,43 @@
2741 2715 if (batch > nr_pages)
2742 2716 refill_stock(memcg, batch - nr_pages);
2743 2717 done:
2744   - css_put(&memcg->css);
2745   - *ptr = memcg;
2746 2718 return 0;
2747 2719 nomem:
2748   - if (!(gfp_mask & __GFP_NOFAIL)) {
2749   - *ptr = NULL;
  2720 + if (!(gfp_mask & __GFP_NOFAIL))
2750 2721 return -ENOMEM;
2751   - }
2752 2722 bypass:
2753   - *ptr = root_mem_cgroup;
2754 2723 return -EINTR;
2755 2724 }
2756 2725  
  2726 +/**
  2727 + * mem_cgroup_try_charge_mm - try charging a mm
  2728 + * @mm: mm_struct to charge
  2729 + * @nr_pages: number of pages to charge
  2730 + * @oom: trigger OOM if reclaim fails
  2731 + *
  2732 + * Returns the charged mem_cgroup associated with the given mm_struct or
  2733 + * NULL the charge failed.
  2734 + */
  2735 +static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm,
  2736 + gfp_t gfp_mask,
  2737 + unsigned int nr_pages,
  2738 + bool oom)
  2739 +
  2740 +{
  2741 + struct mem_cgroup *memcg;
  2742 + int ret;
  2743 +
  2744 + memcg = get_mem_cgroup_from_mm(mm);
  2745 + ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom);
  2746 + css_put(&memcg->css);
  2747 + if (ret == -EINTR)
  2748 + memcg = root_mem_cgroup;
  2749 + else if (ret)
  2750 + memcg = NULL;
  2751 +
  2752 + return memcg;
  2753 +}
  2754 +
2757 2755 /*
2758 2756 * Somemtimes we have to undo a charge we got by try_charge().
2759 2757 * This function is for that and do uncharge, put css's refcnt.
2760 2758  
2761 2759  
... ... @@ -2949,20 +2947,17 @@
2949 2947 static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
2950 2948 {
2951 2949 struct res_counter *fail_res;
2952   - struct mem_cgroup *_memcg;
2953 2950 int ret = 0;
2954 2951  
2955 2952 ret = res_counter_charge(&memcg->kmem, size, &fail_res);
2956 2953 if (ret)
2957 2954 return ret;
2958 2955  
2959   - _memcg = memcg;
2960   - ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT,
2961   - &_memcg, oom_gfp_allowed(gfp));
2962   -
  2956 + ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT,
  2957 + oom_gfp_allowed(gfp));
2963 2958 if (ret == -EINTR) {
2964 2959 /*
2965   - * __mem_cgroup_try_charge() chosed to bypass to root due to
  2960 + * mem_cgroup_try_charge() chosed to bypass to root due to
2966 2961 * OOM kill or fatal signal. Since our only options are to
2967 2962 * either fail the allocation or charge it to this cgroup, do
2968 2963 * it as a temporary condition. But we can't fail. From a
... ... @@ -2972,7 +2967,7 @@
2972 2967 *
2973 2968 * This condition will only trigger if the task entered
2974 2969 * memcg_charge_kmem in a sane state, but was OOM-killed during
2975   - * __mem_cgroup_try_charge() above. Tasks that were already
  2970 + * mem_cgroup_try_charge() above. Tasks that were already
2976 2971 * dying when the allocation triggers should have been already
2977 2972 * directed to the root cgroup in memcontrol.h
2978 2973 */
2979 2974  
2980 2975  
... ... @@ -3826,10 +3821,9 @@
3826 3821 int mem_cgroup_newpage_charge(struct page *page,
3827 3822 struct mm_struct *mm, gfp_t gfp_mask)
3828 3823 {
3829   - struct mem_cgroup *memcg = NULL;
3830 3824 unsigned int nr_pages = 1;
  3825 + struct mem_cgroup *memcg;
3831 3826 bool oom = true;
3832   - int ret;
3833 3827  
3834 3828 if (mem_cgroup_disabled())
3835 3829 return 0;
... ... @@ -3848,9 +3842,9 @@
3848 3842 oom = false;
3849 3843 }
3850 3844  
3851   - ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
3852   - if (ret == -ENOMEM)
3853   - return ret;
  3845 + memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages, oom);
  3846 + if (!memcg)
  3847 + return -ENOMEM;
3854 3848 __mem_cgroup_commit_charge(memcg, page, nr_pages,
3855 3849 MEM_CGROUP_CHARGE_TYPE_ANON, false);
3856 3850 return 0;
... ... @@ -3867,7 +3861,7 @@
3867 3861 gfp_t mask,
3868 3862 struct mem_cgroup **memcgp)
3869 3863 {
3870   - struct mem_cgroup *memcg;
  3864 + struct mem_cgroup *memcg = NULL;
3871 3865 struct page_cgroup *pc;
3872 3866 int ret;
3873 3867  
3874 3868  
3875 3869  
3876 3870  
3877 3871  
... ... @@ -3880,31 +3874,29 @@
3880 3874 * in turn serializes uncharging.
3881 3875 */
3882 3876 if (PageCgroupUsed(pc))
3883   - return 0;
3884   - if (!do_swap_account)
3885   - goto charge_cur_mm;
3886   - memcg = try_get_mem_cgroup_from_page(page);
  3877 + goto out;
  3878 + if (do_swap_account)
  3879 + memcg = try_get_mem_cgroup_from_page(page);
3887 3880 if (!memcg)
3888   - goto charge_cur_mm;
3889   - *memcgp = memcg;
3890   - ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true);
  3881 + memcg = get_mem_cgroup_from_mm(mm);
  3882 + ret = mem_cgroup_try_charge(memcg, mask, 1, true);
3891 3883 css_put(&memcg->css);
3892 3884 if (ret == -EINTR)
3893   - ret = 0;
3894   - return ret;
3895   -charge_cur_mm:
3896   - ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);
3897   - if (ret == -EINTR)
3898   - ret = 0;
3899   - return ret;
  3885 + memcg = root_mem_cgroup;
  3886 + else if (ret)
  3887 + return ret;
  3888 +out:
  3889 + *memcgp = memcg;
  3890 + return 0;
3900 3891 }
3901 3892  
3902 3893 int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
3903 3894 gfp_t gfp_mask, struct mem_cgroup **memcgp)
3904 3895 {
3905   - *memcgp = NULL;
3906   - if (mem_cgroup_disabled())
  3896 + if (mem_cgroup_disabled()) {
  3897 + *memcgp = NULL;
3907 3898 return 0;
  3899 + }
3908 3900 /*
3909 3901 * A racing thread's fault, or swapoff, may have already
3910 3902 * updated the pte, and even removed page from swap cache: in
3911 3903  
... ... @@ -3912,12 +3904,13 @@
3912 3904 * there's also a KSM case which does need to charge the page.
3913 3905 */
3914 3906 if (!PageSwapCache(page)) {
3915   - int ret;
  3907 + struct mem_cgroup *memcg;
3916 3908  
3917   - ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true);
3918   - if (ret == -EINTR)
3919   - ret = 0;
3920   - return ret;
  3909 + memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);
  3910 + if (!memcg)
  3911 + return -ENOMEM;
  3912 + *memcgp = memcg;
  3913 + return 0;
3921 3914 }
3922 3915 return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
3923 3916 }
3924 3917  
... ... @@ -3964,8 +3957,8 @@
3964 3957 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
3965 3958 gfp_t gfp_mask)
3966 3959 {
3967   - struct mem_cgroup *memcg = NULL;
3968 3960 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
  3961 + struct mem_cgroup *memcg;
3969 3962 int ret;
3970 3963  
3971 3964 if (mem_cgroup_disabled())
3972 3965  
3973 3966  
... ... @@ -3973,23 +3966,28 @@
3973 3966 if (PageCompound(page))
3974 3967 return 0;
3975 3968  
3976   - if (!PageSwapCache(page)) {
3977   - /*
3978   - * Page cache insertions can happen without an actual
3979   - * task context, e.g. during disk probing on boot.
3980   - */
3981   - if (!mm)
3982   - memcg = root_mem_cgroup;
3983   - ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &memcg, true);
3984   - if (ret != -ENOMEM)
3985   - __mem_cgroup_commit_charge(memcg, page, 1, type, false);
3986   - } else { /* page is swapcache/shmem */
  3969 + if (PageSwapCache(page)) { /* shmem */
3987 3970 ret = __mem_cgroup_try_charge_swapin(mm, page,
3988 3971 gfp_mask, &memcg);
3989   - if (!ret)
3990   - __mem_cgroup_commit_charge_swapin(page, memcg, type);
  3972 + if (ret)
  3973 + return ret;
  3974 + __mem_cgroup_commit_charge_swapin(page, memcg, type);
  3975 + return 0;
3991 3976 }
3992   - return ret;
  3977 +
  3978 + /*
  3979 + * Page cache insertions can happen without an actual mm
  3980 + * context, e.g. during disk probing on boot.
  3981 + */
  3982 + if (unlikely(!mm))
  3983 + memcg = root_mem_cgroup;
  3984 + else {
  3985 + memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);
  3986 + if (!memcg)
  3987 + return -ENOMEM;
  3988 + }
  3989 + __mem_cgroup_commit_charge(memcg, page, 1, type, false);
  3990 + return 0;
3993 3991 }
3994 3992  
3995 3993 static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
... ... @@ -6601,8 +6599,7 @@
6601 6599 batch_count = PRECHARGE_COUNT_AT_ONCE;
6602 6600 cond_resched();
6603 6601 }
6604   - ret = __mem_cgroup_try_charge(NULL,
6605   - GFP_KERNEL, 1, &memcg, false);
  6602 + ret = mem_cgroup_try_charge(memcg, GFP_KERNEL, 1, false);
6606 6603 if (ret)
6607 6604 /* mem_cgroup_clear_mc() will do uncharge later */
6608 6605 return ret;