Commit f817ed48535ac6510ebae7c4116f24a5f9268834

Authored by KAMEZAWA Hiroyuki
Committed by Linus Torvalds
1 parent 0753b0ef3b

memcg: move all acccounting to parent at rmdir()

This patch provides a function to move account information of a page
between mem_cgroups and rewrite force_empty to make use of this.

This moving of page_cgroup is done under
 - lru_lock of source/destination mem_cgroup is held.
 - lock_page_cgroup() is held.

Then, a routine which touches pc->mem_cgroup without lock_page_cgroup()
should confirm pc->mem_cgroup is still valid or not.  Typical code can be
following.

(while page is not under lock_page())
	mem = pc->mem_cgroup;
	mz = page_cgroup_zoneinfo(pc)
	spin_lock_irqsave(&mz->lru_lock);
	if (pc->mem_cgroup == mem)
		...../* some list handling */
	spin_unlock_irqrestore(&mz->lru_lock);

Of course, better way is
	lock_page_cgroup(pc);
	....
	unlock_page_cgroup(pc);

But you should confirm the nest of lock and avoid deadlock.

If you treats page_cgroup from mem_cgroup's LRU under mz->lru_lock,
you don't have to worry about what pc->mem_cgroup points to.
moved pages are added to head of lru, not to tail.

Expected users of this routine is:
  - force_empty (rmdir)
  - moving tasks between cgroup (for moving account information.)
  - hierarchy (maybe useful.)

force_empty(rmdir) uses this move_account and move pages to its parent.
This "move" will not cause OOM (I added "oom" parameter to try_charge().)

If the parent is busy (not enough memory), force_empty calls try_to_free_page()
and reduce usage.

Purpose of this behavior is
  - Fix "forget all" behavior of force_empty and avoid leak of accounting.
  - By "moving first, free if necessary", keep pages on memory as much as
    possible.

Adding a switch to change behavior of force_empty to
  - free first, move if necessary
  - free all, if there is mlocked/busy pages, return -EBUSY.
is under consideration. (I'll add if someone requtests.)

This patch also removes memory.force_empty file, a brutal debug-only interface.

Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Tested-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 2 changed files with 214 additions and 75 deletions Side-by-side Diff

Documentation/controllers/memory.txt
... ... @@ -207,12 +207,6 @@
207 207 The memory.stat file gives accounting information. Now, the number of
208 208 caches, RSS and Active pages/Inactive pages are shown.
209 209  
210   -The memory.force_empty gives an interface to drop *all* charges by force.
211   -
212   -# echo 1 > memory.force_empty
213   -
214   -will drop all charges in cgroup. Currently, this is maintained for test.
215   -
216 210 4. Testing
217 211  
218 212 Balbir posted lmbench, AIM9, LTP and vmmstress results [10] and [11].
... ... @@ -242,8 +236,10 @@
242 236  
243 237 A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a
244 238 cgroup might have some charge associated with it, even though all
245   -tasks have migrated away from it. Such charges are automatically dropped at
246   -rmdir() if there are no tasks.
  239 +tasks have migrated away from it.
  240 +Such charges are moved to its parent as much as possible and freed if parent
  241 +is full. Both of RSS and CACHES are moved to parent.
  242 +If both of them are busy, rmdir() returns -EBUSY.
247 243  
248 244 5. TODO
249 245  
... ... @@ -257,7 +257,7 @@
257 257 }
258 258  
259 259 static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
260   - struct page_cgroup *pc)
  260 + struct page_cgroup *pc, bool hot)
261 261 {
262 262 int lru = LRU_BASE;
263 263  
... ... @@ -271,7 +271,10 @@
271 271 }
272 272  
273 273 MEM_CGROUP_ZSTAT(mz, lru) += 1;
274   - list_add(&pc->lru, &mz->lists[lru]);
  274 + if (hot)
  275 + list_add(&pc->lru, &mz->lists[lru]);
  276 + else
  277 + list_add_tail(&pc->lru, &mz->lists[lru]);
275 278  
276 279 mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true);
277 280 }
278 281  
... ... @@ -467,21 +470,12 @@
467 470 return nr_taken;
468 471 }
469 472  
470   -
471   -/**
472   - * mem_cgroup_try_charge - get charge of PAGE_SIZE.
473   - * @mm: an mm_struct which is charged against. (when *memcg is NULL)
474   - * @gfp_mask: gfp_mask for reclaim.
475   - * @memcg: a pointer to memory cgroup which is charged against.
476   - *
477   - * charge against memory cgroup pointed by *memcg. if *memcg == NULL, estimated
478   - * memory cgroup from @mm is got and stored in *memcg.
479   - *
480   - * Returns 0 if success. -ENOMEM at failure.
  473 +/*
  474 + * Unlike exported interface, "oom" parameter is added. if oom==true,
  475 + * oom-killer can be invoked.
481 476 */
482   -
483   -int mem_cgroup_try_charge(struct mm_struct *mm,
484   - gfp_t gfp_mask, struct mem_cgroup **memcg)
  477 +static int __mem_cgroup_try_charge(struct mm_struct *mm,
  478 + gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
485 479 {
486 480 struct mem_cgroup *mem;
487 481 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
... ... @@ -528,7 +522,8 @@
528 522 continue;
529 523  
530 524 if (!nr_retries--) {
531   - mem_cgroup_out_of_memory(mem, gfp_mask);
  525 + if (oom)
  526 + mem_cgroup_out_of_memory(mem, gfp_mask);
532 527 goto nomem;
533 528 }
534 529 }
... ... @@ -538,6 +533,25 @@
538 533 return -ENOMEM;
539 534 }
540 535  
  536 +/**
  537 + * mem_cgroup_try_charge - get charge of PAGE_SIZE.
  538 + * @mm: an mm_struct which is charged against. (when *memcg is NULL)
  539 + * @gfp_mask: gfp_mask for reclaim.
  540 + * @memcg: a pointer to memory cgroup which is charged against.
  541 + *
  542 + * charge against memory cgroup pointed by *memcg. if *memcg == NULL, estimated
  543 + * memory cgroup from @mm is got and stored in *memcg.
  544 + *
  545 + * Returns 0 if success. -ENOMEM at failure.
  546 + * This call can invoke OOM-Killer.
  547 + */
  548 +
  549 +int mem_cgroup_try_charge(struct mm_struct *mm,
  550 + gfp_t mask, struct mem_cgroup **memcg)
  551 +{
  552 + return __mem_cgroup_try_charge(mm, mask, memcg, true);
  553 +}
  554 +
541 555 /*
542 556 * commit a charge got by mem_cgroup_try_charge() and makes page_cgroup to be
543 557 * USED state. If already USED, uncharge and return.
544 558  
545 559  
... ... @@ -571,12 +585,110 @@
571 585 mz = page_cgroup_zoneinfo(pc);
572 586  
573 587 spin_lock_irqsave(&mz->lru_lock, flags);
574   - __mem_cgroup_add_list(mz, pc);
  588 + __mem_cgroup_add_list(mz, pc, true);
575 589 spin_unlock_irqrestore(&mz->lru_lock, flags);
576 590 unlock_page_cgroup(pc);
577 591 }
578 592  
  593 +/**
  594 + * mem_cgroup_move_account - move account of the page
  595 + * @pc: page_cgroup of the page.
  596 + * @from: mem_cgroup which the page is moved from.
  597 + * @to: mem_cgroup which the page is moved to. @from != @to.
  598 + *
  599 + * The caller must confirm following.
  600 + * 1. disable irq.
  601 + * 2. lru_lock of old mem_cgroup(@from) should be held.
  602 + *
  603 + * returns 0 at success,
  604 + * returns -EBUSY when lock is busy or "pc" is unstable.
  605 + *
  606 + * This function does "uncharge" from old cgroup but doesn't do "charge" to
  607 + * new cgroup. It should be done by a caller.
  608 + */
  609 +
  610 +static int mem_cgroup_move_account(struct page_cgroup *pc,
  611 + struct mem_cgroup *from, struct mem_cgroup *to)
  612 +{
  613 + struct mem_cgroup_per_zone *from_mz, *to_mz;
  614 + int nid, zid;
  615 + int ret = -EBUSY;
  616 +
  617 + VM_BUG_ON(!irqs_disabled());
  618 + VM_BUG_ON(from == to);
  619 +
  620 + nid = page_cgroup_nid(pc);
  621 + zid = page_cgroup_zid(pc);
  622 + from_mz = mem_cgroup_zoneinfo(from, nid, zid);
  623 + to_mz = mem_cgroup_zoneinfo(to, nid, zid);
  624 +
  625 +
  626 + if (!trylock_page_cgroup(pc))
  627 + return ret;
  628 +
  629 + if (!PageCgroupUsed(pc))
  630 + goto out;
  631 +
  632 + if (pc->mem_cgroup != from)
  633 + goto out;
  634 +
  635 + if (spin_trylock(&to_mz->lru_lock)) {
  636 + __mem_cgroup_remove_list(from_mz, pc);
  637 + css_put(&from->css);
  638 + res_counter_uncharge(&from->res, PAGE_SIZE);
  639 + pc->mem_cgroup = to;
  640 + css_get(&to->css);
  641 + __mem_cgroup_add_list(to_mz, pc, false);
  642 + ret = 0;
  643 + spin_unlock(&to_mz->lru_lock);
  644 + }
  645 +out:
  646 + unlock_page_cgroup(pc);
  647 + return ret;
  648 +}
  649 +
579 650 /*
  651 + * move charges to its parent.
  652 + */
  653 +
  654 +static int mem_cgroup_move_parent(struct page_cgroup *pc,
  655 + struct mem_cgroup *child,
  656 + gfp_t gfp_mask)
  657 +{
  658 + struct cgroup *cg = child->css.cgroup;
  659 + struct cgroup *pcg = cg->parent;
  660 + struct mem_cgroup *parent;
  661 + struct mem_cgroup_per_zone *mz;
  662 + unsigned long flags;
  663 + int ret;
  664 +
  665 + /* Is ROOT ? */
  666 + if (!pcg)
  667 + return -EINVAL;
  668 +
  669 + parent = mem_cgroup_from_cont(pcg);
  670 +
  671 + ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
  672 + if (ret)
  673 + return ret;
  674 +
  675 + mz = mem_cgroup_zoneinfo(child,
  676 + page_cgroup_nid(pc), page_cgroup_zid(pc));
  677 +
  678 + spin_lock_irqsave(&mz->lru_lock, flags);
  679 + ret = mem_cgroup_move_account(pc, child, parent);
  680 + spin_unlock_irqrestore(&mz->lru_lock, flags);
  681 +
  682 + /* drop extra refcnt */
  683 + css_put(&parent->css);
  684 + /* uncharge if move fails */
  685 + if (ret)
  686 + res_counter_uncharge(&parent->res, PAGE_SIZE);
  687 +
  688 + return ret;
  689 +}
  690 +
  691 +/*
580 692 * Charge the memory controller for page usage.
581 693 * Return
582 694 * 0 if the charge was successful
... ... @@ -597,7 +709,7 @@
597 709 prefetchw(pc);
598 710  
599 711 mem = memcg;
600   - ret = mem_cgroup_try_charge(mm, gfp_mask, &mem);
  712 + ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
601 713 if (ret)
602 714 return ret;
603 715  
604 716  
605 717  
606 718  
607 719  
608 720  
609 721  
610 722  
611 723  
... ... @@ -899,46 +1011,52 @@
899 1011 * This routine traverse page_cgroup in given list and drop them all.
900 1012 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
901 1013 */
902   -#define FORCE_UNCHARGE_BATCH (128)
903   -static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
  1014 +static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
904 1015 struct mem_cgroup_per_zone *mz,
905 1016 enum lru_list lru)
906 1017 {
907   - struct page_cgroup *pc;
908   - struct page *page;
909   - int count = FORCE_UNCHARGE_BATCH;
  1018 + struct page_cgroup *pc, *busy;
910 1019 unsigned long flags;
  1020 + unsigned long loop;
911 1021 struct list_head *list;
  1022 + int ret = 0;
912 1023  
913 1024 list = &mz->lists[lru];
914 1025  
915   - spin_lock_irqsave(&mz->lru_lock, flags);
916   - while (!list_empty(list)) {
917   - pc = list_entry(list->prev, struct page_cgroup, lru);
918   - page = pc->page;
919   - if (!PageCgroupUsed(pc))
  1026 + loop = MEM_CGROUP_ZSTAT(mz, lru);
  1027 + /* give some margin against EBUSY etc...*/
  1028 + loop += 256;
  1029 + busy = NULL;
  1030 + while (loop--) {
  1031 + ret = 0;
  1032 + spin_lock_irqsave(&mz->lru_lock, flags);
  1033 + if (list_empty(list)) {
  1034 + spin_unlock_irqrestore(&mz->lru_lock, flags);
920 1035 break;
921   - get_page(page);
  1036 + }
  1037 + pc = list_entry(list->prev, struct page_cgroup, lru);
  1038 + if (busy == pc) {
  1039 + list_move(&pc->lru, list);
  1040 + busy = 0;
  1041 + spin_unlock_irqrestore(&mz->lru_lock, flags);
  1042 + continue;
  1043 + }
922 1044 spin_unlock_irqrestore(&mz->lru_lock, flags);
923   - /*
924   - * Check if this page is on LRU. !LRU page can be found
925   - * if it's under page migration.
926   - */
927   - if (PageLRU(page)) {
928   - __mem_cgroup_uncharge_common(page,
929   - MEM_CGROUP_CHARGE_TYPE_FORCE);
930   - put_page(page);
931   - if (--count <= 0) {
932   - count = FORCE_UNCHARGE_BATCH;
933   - cond_resched();
934   - }
935   - } else {
936   - spin_lock_irqsave(&mz->lru_lock, flags);
  1045 +
  1046 + ret = mem_cgroup_move_parent(pc, mem, GFP_HIGHUSER_MOVABLE);
  1047 + if (ret == -ENOMEM)
937 1048 break;
938   - }
939   - spin_lock_irqsave(&mz->lru_lock, flags);
  1049 +
  1050 + if (ret == -EBUSY || ret == -EINVAL) {
  1051 + /* found lock contention or "pc" is obsolete. */
  1052 + busy = pc;
  1053 + cond_resched();
  1054 + } else
  1055 + busy = NULL;
940 1056 }
941   - spin_unlock_irqrestore(&mz->lru_lock, flags);
  1057 + if (!ret && !list_empty(list))
  1058 + return -EBUSY;
  1059 + return ret;
942 1060 }
943 1061  
944 1062 /*
945 1063  
946 1064  
947 1065  
948 1066  
949 1067  
950 1068  
951 1069  
... ... @@ -947,34 +1065,68 @@
947 1065 */
948 1066 static int mem_cgroup_force_empty(struct mem_cgroup *mem)
949 1067 {
950   - int ret = -EBUSY;
951   - int node, zid;
  1068 + int ret;
  1069 + int node, zid, shrink;
  1070 + int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
952 1071  
953 1072 css_get(&mem->css);
954   - /*
955   - * page reclaim code (kswapd etc..) will move pages between
956   - * active_list <-> inactive_list while we don't take a lock.
957   - * So, we have to do loop here until all lists are empty.
958   - */
  1073 +
  1074 + shrink = 0;
  1075 +move_account:
959 1076 while (mem->res.usage > 0) {
  1077 + ret = -EBUSY;
960 1078 if (atomic_read(&mem->css.cgroup->count) > 0)
961 1079 goto out;
  1080 +
962 1081 /* This is for making all *used* pages to be on LRU. */
963 1082 lru_add_drain_all();
964   - for_each_node_state(node, N_POSSIBLE)
965   - for (zid = 0; zid < MAX_NR_ZONES; zid++) {
  1083 + ret = 0;
  1084 + for_each_node_state(node, N_POSSIBLE) {
  1085 + for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
966 1086 struct mem_cgroup_per_zone *mz;
967 1087 enum lru_list l;
968 1088 mz = mem_cgroup_zoneinfo(mem, node, zid);
969   - for_each_lru(l)
970   - mem_cgroup_force_empty_list(mem, mz, l);
  1089 + for_each_lru(l) {
  1090 + ret = mem_cgroup_force_empty_list(mem,
  1091 + mz, l);
  1092 + if (ret)
  1093 + break;
  1094 + }
971 1095 }
  1096 + if (ret)
  1097 + break;
  1098 + }
  1099 + /* it seems parent cgroup doesn't have enough mem */
  1100 + if (ret == -ENOMEM)
  1101 + goto try_to_free;
972 1102 cond_resched();
973 1103 }
974 1104 ret = 0;
975 1105 out:
976 1106 css_put(&mem->css);
977 1107 return ret;
  1108 +
  1109 +try_to_free:
  1110 + /* returns EBUSY if we come here twice. */
  1111 + if (shrink) {
  1112 + ret = -EBUSY;
  1113 + goto out;
  1114 + }
  1115 + /* try to free all pages in this cgroup */
  1116 + shrink = 1;
  1117 + while (nr_retries && mem->res.usage > 0) {
  1118 + int progress;
  1119 + progress = try_to_free_mem_cgroup_pages(mem,
  1120 + GFP_HIGHUSER_MOVABLE);
  1121 + if (!progress)
  1122 + nr_retries--;
  1123 +
  1124 + }
  1125 + /* try move_account...there may be some *locked* pages. */
  1126 + if (mem->res.usage)
  1127 + goto move_account;
  1128 + ret = 0;
  1129 + goto out;
978 1130 }
979 1131  
980 1132 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
... ... @@ -1023,11 +1175,6 @@
1023 1175 return 0;
1024 1176 }
1025 1177  
1026   -static int mem_force_empty_write(struct cgroup *cont, unsigned int event)
1027   -{
1028   - return mem_cgroup_force_empty(mem_cgroup_from_cont(cont));
1029   -}
1030   -
1031 1178 static const struct mem_cgroup_stat_desc {
1032 1179 const char *msg;
1033 1180 u64 unit;
... ... @@ -1102,10 +1249,6 @@
1102 1249 .private = RES_FAILCNT,
1103 1250 .trigger = mem_cgroup_reset,
1104 1251 .read_u64 = mem_cgroup_read,
1105   - },
1106   - {
1107   - .name = "force_empty",
1108   - .trigger = mem_force_empty_write,
1109 1252 },
1110 1253 {
1111 1254 .name = "stat",