Commit f817ed48535ac6510ebae7c4116f24a5f9268834
Committed by
Linus Torvalds
1 parent
0753b0ef3b
Exists in
master
and in
4 other branches
memcg: move all acccounting to parent at rmdir()
This patch provides a function to move account information of a page between mem_cgroups and rewrite force_empty to make use of this. This moving of page_cgroup is done under - lru_lock of source/destination mem_cgroup is held. - lock_page_cgroup() is held. Then, a routine which touches pc->mem_cgroup without lock_page_cgroup() should confirm pc->mem_cgroup is still valid or not. Typical code can be following. (while page is not under lock_page()) mem = pc->mem_cgroup; mz = page_cgroup_zoneinfo(pc) spin_lock_irqsave(&mz->lru_lock); if (pc->mem_cgroup == mem) ...../* some list handling */ spin_unlock_irqrestore(&mz->lru_lock); Of course, better way is lock_page_cgroup(pc); .... unlock_page_cgroup(pc); But you should confirm the nest of lock and avoid deadlock. If you treats page_cgroup from mem_cgroup's LRU under mz->lru_lock, you don't have to worry about what pc->mem_cgroup points to. moved pages are added to head of lru, not to tail. Expected users of this routine is: - force_empty (rmdir) - moving tasks between cgroup (for moving account information.) - hierarchy (maybe useful.) force_empty(rmdir) uses this move_account and move pages to its parent. This "move" will not cause OOM (I added "oom" parameter to try_charge().) If the parent is busy (not enough memory), force_empty calls try_to_free_page() and reduce usage. Purpose of this behavior is - Fix "forget all" behavior of force_empty and avoid leak of accounting. - By "moving first, free if necessary", keep pages on memory as much as possible. Adding a switch to change behavior of force_empty to - free first, move if necessary - free all, if there is mlocked/busy pages, return -EBUSY. is under consideration. (I'll add if someone requtests.) This patch also removes memory.force_empty file, a brutal debug-only interface. Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Tested-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Paul Menage <menage@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 2 changed files with 214 additions and 75 deletions Side-by-side Diff
Documentation/controllers/memory.txt
... | ... | @@ -207,12 +207,6 @@ |
207 | 207 | The memory.stat file gives accounting information. Now, the number of |
208 | 208 | caches, RSS and Active pages/Inactive pages are shown. |
209 | 209 | |
210 | -The memory.force_empty gives an interface to drop *all* charges by force. | |
211 | - | |
212 | -# echo 1 > memory.force_empty | |
213 | - | |
214 | -will drop all charges in cgroup. Currently, this is maintained for test. | |
215 | - | |
216 | 210 | 4. Testing |
217 | 211 | |
218 | 212 | Balbir posted lmbench, AIM9, LTP and vmmstress results [10] and [11]. |
... | ... | @@ -242,8 +236,10 @@ |
242 | 236 | |
243 | 237 | A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a |
244 | 238 | cgroup might have some charge associated with it, even though all |
245 | -tasks have migrated away from it. Such charges are automatically dropped at | |
246 | -rmdir() if there are no tasks. | |
239 | +tasks have migrated away from it. | |
240 | +Such charges are moved to its parent as much as possible and freed if parent | |
241 | +is full. Both of RSS and CACHES are moved to parent. | |
242 | +If both of them are busy, rmdir() returns -EBUSY. | |
247 | 243 | |
248 | 244 | 5. TODO |
249 | 245 |
mm/memcontrol.c
... | ... | @@ -257,7 +257,7 @@ |
257 | 257 | } |
258 | 258 | |
259 | 259 | static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, |
260 | - struct page_cgroup *pc) | |
260 | + struct page_cgroup *pc, bool hot) | |
261 | 261 | { |
262 | 262 | int lru = LRU_BASE; |
263 | 263 | |
... | ... | @@ -271,7 +271,10 @@ |
271 | 271 | } |
272 | 272 | |
273 | 273 | MEM_CGROUP_ZSTAT(mz, lru) += 1; |
274 | - list_add(&pc->lru, &mz->lists[lru]); | |
274 | + if (hot) | |
275 | + list_add(&pc->lru, &mz->lists[lru]); | |
276 | + else | |
277 | + list_add_tail(&pc->lru, &mz->lists[lru]); | |
275 | 278 | |
276 | 279 | mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true); |
277 | 280 | } |
278 | 281 | |
... | ... | @@ -467,21 +470,12 @@ |
467 | 470 | return nr_taken; |
468 | 471 | } |
469 | 472 | |
470 | - | |
471 | -/** | |
472 | - * mem_cgroup_try_charge - get charge of PAGE_SIZE. | |
473 | - * @mm: an mm_struct which is charged against. (when *memcg is NULL) | |
474 | - * @gfp_mask: gfp_mask for reclaim. | |
475 | - * @memcg: a pointer to memory cgroup which is charged against. | |
476 | - * | |
477 | - * charge against memory cgroup pointed by *memcg. if *memcg == NULL, estimated | |
478 | - * memory cgroup from @mm is got and stored in *memcg. | |
479 | - * | |
480 | - * Returns 0 if success. -ENOMEM at failure. | |
473 | +/* | |
474 | + * Unlike exported interface, "oom" parameter is added. if oom==true, | |
475 | + * oom-killer can be invoked. | |
481 | 476 | */ |
482 | - | |
483 | -int mem_cgroup_try_charge(struct mm_struct *mm, | |
484 | - gfp_t gfp_mask, struct mem_cgroup **memcg) | |
477 | +static int __mem_cgroup_try_charge(struct mm_struct *mm, | |
478 | + gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) | |
485 | 479 | { |
486 | 480 | struct mem_cgroup *mem; |
487 | 481 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; |
... | ... | @@ -528,7 +522,8 @@ |
528 | 522 | continue; |
529 | 523 | |
530 | 524 | if (!nr_retries--) { |
531 | - mem_cgroup_out_of_memory(mem, gfp_mask); | |
525 | + if (oom) | |
526 | + mem_cgroup_out_of_memory(mem, gfp_mask); | |
532 | 527 | goto nomem; |
533 | 528 | } |
534 | 529 | } |
... | ... | @@ -538,6 +533,25 @@ |
538 | 533 | return -ENOMEM; |
539 | 534 | } |
540 | 535 | |
536 | +/** | |
537 | + * mem_cgroup_try_charge - get charge of PAGE_SIZE. | |
538 | + * @mm: an mm_struct which is charged against. (when *memcg is NULL) | |
539 | + * @gfp_mask: gfp_mask for reclaim. | |
540 | + * @memcg: a pointer to memory cgroup which is charged against. | |
541 | + * | |
542 | + * charge against memory cgroup pointed by *memcg. if *memcg == NULL, estimated | |
543 | + * memory cgroup from @mm is got and stored in *memcg. | |
544 | + * | |
545 | + * Returns 0 if success. -ENOMEM at failure. | |
546 | + * This call can invoke OOM-Killer. | |
547 | + */ | |
548 | + | |
549 | +int mem_cgroup_try_charge(struct mm_struct *mm, | |
550 | + gfp_t mask, struct mem_cgroup **memcg) | |
551 | +{ | |
552 | + return __mem_cgroup_try_charge(mm, mask, memcg, true); | |
553 | +} | |
554 | + | |
541 | 555 | /* |
542 | 556 | * commit a charge got by mem_cgroup_try_charge() and makes page_cgroup to be |
543 | 557 | * USED state. If already USED, uncharge and return. |
544 | 558 | |
545 | 559 | |
... | ... | @@ -571,12 +585,110 @@ |
571 | 585 | mz = page_cgroup_zoneinfo(pc); |
572 | 586 | |
573 | 587 | spin_lock_irqsave(&mz->lru_lock, flags); |
574 | - __mem_cgroup_add_list(mz, pc); | |
588 | + __mem_cgroup_add_list(mz, pc, true); | |
575 | 589 | spin_unlock_irqrestore(&mz->lru_lock, flags); |
576 | 590 | unlock_page_cgroup(pc); |
577 | 591 | } |
578 | 592 | |
593 | +/** | |
594 | + * mem_cgroup_move_account - move account of the page | |
595 | + * @pc: page_cgroup of the page. | |
596 | + * @from: mem_cgroup which the page is moved from. | |
597 | + * @to: mem_cgroup which the page is moved to. @from != @to. | |
598 | + * | |
599 | + * The caller must confirm following. | |
600 | + * 1. disable irq. | |
601 | + * 2. lru_lock of old mem_cgroup(@from) should be held. | |
602 | + * | |
603 | + * returns 0 at success, | |
604 | + * returns -EBUSY when lock is busy or "pc" is unstable. | |
605 | + * | |
606 | + * This function does "uncharge" from old cgroup but doesn't do "charge" to | |
607 | + * new cgroup. It should be done by a caller. | |
608 | + */ | |
609 | + | |
610 | +static int mem_cgroup_move_account(struct page_cgroup *pc, | |
611 | + struct mem_cgroup *from, struct mem_cgroup *to) | |
612 | +{ | |
613 | + struct mem_cgroup_per_zone *from_mz, *to_mz; | |
614 | + int nid, zid; | |
615 | + int ret = -EBUSY; | |
616 | + | |
617 | + VM_BUG_ON(!irqs_disabled()); | |
618 | + VM_BUG_ON(from == to); | |
619 | + | |
620 | + nid = page_cgroup_nid(pc); | |
621 | + zid = page_cgroup_zid(pc); | |
622 | + from_mz = mem_cgroup_zoneinfo(from, nid, zid); | |
623 | + to_mz = mem_cgroup_zoneinfo(to, nid, zid); | |
624 | + | |
625 | + | |
626 | + if (!trylock_page_cgroup(pc)) | |
627 | + return ret; | |
628 | + | |
629 | + if (!PageCgroupUsed(pc)) | |
630 | + goto out; | |
631 | + | |
632 | + if (pc->mem_cgroup != from) | |
633 | + goto out; | |
634 | + | |
635 | + if (spin_trylock(&to_mz->lru_lock)) { | |
636 | + __mem_cgroup_remove_list(from_mz, pc); | |
637 | + css_put(&from->css); | |
638 | + res_counter_uncharge(&from->res, PAGE_SIZE); | |
639 | + pc->mem_cgroup = to; | |
640 | + css_get(&to->css); | |
641 | + __mem_cgroup_add_list(to_mz, pc, false); | |
642 | + ret = 0; | |
643 | + spin_unlock(&to_mz->lru_lock); | |
644 | + } | |
645 | +out: | |
646 | + unlock_page_cgroup(pc); | |
647 | + return ret; | |
648 | +} | |
649 | + | |
579 | 650 | /* |
651 | + * move charges to its parent. | |
652 | + */ | |
653 | + | |
654 | +static int mem_cgroup_move_parent(struct page_cgroup *pc, | |
655 | + struct mem_cgroup *child, | |
656 | + gfp_t gfp_mask) | |
657 | +{ | |
658 | + struct cgroup *cg = child->css.cgroup; | |
659 | + struct cgroup *pcg = cg->parent; | |
660 | + struct mem_cgroup *parent; | |
661 | + struct mem_cgroup_per_zone *mz; | |
662 | + unsigned long flags; | |
663 | + int ret; | |
664 | + | |
665 | + /* Is ROOT ? */ | |
666 | + if (!pcg) | |
667 | + return -EINVAL; | |
668 | + | |
669 | + parent = mem_cgroup_from_cont(pcg); | |
670 | + | |
671 | + ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); | |
672 | + if (ret) | |
673 | + return ret; | |
674 | + | |
675 | + mz = mem_cgroup_zoneinfo(child, | |
676 | + page_cgroup_nid(pc), page_cgroup_zid(pc)); | |
677 | + | |
678 | + spin_lock_irqsave(&mz->lru_lock, flags); | |
679 | + ret = mem_cgroup_move_account(pc, child, parent); | |
680 | + spin_unlock_irqrestore(&mz->lru_lock, flags); | |
681 | + | |
682 | + /* drop extra refcnt */ | |
683 | + css_put(&parent->css); | |
684 | + /* uncharge if move fails */ | |
685 | + if (ret) | |
686 | + res_counter_uncharge(&parent->res, PAGE_SIZE); | |
687 | + | |
688 | + return ret; | |
689 | +} | |
690 | + | |
691 | +/* | |
580 | 692 | * Charge the memory controller for page usage. |
581 | 693 | * Return |
582 | 694 | * 0 if the charge was successful |
... | ... | @@ -597,7 +709,7 @@ |
597 | 709 | prefetchw(pc); |
598 | 710 | |
599 | 711 | mem = memcg; |
600 | - ret = mem_cgroup_try_charge(mm, gfp_mask, &mem); | |
712 | + ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); | |
601 | 713 | if (ret) |
602 | 714 | return ret; |
603 | 715 | |
604 | 716 | |
605 | 717 | |
606 | 718 | |
607 | 719 | |
608 | 720 | |
609 | 721 | |
610 | 722 | |
611 | 723 | |
... | ... | @@ -899,46 +1011,52 @@ |
899 | 1011 | * This routine traverse page_cgroup in given list and drop them all. |
900 | 1012 | * *And* this routine doesn't reclaim page itself, just removes page_cgroup. |
901 | 1013 | */ |
902 | -#define FORCE_UNCHARGE_BATCH (128) | |
903 | -static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, | |
1014 | +static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, | |
904 | 1015 | struct mem_cgroup_per_zone *mz, |
905 | 1016 | enum lru_list lru) |
906 | 1017 | { |
907 | - struct page_cgroup *pc; | |
908 | - struct page *page; | |
909 | - int count = FORCE_UNCHARGE_BATCH; | |
1018 | + struct page_cgroup *pc, *busy; | |
910 | 1019 | unsigned long flags; |
1020 | + unsigned long loop; | |
911 | 1021 | struct list_head *list; |
1022 | + int ret = 0; | |
912 | 1023 | |
913 | 1024 | list = &mz->lists[lru]; |
914 | 1025 | |
915 | - spin_lock_irqsave(&mz->lru_lock, flags); | |
916 | - while (!list_empty(list)) { | |
917 | - pc = list_entry(list->prev, struct page_cgroup, lru); | |
918 | - page = pc->page; | |
919 | - if (!PageCgroupUsed(pc)) | |
1026 | + loop = MEM_CGROUP_ZSTAT(mz, lru); | |
1027 | + /* give some margin against EBUSY etc...*/ | |
1028 | + loop += 256; | |
1029 | + busy = NULL; | |
1030 | + while (loop--) { | |
1031 | + ret = 0; | |
1032 | + spin_lock_irqsave(&mz->lru_lock, flags); | |
1033 | + if (list_empty(list)) { | |
1034 | + spin_unlock_irqrestore(&mz->lru_lock, flags); | |
920 | 1035 | break; |
921 | - get_page(page); | |
1036 | + } | |
1037 | + pc = list_entry(list->prev, struct page_cgroup, lru); | |
1038 | + if (busy == pc) { | |
1039 | + list_move(&pc->lru, list); | |
1040 | + busy = 0; | |
1041 | + spin_unlock_irqrestore(&mz->lru_lock, flags); | |
1042 | + continue; | |
1043 | + } | |
922 | 1044 | spin_unlock_irqrestore(&mz->lru_lock, flags); |
923 | - /* | |
924 | - * Check if this page is on LRU. !LRU page can be found | |
925 | - * if it's under page migration. | |
926 | - */ | |
927 | - if (PageLRU(page)) { | |
928 | - __mem_cgroup_uncharge_common(page, | |
929 | - MEM_CGROUP_CHARGE_TYPE_FORCE); | |
930 | - put_page(page); | |
931 | - if (--count <= 0) { | |
932 | - count = FORCE_UNCHARGE_BATCH; | |
933 | - cond_resched(); | |
934 | - } | |
935 | - } else { | |
936 | - spin_lock_irqsave(&mz->lru_lock, flags); | |
1045 | + | |
1046 | + ret = mem_cgroup_move_parent(pc, mem, GFP_HIGHUSER_MOVABLE); | |
1047 | + if (ret == -ENOMEM) | |
937 | 1048 | break; |
938 | - } | |
939 | - spin_lock_irqsave(&mz->lru_lock, flags); | |
1049 | + | |
1050 | + if (ret == -EBUSY || ret == -EINVAL) { | |
1051 | + /* found lock contention or "pc" is obsolete. */ | |
1052 | + busy = pc; | |
1053 | + cond_resched(); | |
1054 | + } else | |
1055 | + busy = NULL; | |
940 | 1056 | } |
941 | - spin_unlock_irqrestore(&mz->lru_lock, flags); | |
1057 | + if (!ret && !list_empty(list)) | |
1058 | + return -EBUSY; | |
1059 | + return ret; | |
942 | 1060 | } |
943 | 1061 | |
944 | 1062 | /* |
945 | 1063 | |
946 | 1064 | |
947 | 1065 | |
948 | 1066 | |
949 | 1067 | |
950 | 1068 | |
951 | 1069 | |
... | ... | @@ -947,34 +1065,68 @@ |
947 | 1065 | */ |
948 | 1066 | static int mem_cgroup_force_empty(struct mem_cgroup *mem) |
949 | 1067 | { |
950 | - int ret = -EBUSY; | |
951 | - int node, zid; | |
1068 | + int ret; | |
1069 | + int node, zid, shrink; | |
1070 | + int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | |
952 | 1071 | |
953 | 1072 | css_get(&mem->css); |
954 | - /* | |
955 | - * page reclaim code (kswapd etc..) will move pages between | |
956 | - * active_list <-> inactive_list while we don't take a lock. | |
957 | - * So, we have to do loop here until all lists are empty. | |
958 | - */ | |
1073 | + | |
1074 | + shrink = 0; | |
1075 | +move_account: | |
959 | 1076 | while (mem->res.usage > 0) { |
1077 | + ret = -EBUSY; | |
960 | 1078 | if (atomic_read(&mem->css.cgroup->count) > 0) |
961 | 1079 | goto out; |
1080 | + | |
962 | 1081 | /* This is for making all *used* pages to be on LRU. */ |
963 | 1082 | lru_add_drain_all(); |
964 | - for_each_node_state(node, N_POSSIBLE) | |
965 | - for (zid = 0; zid < MAX_NR_ZONES; zid++) { | |
1083 | + ret = 0; | |
1084 | + for_each_node_state(node, N_POSSIBLE) { | |
1085 | + for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { | |
966 | 1086 | struct mem_cgroup_per_zone *mz; |
967 | 1087 | enum lru_list l; |
968 | 1088 | mz = mem_cgroup_zoneinfo(mem, node, zid); |
969 | - for_each_lru(l) | |
970 | - mem_cgroup_force_empty_list(mem, mz, l); | |
1089 | + for_each_lru(l) { | |
1090 | + ret = mem_cgroup_force_empty_list(mem, | |
1091 | + mz, l); | |
1092 | + if (ret) | |
1093 | + break; | |
1094 | + } | |
971 | 1095 | } |
1096 | + if (ret) | |
1097 | + break; | |
1098 | + } | |
1099 | + /* it seems parent cgroup doesn't have enough mem */ | |
1100 | + if (ret == -ENOMEM) | |
1101 | + goto try_to_free; | |
972 | 1102 | cond_resched(); |
973 | 1103 | } |
974 | 1104 | ret = 0; |
975 | 1105 | out: |
976 | 1106 | css_put(&mem->css); |
977 | 1107 | return ret; |
1108 | + | |
1109 | +try_to_free: | |
1110 | + /* returns EBUSY if we come here twice. */ | |
1111 | + if (shrink) { | |
1112 | + ret = -EBUSY; | |
1113 | + goto out; | |
1114 | + } | |
1115 | + /* try to free all pages in this cgroup */ | |
1116 | + shrink = 1; | |
1117 | + while (nr_retries && mem->res.usage > 0) { | |
1118 | + int progress; | |
1119 | + progress = try_to_free_mem_cgroup_pages(mem, | |
1120 | + GFP_HIGHUSER_MOVABLE); | |
1121 | + if (!progress) | |
1122 | + nr_retries--; | |
1123 | + | |
1124 | + } | |
1125 | + /* try move_account...there may be some *locked* pages. */ | |
1126 | + if (mem->res.usage) | |
1127 | + goto move_account; | |
1128 | + ret = 0; | |
1129 | + goto out; | |
978 | 1130 | } |
979 | 1131 | |
980 | 1132 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) |
... | ... | @@ -1023,11 +1175,6 @@ |
1023 | 1175 | return 0; |
1024 | 1176 | } |
1025 | 1177 | |
1026 | -static int mem_force_empty_write(struct cgroup *cont, unsigned int event) | |
1027 | -{ | |
1028 | - return mem_cgroup_force_empty(mem_cgroup_from_cont(cont)); | |
1029 | -} | |
1030 | - | |
1031 | 1178 | static const struct mem_cgroup_stat_desc { |
1032 | 1179 | const char *msg; |
1033 | 1180 | u64 unit; |
... | ... | @@ -1102,10 +1249,6 @@ |
1102 | 1249 | .private = RES_FAILCNT, |
1103 | 1250 | .trigger = mem_cgroup_reset, |
1104 | 1251 | .read_u64 = mem_cgroup_read, |
1105 | - }, | |
1106 | - { | |
1107 | - .name = "force_empty", | |
1108 | - .trigger = mem_force_empty_write, | |
1109 | 1252 | }, |
1110 | 1253 | { |
1111 | 1254 | .name = "stat", |