Commit 6d12e2d8ddbe653d80ea4f71578481c1bc933025
Committed by
Linus Torvalds
1 parent
c0149530d0
Exists in
master
and in
4 other branches
per-zone and reclaim enhancements for memory controller: per-zone active inactive counter
This patch adds per-zone status in memory cgroup. These values are often read (as per-zone value) by page reclaiming. In current design, per-zone stat is just a unsigned long value and not an atomic value because they are modified only under lru_lock. (So, atomic_ops is not necessary.) This patch adds ACTIVE and INACTIVE per-zone status values. For handling per-zone status, this patch adds struct mem_cgroup_per_zone { ... } and some helper functions. This will be useful to add per-zone objects in mem_cgroup. This patch turns memory controller's early_init to be 0 for calling kmalloc() in initialization. Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: David Rientjes <rientjes@google.com> Cc: Herbert Poetzl <herbert@13thfloor.at> Cc: Kirill Korotaev <dev@sw.ru> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Paul Menage <menage@google.com> Cc: Pavel Emelianov <xemul@openvz.org> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com> Cc: Rik van Riel <riel@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 1 changed file with 154 additions and 7 deletions Side-by-side Diff
mm/memcontrol.c
... | ... | @@ -78,6 +78,31 @@ |
78 | 78 | } |
79 | 79 | |
80 | 80 | /* |
81 | + * per-zone information in memory controller. | |
82 | + */ | |
83 | + | |
84 | +enum mem_cgroup_zstat_index { | |
85 | + MEM_CGROUP_ZSTAT_ACTIVE, | |
86 | + MEM_CGROUP_ZSTAT_INACTIVE, | |
87 | + | |
88 | + NR_MEM_CGROUP_ZSTAT, | |
89 | +}; | |
90 | + | |
91 | +struct mem_cgroup_per_zone { | |
92 | + unsigned long count[NR_MEM_CGROUP_ZSTAT]; | |
93 | +}; | |
94 | +/* Macro for accessing counter */ | |
95 | +#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) | |
96 | + | |
97 | +struct mem_cgroup_per_node { | |
98 | + struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; | |
99 | +}; | |
100 | + | |
101 | +struct mem_cgroup_lru_info { | |
102 | + struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; | |
103 | +}; | |
104 | + | |
105 | +/* | |
81 | 106 | * The memory controller data structure. The memory controller controls both |
82 | 107 | * page cache and RSS per cgroup. We would eventually like to provide |
83 | 108 | * statistics based on the statistics developed by Rik Van Riel for clock-pro, |
... | ... | @@ -101,6 +126,7 @@ |
101 | 126 | */ |
102 | 127 | struct list_head active_list; |
103 | 128 | struct list_head inactive_list; |
129 | + struct mem_cgroup_lru_info info; | |
104 | 130 | /* |
105 | 131 | * spin_lock to protect the per cgroup LRU |
106 | 132 | */ |
... | ... | @@ -158,6 +184,7 @@ |
158 | 184 | MEM_CGROUP_CHARGE_TYPE_MAPPED, |
159 | 185 | }; |
160 | 186 | |
187 | + | |
161 | 188 | /* |
162 | 189 | * Always modified under lru lock. Then, not necessary to preempt_disable() |
163 | 190 | */ |
164 | 191 | |
165 | 192 | |
... | ... | @@ -173,9 +200,40 @@ |
173 | 200 | MEM_CGROUP_STAT_CACHE, val); |
174 | 201 | else |
175 | 202 | __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val); |
203 | +} | |
176 | 204 | |
205 | +static inline struct mem_cgroup_per_zone * | |
206 | +mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) | |
207 | +{ | |
208 | + BUG_ON(!mem->info.nodeinfo[nid]); | |
209 | + return &mem->info.nodeinfo[nid]->zoneinfo[zid]; | |
177 | 210 | } |
178 | 211 | |
212 | +static inline struct mem_cgroup_per_zone * | |
213 | +page_cgroup_zoneinfo(struct page_cgroup *pc) | |
214 | +{ | |
215 | + struct mem_cgroup *mem = pc->mem_cgroup; | |
216 | + int nid = page_cgroup_nid(pc); | |
217 | + int zid = page_cgroup_zid(pc); | |
218 | + | |
219 | + return mem_cgroup_zoneinfo(mem, nid, zid); | |
220 | +} | |
221 | + | |
222 | +static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem, | |
223 | + enum mem_cgroup_zstat_index idx) | |
224 | +{ | |
225 | + int nid, zid; | |
226 | + struct mem_cgroup_per_zone *mz; | |
227 | + u64 total = 0; | |
228 | + | |
229 | + for_each_online_node(nid) | |
230 | + for (zid = 0; zid < MAX_NR_ZONES; zid++) { | |
231 | + mz = mem_cgroup_zoneinfo(mem, nid, zid); | |
232 | + total += MEM_CGROUP_ZSTAT(mz, idx); | |
233 | + } | |
234 | + return total; | |
235 | +} | |
236 | + | |
179 | 237 | static struct mem_cgroup init_mem_cgroup; |
180 | 238 | |
181 | 239 | static inline |
182 | 240 | |
183 | 241 | |
184 | 242 | |
... | ... | @@ -286,12 +344,51 @@ |
286 | 344 | return ret; |
287 | 345 | } |
288 | 346 | |
347 | +static void __mem_cgroup_remove_list(struct page_cgroup *pc) | |
348 | +{ | |
349 | + int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; | |
350 | + struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); | |
351 | + | |
352 | + if (from) | |
353 | + MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1; | |
354 | + else | |
355 | + MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1; | |
356 | + | |
357 | + mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false); | |
358 | + list_del_init(&pc->lru); | |
359 | +} | |
360 | + | |
361 | +static void __mem_cgroup_add_list(struct page_cgroup *pc) | |
362 | +{ | |
363 | + int to = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; | |
364 | + struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); | |
365 | + | |
366 | + if (!to) { | |
367 | + MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; | |
368 | + list_add(&pc->lru, &pc->mem_cgroup->inactive_list); | |
369 | + } else { | |
370 | + MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; | |
371 | + list_add(&pc->lru, &pc->mem_cgroup->active_list); | |
372 | + } | |
373 | + mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, true); | |
374 | +} | |
375 | + | |
289 | 376 | static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active) |
290 | 377 | { |
378 | + int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; | |
379 | + struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); | |
380 | + | |
381 | + if (from) | |
382 | + MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1; | |
383 | + else | |
384 | + MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1; | |
385 | + | |
291 | 386 | if (active) { |
387 | + MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; | |
292 | 388 | pc->flags |= PAGE_CGROUP_FLAG_ACTIVE; |
293 | 389 | list_move(&pc->lru, &pc->mem_cgroup->active_list); |
294 | 390 | } else { |
391 | + MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; | |
295 | 392 | pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE; |
296 | 393 | list_move(&pc->lru, &pc->mem_cgroup->inactive_list); |
297 | 394 | } |
... | ... | @@ -501,8 +598,7 @@ |
501 | 598 | |
502 | 599 | spin_lock_irqsave(&mem->lru_lock, flags); |
503 | 600 | /* Update statistics vector */ |
504 | - mem_cgroup_charge_statistics(mem, pc->flags, true); | |
505 | - list_add(&pc->lru, &mem->active_list); | |
601 | + __mem_cgroup_add_list(pc); | |
506 | 602 | spin_unlock_irqrestore(&mem->lru_lock, flags); |
507 | 603 | |
508 | 604 | done: |
509 | 605 | |
... | ... | @@ -571,13 +667,13 @@ |
571 | 667 | css_put(&mem->css); |
572 | 668 | res_counter_uncharge(&mem->res, PAGE_SIZE); |
573 | 669 | spin_lock_irqsave(&mem->lru_lock, flags); |
574 | - list_del_init(&pc->lru); | |
575 | - mem_cgroup_charge_statistics(mem, pc->flags, false); | |
670 | + __mem_cgroup_remove_list(pc); | |
576 | 671 | spin_unlock_irqrestore(&mem->lru_lock, flags); |
577 | 672 | kfree(pc); |
578 | 673 | } |
579 | 674 | } |
580 | 675 | } |
676 | + | |
581 | 677 | /* |
582 | 678 | * Returns non-zero if a page (under migration) has valid page_cgroup member. |
583 | 679 | * Refcnt of page_cgroup is incremented. |
584 | 680 | |
585 | 681 | |
586 | 682 | |
... | ... | @@ -609,16 +705,26 @@ |
609 | 705 | void mem_cgroup_page_migration(struct page *page, struct page *newpage) |
610 | 706 | { |
611 | 707 | struct page_cgroup *pc; |
708 | + struct mem_cgroup *mem; | |
709 | + unsigned long flags; | |
612 | 710 | retry: |
613 | 711 | pc = page_get_page_cgroup(page); |
614 | 712 | if (!pc) |
615 | 713 | return; |
714 | + mem = pc->mem_cgroup; | |
616 | 715 | if (clear_page_cgroup(page, pc) != pc) |
617 | 716 | goto retry; |
717 | + | |
718 | + spin_lock_irqsave(&mem->lru_lock, flags); | |
719 | + | |
720 | + __mem_cgroup_remove_list(pc); | |
618 | 721 | pc->page = newpage; |
619 | 722 | lock_page_cgroup(newpage); |
620 | 723 | page_assign_page_cgroup(newpage, pc); |
621 | 724 | unlock_page_cgroup(newpage); |
725 | + __mem_cgroup_add_list(pc); | |
726 | + | |
727 | + spin_unlock_irqrestore(&mem->lru_lock, flags); | |
622 | 728 | return; |
623 | 729 | } |
624 | 730 | |
... | ... | @@ -648,8 +754,7 @@ |
648 | 754 | if (clear_page_cgroup(page, pc) == pc) { |
649 | 755 | css_put(&mem->css); |
650 | 756 | res_counter_uncharge(&mem->res, PAGE_SIZE); |
651 | - list_del_init(&pc->lru); | |
652 | - mem_cgroup_charge_statistics(mem, pc->flags, false); | |
757 | + __mem_cgroup_remove_list(pc); | |
653 | 758 | kfree(pc); |
654 | 759 | } else /* being uncharged ? ...do relax */ |
655 | 760 | break; |
... | ... | @@ -828,6 +933,17 @@ |
828 | 933 | seq_printf(m, "%s %lld\n", mem_cgroup_stat_desc[i].msg, |
829 | 934 | (long long)val); |
830 | 935 | } |
936 | + /* showing # of active pages */ | |
937 | + { | |
938 | + unsigned long active, inactive; | |
939 | + | |
940 | + inactive = mem_cgroup_get_all_zonestat(mem_cont, | |
941 | + MEM_CGROUP_ZSTAT_INACTIVE); | |
942 | + active = mem_cgroup_get_all_zonestat(mem_cont, | |
943 | + MEM_CGROUP_ZSTAT_ACTIVE); | |
944 | + seq_printf(m, "active %ld\n", (active) * PAGE_SIZE); | |
945 | + seq_printf(m, "inactive %ld\n", (inactive) * PAGE_SIZE); | |
946 | + } | |
831 | 947 | return 0; |
832 | 948 | } |
833 | 949 | |
834 | 950 | |
... | ... | @@ -881,12 +997,25 @@ |
881 | 997 | }, |
882 | 998 | }; |
883 | 999 | |
1000 | +static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | |
1001 | +{ | |
1002 | + struct mem_cgroup_per_node *pn; | |
1003 | + | |
1004 | + pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, node); | |
1005 | + if (!pn) | |
1006 | + return 1; | |
1007 | + mem->info.nodeinfo[node] = pn; | |
1008 | + memset(pn, 0, sizeof(*pn)); | |
1009 | + return 0; | |
1010 | +} | |
1011 | + | |
884 | 1012 | static struct mem_cgroup init_mem_cgroup; |
885 | 1013 | |
886 | 1014 | static struct cgroup_subsys_state * |
887 | 1015 | mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) |
888 | 1016 | { |
889 | 1017 | struct mem_cgroup *mem; |
1018 | + int node; | |
890 | 1019 | |
891 | 1020 | if (unlikely((cont->parent) == NULL)) { |
892 | 1021 | mem = &init_mem_cgroup; |
893 | 1022 | |
... | ... | @@ -902,7 +1031,19 @@ |
902 | 1031 | INIT_LIST_HEAD(&mem->inactive_list); |
903 | 1032 | spin_lock_init(&mem->lru_lock); |
904 | 1033 | mem->control_type = MEM_CGROUP_TYPE_ALL; |
1034 | + memset(&mem->info, 0, sizeof(mem->info)); | |
1035 | + | |
1036 | + for_each_node_state(node, N_POSSIBLE) | |
1037 | + if (alloc_mem_cgroup_per_zone_info(mem, node)) | |
1038 | + goto free_out; | |
1039 | + | |
905 | 1040 | return &mem->css; |
1041 | +free_out: | |
1042 | + for_each_node_state(node, N_POSSIBLE) | |
1043 | + kfree(mem->info.nodeinfo[node]); | |
1044 | + if (cont->parent != NULL) | |
1045 | + kfree(mem); | |
1046 | + return NULL; | |
906 | 1047 | } |
907 | 1048 | |
908 | 1049 | static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss, |
... | ... | @@ -915,6 +1056,12 @@ |
915 | 1056 | static void mem_cgroup_destroy(struct cgroup_subsys *ss, |
916 | 1057 | struct cgroup *cont) |
917 | 1058 | { |
1059 | + int node; | |
1060 | + struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | |
1061 | + | |
1062 | + for_each_node_state(node, N_POSSIBLE) | |
1063 | + kfree(mem->info.nodeinfo[node]); | |
1064 | + | |
918 | 1065 | kfree(mem_cgroup_from_cont(cont)); |
919 | 1066 | } |
920 | 1067 | |
... | ... | @@ -967,6 +1114,6 @@ |
967 | 1114 | .destroy = mem_cgroup_destroy, |
968 | 1115 | .populate = mem_cgroup_populate, |
969 | 1116 | .attach = mem_cgroup_move_task, |
970 | - .early_init = 1, | |
1117 | + .early_init = 0, | |
971 | 1118 | }; |