Commit 6d12e2d8ddbe653d80ea4f71578481c1bc933025

Authored by KAMEZAWA Hiroyuki
Committed by Linus Torvalds
1 parent c0149530d0

per-zone and reclaim enhancements for memory controller: per-zone active inactive counter

This patch adds per-zone status in memory cgroup.  These values are often read
(as per-zone value) by page reclaiming.

In current design, per-zone stat is just a unsigned long value and not an
atomic value because they are modified only under lru_lock.  (So, atomic_ops
is not necessary.)

This patch adds ACTIVE and INACTIVE per-zone status values.

For handling per-zone status, this patch adds
  struct mem_cgroup_per_zone {
		...
  }
and some helper functions. This will be useful to add per-zone objects
in mem_cgroup.

This patch turns memory controller's early_init to be 0 for calling
kmalloc() in initialization.

Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Cc: Kirill Korotaev <dev@sw.ru>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Paul Menage <menage@google.com>
Cc: Pavel Emelianov <xemul@openvz.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 1 changed file with 154 additions and 7 deletions Side-by-side Diff

... ... @@ -78,6 +78,31 @@
78 78 }
79 79  
80 80 /*
  81 + * per-zone information in memory controller.
  82 + */
  83 +
  84 +enum mem_cgroup_zstat_index {
  85 + MEM_CGROUP_ZSTAT_ACTIVE,
  86 + MEM_CGROUP_ZSTAT_INACTIVE,
  87 +
  88 + NR_MEM_CGROUP_ZSTAT,
  89 +};
  90 +
  91 +struct mem_cgroup_per_zone {
  92 + unsigned long count[NR_MEM_CGROUP_ZSTAT];
  93 +};
  94 +/* Macro for accessing counter */
  95 +#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
  96 +
  97 +struct mem_cgroup_per_node {
  98 + struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
  99 +};
  100 +
  101 +struct mem_cgroup_lru_info {
  102 + struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
  103 +};
  104 +
  105 +/*
81 106 * The memory controller data structure. The memory controller controls both
82 107 * page cache and RSS per cgroup. We would eventually like to provide
83 108 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
... ... @@ -101,6 +126,7 @@
101 126 */
102 127 struct list_head active_list;
103 128 struct list_head inactive_list;
  129 + struct mem_cgroup_lru_info info;
104 130 /*
105 131 * spin_lock to protect the per cgroup LRU
106 132 */
... ... @@ -158,6 +184,7 @@
158 184 MEM_CGROUP_CHARGE_TYPE_MAPPED,
159 185 };
160 186  
  187 +
161 188 /*
162 189 * Always modified under lru lock. Then, not necessary to preempt_disable()
163 190 */
164 191  
165 192  
... ... @@ -173,9 +200,40 @@
173 200 MEM_CGROUP_STAT_CACHE, val);
174 201 else
175 202 __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val);
  203 +}
176 204  
  205 +static inline struct mem_cgroup_per_zone *
  206 +mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
  207 +{
  208 + BUG_ON(!mem->info.nodeinfo[nid]);
  209 + return &mem->info.nodeinfo[nid]->zoneinfo[zid];
177 210 }
178 211  
  212 +static inline struct mem_cgroup_per_zone *
  213 +page_cgroup_zoneinfo(struct page_cgroup *pc)
  214 +{
  215 + struct mem_cgroup *mem = pc->mem_cgroup;
  216 + int nid = page_cgroup_nid(pc);
  217 + int zid = page_cgroup_zid(pc);
  218 +
  219 + return mem_cgroup_zoneinfo(mem, nid, zid);
  220 +}
  221 +
  222 +static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem,
  223 + enum mem_cgroup_zstat_index idx)
  224 +{
  225 + int nid, zid;
  226 + struct mem_cgroup_per_zone *mz;
  227 + u64 total = 0;
  228 +
  229 + for_each_online_node(nid)
  230 + for (zid = 0; zid < MAX_NR_ZONES; zid++) {
  231 + mz = mem_cgroup_zoneinfo(mem, nid, zid);
  232 + total += MEM_CGROUP_ZSTAT(mz, idx);
  233 + }
  234 + return total;
  235 +}
  236 +
179 237 static struct mem_cgroup init_mem_cgroup;
180 238  
181 239 static inline
182 240  
183 241  
184 242  
... ... @@ -286,12 +344,51 @@
286 344 return ret;
287 345 }
288 346  
  347 +static void __mem_cgroup_remove_list(struct page_cgroup *pc)
  348 +{
  349 + int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
  350 + struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
  351 +
  352 + if (from)
  353 + MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1;
  354 + else
  355 + MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1;
  356 +
  357 + mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false);
  358 + list_del_init(&pc->lru);
  359 +}
  360 +
  361 +static void __mem_cgroup_add_list(struct page_cgroup *pc)
  362 +{
  363 + int to = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
  364 + struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
  365 +
  366 + if (!to) {
  367 + MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1;
  368 + list_add(&pc->lru, &pc->mem_cgroup->inactive_list);
  369 + } else {
  370 + MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1;
  371 + list_add(&pc->lru, &pc->mem_cgroup->active_list);
  372 + }
  373 + mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, true);
  374 +}
  375 +
289 376 static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
290 377 {
  378 + int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
  379 + struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
  380 +
  381 + if (from)
  382 + MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1;
  383 + else
  384 + MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1;
  385 +
291 386 if (active) {
  387 + MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1;
292 388 pc->flags |= PAGE_CGROUP_FLAG_ACTIVE;
293 389 list_move(&pc->lru, &pc->mem_cgroup->active_list);
294 390 } else {
  391 + MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1;
295 392 pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE;
296 393 list_move(&pc->lru, &pc->mem_cgroup->inactive_list);
297 394 }
... ... @@ -501,8 +598,7 @@
501 598  
502 599 spin_lock_irqsave(&mem->lru_lock, flags);
503 600 /* Update statistics vector */
504   - mem_cgroup_charge_statistics(mem, pc->flags, true);
505   - list_add(&pc->lru, &mem->active_list);
  601 + __mem_cgroup_add_list(pc);
506 602 spin_unlock_irqrestore(&mem->lru_lock, flags);
507 603  
508 604 done:
509 605  
... ... @@ -571,13 +667,13 @@
571 667 css_put(&mem->css);
572 668 res_counter_uncharge(&mem->res, PAGE_SIZE);
573 669 spin_lock_irqsave(&mem->lru_lock, flags);
574   - list_del_init(&pc->lru);
575   - mem_cgroup_charge_statistics(mem, pc->flags, false);
  670 + __mem_cgroup_remove_list(pc);
576 671 spin_unlock_irqrestore(&mem->lru_lock, flags);
577 672 kfree(pc);
578 673 }
579 674 }
580 675 }
  676 +
581 677 /*
582 678 * Returns non-zero if a page (under migration) has valid page_cgroup member.
583 679 * Refcnt of page_cgroup is incremented.
584 680  
585 681  
586 682  
... ... @@ -609,16 +705,26 @@
609 705 void mem_cgroup_page_migration(struct page *page, struct page *newpage)
610 706 {
611 707 struct page_cgroup *pc;
  708 + struct mem_cgroup *mem;
  709 + unsigned long flags;
612 710 retry:
613 711 pc = page_get_page_cgroup(page);
614 712 if (!pc)
615 713 return;
  714 + mem = pc->mem_cgroup;
616 715 if (clear_page_cgroup(page, pc) != pc)
617 716 goto retry;
  717 +
  718 + spin_lock_irqsave(&mem->lru_lock, flags);
  719 +
  720 + __mem_cgroup_remove_list(pc);
618 721 pc->page = newpage;
619 722 lock_page_cgroup(newpage);
620 723 page_assign_page_cgroup(newpage, pc);
621 724 unlock_page_cgroup(newpage);
  725 + __mem_cgroup_add_list(pc);
  726 +
  727 + spin_unlock_irqrestore(&mem->lru_lock, flags);
622 728 return;
623 729 }
624 730  
... ... @@ -648,8 +754,7 @@
648 754 if (clear_page_cgroup(page, pc) == pc) {
649 755 css_put(&mem->css);
650 756 res_counter_uncharge(&mem->res, PAGE_SIZE);
651   - list_del_init(&pc->lru);
652   - mem_cgroup_charge_statistics(mem, pc->flags, false);
  757 + __mem_cgroup_remove_list(pc);
653 758 kfree(pc);
654 759 } else /* being uncharged ? ...do relax */
655 760 break;
... ... @@ -828,6 +933,17 @@
828 933 seq_printf(m, "%s %lld\n", mem_cgroup_stat_desc[i].msg,
829 934 (long long)val);
830 935 }
  936 + /* showing # of active pages */
  937 + {
  938 + unsigned long active, inactive;
  939 +
  940 + inactive = mem_cgroup_get_all_zonestat(mem_cont,
  941 + MEM_CGROUP_ZSTAT_INACTIVE);
  942 + active = mem_cgroup_get_all_zonestat(mem_cont,
  943 + MEM_CGROUP_ZSTAT_ACTIVE);
  944 + seq_printf(m, "active %ld\n", (active) * PAGE_SIZE);
  945 + seq_printf(m, "inactive %ld\n", (inactive) * PAGE_SIZE);
  946 + }
831 947 return 0;
832 948 }
833 949  
834 950  
... ... @@ -881,12 +997,25 @@
881 997 },
882 998 };
883 999  
  1000 +static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
  1001 +{
  1002 + struct mem_cgroup_per_node *pn;
  1003 +
  1004 + pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, node);
  1005 + if (!pn)
  1006 + return 1;
  1007 + mem->info.nodeinfo[node] = pn;
  1008 + memset(pn, 0, sizeof(*pn));
  1009 + return 0;
  1010 +}
  1011 +
884 1012 static struct mem_cgroup init_mem_cgroup;
885 1013  
886 1014 static struct cgroup_subsys_state *
887 1015 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
888 1016 {
889 1017 struct mem_cgroup *mem;
  1018 + int node;
890 1019  
891 1020 if (unlikely((cont->parent) == NULL)) {
892 1021 mem = &init_mem_cgroup;
893 1022  
... ... @@ -902,7 +1031,19 @@
902 1031 INIT_LIST_HEAD(&mem->inactive_list);
903 1032 spin_lock_init(&mem->lru_lock);
904 1033 mem->control_type = MEM_CGROUP_TYPE_ALL;
  1034 + memset(&mem->info, 0, sizeof(mem->info));
  1035 +
  1036 + for_each_node_state(node, N_POSSIBLE)
  1037 + if (alloc_mem_cgroup_per_zone_info(mem, node))
  1038 + goto free_out;
  1039 +
905 1040 return &mem->css;
  1041 +free_out:
  1042 + for_each_node_state(node, N_POSSIBLE)
  1043 + kfree(mem->info.nodeinfo[node]);
  1044 + if (cont->parent != NULL)
  1045 + kfree(mem);
  1046 + return NULL;
906 1047 }
907 1048  
908 1049 static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
... ... @@ -915,6 +1056,12 @@
915 1056 static void mem_cgroup_destroy(struct cgroup_subsys *ss,
916 1057 struct cgroup *cont)
917 1058 {
  1059 + int node;
  1060 + struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
  1061 +
  1062 + for_each_node_state(node, N_POSSIBLE)
  1063 + kfree(mem->info.nodeinfo[node]);
  1064 +
918 1065 kfree(mem_cgroup_from_cont(cont));
919 1066 }
920 1067  
... ... @@ -967,6 +1114,6 @@
967 1114 .destroy = mem_cgroup_destroy,
968 1115 .populate = mem_cgroup_populate,
969 1116 .attach = mem_cgroup_move_task,
970   - .early_init = 1,
  1117 + .early_init = 0,
971 1118 };