memcg: use generic percpu instead of private implementation

When per-cpu counter for memcg was implemneted, dynamic percpu allocator was not very good. But now, we have good one and useful macros. This patch replaces memcg's private percpu counter implementation with generic dynamic percpu allocator. The benefits are - We can remove private implementation. - The counters will be NUMA-aware. (Current one is not...) - This patch makes sizeof struct mem_cgroup smaller. Then, struct mem_cgroup may be fit in page size on small config. - About basic performance aspects, see below. [Before] # size mm/memcontrol.o text data bss dec hex filename 24373 2528 4132 31033 7939 mm/memcontrol.o [page-fault-throuput test on 8cpu/SMP in root cgroup] # /root/bin/perf stat -a -e page-faults,cache-misses --repeat 5 ./multi-fault-fork 8 Performance counter stats for './multi-fault-fork 8' (5 runs): 45878618 page-faults ( +- 0.110% ) 602635826 cache-misses ( +- 0.105% ) 61.005373262 seconds time elapsed ( +- 0.004% ) Then cache-miss/page fault = 13.14 [After] #size mm/memcontrol.o text data bss dec hex filename 23913 2528 4132 30573 776d mm/memcontrol.o # /root/bin/perf stat -a -e page-faults,cache-misses --repeat 5 ./multi-fault-fork 8 Performance counter stats for './multi-fault-fork 8' (5 runs): 48179400 page-faults ( +- 0.271% ) 588628407 cache-misses ( +- 0.136% ) 61.004615021 seconds time elapsed ( +- 0.004% ) Then cache-miss/page fault = 12.22 Text size is reduced. This performance improvement is not big and will be invisible in real world applications. But this result shows this patch has some good effect even on (small) SMP. Here is a test program I used. 1. fork() processes on each cpus. 2. do page fault repeatedly on each process. 3. after 60secs, kill all childredn and exit. (3 is necessary for getting stable data, this is improvement from previous one.) #define _GNU_SOURCE #include <stdio.h> #include <sched.h> #include <sys/mman.h> #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <signal.h> #include <stdlib.h> /* * For avoiding contention in page table lock, FAULT area is * sparse. If FAULT_LENGTH is too large for your cpus, decrease it. */ #define FAULT_LENGTH (2 * 1024 * 1024) #define PAGE_SIZE 4096 #define MAXNUM (128) void alarm_handler(int sig) { } void *worker(int cpu, int ppid) { void *start, *end; char *c; cpu_set_t set; int i; CPU_ZERO(&set); CPU_SET(cpu, &set); sched_setaffinity(0, sizeof(set), &set); start = mmap(NULL, FAULT_LENGTH, PROT_READ|PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); if (start == MAP_FAILED) { perror("mmap"); exit(1); } end = start + FAULT_LENGTH; pause(); //fprintf(stderr, "run%d", cpu); while (1) { for (c = (char*)start; (void *)c < end; c += PAGE_SIZE) *c = 0; madvise(start, FAULT_LENGTH, MADV_DONTNEED); } return NULL; } int main(int argc, char *argv[]) { int num, i, ret, pid, status; int pids[MAXNUM]; if (argc < 2) return 0; setpgid(0, 0); signal(SIGALRM, alarm_handler); num = atoi(argv[1]); pid = getpid(); for (i = 0; i < num; ++i) { ret = fork(); if (!ret) { worker(i, pid); exit(0); } pids[i] = ret; } sleep(1); kill(-pid, SIGALRM); sleep(60); for (i = 0; i < num; i++) kill(pids[i], SIGKILL); for (i = 0; i < num; i++) waitpid(pids[i], &status, 0); return 0; } Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Pavel Emelyanov <xemul@openvz.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

memcg: use generic percpu instead of private implementation
When per-cpu counter for memcg was implemneted, dynamic percpu allocator was not very good. But now, we have good one and useful macros. This patch replaces memcg's private percpu counter implementation with generic dynamic percpu allocator. The benefits are - We can remove private implementation. - The counters will be NUMA-aware. (Current one is not...) - This patch makes sizeof struct mem_cgroup smaller. Then, struct mem_cgroup may be fit in page size on small config. - About basic performance aspects, see below. [Before] # size mm/memcontrol.o text data bss dec hex filename 24373 2528 4132 31033 7939 mm/memcontrol.o [page-fault-throuput test on 8cpu/SMP in root cgroup] # /root/bin/perf stat -a -e page-faults,cache-misses --repeat 5 ./multi-fault-fork 8 Performance counter stats for './multi-fault-fork 8' (5 runs): 45878618 page-faults ( +- 0.110% ) 602635826 cache-misses ( +- 0.105% ) 61.005373262 seconds time elapsed ( +- 0.004% ) Then cache-miss/page fault = 13.14 [After] #size mm/memcontrol.o text data bss dec hex filename 23913 2528 4132 30573 776d mm/memcontrol.o # /root/bin/perf stat -a -e page-faults,cache-misses --repeat 5 ./multi-fault-fork 8 Performance counter stats for './multi-fault-fork 8' (5 runs): 48179400 page-faults ( +- 0.271% ) 588628407 cache-misses ( +- 0.136% ) 61.004615021 seconds time elapsed ( +- 0.004% ) Then cache-miss/page fault = 12.22 Text size is reduced. This performance improvement is not big and will be invisible in real world applications. But this result shows this patch has some good effect even on (small) SMP. Here is a test program I used. 1. fork() processes on each cpus. 2. do page fault repeatedly on each process. 3. after 60secs, kill all childredn and exit. (3 is necessary for getting stable data, this is improvement from previous one.) #define _GNU_SOURCE #include <stdio.h> #include <sched.h> #include <sys/mman.h> #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <signal.h> #include <stdlib.h> /* * For avoiding contention in page table lock, FAULT area is * sparse. If FAULT_LENGTH is too large for your cpus, decrease it. */ #define FAULT_LENGTH (2 * 1024 * 1024) #define PAGE_SIZE 4096 #define MAXNUM (128) void alarm_handler(int sig) { } void *worker(int cpu, int ppid) { void *start, *end; char *c; cpu_set_t set; int i; CPU_ZERO(&set); CPU_SET(cpu, &set); sched_setaffinity(0, sizeof(set), &set); start = mmap(NULL, FAULT_LENGTH, PROT_READ|PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); if (start == MAP_FAILED) { perror("mmap"); exit(1); } end = start + FAULT_LENGTH; pause(); //fprintf(stderr, "run%d", cpu); while (1) { for (c = (char*)start; (void *)c < end; c += PAGE_SIZE) *c = 0; madvise(start, FAULT_LENGTH, MADV_DONTNEED); } return NULL; } int main(int argc, char *argv[]) { int num, i, ret, pid, status; int pids[MAXNUM]; if (argc < 2) return 0; setpgid(0, 0); signal(SIGALRM, alarm_handler); num = atoi(argv[1]); pid = getpid(); for (i = 0; i < num; ++i) { ret = fork(); if (!ret) { worker(i, pid); exit(0); } pids[i] = ret; } sleep(1); kill(-pid, SIGALRM); sleep(60); for (i = 0; i < num; i++) kill(pids[i], SIGKILL); for (i = 0; i < num; i++) waitpid(pids[i], &status, 0); return 0; } Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Pavel Emelyanov <xemul@openvz.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
KAMEZAWA Hiroyuki · Linus Torvalds
1 parent 6a6135b64f
Showing 1 changed file with 63 additions and 121 deletions Side-by-side Diff
mm/memcontrol.c
@@ -89,55 +89,9 @@
  
 struct mem_cgroup_stat_cpu {
 	s64 count[MEM_CGROUP_STAT_NSTATS];
-} ____cacheline_aligned_in_smp;
-
-struct mem_cgroup_stat {
-	struct mem_cgroup_stat_cpu cpustat[0];
 };
  
-static inline void
-__mem_cgroup_stat_set_safe(struct mem_cgroup_stat_cpu *stat,
-				enum mem_cgroup_stat_index idx, s64 val)
-{
-	stat->count[idx] = val;
-}
-
-static inline s64
-__mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat,
-				enum mem_cgroup_stat_index idx)
-{
-	return stat->count[idx];
-}
-
 /*
- * For accounting under irq disable, no need for increment preempt count.
- */
-static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
-		enum mem_cgroup_stat_index idx, int val)
-{
-	stat->count[idx] += val;
-}
-
-static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
-		enum mem_cgroup_stat_index idx)
-{
-	int cpu;
-	s64 ret = 0;
-	for_each_possible_cpu(cpu)
-		ret += stat->cpustat[cpu].count[idx];
-	return ret;
-}
-
-static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat)
-{
-	s64 ret;
-
-	ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE);
-	ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS);
-	return ret;
-}
-
-/*
  * per-zone information in memory controller.
  */
 struct mem_cgroup_per_zone {
  
@@ -270,9 +224,9 @@
 	unsigned long 	move_charge_at_immigrate;
  
 	/*
-	 * statistics. This must be placed at the end of memcg.
+	 * percpu counter.
 	 */
-	struct mem_cgroup_stat stat;
+	struct mem_cgroup_stat_cpu *stat;
 };
  
 /* Stuffs for move charges at task migration. */
  
  
  
  
@@ -441,19 +395,14 @@
 static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem)
 {
 	bool ret = false;
-	int cpu;
 	s64 val;
-	struct mem_cgroup_stat_cpu *cpustat;
  
-	cpu = get_cpu();
-	cpustat = &mem->stat.cpustat[cpu];
-	val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_SOFTLIMIT);
+	val = this_cpu_read(mem->stat->count[MEM_CGROUP_STAT_SOFTLIMIT]);
 	if (unlikely(val < 0)) {
-		__mem_cgroup_stat_set_safe(cpustat, MEM_CGROUP_STAT_SOFTLIMIT,
+		this_cpu_write(mem->stat->count[MEM_CGROUP_STAT_SOFTLIMIT],
 				SOFTLIMIT_EVENTS_THRESH);
 		ret = true;
 	}
-	put_cpu();
 	return ret;
 }
  
  
@@ -549,17 +498,31 @@
 	return mz;
 }
  
+static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
+		enum mem_cgroup_stat_index idx)
+{
+	int cpu;
+	s64 val = 0;
+
+	for_each_possible_cpu(cpu)
+		val += per_cpu(mem->stat->count[idx], cpu);
+	return val;
+}
+
+static s64 mem_cgroup_local_usage(struct mem_cgroup *mem)
+{
+	s64 ret;
+
+	ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
+	ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
+	return ret;
+}
+
 static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
 					 bool charge)
 {
 	int val = (charge) ? 1 : -1;
-	struct mem_cgroup_stat *stat = &mem->stat;
-	struct mem_cgroup_stat_cpu *cpustat;
-	int cpu = get_cpu();
-
-	cpustat = &stat->cpustat[cpu];
-	__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val);
-	put_cpu();
+	this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
 }
  
 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
  
  
  
  
  
  
@@ -567,26 +530,22 @@
 					 bool charge)
 {
 	int val = (charge) ? 1 : -1;
-	struct mem_cgroup_stat *stat = &mem->stat;
-	struct mem_cgroup_stat_cpu *cpustat;
-	int cpu = get_cpu();
  
-	cpustat = &stat->cpustat[cpu];
+	preempt_disable();
+
 	if (PageCgroupCache(pc))
-		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
+		__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val);
 	else
-		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val);
+		__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val);
  
 	if (charge)
-		__mem_cgroup_stat_add_safe(cpustat,
-				MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
+		__this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);
 	else
-		__mem_cgroup_stat_add_safe(cpustat,
-				MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
-	__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SOFTLIMIT, -1);
-	__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_THRESHOLDS, -1);
+		__this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);
+	__this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_SOFTLIMIT]);
+	__this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_THRESHOLDS]);
  
-	put_cpu();
+	preempt_enable();
 }
  
 static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
@@ -1244,7 +1203,7 @@
 				}
 			}
 		}
-		if (!mem_cgroup_local_usage(&victim->stat)) {
+		if (!mem_cgroup_local_usage(victim)) {
 			/* this cgroup's local usage == 0 */
 			css_put(&victim->css);
 			continue;
@@ -1310,9 +1269,6 @@
 void mem_cgroup_update_file_mapped(struct page *page, int val)
 {
 	struct mem_cgroup *mem;
-	struct mem_cgroup_stat *stat;
-	struct mem_cgroup_stat_cpu *cpustat;
-	int cpu;
 	struct page_cgroup *pc;
  
 	pc = lookup_page_cgroup(page);
  
  
@@ -1328,13 +1284,10 @@
 		goto done;
  
 	/*
-	 * Preemption is already disabled, we don't need get_cpu()
+	 * Preemption is already disabled. We can use __this_cpu_xxx
 	 */
-	cpu = smp_processor_id();
-	stat = &mem->stat;
-	cpustat = &stat->cpustat[cpu];
+	__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], val);
  
-	__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, val);
 done:
 	unlock_page_cgroup(pc);
 }
@@ -1761,9 +1714,6 @@
 	struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
 {
 	struct page *page;
-	int cpu;
-	struct mem_cgroup_stat *stat;
-	struct mem_cgroup_stat_cpu *cpustat;
  
 	VM_BUG_ON(from == to);
 	VM_BUG_ON(PageLRU(pc->page));
@@ -1773,18 +1723,11 @@
  
 	page = pc->page;
 	if (page_mapped(page) && !PageAnon(page)) {
-		cpu = smp_processor_id();
-		/* Update mapped_file data for mem_cgroup "from" */
-		stat = &from->stat;
-		cpustat = &stat->cpustat[cpu];
-		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED,
-						-1);
-
-		/* Update mapped_file data for mem_cgroup "to" */
-		stat = &to->stat;
-		cpustat = &stat->cpustat[cpu];
-		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED,
-						1);
+		/* Update mapped_file data for mem_cgroup */
+		preempt_disable();
+		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+		preempt_enable();
 	}
 	mem_cgroup_charge_statistics(from, pc, false);
 	if (uncharge)
@@ -2885,7 +2828,7 @@
 mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data)
 {
 	struct mem_cgroup_idx_data *d = data;
-	d->val += mem_cgroup_read_stat(&mem->stat, d->idx);
+	d->val += mem_cgroup_read_stat(mem, d->idx);
 	return 0;
 }
  
  
  
  
  
  
@@ -3134,18 +3077,18 @@
 	s64 val;
  
 	/* per cpu stat */
-	val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_CACHE);
+	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
 	s->stat[MCS_CACHE] += val * PAGE_SIZE;
-	val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
+	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
 	s->stat[MCS_RSS] += val * PAGE_SIZE;
-	val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_FILE_MAPPED);
+	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
 	s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
-	val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT);
+	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT);
 	s->stat[MCS_PGPGIN] += val;
-	val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT);
+	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT);
 	s->stat[MCS_PGPGOUT] += val;
 	if (do_swap_account) {
-		val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT);
+		val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
 		s->stat[MCS_SWAP] += val * PAGE_SIZE;
 	}
  
  
  
  
  
@@ -3276,19 +3219,14 @@
 static bool mem_cgroup_threshold_check(struct mem_cgroup *mem)
 {
 	bool ret = false;
-	int cpu;
 	s64 val;
-	struct mem_cgroup_stat_cpu *cpustat;
  
-	cpu = get_cpu();
-	cpustat = &mem->stat.cpustat[cpu];
-	val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_THRESHOLDS);
+	val = this_cpu_read(mem->stat->count[MEM_CGROUP_STAT_THRESHOLDS]);
 	if (unlikely(val < 0)) {
-		__mem_cgroup_stat_set_safe(cpustat, MEM_CGROUP_STAT_THRESHOLDS,
+		this_cpu_write(mem->stat->count[MEM_CGROUP_STAT_THRESHOLDS],
 				THRESHOLDS_EVENTS_THRESH);
 		ret = true;
 	}
-	put_cpu();
 	return ret;
 }
  
  
  
@@ -3676,17 +3614,12 @@
 	kfree(mem->info.nodeinfo[node]);
 }
  
-static int mem_cgroup_size(void)
-{
-	int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu);
-	return sizeof(struct mem_cgroup) + cpustat_size;
-}
-
 static struct mem_cgroup *mem_cgroup_alloc(void)
 {
 	struct mem_cgroup *mem;
-	int size = mem_cgroup_size();
+	int size = sizeof(struct mem_cgroup);
  
+	/* Can be very big if MAX_NUMNODES is very big */
 	if (size < PAGE_SIZE)
 		mem = kmalloc(size, GFP_KERNEL);
 	else
@@ -3694,6 +3627,14 @@
  
 	if (mem)
 		memset(mem, 0, size);
+	mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
+	if (!mem->stat) {
+		if (size < PAGE_SIZE)
+			kfree(mem);
+		else
+			vfree(mem);
+		mem = NULL;
+	}
 	return mem;
 }
  
@@ -3718,7 +3659,8 @@
 	for_each_node_state(node, N_POSSIBLE)
 		free_mem_cgroup_per_zone_info(mem, node);
  
-	if (mem_cgroup_size() < PAGE_SIZE)
+	free_percpu(mem->stat);
+	if (sizeof(struct mem_cgroup) < PAGE_SIZE)
 		kfree(mem);
 	else
 		vfree(mem);
...	...	@@ -89,55 +89,9 @@
89	89
90	90	struct mem_cgroup_stat_cpu {
91	91	s64 count[MEM_CGROUP_STAT_NSTATS];
92		-} ____cacheline_aligned_in_smp;
93		-
94		-struct mem_cgroup_stat {
95		- struct mem_cgroup_stat_cpu cpustat[0];
96	92	};
97	93
98		-static inline void
99		-__mem_cgroup_stat_set_safe(struct mem_cgroup_stat_cpu *stat,
100		- enum mem_cgroup_stat_index idx, s64 val)
101		-{
102		- stat->count[idx] = val;
103		-}
104		-
105		-static inline s64
106		-__mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat,
107		- enum mem_cgroup_stat_index idx)
108		-{
109		- return stat->count[idx];
110		-}
111		-
112	94	/*
113		- * For accounting under irq disable, no need for increment preempt count.
114		- */
115		-static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
116		- enum mem_cgroup_stat_index idx, int val)
117		-{
118		- stat->count[idx] += val;
119		-}
120		-
121		-static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
122		- enum mem_cgroup_stat_index idx)
123		-{
124		- int cpu;
125		- s64 ret = 0;
126		- for_each_possible_cpu(cpu)
127		- ret += stat->cpustat[cpu].count[idx];
128		- return ret;
129		-}
130		-
131		-static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat)
132		-{
133		- s64 ret;
134		-
135		- ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE);
136		- ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS);
137		- return ret;
138		-}
139		-
140		-/*
141	95	* per-zone information in memory controller.
142	96	*/
143	97	struct mem_cgroup_per_zone {
144	98
...	...	@@ -270,9 +224,9 @@
270	224	unsigned long move_charge_at_immigrate;
271	225
272	226	/*
273		- * statistics. This must be placed at the end of memcg.
	227	+ * percpu counter.
274	228	*/
275		- struct mem_cgroup_stat stat;
	229	+ struct mem_cgroup_stat_cpu *stat;
276	230	};
277	231
278	232	/* Stuffs for move charges at task migration. */
279	233
280	234
281	235
282	236
...	...	@@ -441,19 +395,14 @@
441	395	static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem)
442	396	{
443	397	bool ret = false;
444		- int cpu;
445	398	s64 val;
446		- struct mem_cgroup_stat_cpu *cpustat;
447	399
448		- cpu = get_cpu();
449		- cpustat = &mem->stat.cpustat[cpu];
450		- val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_SOFTLIMIT);
	400	+ val = this_cpu_read(mem->stat->count[MEM_CGROUP_STAT_SOFTLIMIT]);
451	401	if (unlikely(val < 0)) {
452		- __mem_cgroup_stat_set_safe(cpustat, MEM_CGROUP_STAT_SOFTLIMIT,
	402	+ this_cpu_write(mem->stat->count[MEM_CGROUP_STAT_SOFTLIMIT],
453	403	SOFTLIMIT_EVENTS_THRESH);
454	404	ret = true;
455	405	}
456		- put_cpu();
457	406	return ret;
458	407	}
459	408
460	409
...	...	@@ -549,17 +498,31 @@
549	498	return mz;
550	499	}
551	500
	501	+static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
	502	+ enum mem_cgroup_stat_index idx)
	503	+{
	504	+ int cpu;
	505	+ s64 val = 0;
	506	+
	507	+ for_each_possible_cpu(cpu)
	508	+ val += per_cpu(mem->stat->count[idx], cpu);
	509	+ return val;
	510	+}
	511	+
	512	+static s64 mem_cgroup_local_usage(struct mem_cgroup *mem)
	513	+{
	514	+ s64 ret;
	515	+
	516	+ ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
	517	+ ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
	518	+ return ret;
	519	+}
	520	+
552	521	static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
553	522	bool charge)
554	523	{
555	524	int val = (charge) ? 1 : -1;
556		- struct mem_cgroup_stat *stat = &mem->stat;
557		- struct mem_cgroup_stat_cpu *cpustat;
558		- int cpu = get_cpu();
559		-
560		- cpustat = &stat->cpustat[cpu];
561		- __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val);
562		- put_cpu();
	525	+ this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
563	526	}
564	527
565	528	static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
566	529
567	530
568	531
569	532
570	533
571	534
...	...	@@ -567,26 +530,22 @@
567	530	bool charge)
568	531	{
569	532	int val = (charge) ? 1 : -1;
570		- struct mem_cgroup_stat *stat = &mem->stat;
571		- struct mem_cgroup_stat_cpu *cpustat;
572		- int cpu = get_cpu();
573	533
574		- cpustat = &stat->cpustat[cpu];
	534	+ preempt_disable();
	535	+
575	536	if (PageCgroupCache(pc))
576		- __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
	537	+ __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val);
577	538	else
578		- __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val);
	539	+ __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val);
579	540
580	541	if (charge)
581		- __mem_cgroup_stat_add_safe(cpustat,
582		- MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
	542	+ __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);
583	543	else
584		- __mem_cgroup_stat_add_safe(cpustat,
585		- MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
586		- __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SOFTLIMIT, -1);
587		- __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_THRESHOLDS, -1);
	544	+ __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);
	545	+ __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_SOFTLIMIT]);
	546	+ __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_THRESHOLDS]);
588	547
589		- put_cpu();
	548	+ preempt_enable();
590	549	}
591	550
592	551	static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
...	...	@@ -1244,7 +1203,7 @@
1244	1203	}
1245	1204	}
1246	1205	}
1247		- if (!mem_cgroup_local_usage(&victim->stat)) {
	1206	+ if (!mem_cgroup_local_usage(victim)) {
1248	1207	/* this cgroup's local usage == 0 */
1249	1208	css_put(&victim->css);
1250	1209	continue;
...	...	@@ -1310,9 +1269,6 @@
1310	1269	void mem_cgroup_update_file_mapped(struct page *page, int val)
1311	1270	{
1312	1271	struct mem_cgroup *mem;
1313		- struct mem_cgroup_stat *stat;
1314		- struct mem_cgroup_stat_cpu *cpustat;
1315		- int cpu;
1316	1272	struct page_cgroup *pc;
1317	1273
1318	1274	pc = lookup_page_cgroup(page);
1319	1275
1320	1276
...	...	@@ -1328,13 +1284,10 @@
1328	1284	goto done;
1329	1285
1330	1286	/*
1331		- * Preemption is already disabled, we don't need get_cpu()
	1287	+ * Preemption is already disabled. We can use __this_cpu_xxx
1332	1288	*/
1333		- cpu = smp_processor_id();
1334		- stat = &mem->stat;
1335		- cpustat = &stat->cpustat[cpu];
	1289	+ __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], val);
1336	1290
1337		- __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, val);
1338	1291	done:
1339	1292	unlock_page_cgroup(pc);
1340	1293	}
...	...	@@ -1761,9 +1714,6 @@
1761	1714	struct mem_cgroup from, struct mem_cgroup to, bool uncharge)
1762	1715	{
1763	1716	struct page *page;
1764		- int cpu;
1765		- struct mem_cgroup_stat *stat;
1766		- struct mem_cgroup_stat_cpu *cpustat;
1767	1717
1768	1718	VM_BUG_ON(from == to);
1769	1719	VM_BUG_ON(PageLRU(pc->page));
...	...	@@ -1773,18 +1723,11 @@
1773	1723
1774	1724	page = pc->page;
1775	1725	if (page_mapped(page) && !PageAnon(page)) {
1776		- cpu = smp_processor_id();
1777		- /* Update mapped_file data for mem_cgroup "from" */
1778		- stat = &from->stat;
1779		- cpustat = &stat->cpustat[cpu];
1780		- __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED,
1781		- -1);
1782		-
1783		- /* Update mapped_file data for mem_cgroup "to" */
1784		- stat = &to->stat;
1785		- cpustat = &stat->cpustat[cpu];
1786		- __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED,
1787		- 1);
	1726	+ /* Update mapped_file data for mem_cgroup */
	1727	+ preempt_disable();
	1728	+ __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
	1729	+ __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
	1730	+ preempt_enable();
1788	1731	}
1789	1732	mem_cgroup_charge_statistics(from, pc, false);
1790	1733	if (uncharge)
...	...	@@ -2885,7 +2828,7 @@
2885	2828	mem_cgroup_get_idx_stat(struct mem_cgroup mem, void data)
2886	2829	{
2887	2830	struct mem_cgroup_idx_data *d = data;
2888		- d->val += mem_cgroup_read_stat(&mem->stat, d->idx);
	2831	+ d->val += mem_cgroup_read_stat(mem, d->idx);
2889	2832	return 0;
2890	2833	}
2891	2834
2892	2835
2893	2836
2894	2837
2895	2838
2896	2839
...	...	@@ -3134,18 +3077,18 @@
3134	3077	s64 val;
3135	3078
3136	3079	/* per cpu stat */
3137		- val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_CACHE);
	3080	+ val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
3138	3081	s->stat[MCS_CACHE] += val * PAGE_SIZE;
3139		- val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
	3082	+ val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
3140	3083	s->stat[MCS_RSS] += val * PAGE_SIZE;
3141		- val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_FILE_MAPPED);
	3084	+ val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
3142	3085	s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
3143		- val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT);
	3086	+ val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT);
3144	3087	s->stat[MCS_PGPGIN] += val;
3145		- val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT);
	3088	+ val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT);
3146	3089	s->stat[MCS_PGPGOUT] += val;
3147	3090	if (do_swap_account) {
3148		- val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT);
	3091	+ val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
3149	3092	s->stat[MCS_SWAP] += val * PAGE_SIZE;
3150	3093	}
3151	3094
3152	3095
3153	3096
3154	3097
3155	3098
...	...	@@ -3276,19 +3219,14 @@
3276	3219	static bool mem_cgroup_threshold_check(struct mem_cgroup *mem)
3277	3220	{
3278	3221	bool ret = false;
3279		- int cpu;
3280	3222	s64 val;
3281		- struct mem_cgroup_stat_cpu *cpustat;
3282	3223
3283		- cpu = get_cpu();
3284		- cpustat = &mem->stat.cpustat[cpu];
3285		- val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_THRESHOLDS);
	3224	+ val = this_cpu_read(mem->stat->count[MEM_CGROUP_STAT_THRESHOLDS]);
3286	3225	if (unlikely(val < 0)) {
3287		- __mem_cgroup_stat_set_safe(cpustat, MEM_CGROUP_STAT_THRESHOLDS,
	3226	+ this_cpu_write(mem->stat->count[MEM_CGROUP_STAT_THRESHOLDS],
3288	3227	THRESHOLDS_EVENTS_THRESH);
3289	3228	ret = true;
3290	3229	}
3291		- put_cpu();
3292	3230	return ret;
3293	3231	}
3294	3232
3295	3233
3296	3234
...	...	@@ -3676,17 +3614,12 @@
3676	3614	kfree(mem->info.nodeinfo[node]);
3677	3615	}
3678	3616
3679		-static int mem_cgroup_size(void)
3680		-{
3681		- int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu);
3682		- return sizeof(struct mem_cgroup) + cpustat_size;
3683		-}
3684		-
3685	3617	static struct mem_cgroup *mem_cgroup_alloc(void)
3686	3618	{
3687	3619	struct mem_cgroup *mem;
3688		- int size = mem_cgroup_size();
	3620	+ int size = sizeof(struct mem_cgroup);
3689	3621
	3622	+ /* Can be very big if MAX_NUMNODES is very big */
3690	3623	if (size < PAGE_SIZE)
3691	3624	mem = kmalloc(size, GFP_KERNEL);
3692	3625	else
...	...	@@ -3694,6 +3627,14 @@
3694	3627
3695	3628	if (mem)
3696	3629	memset(mem, 0, size);
	3630	+ mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
	3631	+ if (!mem->stat) {
	3632	+ if (size < PAGE_SIZE)
	3633	+ kfree(mem);
	3634	+ else
	3635	+ vfree(mem);
	3636	+ mem = NULL;
	3637	+ }
3697	3638	return mem;
3698	3639	}
3699	3640
...	...	@@ -3718,7 +3659,8 @@
3718	3659	for_each_node_state(node, N_POSSIBLE)
3719	3660	free_mem_cgroup_per_zone_info(mem, node);
3720	3661
3721		- if (mem_cgroup_size() < PAGE_SIZE)
	3662	+ free_percpu(mem->stat);
	3663	+ if (sizeof(struct mem_cgroup) < PAGE_SIZE)
3722	3664	kfree(mem);
3723	3665	else
3724	3666	vfree(mem);