Commit c62b1a3b31b5e27a6c5c2e91cc5ce05fdb6344d0

Authored by KAMEZAWA Hiroyuki
Committed by Linus Torvalds
1 parent 6a6135b64f

memcg: use generic percpu instead of private implementation

When per-cpu counter for memcg was implemneted, dynamic percpu allocator
was not very good.  But now, we have good one and useful macros.  This
patch replaces memcg's private percpu counter implementation with generic
dynamic percpu allocator.

The benefits are
	- We can remove private implementation.
	- The counters will be NUMA-aware. (Current one is not...)
	- This patch makes sizeof struct mem_cgroup smaller. Then,
	  struct mem_cgroup may be fit in page size on small config.
        - About basic performance aspects, see below.

 [Before]
 # size mm/memcontrol.o
   text    data     bss     dec     hex filename
  24373    2528    4132   31033    7939 mm/memcontrol.o

 [page-fault-throuput test on 8cpu/SMP in root cgroup]
 # /root/bin/perf stat -a -e page-faults,cache-misses --repeat 5 ./multi-fault-fork 8

 Performance counter stats for './multi-fault-fork 8' (5 runs):

       45878618  page-faults                ( +-   0.110% )
      602635826  cache-misses               ( +-   0.105% )

   61.005373262  seconds time elapsed   ( +-   0.004% )

 Then cache-miss/page fault = 13.14

 [After]
 #size mm/memcontrol.o
   text    data     bss     dec     hex filename
  23913    2528    4132   30573    776d mm/memcontrol.o
 # /root/bin/perf stat -a -e page-faults,cache-misses --repeat 5 ./multi-fault-fork 8

 Performance counter stats for './multi-fault-fork 8' (5 runs):

       48179400  page-faults                ( +-   0.271% )
      588628407  cache-misses               ( +-   0.136% )

   61.004615021  seconds time elapsed   ( +-   0.004% )

  Then cache-miss/page fault = 12.22

 Text size is reduced.
 This performance improvement is not big and will be invisible in real world
 applications. But this result shows this patch has some good effect even
 on (small) SMP.

Here is a test program I used.

 1. fork() processes on each cpus.
 2. do page fault repeatedly on each process.
 3. after 60secs, kill all childredn and exit.

(3 is necessary for getting stable data, this is improvement from previous one.)

#define _GNU_SOURCE
#include <stdio.h>
#include <sched.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <signal.h>
#include <stdlib.h>

/*
 * For avoiding contention in page table lock, FAULT area is
 * sparse. If FAULT_LENGTH is too large for your cpus, decrease it.
 */
#define FAULT_LENGTH	(2 * 1024 * 1024)
#define PAGE_SIZE	4096
#define MAXNUM		(128)

void alarm_handler(int sig)
{
}

void *worker(int cpu, int ppid)
{
	void *start, *end;
	char *c;
	cpu_set_t set;
	int i;

	CPU_ZERO(&set);
	CPU_SET(cpu, &set);
	sched_setaffinity(0, sizeof(set), &set);

	start = mmap(NULL, FAULT_LENGTH, PROT_READ|PROT_WRITE,
			MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
	if (start == MAP_FAILED) {
		perror("mmap");
		exit(1);
	}
	end = start + FAULT_LENGTH;

	pause();
	//fprintf(stderr, "run%d", cpu);
	while (1) {
		for (c = (char*)start; (void *)c < end; c += PAGE_SIZE)
			*c = 0;
		madvise(start, FAULT_LENGTH, MADV_DONTNEED);
	}
	return NULL;
}

int main(int argc, char *argv[])
{
	int num, i, ret, pid, status;
	int pids[MAXNUM];

	if (argc < 2)
		return 0;

	setpgid(0, 0);
	signal(SIGALRM, alarm_handler);
	num = atoi(argv[1]);
	pid = getpid();

	for (i = 0; i < num; ++i) {
		ret = fork();
		if (!ret) {
			worker(i, pid);
			exit(0);
		}
		pids[i] = ret;
	}
	sleep(1);
	kill(-pid, SIGALRM);
	sleep(60);
	for (i = 0; i < num; i++)
		kill(pids[i], SIGKILL);
	for (i = 0; i < num; i++)
		waitpid(pids[i], &status, 0);
	return 0;
}

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 1 changed file with 63 additions and 121 deletions Side-by-side Diff

... ... @@ -89,55 +89,9 @@
89 89  
90 90 struct mem_cgroup_stat_cpu {
91 91 s64 count[MEM_CGROUP_STAT_NSTATS];
92   -} ____cacheline_aligned_in_smp;
93   -
94   -struct mem_cgroup_stat {
95   - struct mem_cgroup_stat_cpu cpustat[0];
96 92 };
97 93  
98   -static inline void
99   -__mem_cgroup_stat_set_safe(struct mem_cgroup_stat_cpu *stat,
100   - enum mem_cgroup_stat_index idx, s64 val)
101   -{
102   - stat->count[idx] = val;
103   -}
104   -
105   -static inline s64
106   -__mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat,
107   - enum mem_cgroup_stat_index idx)
108   -{
109   - return stat->count[idx];
110   -}
111   -
112 94 /*
113   - * For accounting under irq disable, no need for increment preempt count.
114   - */
115   -static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
116   - enum mem_cgroup_stat_index idx, int val)
117   -{
118   - stat->count[idx] += val;
119   -}
120   -
121   -static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
122   - enum mem_cgroup_stat_index idx)
123   -{
124   - int cpu;
125   - s64 ret = 0;
126   - for_each_possible_cpu(cpu)
127   - ret += stat->cpustat[cpu].count[idx];
128   - return ret;
129   -}
130   -
131   -static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat)
132   -{
133   - s64 ret;
134   -
135   - ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE);
136   - ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS);
137   - return ret;
138   -}
139   -
140   -/*
141 95 * per-zone information in memory controller.
142 96 */
143 97 struct mem_cgroup_per_zone {
144 98  
... ... @@ -270,9 +224,9 @@
270 224 unsigned long move_charge_at_immigrate;
271 225  
272 226 /*
273   - * statistics. This must be placed at the end of memcg.
  227 + * percpu counter.
274 228 */
275   - struct mem_cgroup_stat stat;
  229 + struct mem_cgroup_stat_cpu *stat;
276 230 };
277 231  
278 232 /* Stuffs for move charges at task migration. */
279 233  
280 234  
281 235  
282 236  
... ... @@ -441,19 +395,14 @@
441 395 static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem)
442 396 {
443 397 bool ret = false;
444   - int cpu;
445 398 s64 val;
446   - struct mem_cgroup_stat_cpu *cpustat;
447 399  
448   - cpu = get_cpu();
449   - cpustat = &mem->stat.cpustat[cpu];
450   - val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_SOFTLIMIT);
  400 + val = this_cpu_read(mem->stat->count[MEM_CGROUP_STAT_SOFTLIMIT]);
451 401 if (unlikely(val < 0)) {
452   - __mem_cgroup_stat_set_safe(cpustat, MEM_CGROUP_STAT_SOFTLIMIT,
  402 + this_cpu_write(mem->stat->count[MEM_CGROUP_STAT_SOFTLIMIT],
453 403 SOFTLIMIT_EVENTS_THRESH);
454 404 ret = true;
455 405 }
456   - put_cpu();
457 406 return ret;
458 407 }
459 408  
460 409  
... ... @@ -549,17 +498,31 @@
549 498 return mz;
550 499 }
551 500  
  501 +static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
  502 + enum mem_cgroup_stat_index idx)
  503 +{
  504 + int cpu;
  505 + s64 val = 0;
  506 +
  507 + for_each_possible_cpu(cpu)
  508 + val += per_cpu(mem->stat->count[idx], cpu);
  509 + return val;
  510 +}
  511 +
  512 +static s64 mem_cgroup_local_usage(struct mem_cgroup *mem)
  513 +{
  514 + s64 ret;
  515 +
  516 + ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
  517 + ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
  518 + return ret;
  519 +}
  520 +
552 521 static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
553 522 bool charge)
554 523 {
555 524 int val = (charge) ? 1 : -1;
556   - struct mem_cgroup_stat *stat = &mem->stat;
557   - struct mem_cgroup_stat_cpu *cpustat;
558   - int cpu = get_cpu();
559   -
560   - cpustat = &stat->cpustat[cpu];
561   - __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val);
562   - put_cpu();
  525 + this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
563 526 }
564 527  
565 528 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
566 529  
567 530  
568 531  
569 532  
570 533  
571 534  
... ... @@ -567,26 +530,22 @@
567 530 bool charge)
568 531 {
569 532 int val = (charge) ? 1 : -1;
570   - struct mem_cgroup_stat *stat = &mem->stat;
571   - struct mem_cgroup_stat_cpu *cpustat;
572   - int cpu = get_cpu();
573 533  
574   - cpustat = &stat->cpustat[cpu];
  534 + preempt_disable();
  535 +
575 536 if (PageCgroupCache(pc))
576   - __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
  537 + __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val);
577 538 else
578   - __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val);
  539 + __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val);
579 540  
580 541 if (charge)
581   - __mem_cgroup_stat_add_safe(cpustat,
582   - MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
  542 + __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);
583 543 else
584   - __mem_cgroup_stat_add_safe(cpustat,
585   - MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
586   - __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SOFTLIMIT, -1);
587   - __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_THRESHOLDS, -1);
  544 + __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);
  545 + __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_SOFTLIMIT]);
  546 + __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_THRESHOLDS]);
588 547  
589   - put_cpu();
  548 + preempt_enable();
590 549 }
591 550  
592 551 static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
... ... @@ -1244,7 +1203,7 @@
1244 1203 }
1245 1204 }
1246 1205 }
1247   - if (!mem_cgroup_local_usage(&victim->stat)) {
  1206 + if (!mem_cgroup_local_usage(victim)) {
1248 1207 /* this cgroup's local usage == 0 */
1249 1208 css_put(&victim->css);
1250 1209 continue;
... ... @@ -1310,9 +1269,6 @@
1310 1269 void mem_cgroup_update_file_mapped(struct page *page, int val)
1311 1270 {
1312 1271 struct mem_cgroup *mem;
1313   - struct mem_cgroup_stat *stat;
1314   - struct mem_cgroup_stat_cpu *cpustat;
1315   - int cpu;
1316 1272 struct page_cgroup *pc;
1317 1273  
1318 1274 pc = lookup_page_cgroup(page);
1319 1275  
1320 1276  
... ... @@ -1328,13 +1284,10 @@
1328 1284 goto done;
1329 1285  
1330 1286 /*
1331   - * Preemption is already disabled, we don't need get_cpu()
  1287 + * Preemption is already disabled. We can use __this_cpu_xxx
1332 1288 */
1333   - cpu = smp_processor_id();
1334   - stat = &mem->stat;
1335   - cpustat = &stat->cpustat[cpu];
  1289 + __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], val);
1336 1290  
1337   - __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, val);
1338 1291 done:
1339 1292 unlock_page_cgroup(pc);
1340 1293 }
... ... @@ -1761,9 +1714,6 @@
1761 1714 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
1762 1715 {
1763 1716 struct page *page;
1764   - int cpu;
1765   - struct mem_cgroup_stat *stat;
1766   - struct mem_cgroup_stat_cpu *cpustat;
1767 1717  
1768 1718 VM_BUG_ON(from == to);
1769 1719 VM_BUG_ON(PageLRU(pc->page));
... ... @@ -1773,18 +1723,11 @@
1773 1723  
1774 1724 page = pc->page;
1775 1725 if (page_mapped(page) && !PageAnon(page)) {
1776   - cpu = smp_processor_id();
1777   - /* Update mapped_file data for mem_cgroup "from" */
1778   - stat = &from->stat;
1779   - cpustat = &stat->cpustat[cpu];
1780   - __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED,
1781   - -1);
1782   -
1783   - /* Update mapped_file data for mem_cgroup "to" */
1784   - stat = &to->stat;
1785   - cpustat = &stat->cpustat[cpu];
1786   - __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED,
1787   - 1);
  1726 + /* Update mapped_file data for mem_cgroup */
  1727 + preempt_disable();
  1728 + __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
  1729 + __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
  1730 + preempt_enable();
1788 1731 }
1789 1732 mem_cgroup_charge_statistics(from, pc, false);
1790 1733 if (uncharge)
... ... @@ -2885,7 +2828,7 @@
2885 2828 mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data)
2886 2829 {
2887 2830 struct mem_cgroup_idx_data *d = data;
2888   - d->val += mem_cgroup_read_stat(&mem->stat, d->idx);
  2831 + d->val += mem_cgroup_read_stat(mem, d->idx);
2889 2832 return 0;
2890 2833 }
2891 2834  
2892 2835  
2893 2836  
2894 2837  
2895 2838  
2896 2839  
... ... @@ -3134,18 +3077,18 @@
3134 3077 s64 val;
3135 3078  
3136 3079 /* per cpu stat */
3137   - val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_CACHE);
  3080 + val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
3138 3081 s->stat[MCS_CACHE] += val * PAGE_SIZE;
3139   - val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
  3082 + val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
3140 3083 s->stat[MCS_RSS] += val * PAGE_SIZE;
3141   - val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_FILE_MAPPED);
  3084 + val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
3142 3085 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
3143   - val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT);
  3086 + val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT);
3144 3087 s->stat[MCS_PGPGIN] += val;
3145   - val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT);
  3088 + val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT);
3146 3089 s->stat[MCS_PGPGOUT] += val;
3147 3090 if (do_swap_account) {
3148   - val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT);
  3091 + val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
3149 3092 s->stat[MCS_SWAP] += val * PAGE_SIZE;
3150 3093 }
3151 3094  
3152 3095  
3153 3096  
3154 3097  
3155 3098  
... ... @@ -3276,19 +3219,14 @@
3276 3219 static bool mem_cgroup_threshold_check(struct mem_cgroup *mem)
3277 3220 {
3278 3221 bool ret = false;
3279   - int cpu;
3280 3222 s64 val;
3281   - struct mem_cgroup_stat_cpu *cpustat;
3282 3223  
3283   - cpu = get_cpu();
3284   - cpustat = &mem->stat.cpustat[cpu];
3285   - val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_THRESHOLDS);
  3224 + val = this_cpu_read(mem->stat->count[MEM_CGROUP_STAT_THRESHOLDS]);
3286 3225 if (unlikely(val < 0)) {
3287   - __mem_cgroup_stat_set_safe(cpustat, MEM_CGROUP_STAT_THRESHOLDS,
  3226 + this_cpu_write(mem->stat->count[MEM_CGROUP_STAT_THRESHOLDS],
3288 3227 THRESHOLDS_EVENTS_THRESH);
3289 3228 ret = true;
3290 3229 }
3291   - put_cpu();
3292 3230 return ret;
3293 3231 }
3294 3232  
3295 3233  
3296 3234  
... ... @@ -3676,17 +3614,12 @@
3676 3614 kfree(mem->info.nodeinfo[node]);
3677 3615 }
3678 3616  
3679   -static int mem_cgroup_size(void)
3680   -{
3681   - int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu);
3682   - return sizeof(struct mem_cgroup) + cpustat_size;
3683   -}
3684   -
3685 3617 static struct mem_cgroup *mem_cgroup_alloc(void)
3686 3618 {
3687 3619 struct mem_cgroup *mem;
3688   - int size = mem_cgroup_size();
  3620 + int size = sizeof(struct mem_cgroup);
3689 3621  
  3622 + /* Can be very big if MAX_NUMNODES is very big */
3690 3623 if (size < PAGE_SIZE)
3691 3624 mem = kmalloc(size, GFP_KERNEL);
3692 3625 else
... ... @@ -3694,6 +3627,14 @@
3694 3627  
3695 3628 if (mem)
3696 3629 memset(mem, 0, size);
  3630 + mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
  3631 + if (!mem->stat) {
  3632 + if (size < PAGE_SIZE)
  3633 + kfree(mem);
  3634 + else
  3635 + vfree(mem);
  3636 + mem = NULL;
  3637 + }
3697 3638 return mem;
3698 3639 }
3699 3640  
... ... @@ -3718,7 +3659,8 @@
3718 3659 for_each_node_state(node, N_POSSIBLE)
3719 3660 free_mem_cgroup_per_zone_info(mem, node);
3720 3661  
3721   - if (mem_cgroup_size() < PAGE_SIZE)
  3662 + free_percpu(mem->stat);
  3663 + if (sizeof(struct mem_cgroup) < PAGE_SIZE)
3722 3664 kfree(mem);
3723 3665 else
3724 3666 vfree(mem);