Commit 0eea10301708c64a6b793894c156e21ddd15eb64

Authored by Balbir Singh
Committed by Linus Torvalds
1 parent 66e1707bc3

Memory controller improve user interface

Change the interface to use bytes instead of pages.  Page sizes can vary
across platforms and configurations.  A new strategy routine has been added
to the resource counters infrastructure to format the data as desired.

Suggested by David Rientjes, Andrew Morton and Herbert Poetzl

Tested on a UML setup with the config for memory control enabled.

[kamezawa.hiroyu@jp.fujitsu.com: possible race fix in res_counter]
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: Pavel Emelianov <xemul@openvz.org>
Cc: Paul Menage <menage@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Kirill Korotaev <dev@sw.ru>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Cc: David Rientjes <rientjes@google.com>
Cc: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 4 changed files with 81 additions and 31 deletions Side-by-side Diff

Documentation/controllers/memory.txt
... ... @@ -165,12 +165,31 @@
165 165  
166 166 Since now we're in the 0 cgroup,
167 167 We can alter the memory limit:
168   -# echo -n 6000 > /cgroups/0/memory.limit
  168 +# echo -n 4M > /cgroups/0/memory.limit_in_bytes
169 169  
  170 +NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo,
  171 +mega or gigabytes.
  172 +
  173 +# cat /cgroups/0/memory.limit_in_bytes
  174 +4194304 Bytes
  175 +
  176 +NOTE: The interface has now changed to display the usage in bytes
  177 +instead of pages
  178 +
170 179 We can check the usage:
171   -# cat /cgroups/0/memory.usage
172   -25
  180 +# cat /cgroups/0/memory.usage_in_bytes
  181 +1216512 Bytes
173 182  
  183 +A successful write to this file does not guarantee a successful set of
  184 +this limit to the value written into the file. This can be due to a
  185 +number of factors, such as rounding up to page boundaries or the total
  186 +availability of memory on the system. The user is required to re-read
  187 +this file after a write to guarantee the value committed by the kernel.
  188 +
  189 +# echo -n 1 > memory.limit_in_bytes
  190 +# cat memory.limit_in_bytes
  191 +4096 Bytes
  192 +
174 193 The memory.failcnt field gives the number of times that the cgroup limit was
175 194 exceeded.
176 195  
... ... @@ -206,8 +225,8 @@
206 225 tasks have migrated away from it. If some pages are still left, after following
207 226 the steps listed in sections 4.1 and 4.2, check the Swap Cache usage in
208 227 /proc/meminfo to see if the Swap Cache usage is showing up in the
209   -cgroups memory.usage counter. A simple test of swapoff -a and swapon -a
210   -should free any pending Swap Cache usage.
  228 +cgroups memory.usage_in_bytes counter. A simple test of swapoff -a and
  229 +swapon -a should free any pending Swap Cache usage.
211 230  
212 231 4.4 Choosing what to account -- Page Cache (unmapped) vs RSS (mapped)?
213 232  
include/linux/res_counter.h
... ... @@ -23,15 +23,15 @@
23 23 /*
24 24 * the current resource consumption level
25 25 */
26   - unsigned long usage;
  26 + unsigned long long usage;
27 27 /*
28 28 * the limit that usage cannot exceed
29 29 */
30   - unsigned long limit;
  30 + unsigned long long limit;
31 31 /*
32 32 * the number of unsuccessful attempts to consume the resource
33 33 */
34   - unsigned long failcnt;
  34 + unsigned long long failcnt;
35 35 /*
36 36 * the lock to protect all of the above.
37 37 * the routines below consider this to be IRQ-safe
38 38  
... ... @@ -52,9 +52,11 @@
52 52 */
53 53  
54 54 ssize_t res_counter_read(struct res_counter *counter, int member,
55   - const char __user *buf, size_t nbytes, loff_t *pos);
  55 + const char __user *buf, size_t nbytes, loff_t *pos,
  56 + int (*read_strategy)(unsigned long long val, char *s));
56 57 ssize_t res_counter_write(struct res_counter *counter, int member,
57   - const char __user *buf, size_t nbytes, loff_t *pos);
  58 + const char __user *buf, size_t nbytes, loff_t *pos,
  59 + int (*write_strategy)(char *buf, unsigned long long *val));
58 60  
59 61 /*
60 62 * the field descriptors. one for each member of res_counter
kernel/res_counter.c
... ... @@ -16,7 +16,7 @@
16 16 void res_counter_init(struct res_counter *counter)
17 17 {
18 18 spin_lock_init(&counter->lock);
19   - counter->limit = (unsigned long)LONG_MAX;
  19 + counter->limit = (unsigned long long)LLONG_MAX;
20 20 }
21 21  
22 22 int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
... ... @@ -59,8 +59,8 @@
59 59 }
60 60  
61 61  
62   -static inline unsigned long *res_counter_member(struct res_counter *counter,
63   - int member)
  62 +static inline unsigned long long *
  63 +res_counter_member(struct res_counter *counter, int member)
64 64 {
65 65 switch (member) {
66 66 case RES_USAGE:
67 67  
68 68  
69 69  
70 70  
... ... @@ -76,24 +76,30 @@
76 76 }
77 77  
78 78 ssize_t res_counter_read(struct res_counter *counter, int member,
79   - const char __user *userbuf, size_t nbytes, loff_t *pos)
  79 + const char __user *userbuf, size_t nbytes, loff_t *pos,
  80 + int (*read_strategy)(unsigned long long val, char *st_buf))
80 81 {
81   - unsigned long *val;
  82 + unsigned long long *val;
82 83 char buf[64], *s;
83 84  
84 85 s = buf;
85 86 val = res_counter_member(counter, member);
86   - s += sprintf(s, "%lu\n", *val);
  87 + if (read_strategy)
  88 + s += read_strategy(*val, s);
  89 + else
  90 + s += sprintf(s, "%llu\n", *val);
87 91 return simple_read_from_buffer((void __user *)userbuf, nbytes,
88 92 pos, buf, s - buf);
89 93 }
90 94  
91 95 ssize_t res_counter_write(struct res_counter *counter, int member,
92   - const char __user *userbuf, size_t nbytes, loff_t *pos)
  96 + const char __user *userbuf, size_t nbytes, loff_t *pos,
  97 + int (*write_strategy)(char *st_buf, unsigned long long *val))
93 98 {
94 99 int ret;
95 100 char *buf, *end;
96   - unsigned long tmp, *val;
  101 + unsigned long flags;
  102 + unsigned long long tmp, *val;
97 103  
98 104 buf = kmalloc(nbytes + 1, GFP_KERNEL);
99 105 ret = -ENOMEM;
100 106  
101 107  
... ... @@ -106,12 +112,20 @@
106 112 goto out_free;
107 113  
108 114 ret = -EINVAL;
109   - tmp = simple_strtoul(buf, &end, 10);
110   - if (*end != '\0')
111   - goto out_free;
112 115  
  116 + if (write_strategy) {
  117 + if (write_strategy(buf, &tmp)) {
  118 + goto out_free;
  119 + }
  120 + } else {
  121 + tmp = simple_strtoull(buf, &end, 10);
  122 + if (*end != '\0')
  123 + goto out_free;
  124 + }
  125 + spin_lock_irqsave(&counter->lock, flags);
113 126 val = res_counter_member(counter, member);
114 127 *val = tmp;
  128 + spin_unlock_irqrestore(&counter->lock, flags);
115 129 ret = nbytes;
116 130 out_free:
117 131 kfree(buf);
... ... @@ -302,7 +302,7 @@
302 302 * If we created the page_cgroup, we should free it on exceeding
303 303 * the cgroup limit.
304 304 */
305   - while (res_counter_charge(&mem->res, 1)) {
  305 + while (res_counter_charge(&mem->res, PAGE_SIZE)) {
306 306 if (try_to_free_mem_cgroup_pages(mem))
307 307 continue;
308 308  
... ... @@ -341,7 +341,7 @@
341 341 kfree(pc);
342 342 pc = race_pc;
343 343 atomic_inc(&pc->ref_cnt);
344   - res_counter_uncharge(&mem->res, 1);
  344 + res_counter_uncharge(&mem->res, PAGE_SIZE);
345 345 css_put(&mem->css);
346 346 goto done;
347 347 }
... ... @@ -384,7 +384,7 @@
384 384 css_put(&mem->css);
385 385 page_assign_page_cgroup(page, NULL);
386 386 unlock_page_cgroup(page);
387   - res_counter_uncharge(&mem->res, 1);
  387 + res_counter_uncharge(&mem->res, PAGE_SIZE);
388 388  
389 389 spin_lock_irqsave(&mem->lru_lock, flags);
390 390 list_del_init(&pc->lru);
391 391  
392 392  
... ... @@ -393,12 +393,26 @@
393 393 }
394 394 }
395 395  
396   -static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
397   - struct file *file, char __user *userbuf, size_t nbytes,
398   - loff_t *ppos)
  396 +int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp)
399 397 {
  398 + *tmp = memparse(buf, &buf);
  399 + if (*buf != '\0')
  400 + return -EINVAL;
  401 +
  402 + /*
  403 + * Round up the value to the closest page size
  404 + */
  405 + *tmp = ((*tmp + PAGE_SIZE - 1) >> PAGE_SHIFT) << PAGE_SHIFT;
  406 + return 0;
  407 +}
  408 +
  409 +static ssize_t mem_cgroup_read(struct cgroup *cont,
  410 + struct cftype *cft, struct file *file,
  411 + char __user *userbuf, size_t nbytes, loff_t *ppos)
  412 +{
400 413 return res_counter_read(&mem_cgroup_from_cont(cont)->res,
401   - cft->private, userbuf, nbytes, ppos);
  414 + cft->private, userbuf, nbytes, ppos,
  415 + NULL);
402 416 }
403 417  
404 418 static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
405 419  
406 420  
... ... @@ -406,17 +420,18 @@
406 420 size_t nbytes, loff_t *ppos)
407 421 {
408 422 return res_counter_write(&mem_cgroup_from_cont(cont)->res,
409   - cft->private, userbuf, nbytes, ppos);
  423 + cft->private, userbuf, nbytes, ppos,
  424 + mem_cgroup_write_strategy);
410 425 }
411 426  
412 427 static struct cftype mem_cgroup_files[] = {
413 428 {
414   - .name = "usage",
  429 + .name = "usage_in_bytes",
415 430 .private = RES_USAGE,
416 431 .read = mem_cgroup_read,
417 432 },
418 433 {
419   - .name = "limit",
  434 + .name = "limit_in_bytes",
420 435 .private = RES_LIMIT,
421 436 .write = mem_cgroup_write,
422 437 .read = mem_cgroup_read,