Commit 0eea10301708c64a6b793894c156e21ddd15eb64
Committed by
Linus Torvalds
1 parent
66e1707bc3
Exists in
master
and in
7 other branches
Memory controller improve user interface
Change the interface to use bytes instead of pages. Page sizes can vary across platforms and configurations. A new strategy routine has been added to the resource counters infrastructure to format the data as desired. Suggested by David Rientjes, Andrew Morton and Herbert Poetzl Tested on a UML setup with the config for memory control enabled. [kamezawa.hiroyu@jp.fujitsu.com: possible race fix in res_counter] Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com> Signed-off-by: Pavel Emelianov <xemul@openvz.org> Cc: Paul Menage <menage@google.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Kirill Korotaev <dev@sw.ru> Cc: Herbert Poetzl <herbert@13thfloor.at> Cc: David Rientjes <rientjes@google.com> Cc: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 4 changed files with 81 additions and 31 deletions Side-by-side Diff
Documentation/controllers/memory.txt
... | ... | @@ -165,12 +165,31 @@ |
165 | 165 | |
166 | 166 | Since now we're in the 0 cgroup, |
167 | 167 | We can alter the memory limit: |
168 | -# echo -n 6000 > /cgroups/0/memory.limit | |
168 | +# echo -n 4M > /cgroups/0/memory.limit_in_bytes | |
169 | 169 | |
170 | +NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, | |
171 | +mega or gigabytes. | |
172 | + | |
173 | +# cat /cgroups/0/memory.limit_in_bytes | |
174 | +4194304 Bytes | |
175 | + | |
176 | +NOTE: The interface has now changed to display the usage in bytes | |
177 | +instead of pages | |
178 | + | |
170 | 179 | We can check the usage: |
171 | -# cat /cgroups/0/memory.usage | |
172 | -25 | |
180 | +# cat /cgroups/0/memory.usage_in_bytes | |
181 | +1216512 Bytes | |
173 | 182 | |
183 | +A successful write to this file does not guarantee a successful set of | |
184 | +this limit to the value written into the file. This can be due to a | |
185 | +number of factors, such as rounding up to page boundaries or the total | |
186 | +availability of memory on the system. The user is required to re-read | |
187 | +this file after a write to guarantee the value committed by the kernel. | |
188 | + | |
189 | +# echo -n 1 > memory.limit_in_bytes | |
190 | +# cat memory.limit_in_bytes | |
191 | +4096 Bytes | |
192 | + | |
174 | 193 | The memory.failcnt field gives the number of times that the cgroup limit was |
175 | 194 | exceeded. |
176 | 195 | |
... | ... | @@ -206,8 +225,8 @@ |
206 | 225 | tasks have migrated away from it. If some pages are still left, after following |
207 | 226 | the steps listed in sections 4.1 and 4.2, check the Swap Cache usage in |
208 | 227 | /proc/meminfo to see if the Swap Cache usage is showing up in the |
209 | -cgroups memory.usage counter. A simple test of swapoff -a and swapon -a | |
210 | -should free any pending Swap Cache usage. | |
228 | +cgroups memory.usage_in_bytes counter. A simple test of swapoff -a and | |
229 | +swapon -a should free any pending Swap Cache usage. | |
211 | 230 | |
212 | 231 | 4.4 Choosing what to account -- Page Cache (unmapped) vs RSS (mapped)? |
213 | 232 |
include/linux/res_counter.h
... | ... | @@ -23,15 +23,15 @@ |
23 | 23 | /* |
24 | 24 | * the current resource consumption level |
25 | 25 | */ |
26 | - unsigned long usage; | |
26 | + unsigned long long usage; | |
27 | 27 | /* |
28 | 28 | * the limit that usage cannot exceed |
29 | 29 | */ |
30 | - unsigned long limit; | |
30 | + unsigned long long limit; | |
31 | 31 | /* |
32 | 32 | * the number of unsuccessful attempts to consume the resource |
33 | 33 | */ |
34 | - unsigned long failcnt; | |
34 | + unsigned long long failcnt; | |
35 | 35 | /* |
36 | 36 | * the lock to protect all of the above. |
37 | 37 | * the routines below consider this to be IRQ-safe |
38 | 38 | |
... | ... | @@ -52,9 +52,11 @@ |
52 | 52 | */ |
53 | 53 | |
54 | 54 | ssize_t res_counter_read(struct res_counter *counter, int member, |
55 | - const char __user *buf, size_t nbytes, loff_t *pos); | |
55 | + const char __user *buf, size_t nbytes, loff_t *pos, | |
56 | + int (*read_strategy)(unsigned long long val, char *s)); | |
56 | 57 | ssize_t res_counter_write(struct res_counter *counter, int member, |
57 | - const char __user *buf, size_t nbytes, loff_t *pos); | |
58 | + const char __user *buf, size_t nbytes, loff_t *pos, | |
59 | + int (*write_strategy)(char *buf, unsigned long long *val)); | |
58 | 60 | |
59 | 61 | /* |
60 | 62 | * the field descriptors. one for each member of res_counter |
kernel/res_counter.c
... | ... | @@ -16,7 +16,7 @@ |
16 | 16 | void res_counter_init(struct res_counter *counter) |
17 | 17 | { |
18 | 18 | spin_lock_init(&counter->lock); |
19 | - counter->limit = (unsigned long)LONG_MAX; | |
19 | + counter->limit = (unsigned long long)LLONG_MAX; | |
20 | 20 | } |
21 | 21 | |
22 | 22 | int res_counter_charge_locked(struct res_counter *counter, unsigned long val) |
... | ... | @@ -59,8 +59,8 @@ |
59 | 59 | } |
60 | 60 | |
61 | 61 | |
62 | -static inline unsigned long *res_counter_member(struct res_counter *counter, | |
63 | - int member) | |
62 | +static inline unsigned long long * | |
63 | +res_counter_member(struct res_counter *counter, int member) | |
64 | 64 | { |
65 | 65 | switch (member) { |
66 | 66 | case RES_USAGE: |
67 | 67 | |
68 | 68 | |
69 | 69 | |
70 | 70 | |
... | ... | @@ -76,24 +76,30 @@ |
76 | 76 | } |
77 | 77 | |
78 | 78 | ssize_t res_counter_read(struct res_counter *counter, int member, |
79 | - const char __user *userbuf, size_t nbytes, loff_t *pos) | |
79 | + const char __user *userbuf, size_t nbytes, loff_t *pos, | |
80 | + int (*read_strategy)(unsigned long long val, char *st_buf)) | |
80 | 81 | { |
81 | - unsigned long *val; | |
82 | + unsigned long long *val; | |
82 | 83 | char buf[64], *s; |
83 | 84 | |
84 | 85 | s = buf; |
85 | 86 | val = res_counter_member(counter, member); |
86 | - s += sprintf(s, "%lu\n", *val); | |
87 | + if (read_strategy) | |
88 | + s += read_strategy(*val, s); | |
89 | + else | |
90 | + s += sprintf(s, "%llu\n", *val); | |
87 | 91 | return simple_read_from_buffer((void __user *)userbuf, nbytes, |
88 | 92 | pos, buf, s - buf); |
89 | 93 | } |
90 | 94 | |
91 | 95 | ssize_t res_counter_write(struct res_counter *counter, int member, |
92 | - const char __user *userbuf, size_t nbytes, loff_t *pos) | |
96 | + const char __user *userbuf, size_t nbytes, loff_t *pos, | |
97 | + int (*write_strategy)(char *st_buf, unsigned long long *val)) | |
93 | 98 | { |
94 | 99 | int ret; |
95 | 100 | char *buf, *end; |
96 | - unsigned long tmp, *val; | |
101 | + unsigned long flags; | |
102 | + unsigned long long tmp, *val; | |
97 | 103 | |
98 | 104 | buf = kmalloc(nbytes + 1, GFP_KERNEL); |
99 | 105 | ret = -ENOMEM; |
100 | 106 | |
101 | 107 | |
... | ... | @@ -106,12 +112,20 @@ |
106 | 112 | goto out_free; |
107 | 113 | |
108 | 114 | ret = -EINVAL; |
109 | - tmp = simple_strtoul(buf, &end, 10); | |
110 | - if (*end != '\0') | |
111 | - goto out_free; | |
112 | 115 | |
116 | + if (write_strategy) { | |
117 | + if (write_strategy(buf, &tmp)) { | |
118 | + goto out_free; | |
119 | + } | |
120 | + } else { | |
121 | + tmp = simple_strtoull(buf, &end, 10); | |
122 | + if (*end != '\0') | |
123 | + goto out_free; | |
124 | + } | |
125 | + spin_lock_irqsave(&counter->lock, flags); | |
113 | 126 | val = res_counter_member(counter, member); |
114 | 127 | *val = tmp; |
128 | + spin_unlock_irqrestore(&counter->lock, flags); | |
115 | 129 | ret = nbytes; |
116 | 130 | out_free: |
117 | 131 | kfree(buf); |
mm/memcontrol.c
... | ... | @@ -302,7 +302,7 @@ |
302 | 302 | * If we created the page_cgroup, we should free it on exceeding |
303 | 303 | * the cgroup limit. |
304 | 304 | */ |
305 | - while (res_counter_charge(&mem->res, 1)) { | |
305 | + while (res_counter_charge(&mem->res, PAGE_SIZE)) { | |
306 | 306 | if (try_to_free_mem_cgroup_pages(mem)) |
307 | 307 | continue; |
308 | 308 | |
... | ... | @@ -341,7 +341,7 @@ |
341 | 341 | kfree(pc); |
342 | 342 | pc = race_pc; |
343 | 343 | atomic_inc(&pc->ref_cnt); |
344 | - res_counter_uncharge(&mem->res, 1); | |
344 | + res_counter_uncharge(&mem->res, PAGE_SIZE); | |
345 | 345 | css_put(&mem->css); |
346 | 346 | goto done; |
347 | 347 | } |
... | ... | @@ -384,7 +384,7 @@ |
384 | 384 | css_put(&mem->css); |
385 | 385 | page_assign_page_cgroup(page, NULL); |
386 | 386 | unlock_page_cgroup(page); |
387 | - res_counter_uncharge(&mem->res, 1); | |
387 | + res_counter_uncharge(&mem->res, PAGE_SIZE); | |
388 | 388 | |
389 | 389 | spin_lock_irqsave(&mem->lru_lock, flags); |
390 | 390 | list_del_init(&pc->lru); |
391 | 391 | |
392 | 392 | |
... | ... | @@ -393,12 +393,26 @@ |
393 | 393 | } |
394 | 394 | } |
395 | 395 | |
396 | -static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, | |
397 | - struct file *file, char __user *userbuf, size_t nbytes, | |
398 | - loff_t *ppos) | |
396 | +int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp) | |
399 | 397 | { |
398 | + *tmp = memparse(buf, &buf); | |
399 | + if (*buf != '\0') | |
400 | + return -EINVAL; | |
401 | + | |
402 | + /* | |
403 | + * Round up the value to the closest page size | |
404 | + */ | |
405 | + *tmp = ((*tmp + PAGE_SIZE - 1) >> PAGE_SHIFT) << PAGE_SHIFT; | |
406 | + return 0; | |
407 | +} | |
408 | + | |
409 | +static ssize_t mem_cgroup_read(struct cgroup *cont, | |
410 | + struct cftype *cft, struct file *file, | |
411 | + char __user *userbuf, size_t nbytes, loff_t *ppos) | |
412 | +{ | |
400 | 413 | return res_counter_read(&mem_cgroup_from_cont(cont)->res, |
401 | - cft->private, userbuf, nbytes, ppos); | |
414 | + cft->private, userbuf, nbytes, ppos, | |
415 | + NULL); | |
402 | 416 | } |
403 | 417 | |
404 | 418 | static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft, |
405 | 419 | |
406 | 420 | |
... | ... | @@ -406,17 +420,18 @@ |
406 | 420 | size_t nbytes, loff_t *ppos) |
407 | 421 | { |
408 | 422 | return res_counter_write(&mem_cgroup_from_cont(cont)->res, |
409 | - cft->private, userbuf, nbytes, ppos); | |
423 | + cft->private, userbuf, nbytes, ppos, | |
424 | + mem_cgroup_write_strategy); | |
410 | 425 | } |
411 | 426 | |
412 | 427 | static struct cftype mem_cgroup_files[] = { |
413 | 428 | { |
414 | - .name = "usage", | |
429 | + .name = "usage_in_bytes", | |
415 | 430 | .private = RES_USAGE, |
416 | 431 | .read = mem_cgroup_read, |
417 | 432 | }, |
418 | 433 | { |
419 | - .name = "limit", | |
434 | + .name = "limit_in_bytes", | |
420 | 435 | .private = RES_LIMIT, |
421 | 436 | .write = mem_cgroup_write, |
422 | 437 | .read = mem_cgroup_read, |