Commit 2e72b6347c9459e6cff5634ddc815485bae6985f
Committed by
Linus Torvalds
1 parent
378ce724bc
Exists in
master
and in
7 other branches
memcg: implement memory thresholds
It allows to register multiple memory and memsw thresholds and gets notifications when it crosses. To register a threshold application need: - create an eventfd; - open memory.usage_in_bytes or memory.memsw.usage_in_bytes; - write string like "<event_fd> <memory.usage_in_bytes> <threshold>" to cgroup.event_control. Application will be notified through eventfd when memory usage crosses threshold in any direction. It's applicable for root and non-root cgroup. It uses stats to track memory usage, simmilar to soft limits. It checks if we need to send event to userspace on every 100 page in/out. I guess it's good compromise between performance and accuracy of thresholds. [akpm@linux-foundation.org: coding-style fixes] [nishimura@mxp.nes.nec.co.jp: fix documentation merge issue] Signed-off-by: Kirill A. Shutemov <kirill@shutemov.name> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: Pavel Emelyanov <xemul@openvz.org> Cc: Dan Malek <dan@embeddedalley.com> Cc: Vladislav Buzov <vbuzov@embeddedalley.com> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Cc: Alexander Shishkin <virtuoso@slind.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 2 changed files with 327 additions and 1 deletions Side-by-side Diff
Documentation/cgroups/memory.txt
... | ... | @@ -468,7 +468,24 @@ |
468 | 468 | - All of moving charge operations are done under cgroup_mutex. It's not good |
469 | 469 | behavior to hold the mutex too long, so we may need some trick. |
470 | 470 | |
471 | -9. TODO | |
471 | +9. Memory thresholds | |
472 | + | |
473 | +Memory controler implements memory thresholds using cgroups notification | |
474 | +API (see cgroups.txt). It allows to register multiple memory and memsw | |
475 | +thresholds and gets notifications when it crosses. | |
476 | + | |
477 | +To register a threshold application need: | |
478 | + - create an eventfd using eventfd(2); | |
479 | + - open memory.usage_in_bytes or memory.memsw.usage_in_bytes; | |
480 | + - write string like "<event_fd> <memory.usage_in_bytes> <threshold>" to | |
481 | + cgroup.event_control. | |
482 | + | |
483 | +Application will be notified through eventfd when memory usage crosses | |
484 | +threshold in any direction. | |
485 | + | |
486 | +It's applicable for root and non-root cgroup. | |
487 | + | |
488 | +10. TODO | |
472 | 489 | |
473 | 490 | 1. Add support for accounting huge pages (as a separate controller) |
474 | 491 | 2. Make per-cgroup scanner reclaim not-shared pages first |
mm/memcontrol.c
... | ... | @@ -6,6 +6,10 @@ |
6 | 6 | * Copyright 2007 OpenVZ SWsoft Inc |
7 | 7 | * Author: Pavel Emelianov <xemul@openvz.org> |
8 | 8 | * |
9 | + * Memory thresholds | |
10 | + * Copyright (C) 2009 Nokia Corporation | |
11 | + * Author: Kirill A. Shutemov | |
12 | + * | |
9 | 13 | * This program is free software; you can redistribute it and/or modify |
10 | 14 | * it under the terms of the GNU General Public License as published by |
11 | 15 | * the Free Software Foundation; either version 2 of the License, or |
... | ... | @@ -35,6 +39,8 @@ |
35 | 39 | #include <linux/swap.h> |
36 | 40 | #include <linux/swapops.h> |
37 | 41 | #include <linux/spinlock.h> |
42 | +#include <linux/eventfd.h> | |
43 | +#include <linux/sort.h> | |
38 | 44 | #include <linux/fs.h> |
39 | 45 | #include <linux/seq_file.h> |
40 | 46 | #include <linux/vmalloc.h> |
... | ... | @@ -58,6 +64,7 @@ |
58 | 64 | #endif |
59 | 65 | |
60 | 66 | #define SOFTLIMIT_EVENTS_THRESH (1000) |
67 | +#define THRESHOLDS_EVENTS_THRESH (100) | |
61 | 68 | |
62 | 69 | /* |
63 | 70 | * Statistics for memory cgroup. |
... | ... | @@ -74,6 +81,8 @@ |
74 | 81 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ |
75 | 82 | MEM_CGROUP_STAT_SOFTLIMIT, /* decrements on each page in/out. |
76 | 83 | used by soft limit implementation */ |
84 | + MEM_CGROUP_STAT_THRESHOLDS, /* decrements on each page in/out. | |
85 | + used by threshold implementation */ | |
77 | 86 | |
78 | 87 | MEM_CGROUP_STAT_NSTATS, |
79 | 88 | }; |
... | ... | @@ -177,6 +186,23 @@ |
177 | 186 | |
178 | 187 | static struct mem_cgroup_tree soft_limit_tree __read_mostly; |
179 | 188 | |
189 | +struct mem_cgroup_threshold { | |
190 | + struct eventfd_ctx *eventfd; | |
191 | + u64 threshold; | |
192 | +}; | |
193 | + | |
194 | +struct mem_cgroup_threshold_ary { | |
195 | + /* An array index points to threshold just below usage. */ | |
196 | + atomic_t current_threshold; | |
197 | + /* Size of entries[] */ | |
198 | + unsigned int size; | |
199 | + /* Array of thresholds */ | |
200 | + struct mem_cgroup_threshold entries[0]; | |
201 | +}; | |
202 | + | |
203 | +static bool mem_cgroup_threshold_check(struct mem_cgroup *mem); | |
204 | +static void mem_cgroup_threshold(struct mem_cgroup *mem); | |
205 | + | |
180 | 206 | /* |
181 | 207 | * The memory controller data structure. The memory controller controls both |
182 | 208 | * page cache and RSS per cgroup. We would eventually like to provide |
... | ... | @@ -228,6 +254,15 @@ |
228 | 254 | /* set when res.limit == memsw.limit */ |
229 | 255 | bool memsw_is_minimum; |
230 | 256 | |
257 | + /* protect arrays of thresholds */ | |
258 | + struct mutex thresholds_lock; | |
259 | + | |
260 | + /* thresholds for memory usage. RCU-protected */ | |
261 | + struct mem_cgroup_threshold_ary *thresholds; | |
262 | + | |
263 | + /* thresholds for mem+swap usage. RCU-protected */ | |
264 | + struct mem_cgroup_threshold_ary *memsw_thresholds; | |
265 | + | |
231 | 266 | /* |
232 | 267 | * Should we move charges of a task when a task is moved into this |
233 | 268 | * mem_cgroup ? And what type of charges should we move ? |
... | ... | @@ -549,6 +584,8 @@ |
549 | 584 | __mem_cgroup_stat_add_safe(cpustat, |
550 | 585 | MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); |
551 | 586 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SOFTLIMIT, -1); |
587 | + __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_THRESHOLDS, -1); | |
588 | + | |
552 | 589 | put_cpu(); |
553 | 590 | } |
554 | 591 | |
... | ... | @@ -1576,6 +1613,8 @@ |
1576 | 1613 | if (page && mem_cgroup_soft_limit_check(mem)) |
1577 | 1614 | mem_cgroup_update_tree(mem, page); |
1578 | 1615 | done: |
1616 | + if (mem_cgroup_threshold_check(mem)) | |
1617 | + mem_cgroup_threshold(mem); | |
1579 | 1618 | return 0; |
1580 | 1619 | nomem: |
1581 | 1620 | css_put(&mem->css); |
... | ... | @@ -2148,6 +2187,8 @@ |
2148 | 2187 | |
2149 | 2188 | if (mem_cgroup_soft_limit_check(mem)) |
2150 | 2189 | mem_cgroup_update_tree(mem, page); |
2190 | + if (mem_cgroup_threshold_check(mem)) | |
2191 | + mem_cgroup_threshold(mem); | |
2151 | 2192 | /* at swapout, this memcg will be accessed to record to swap */ |
2152 | 2193 | if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) |
2153 | 2194 | css_put(&mem->css); |
2154 | 2195 | |
2155 | 2196 | |
... | ... | @@ -3232,12 +3273,277 @@ |
3232 | 3273 | return 0; |
3233 | 3274 | } |
3234 | 3275 | |
3276 | +static bool mem_cgroup_threshold_check(struct mem_cgroup *mem) | |
3277 | +{ | |
3278 | + bool ret = false; | |
3279 | + int cpu; | |
3280 | + s64 val; | |
3281 | + struct mem_cgroup_stat_cpu *cpustat; | |
3235 | 3282 | |
3283 | + cpu = get_cpu(); | |
3284 | + cpustat = &mem->stat.cpustat[cpu]; | |
3285 | + val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_THRESHOLDS); | |
3286 | + if (unlikely(val < 0)) { | |
3287 | + __mem_cgroup_stat_set_safe(cpustat, MEM_CGROUP_STAT_THRESHOLDS, | |
3288 | + THRESHOLDS_EVENTS_THRESH); | |
3289 | + ret = true; | |
3290 | + } | |
3291 | + put_cpu(); | |
3292 | + return ret; | |
3293 | +} | |
3294 | + | |
3295 | +static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) | |
3296 | +{ | |
3297 | + struct mem_cgroup_threshold_ary *t; | |
3298 | + u64 usage; | |
3299 | + int i; | |
3300 | + | |
3301 | + rcu_read_lock(); | |
3302 | + if (!swap) | |
3303 | + t = rcu_dereference(memcg->thresholds); | |
3304 | + else | |
3305 | + t = rcu_dereference(memcg->memsw_thresholds); | |
3306 | + | |
3307 | + if (!t) | |
3308 | + goto unlock; | |
3309 | + | |
3310 | + usage = mem_cgroup_usage(memcg, swap); | |
3311 | + | |
3312 | + /* | |
3313 | + * current_threshold points to threshold just below usage. | |
3314 | + * If it's not true, a threshold was crossed after last | |
3315 | + * call of __mem_cgroup_threshold(). | |
3316 | + */ | |
3317 | + i = atomic_read(&t->current_threshold); | |
3318 | + | |
3319 | + /* | |
3320 | + * Iterate backward over array of thresholds starting from | |
3321 | + * current_threshold and check if a threshold is crossed. | |
3322 | + * If none of thresholds below usage is crossed, we read | |
3323 | + * only one element of the array here. | |
3324 | + */ | |
3325 | + for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) | |
3326 | + eventfd_signal(t->entries[i].eventfd, 1); | |
3327 | + | |
3328 | + /* i = current_threshold + 1 */ | |
3329 | + i++; | |
3330 | + | |
3331 | + /* | |
3332 | + * Iterate forward over array of thresholds starting from | |
3333 | + * current_threshold+1 and check if a threshold is crossed. | |
3334 | + * If none of thresholds above usage is crossed, we read | |
3335 | + * only one element of the array here. | |
3336 | + */ | |
3337 | + for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) | |
3338 | + eventfd_signal(t->entries[i].eventfd, 1); | |
3339 | + | |
3340 | + /* Update current_threshold */ | |
3341 | + atomic_set(&t->current_threshold, i - 1); | |
3342 | +unlock: | |
3343 | + rcu_read_unlock(); | |
3344 | +} | |
3345 | + | |
3346 | +static void mem_cgroup_threshold(struct mem_cgroup *memcg) | |
3347 | +{ | |
3348 | + __mem_cgroup_threshold(memcg, false); | |
3349 | + if (do_swap_account) | |
3350 | + __mem_cgroup_threshold(memcg, true); | |
3351 | +} | |
3352 | + | |
3353 | +static int compare_thresholds(const void *a, const void *b) | |
3354 | +{ | |
3355 | + const struct mem_cgroup_threshold *_a = a; | |
3356 | + const struct mem_cgroup_threshold *_b = b; | |
3357 | + | |
3358 | + return _a->threshold - _b->threshold; | |
3359 | +} | |
3360 | + | |
3361 | +static int mem_cgroup_register_event(struct cgroup *cgrp, struct cftype *cft, | |
3362 | + struct eventfd_ctx *eventfd, const char *args) | |
3363 | +{ | |
3364 | + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | |
3365 | + struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; | |
3366 | + int type = MEMFILE_TYPE(cft->private); | |
3367 | + u64 threshold, usage; | |
3368 | + int size; | |
3369 | + int i, ret; | |
3370 | + | |
3371 | + ret = res_counter_memparse_write_strategy(args, &threshold); | |
3372 | + if (ret) | |
3373 | + return ret; | |
3374 | + | |
3375 | + mutex_lock(&memcg->thresholds_lock); | |
3376 | + if (type == _MEM) | |
3377 | + thresholds = memcg->thresholds; | |
3378 | + else if (type == _MEMSWAP) | |
3379 | + thresholds = memcg->memsw_thresholds; | |
3380 | + else | |
3381 | + BUG(); | |
3382 | + | |
3383 | + usage = mem_cgroup_usage(memcg, type == _MEMSWAP); | |
3384 | + | |
3385 | + /* Check if a threshold crossed before adding a new one */ | |
3386 | + if (thresholds) | |
3387 | + __mem_cgroup_threshold(memcg, type == _MEMSWAP); | |
3388 | + | |
3389 | + if (thresholds) | |
3390 | + size = thresholds->size + 1; | |
3391 | + else | |
3392 | + size = 1; | |
3393 | + | |
3394 | + /* Allocate memory for new array of thresholds */ | |
3395 | + thresholds_new = kmalloc(sizeof(*thresholds_new) + | |
3396 | + size * sizeof(struct mem_cgroup_threshold), | |
3397 | + GFP_KERNEL); | |
3398 | + if (!thresholds_new) { | |
3399 | + ret = -ENOMEM; | |
3400 | + goto unlock; | |
3401 | + } | |
3402 | + thresholds_new->size = size; | |
3403 | + | |
3404 | + /* Copy thresholds (if any) to new array */ | |
3405 | + if (thresholds) | |
3406 | + memcpy(thresholds_new->entries, thresholds->entries, | |
3407 | + thresholds->size * | |
3408 | + sizeof(struct mem_cgroup_threshold)); | |
3409 | + /* Add new threshold */ | |
3410 | + thresholds_new->entries[size - 1].eventfd = eventfd; | |
3411 | + thresholds_new->entries[size - 1].threshold = threshold; | |
3412 | + | |
3413 | + /* Sort thresholds. Registering of new threshold isn't time-critical */ | |
3414 | + sort(thresholds_new->entries, size, | |
3415 | + sizeof(struct mem_cgroup_threshold), | |
3416 | + compare_thresholds, NULL); | |
3417 | + | |
3418 | + /* Find current threshold */ | |
3419 | + atomic_set(&thresholds_new->current_threshold, -1); | |
3420 | + for (i = 0; i < size; i++) { | |
3421 | + if (thresholds_new->entries[i].threshold < usage) { | |
3422 | + /* | |
3423 | + * thresholds_new->current_threshold will not be used | |
3424 | + * until rcu_assign_pointer(), so it's safe to increment | |
3425 | + * it here. | |
3426 | + */ | |
3427 | + atomic_inc(&thresholds_new->current_threshold); | |
3428 | + } | |
3429 | + } | |
3430 | + | |
3431 | + /* | |
3432 | + * We need to increment refcnt to be sure that all thresholds | |
3433 | + * will be unregistered before calling __mem_cgroup_free() | |
3434 | + */ | |
3435 | + mem_cgroup_get(memcg); | |
3436 | + | |
3437 | + if (type == _MEM) | |
3438 | + rcu_assign_pointer(memcg->thresholds, thresholds_new); | |
3439 | + else | |
3440 | + rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new); | |
3441 | + | |
3442 | + /* To be sure that nobody uses thresholds before freeing it */ | |
3443 | + synchronize_rcu(); | |
3444 | + | |
3445 | + kfree(thresholds); | |
3446 | +unlock: | |
3447 | + mutex_unlock(&memcg->thresholds_lock); | |
3448 | + | |
3449 | + return ret; | |
3450 | +} | |
3451 | + | |
3452 | +static int mem_cgroup_unregister_event(struct cgroup *cgrp, struct cftype *cft, | |
3453 | + struct eventfd_ctx *eventfd) | |
3454 | +{ | |
3455 | + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | |
3456 | + struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; | |
3457 | + int type = MEMFILE_TYPE(cft->private); | |
3458 | + u64 usage; | |
3459 | + int size = 0; | |
3460 | + int i, j, ret; | |
3461 | + | |
3462 | + mutex_lock(&memcg->thresholds_lock); | |
3463 | + if (type == _MEM) | |
3464 | + thresholds = memcg->thresholds; | |
3465 | + else if (type == _MEMSWAP) | |
3466 | + thresholds = memcg->memsw_thresholds; | |
3467 | + else | |
3468 | + BUG(); | |
3469 | + | |
3470 | + /* | |
3471 | + * Something went wrong if we trying to unregister a threshold | |
3472 | + * if we don't have thresholds | |
3473 | + */ | |
3474 | + BUG_ON(!thresholds); | |
3475 | + | |
3476 | + usage = mem_cgroup_usage(memcg, type == _MEMSWAP); | |
3477 | + | |
3478 | + /* Check if a threshold crossed before removing */ | |
3479 | + __mem_cgroup_threshold(memcg, type == _MEMSWAP); | |
3480 | + | |
3481 | + /* Calculate new number of threshold */ | |
3482 | + for (i = 0; i < thresholds->size; i++) { | |
3483 | + if (thresholds->entries[i].eventfd != eventfd) | |
3484 | + size++; | |
3485 | + } | |
3486 | + | |
3487 | + /* Set thresholds array to NULL if we don't have thresholds */ | |
3488 | + if (!size) { | |
3489 | + thresholds_new = NULL; | |
3490 | + goto assign; | |
3491 | + } | |
3492 | + | |
3493 | + /* Allocate memory for new array of thresholds */ | |
3494 | + thresholds_new = kmalloc(sizeof(*thresholds_new) + | |
3495 | + size * sizeof(struct mem_cgroup_threshold), | |
3496 | + GFP_KERNEL); | |
3497 | + if (!thresholds_new) { | |
3498 | + ret = -ENOMEM; | |
3499 | + goto unlock; | |
3500 | + } | |
3501 | + thresholds_new->size = size; | |
3502 | + | |
3503 | + /* Copy thresholds and find current threshold */ | |
3504 | + atomic_set(&thresholds_new->current_threshold, -1); | |
3505 | + for (i = 0, j = 0; i < thresholds->size; i++) { | |
3506 | + if (thresholds->entries[i].eventfd == eventfd) | |
3507 | + continue; | |
3508 | + | |
3509 | + thresholds_new->entries[j] = thresholds->entries[i]; | |
3510 | + if (thresholds_new->entries[j].threshold < usage) { | |
3511 | + /* | |
3512 | + * thresholds_new->current_threshold will not be used | |
3513 | + * until rcu_assign_pointer(), so it's safe to increment | |
3514 | + * it here. | |
3515 | + */ | |
3516 | + atomic_inc(&thresholds_new->current_threshold); | |
3517 | + } | |
3518 | + j++; | |
3519 | + } | |
3520 | + | |
3521 | +assign: | |
3522 | + if (type == _MEM) | |
3523 | + rcu_assign_pointer(memcg->thresholds, thresholds_new); | |
3524 | + else | |
3525 | + rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new); | |
3526 | + | |
3527 | + /* To be sure that nobody uses thresholds before freeing it */ | |
3528 | + synchronize_rcu(); | |
3529 | + | |
3530 | + for (i = 0; i < thresholds->size - size; i++) | |
3531 | + mem_cgroup_put(memcg); | |
3532 | + | |
3533 | + kfree(thresholds); | |
3534 | +unlock: | |
3535 | + mutex_unlock(&memcg->thresholds_lock); | |
3536 | + | |
3537 | + return ret; | |
3538 | +} | |
3539 | + | |
3236 | 3540 | static struct cftype mem_cgroup_files[] = { |
3237 | 3541 | { |
3238 | 3542 | .name = "usage_in_bytes", |
3239 | 3543 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), |
3240 | 3544 | .read_u64 = mem_cgroup_read, |
3545 | + .register_event = mem_cgroup_register_event, | |
3546 | + .unregister_event = mem_cgroup_unregister_event, | |
3241 | 3547 | }, |
3242 | 3548 | { |
3243 | 3549 | .name = "max_usage_in_bytes", |
... | ... | @@ -3294,6 +3600,8 @@ |
3294 | 3600 | .name = "memsw.usage_in_bytes", |
3295 | 3601 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), |
3296 | 3602 | .read_u64 = mem_cgroup_read, |
3603 | + .register_event = mem_cgroup_register_event, | |
3604 | + .unregister_event = mem_cgroup_unregister_event, | |
3297 | 3605 | }, |
3298 | 3606 | { |
3299 | 3607 | .name = "memsw.max_usage_in_bytes", |
... | ... | @@ -3538,6 +3846,7 @@ |
3538 | 3846 | mem->swappiness = get_swappiness(parent); |
3539 | 3847 | atomic_set(&mem->refcnt, 1); |
3540 | 3848 | mem->move_charge_at_immigrate = 0; |
3849 | + mutex_init(&mem->thresholds_lock); | |
3541 | 3850 | return &mem->css; |
3542 | 3851 | free_out: |
3543 | 3852 | __mem_cgroup_free(mem); |