Commit 2e72b6347c9459e6cff5634ddc815485bae6985f

Authored by Kirill A. Shutemov
Committed by Linus Torvalds
1 parent 378ce724bc

memcg: implement memory thresholds

It allows to register multiple memory and memsw thresholds and gets
notifications when it crosses.

To register a threshold application need:
- create an eventfd;
- open memory.usage_in_bytes or memory.memsw.usage_in_bytes;
- write string like "<event_fd> <memory.usage_in_bytes> <threshold>" to
  cgroup.event_control.

Application will be notified through eventfd when memory usage crosses
threshold in any direction.

It's applicable for root and non-root cgroup.

It uses stats to track memory usage, simmilar to soft limits. It checks
if we need to send event to userspace on every 100 page in/out. I guess
it's good compromise between performance and accuracy of thresholds.

[akpm@linux-foundation.org: coding-style fixes]
[nishimura@mxp.nes.nec.co.jp: fix documentation merge issue]
Signed-off-by: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Dan Malek <dan@embeddedalley.com>
Cc: Vladislav Buzov <vbuzov@embeddedalley.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Alexander Shishkin <virtuoso@slind.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 2 changed files with 327 additions and 1 deletions Side-by-side Diff

Documentation/cgroups/memory.txt
... ... @@ -468,7 +468,24 @@
468 468 - All of moving charge operations are done under cgroup_mutex. It's not good
469 469 behavior to hold the mutex too long, so we may need some trick.
470 470  
471   -9. TODO
  471 +9. Memory thresholds
  472 +
  473 +Memory controler implements memory thresholds using cgroups notification
  474 +API (see cgroups.txt). It allows to register multiple memory and memsw
  475 +thresholds and gets notifications when it crosses.
  476 +
  477 +To register a threshold application need:
  478 + - create an eventfd using eventfd(2);
  479 + - open memory.usage_in_bytes or memory.memsw.usage_in_bytes;
  480 + - write string like "<event_fd> <memory.usage_in_bytes> <threshold>" to
  481 + cgroup.event_control.
  482 +
  483 +Application will be notified through eventfd when memory usage crosses
  484 +threshold in any direction.
  485 +
  486 +It's applicable for root and non-root cgroup.
  487 +
  488 +10. TODO
472 489  
473 490 1. Add support for accounting huge pages (as a separate controller)
474 491 2. Make per-cgroup scanner reclaim not-shared pages first
... ... @@ -6,6 +6,10 @@
6 6 * Copyright 2007 OpenVZ SWsoft Inc
7 7 * Author: Pavel Emelianov <xemul@openvz.org>
8 8 *
  9 + * Memory thresholds
  10 + * Copyright (C) 2009 Nokia Corporation
  11 + * Author: Kirill A. Shutemov
  12 + *
9 13 * This program is free software; you can redistribute it and/or modify
10 14 * it under the terms of the GNU General Public License as published by
11 15 * the Free Software Foundation; either version 2 of the License, or
... ... @@ -35,6 +39,8 @@
35 39 #include <linux/swap.h>
36 40 #include <linux/swapops.h>
37 41 #include <linux/spinlock.h>
  42 +#include <linux/eventfd.h>
  43 +#include <linux/sort.h>
38 44 #include <linux/fs.h>
39 45 #include <linux/seq_file.h>
40 46 #include <linux/vmalloc.h>
... ... @@ -58,6 +64,7 @@
58 64 #endif
59 65  
60 66 #define SOFTLIMIT_EVENTS_THRESH (1000)
  67 +#define THRESHOLDS_EVENTS_THRESH (100)
61 68  
62 69 /*
63 70 * Statistics for memory cgroup.
... ... @@ -74,6 +81,8 @@
74 81 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
75 82 MEM_CGROUP_STAT_SOFTLIMIT, /* decrements on each page in/out.
76 83 used by soft limit implementation */
  84 + MEM_CGROUP_STAT_THRESHOLDS, /* decrements on each page in/out.
  85 + used by threshold implementation */
77 86  
78 87 MEM_CGROUP_STAT_NSTATS,
79 88 };
... ... @@ -177,6 +186,23 @@
177 186  
178 187 static struct mem_cgroup_tree soft_limit_tree __read_mostly;
179 188  
  189 +struct mem_cgroup_threshold {
  190 + struct eventfd_ctx *eventfd;
  191 + u64 threshold;
  192 +};
  193 +
  194 +struct mem_cgroup_threshold_ary {
  195 + /* An array index points to threshold just below usage. */
  196 + atomic_t current_threshold;
  197 + /* Size of entries[] */
  198 + unsigned int size;
  199 + /* Array of thresholds */
  200 + struct mem_cgroup_threshold entries[0];
  201 +};
  202 +
  203 +static bool mem_cgroup_threshold_check(struct mem_cgroup *mem);
  204 +static void mem_cgroup_threshold(struct mem_cgroup *mem);
  205 +
180 206 /*
181 207 * The memory controller data structure. The memory controller controls both
182 208 * page cache and RSS per cgroup. We would eventually like to provide
... ... @@ -228,6 +254,15 @@
228 254 /* set when res.limit == memsw.limit */
229 255 bool memsw_is_minimum;
230 256  
  257 + /* protect arrays of thresholds */
  258 + struct mutex thresholds_lock;
  259 +
  260 + /* thresholds for memory usage. RCU-protected */
  261 + struct mem_cgroup_threshold_ary *thresholds;
  262 +
  263 + /* thresholds for mem+swap usage. RCU-protected */
  264 + struct mem_cgroup_threshold_ary *memsw_thresholds;
  265 +
231 266 /*
232 267 * Should we move charges of a task when a task is moved into this
233 268 * mem_cgroup ? And what type of charges should we move ?
... ... @@ -549,6 +584,8 @@
549 584 __mem_cgroup_stat_add_safe(cpustat,
550 585 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
551 586 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SOFTLIMIT, -1);
  587 + __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_THRESHOLDS, -1);
  588 +
552 589 put_cpu();
553 590 }
554 591  
... ... @@ -1576,6 +1613,8 @@
1576 1613 if (page && mem_cgroup_soft_limit_check(mem))
1577 1614 mem_cgroup_update_tree(mem, page);
1578 1615 done:
  1616 + if (mem_cgroup_threshold_check(mem))
  1617 + mem_cgroup_threshold(mem);
1579 1618 return 0;
1580 1619 nomem:
1581 1620 css_put(&mem->css);
... ... @@ -2148,6 +2187,8 @@
2148 2187  
2149 2188 if (mem_cgroup_soft_limit_check(mem))
2150 2189 mem_cgroup_update_tree(mem, page);
  2190 + if (mem_cgroup_threshold_check(mem))
  2191 + mem_cgroup_threshold(mem);
2151 2192 /* at swapout, this memcg will be accessed to record to swap */
2152 2193 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2153 2194 css_put(&mem->css);
2154 2195  
2155 2196  
... ... @@ -3232,12 +3273,277 @@
3232 3273 return 0;
3233 3274 }
3234 3275  
  3276 +static bool mem_cgroup_threshold_check(struct mem_cgroup *mem)
  3277 +{
  3278 + bool ret = false;
  3279 + int cpu;
  3280 + s64 val;
  3281 + struct mem_cgroup_stat_cpu *cpustat;
3235 3282  
  3283 + cpu = get_cpu();
  3284 + cpustat = &mem->stat.cpustat[cpu];
  3285 + val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_THRESHOLDS);
  3286 + if (unlikely(val < 0)) {
  3287 + __mem_cgroup_stat_set_safe(cpustat, MEM_CGROUP_STAT_THRESHOLDS,
  3288 + THRESHOLDS_EVENTS_THRESH);
  3289 + ret = true;
  3290 + }
  3291 + put_cpu();
  3292 + return ret;
  3293 +}
  3294 +
  3295 +static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
  3296 +{
  3297 + struct mem_cgroup_threshold_ary *t;
  3298 + u64 usage;
  3299 + int i;
  3300 +
  3301 + rcu_read_lock();
  3302 + if (!swap)
  3303 + t = rcu_dereference(memcg->thresholds);
  3304 + else
  3305 + t = rcu_dereference(memcg->memsw_thresholds);
  3306 +
  3307 + if (!t)
  3308 + goto unlock;
  3309 +
  3310 + usage = mem_cgroup_usage(memcg, swap);
  3311 +
  3312 + /*
  3313 + * current_threshold points to threshold just below usage.
  3314 + * If it's not true, a threshold was crossed after last
  3315 + * call of __mem_cgroup_threshold().
  3316 + */
  3317 + i = atomic_read(&t->current_threshold);
  3318 +
  3319 + /*
  3320 + * Iterate backward over array of thresholds starting from
  3321 + * current_threshold and check if a threshold is crossed.
  3322 + * If none of thresholds below usage is crossed, we read
  3323 + * only one element of the array here.
  3324 + */
  3325 + for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
  3326 + eventfd_signal(t->entries[i].eventfd, 1);
  3327 +
  3328 + /* i = current_threshold + 1 */
  3329 + i++;
  3330 +
  3331 + /*
  3332 + * Iterate forward over array of thresholds starting from
  3333 + * current_threshold+1 and check if a threshold is crossed.
  3334 + * If none of thresholds above usage is crossed, we read
  3335 + * only one element of the array here.
  3336 + */
  3337 + for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
  3338 + eventfd_signal(t->entries[i].eventfd, 1);
  3339 +
  3340 + /* Update current_threshold */
  3341 + atomic_set(&t->current_threshold, i - 1);
  3342 +unlock:
  3343 + rcu_read_unlock();
  3344 +}
  3345 +
  3346 +static void mem_cgroup_threshold(struct mem_cgroup *memcg)
  3347 +{
  3348 + __mem_cgroup_threshold(memcg, false);
  3349 + if (do_swap_account)
  3350 + __mem_cgroup_threshold(memcg, true);
  3351 +}
  3352 +
  3353 +static int compare_thresholds(const void *a, const void *b)
  3354 +{
  3355 + const struct mem_cgroup_threshold *_a = a;
  3356 + const struct mem_cgroup_threshold *_b = b;
  3357 +
  3358 + return _a->threshold - _b->threshold;
  3359 +}
  3360 +
  3361 +static int mem_cgroup_register_event(struct cgroup *cgrp, struct cftype *cft,
  3362 + struct eventfd_ctx *eventfd, const char *args)
  3363 +{
  3364 + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
  3365 + struct mem_cgroup_threshold_ary *thresholds, *thresholds_new;
  3366 + int type = MEMFILE_TYPE(cft->private);
  3367 + u64 threshold, usage;
  3368 + int size;
  3369 + int i, ret;
  3370 +
  3371 + ret = res_counter_memparse_write_strategy(args, &threshold);
  3372 + if (ret)
  3373 + return ret;
  3374 +
  3375 + mutex_lock(&memcg->thresholds_lock);
  3376 + if (type == _MEM)
  3377 + thresholds = memcg->thresholds;
  3378 + else if (type == _MEMSWAP)
  3379 + thresholds = memcg->memsw_thresholds;
  3380 + else
  3381 + BUG();
  3382 +
  3383 + usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
  3384 +
  3385 + /* Check if a threshold crossed before adding a new one */
  3386 + if (thresholds)
  3387 + __mem_cgroup_threshold(memcg, type == _MEMSWAP);
  3388 +
  3389 + if (thresholds)
  3390 + size = thresholds->size + 1;
  3391 + else
  3392 + size = 1;
  3393 +
  3394 + /* Allocate memory for new array of thresholds */
  3395 + thresholds_new = kmalloc(sizeof(*thresholds_new) +
  3396 + size * sizeof(struct mem_cgroup_threshold),
  3397 + GFP_KERNEL);
  3398 + if (!thresholds_new) {
  3399 + ret = -ENOMEM;
  3400 + goto unlock;
  3401 + }
  3402 + thresholds_new->size = size;
  3403 +
  3404 + /* Copy thresholds (if any) to new array */
  3405 + if (thresholds)
  3406 + memcpy(thresholds_new->entries, thresholds->entries,
  3407 + thresholds->size *
  3408 + sizeof(struct mem_cgroup_threshold));
  3409 + /* Add new threshold */
  3410 + thresholds_new->entries[size - 1].eventfd = eventfd;
  3411 + thresholds_new->entries[size - 1].threshold = threshold;
  3412 +
  3413 + /* Sort thresholds. Registering of new threshold isn't time-critical */
  3414 + sort(thresholds_new->entries, size,
  3415 + sizeof(struct mem_cgroup_threshold),
  3416 + compare_thresholds, NULL);
  3417 +
  3418 + /* Find current threshold */
  3419 + atomic_set(&thresholds_new->current_threshold, -1);
  3420 + for (i = 0; i < size; i++) {
  3421 + if (thresholds_new->entries[i].threshold < usage) {
  3422 + /*
  3423 + * thresholds_new->current_threshold will not be used
  3424 + * until rcu_assign_pointer(), so it's safe to increment
  3425 + * it here.
  3426 + */
  3427 + atomic_inc(&thresholds_new->current_threshold);
  3428 + }
  3429 + }
  3430 +
  3431 + /*
  3432 + * We need to increment refcnt to be sure that all thresholds
  3433 + * will be unregistered before calling __mem_cgroup_free()
  3434 + */
  3435 + mem_cgroup_get(memcg);
  3436 +
  3437 + if (type == _MEM)
  3438 + rcu_assign_pointer(memcg->thresholds, thresholds_new);
  3439 + else
  3440 + rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new);
  3441 +
  3442 + /* To be sure that nobody uses thresholds before freeing it */
  3443 + synchronize_rcu();
  3444 +
  3445 + kfree(thresholds);
  3446 +unlock:
  3447 + mutex_unlock(&memcg->thresholds_lock);
  3448 +
  3449 + return ret;
  3450 +}
  3451 +
  3452 +static int mem_cgroup_unregister_event(struct cgroup *cgrp, struct cftype *cft,
  3453 + struct eventfd_ctx *eventfd)
  3454 +{
  3455 + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
  3456 + struct mem_cgroup_threshold_ary *thresholds, *thresholds_new;
  3457 + int type = MEMFILE_TYPE(cft->private);
  3458 + u64 usage;
  3459 + int size = 0;
  3460 + int i, j, ret;
  3461 +
  3462 + mutex_lock(&memcg->thresholds_lock);
  3463 + if (type == _MEM)
  3464 + thresholds = memcg->thresholds;
  3465 + else if (type == _MEMSWAP)
  3466 + thresholds = memcg->memsw_thresholds;
  3467 + else
  3468 + BUG();
  3469 +
  3470 + /*
  3471 + * Something went wrong if we trying to unregister a threshold
  3472 + * if we don't have thresholds
  3473 + */
  3474 + BUG_ON(!thresholds);
  3475 +
  3476 + usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
  3477 +
  3478 + /* Check if a threshold crossed before removing */
  3479 + __mem_cgroup_threshold(memcg, type == _MEMSWAP);
  3480 +
  3481 + /* Calculate new number of threshold */
  3482 + for (i = 0; i < thresholds->size; i++) {
  3483 + if (thresholds->entries[i].eventfd != eventfd)
  3484 + size++;
  3485 + }
  3486 +
  3487 + /* Set thresholds array to NULL if we don't have thresholds */
  3488 + if (!size) {
  3489 + thresholds_new = NULL;
  3490 + goto assign;
  3491 + }
  3492 +
  3493 + /* Allocate memory for new array of thresholds */
  3494 + thresholds_new = kmalloc(sizeof(*thresholds_new) +
  3495 + size * sizeof(struct mem_cgroup_threshold),
  3496 + GFP_KERNEL);
  3497 + if (!thresholds_new) {
  3498 + ret = -ENOMEM;
  3499 + goto unlock;
  3500 + }
  3501 + thresholds_new->size = size;
  3502 +
  3503 + /* Copy thresholds and find current threshold */
  3504 + atomic_set(&thresholds_new->current_threshold, -1);
  3505 + for (i = 0, j = 0; i < thresholds->size; i++) {
  3506 + if (thresholds->entries[i].eventfd == eventfd)
  3507 + continue;
  3508 +
  3509 + thresholds_new->entries[j] = thresholds->entries[i];
  3510 + if (thresholds_new->entries[j].threshold < usage) {
  3511 + /*
  3512 + * thresholds_new->current_threshold will not be used
  3513 + * until rcu_assign_pointer(), so it's safe to increment
  3514 + * it here.
  3515 + */
  3516 + atomic_inc(&thresholds_new->current_threshold);
  3517 + }
  3518 + j++;
  3519 + }
  3520 +
  3521 +assign:
  3522 + if (type == _MEM)
  3523 + rcu_assign_pointer(memcg->thresholds, thresholds_new);
  3524 + else
  3525 + rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new);
  3526 +
  3527 + /* To be sure that nobody uses thresholds before freeing it */
  3528 + synchronize_rcu();
  3529 +
  3530 + for (i = 0; i < thresholds->size - size; i++)
  3531 + mem_cgroup_put(memcg);
  3532 +
  3533 + kfree(thresholds);
  3534 +unlock:
  3535 + mutex_unlock(&memcg->thresholds_lock);
  3536 +
  3537 + return ret;
  3538 +}
  3539 +
3236 3540 static struct cftype mem_cgroup_files[] = {
3237 3541 {
3238 3542 .name = "usage_in_bytes",
3239 3543 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
3240 3544 .read_u64 = mem_cgroup_read,
  3545 + .register_event = mem_cgroup_register_event,
  3546 + .unregister_event = mem_cgroup_unregister_event,
3241 3547 },
3242 3548 {
3243 3549 .name = "max_usage_in_bytes",
... ... @@ -3294,6 +3600,8 @@
3294 3600 .name = "memsw.usage_in_bytes",
3295 3601 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
3296 3602 .read_u64 = mem_cgroup_read,
  3603 + .register_event = mem_cgroup_register_event,
  3604 + .unregister_event = mem_cgroup_unregister_event,
3297 3605 },
3298 3606 {
3299 3607 .name = "memsw.max_usage_in_bytes",
... ... @@ -3538,6 +3846,7 @@
3538 3846 mem->swappiness = get_swappiness(parent);
3539 3847 atomic_set(&mem->refcnt, 1);
3540 3848 mem->move_charge_at_immigrate = 0;
  3849 + mutex_init(&mem->thresholds_lock);
3541 3850 return &mem->css;
3542 3851 free_out:
3543 3852 __mem_cgroup_free(mem);