Commit 9490ff275606da012d5b373342a49610ad61cb81

Authored by KAMEZAWA Hiroyuki
Committed by Linus Torvalds
1 parent dc98df5a1b

memcg: oom notifier

Considering containers or other resource management softwares in userland,
event notification of OOM in memcg should be implemented.  Now, memcg has
"threshold" notifier which uses eventfd, we can make use of it for oom
notification.

This patch adds oom notification eventfd callback for memcg.  The usage is
very similar to threshold notifier, but control file is memory.oom_control
and no arguments other than eventfd is required.

	% cgroup_event_notifier /cgroup/A/memory.oom_control dummy
	(About cgroup_event_notifier, see Documentation/cgroup/)

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: David Rientjes <rientjes@google.com>
Cc: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 2 changed files with 111 additions and 9 deletions Side-by-side Diff

Documentation/cgroups/memory.txt
... ... @@ -184,6 +184,9 @@
184 184  
185 185 Note2: When panic_on_oom is set to "2", the whole system will panic.
186 186  
  187 +When oom event notifier is registered, event will be delivered.
  188 +(See oom_control section)
  189 +
187 190 2. Locking
188 191  
189 192 The memory controller uses the following hierarchy
... ... @@ -488,7 +491,22 @@
488 491  
489 492 It's applicable for root and non-root cgroup.
490 493  
491   -10. TODO
  494 +10. OOM Control
  495 +
  496 +Memory controler implements oom notifier using cgroup notification
  497 +API (See cgroups.txt). It allows to register multiple oom notification
  498 +delivery and gets notification when oom happens.
  499 +
  500 +To register a notifier, application need:
  501 + - create an eventfd using eventfd(2)
  502 + - open memory.oom_control file
  503 + - write string like "<event_fd> <memory.oom_control>" to cgroup.event_control
  504 +
  505 +Application will be notifier through eventfd when oom happens.
  506 +OOM notification doesn't work for root cgroup.
  507 +
  508 +
  509 +11. TODO
492 510  
493 511 1. Add support for accounting huge pages (as a separate controller)
494 512 2. Make per-cgroup scanner reclaim not-shared pages first
... ... @@ -149,6 +149,7 @@
149 149 u64 threshold;
150 150 };
151 151  
  152 +/* For threshold */
152 153 struct mem_cgroup_threshold_ary {
153 154 /* An array index points to threshold just below usage. */
154 155 atomic_t current_threshold;
155 156  
... ... @@ -157,8 +158,14 @@
157 158 /* Array of thresholds */
158 159 struct mem_cgroup_threshold entries[0];
159 160 };
  161 +/* for OOM */
  162 +struct mem_cgroup_eventfd_list {
  163 + struct list_head list;
  164 + struct eventfd_ctx *eventfd;
  165 +};
160 166  
161 167 static void mem_cgroup_threshold(struct mem_cgroup *mem);
  168 +static void mem_cgroup_oom_notify(struct mem_cgroup *mem);
162 169  
163 170 /*
164 171 * The memory controller data structure. The memory controller controls both
... ... @@ -220,6 +227,9 @@
220 227 /* thresholds for mem+swap usage. RCU-protected */
221 228 struct mem_cgroup_threshold_ary *memsw_thresholds;
222 229  
  230 + /* For oom notifier event fd */
  231 + struct list_head oom_notify;
  232 +
223 233 /*
224 234 * Should we move charges of a task when a task is moved into this
225 235 * mem_cgroup ? And what type of charges should we move ?
226 236  
... ... @@ -282,9 +292,12 @@
282 292 /* for encoding cft->private value on file */
283 293 #define _MEM (0)
284 294 #define _MEMSWAP (1)
  295 +#define _OOM_TYPE (2)
285 296 #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
286 297 #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff)
287 298 #define MEMFILE_ATTR(val) ((val) & 0xffff)
  299 +/* Used for OOM nofiier */
  300 +#define OOM_CONTROL (0)
288 301  
289 302 /*
290 303 * Reclaim flags for mem_cgroup_hierarchical_reclaim
... ... @@ -1353,6 +1366,8 @@
1353 1366 */
1354 1367 if (!locked)
1355 1368 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
  1369 + else
  1370 + mem_cgroup_oom_notify(mem);
1356 1371 mutex_unlock(&memcg_oom_mutex);
1357 1372  
1358 1373 if (locked)
1359 1374  
... ... @@ -3398,9 +3413,23 @@
3398 3413 return _a->threshold - _b->threshold;
3399 3414 }
3400 3415  
3401   -static int mem_cgroup_register_event(struct cgroup *cgrp, struct cftype *cft,
3402   - struct eventfd_ctx *eventfd, const char *args)
  3416 +static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data)
3403 3417 {
  3418 + struct mem_cgroup_eventfd_list *ev;
  3419 +
  3420 + list_for_each_entry(ev, &mem->oom_notify, list)
  3421 + eventfd_signal(ev->eventfd, 1);
  3422 + return 0;
  3423 +}
  3424 +
  3425 +static void mem_cgroup_oom_notify(struct mem_cgroup *mem)
  3426 +{
  3427 + mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_notify_cb);
  3428 +}
  3429 +
  3430 +static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
  3431 + struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
  3432 +{
3404 3433 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3405 3434 struct mem_cgroup_threshold_ary *thresholds, *thresholds_new;
3406 3435 int type = MEMFILE_TYPE(cft->private);
... ... @@ -3483,8 +3512,8 @@
3483 3512 return ret;
3484 3513 }
3485 3514  
3486   -static int mem_cgroup_unregister_event(struct cgroup *cgrp, struct cftype *cft,
3487   - struct eventfd_ctx *eventfd)
  3515 +static int mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
  3516 + struct cftype *cft, struct eventfd_ctx *eventfd)
3488 3517 {
3489 3518 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3490 3519 struct mem_cgroup_threshold_ary *thresholds, *thresholds_new;
3491 3520  
... ... @@ -3568,13 +3597,61 @@
3568 3597 return ret;
3569 3598 }
3570 3599  
  3600 +static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
  3601 + struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
  3602 +{
  3603 + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
  3604 + struct mem_cgroup_eventfd_list *event;
  3605 + int type = MEMFILE_TYPE(cft->private);
  3606 +
  3607 + BUG_ON(type != _OOM_TYPE);
  3608 + event = kmalloc(sizeof(*event), GFP_KERNEL);
  3609 + if (!event)
  3610 + return -ENOMEM;
  3611 +
  3612 + mutex_lock(&memcg_oom_mutex);
  3613 +
  3614 + event->eventfd = eventfd;
  3615 + list_add(&event->list, &memcg->oom_notify);
  3616 +
  3617 + /* already in OOM ? */
  3618 + if (atomic_read(&memcg->oom_lock))
  3619 + eventfd_signal(eventfd, 1);
  3620 + mutex_unlock(&memcg_oom_mutex);
  3621 +
  3622 + return 0;
  3623 +}
  3624 +
  3625 +static int mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
  3626 + struct cftype *cft, struct eventfd_ctx *eventfd)
  3627 +{
  3628 + struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
  3629 + struct mem_cgroup_eventfd_list *ev, *tmp;
  3630 + int type = MEMFILE_TYPE(cft->private);
  3631 +
  3632 + BUG_ON(type != _OOM_TYPE);
  3633 +
  3634 + mutex_lock(&memcg_oom_mutex);
  3635 +
  3636 + list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) {
  3637 + if (ev->eventfd == eventfd) {
  3638 + list_del(&ev->list);
  3639 + kfree(ev);
  3640 + }
  3641 + }
  3642 +
  3643 + mutex_unlock(&memcg_oom_mutex);
  3644 +
  3645 + return 0;
  3646 +}
  3647 +
3571 3648 static struct cftype mem_cgroup_files[] = {
3572 3649 {
3573 3650 .name = "usage_in_bytes",
3574 3651 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
3575 3652 .read_u64 = mem_cgroup_read,
3576   - .register_event = mem_cgroup_register_event,
3577   - .unregister_event = mem_cgroup_unregister_event,
  3653 + .register_event = mem_cgroup_usage_register_event,
  3654 + .unregister_event = mem_cgroup_usage_unregister_event,
3578 3655 },
3579 3656 {
3580 3657 .name = "max_usage_in_bytes",
... ... @@ -3623,6 +3700,12 @@
3623 3700 .read_u64 = mem_cgroup_move_charge_read,
3624 3701 .write_u64 = mem_cgroup_move_charge_write,
3625 3702 },
  3703 + {
  3704 + .name = "oom_control",
  3705 + .register_event = mem_cgroup_oom_register_event,
  3706 + .unregister_event = mem_cgroup_oom_unregister_event,
  3707 + .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
  3708 + },
3626 3709 };
3627 3710  
3628 3711 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
... ... @@ -3631,8 +3714,8 @@
3631 3714 .name = "memsw.usage_in_bytes",
3632 3715 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
3633 3716 .read_u64 = mem_cgroup_read,
3634   - .register_event = mem_cgroup_register_event,
3635   - .unregister_event = mem_cgroup_unregister_event,
  3717 + .register_event = mem_cgroup_usage_register_event,
  3718 + .unregister_event = mem_cgroup_usage_unregister_event,
3636 3719 },
3637 3720 {
3638 3721 .name = "memsw.max_usage_in_bytes",
... ... @@ -3878,6 +3961,7 @@
3878 3961 }
3879 3962 mem->last_scanned_child = 0;
3880 3963 spin_lock_init(&mem->reclaim_param_lock);
  3964 + INIT_LIST_HEAD(&mem->oom_notify);
3881 3965  
3882 3966 if (parent)
3883 3967 mem->swappiness = get_swappiness(parent);