Commit cc8475822f8a4b17e9b76e7fadb6b9a341860422
Committed by
Linus Torvalds
1 parent
417eead304
Exists in
master
and in
4 other branches
memory cgroup enhancements: force_empty interface for dropping all account in empty cgroup
This patch adds an interface "memory.force_empty". Any write to this file will drop all charges in this cgroup if there is no task under. %echo 1 > /....../memory.force_empty will drop all charges of memory cgroup if cgroup's tasks is empty. This is useful to invoke rmdir() against memory cgroup successfully. Tested and worked well on x86_64/fake-NUMA system. Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: Pavel Emelianov <xemul@openvz.org> Cc: Paul Menage <menage@google.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Kirill Korotaev <dev@sw.ru> Cc: Herbert Poetzl <herbert@13thfloor.at> Cc: David Rientjes <rientjes@google.com> Cc: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 1 changed file with 103 additions and 7 deletions Side-by-side Diff
mm/memcontrol.c
... | ... | @@ -471,6 +471,7 @@ |
471 | 471 | page = pc->page; |
472 | 472 | /* |
473 | 473 | * get page->cgroup and clear it under lock. |
474 | + * force_empty can drop page->cgroup without checking refcnt. | |
474 | 475 | */ |
475 | 476 | if (clear_page_cgroup(page, pc) == pc) { |
476 | 477 | mem = pc->mem_cgroup; |
... | ... | @@ -480,13 +481,6 @@ |
480 | 481 | list_del_init(&pc->lru); |
481 | 482 | spin_unlock_irqrestore(&mem->lru_lock, flags); |
482 | 483 | kfree(pc); |
483 | - } else { | |
484 | - /* | |
485 | - * Note:This will be removed when force-empty patch is | |
486 | - * applied. just show warning here. | |
487 | - */ | |
488 | - printk(KERN_ERR "Race in mem_cgroup_uncharge() ?"); | |
489 | - dump_stack(); | |
490 | 484 | } |
491 | 485 | } |
492 | 486 | } |
... | ... | @@ -534,6 +528,76 @@ |
534 | 528 | return; |
535 | 529 | } |
536 | 530 | |
531 | +/* | |
532 | + * This routine traverse page_cgroup in given list and drop them all. | |
533 | + * This routine ignores page_cgroup->ref_cnt. | |
534 | + * *And* this routine doesn't reclaim page itself, just removes page_cgroup. | |
535 | + */ | |
536 | +#define FORCE_UNCHARGE_BATCH (128) | |
537 | +static void | |
538 | +mem_cgroup_force_empty_list(struct mem_cgroup *mem, struct list_head *list) | |
539 | +{ | |
540 | + struct page_cgroup *pc; | |
541 | + struct page *page; | |
542 | + int count; | |
543 | + unsigned long flags; | |
544 | + | |
545 | +retry: | |
546 | + count = FORCE_UNCHARGE_BATCH; | |
547 | + spin_lock_irqsave(&mem->lru_lock, flags); | |
548 | + | |
549 | + while (--count && !list_empty(list)) { | |
550 | + pc = list_entry(list->prev, struct page_cgroup, lru); | |
551 | + page = pc->page; | |
552 | + /* Avoid race with charge */ | |
553 | + atomic_set(&pc->ref_cnt, 0); | |
554 | + if (clear_page_cgroup(page, pc) == pc) { | |
555 | + css_put(&mem->css); | |
556 | + res_counter_uncharge(&mem->res, PAGE_SIZE); | |
557 | + list_del_init(&pc->lru); | |
558 | + kfree(pc); | |
559 | + } else /* being uncharged ? ...do relax */ | |
560 | + break; | |
561 | + } | |
562 | + spin_unlock_irqrestore(&mem->lru_lock, flags); | |
563 | + if (!list_empty(list)) { | |
564 | + cond_resched(); | |
565 | + goto retry; | |
566 | + } | |
567 | + return; | |
568 | +} | |
569 | + | |
570 | +/* | |
571 | + * make mem_cgroup's charge to be 0 if there is no task. | |
572 | + * This enables deleting this mem_cgroup. | |
573 | + */ | |
574 | + | |
575 | +int mem_cgroup_force_empty(struct mem_cgroup *mem) | |
576 | +{ | |
577 | + int ret = -EBUSY; | |
578 | + css_get(&mem->css); | |
579 | + /* | |
580 | + * page reclaim code (kswapd etc..) will move pages between | |
581 | +` * active_list <-> inactive_list while we don't take a lock. | |
582 | + * So, we have to do loop here until all lists are empty. | |
583 | + */ | |
584 | + while (!(list_empty(&mem->active_list) && | |
585 | + list_empty(&mem->inactive_list))) { | |
586 | + if (atomic_read(&mem->css.cgroup->count) > 0) | |
587 | + goto out; | |
588 | + /* drop all page_cgroup in active_list */ | |
589 | + mem_cgroup_force_empty_list(mem, &mem->active_list); | |
590 | + /* drop all page_cgroup in inactive_list */ | |
591 | + mem_cgroup_force_empty_list(mem, &mem->inactive_list); | |
592 | + } | |
593 | + ret = 0; | |
594 | +out: | |
595 | + css_put(&mem->css); | |
596 | + return ret; | |
597 | +} | |
598 | + | |
599 | + | |
600 | + | |
537 | 601 | int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp) |
538 | 602 | { |
539 | 603 | *tmp = memparse(buf, &buf); |
... | ... | @@ -619,6 +683,33 @@ |
619 | 683 | ppos, buf, s - buf); |
620 | 684 | } |
621 | 685 | |
686 | + | |
687 | +static ssize_t mem_force_empty_write(struct cgroup *cont, | |
688 | + struct cftype *cft, struct file *file, | |
689 | + const char __user *userbuf, | |
690 | + size_t nbytes, loff_t *ppos) | |
691 | +{ | |
692 | + struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | |
693 | + int ret; | |
694 | + ret = mem_cgroup_force_empty(mem); | |
695 | + if (!ret) | |
696 | + ret = nbytes; | |
697 | + return ret; | |
698 | +} | |
699 | + | |
700 | +/* | |
701 | + * Note: This should be removed if cgroup supports write-only file. | |
702 | + */ | |
703 | + | |
704 | +static ssize_t mem_force_empty_read(struct cgroup *cont, | |
705 | + struct cftype *cft, | |
706 | + struct file *file, char __user *userbuf, | |
707 | + size_t nbytes, loff_t *ppos) | |
708 | +{ | |
709 | + return -EINVAL; | |
710 | +} | |
711 | + | |
712 | + | |
622 | 713 | static struct cftype mem_cgroup_files[] = { |
623 | 714 | { |
624 | 715 | .name = "usage_in_bytes", |
... | ... | @@ -640,6 +731,11 @@ |
640 | 731 | .name = "control_type", |
641 | 732 | .write = mem_control_type_write, |
642 | 733 | .read = mem_control_type_read, |
734 | + }, | |
735 | + { | |
736 | + .name = "force_empty", | |
737 | + .write = mem_force_empty_write, | |
738 | + .read = mem_force_empty_read, | |
643 | 739 | }, |
644 | 740 | }; |
645 | 741 |