Commit 956db3ca0606e78456786ef19fd4dc7a5151a6e1

Authored by Cliff Wickman
Committed by Linus Torvalds
1 parent 31a7df01fd

hotplug cpu: move tasks in empty cpusets to parent

This patch corrects a situation that occurs when one disables all the cpus in
a cpuset.

Currently, the disabled (cpu-less) cpuset inherits the cpus of its parent,
which is incorrect because it may then overlap its cpu-exclusive sibling.

Tasks of an empty cpuset should be moved to the cpuset which is the parent of
their current cpuset.  Or if the parent cpuset has no cpus, to its parent,
etc.

And the empty cpuset should be released (if it is flagged notify_on_release).

Depends on the cgroup_scan_tasks() function (proposed by David Rientjes) to
iterate through all tasks in the cpu-less cpuset.  We are deliberately
avoiding a walk of the tasklist.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: Paul Menage <menage@google.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 3 changed files with 145 additions and 45 deletions Side-by-side Diff

include/linux/cgroup.h
... ... @@ -318,6 +318,7 @@
318 318 struct cgroup_iter *it);
319 319 void cgroup_iter_end(struct cgroup *cont, struct cgroup_iter *it);
320 320 int cgroup_scan_tasks(struct cgroup_scanner *scan);
  321 +int cgroup_attach_task(struct cgroup *, struct task_struct *);
321 322  
322 323 #else /* !CONFIG_CGROUPS */
323 324  
... ... @@ -489,7 +489,7 @@
489 489 * Any task can increment and decrement the count field without lock.
490 490 * So in general, code holding cgroup_mutex can't rely on the count
491 491 * field not changing. However, if the count goes to zero, then only
492   - * attach_task() can increment it again. Because a count of zero
  492 + * cgroup_attach_task() can increment it again. Because a count of zero
493 493 * means that no tasks are currently attached, therefore there is no
494 494 * way a task attached to that cgroup can fork (the other way to
495 495 * increment the count). So code holding cgroup_mutex can safely
496 496  
497 497  
... ... @@ -520,17 +520,17 @@
520 520 * The task_lock() exception
521 521 *
522 522 * The need for this exception arises from the action of
523   - * attach_task(), which overwrites one tasks cgroup pointer with
  523 + * cgroup_attach_task(), which overwrites one tasks cgroup pointer with
524 524 * another. It does so using cgroup_mutexe, however there are
525 525 * several performance critical places that need to reference
526 526 * task->cgroup without the expense of grabbing a system global
527 527 * mutex. Therefore except as noted below, when dereferencing or, as
528   - * in attach_task(), modifying a task'ss cgroup pointer we use
  528 + * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
529 529 * task_lock(), which acts on a spinlock (task->alloc_lock) already in
530 530 * the task_struct routinely used for such matters.
531 531 *
532 532 * P.S. One more locking exception. RCU is used to guard the
533   - * update of a tasks cgroup pointer by attach_task()
  533 + * update of a tasks cgroup pointer by cgroup_attach_task()
534 534 */
535 535  
536 536 /**
... ... @@ -1194,7 +1194,7 @@
1194 1194 * Call holding cgroup_mutex. May take task_lock of
1195 1195 * the task 'pid' during call.
1196 1196 */
1197   -static int attach_task(struct cgroup *cgrp, struct task_struct *tsk)
  1197 +int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1198 1198 {
1199 1199 int retval = 0;
1200 1200 struct cgroup_subsys *ss;
... ... @@ -1287,7 +1287,7 @@
1287 1287 get_task_struct(tsk);
1288 1288 }
1289 1289  
1290   - ret = attach_task(cgrp, tsk);
  1290 + ret = cgroup_attach_task(cgrp, tsk);
1291 1291 put_task_struct(tsk);
1292 1292 return ret;
1293 1293 }
... ... @@ -2514,7 +2514,7 @@
2514 2514 * - Used for /proc/<pid>/cgroup.
2515 2515 * - No need to task_lock(tsk) on this tsk->cgroup reference, as it
2516 2516 * doesn't really matter if tsk->cgroup changes after we read it,
2517   - * and we take cgroup_mutex, keeping attach_task() from changing it
  2517 + * and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
2518 2518 * anyway. No need to check that tsk->cgroup != NULL, thanks to
2519 2519 * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
2520 2520 * cgroup to top_cgroup.
... ... @@ -2625,7 +2625,7 @@
2625 2625 * A pointer to the shared css_set was automatically copied in
2626 2626 * fork.c by dup_task_struct(). However, we ignore that copy, since
2627 2627 * it was not made under the protection of RCU or cgroup_mutex, so
2628   - * might no longer be a valid cgroup pointer. attach_task() might
  2628 + * might no longer be a valid cgroup pointer. cgroup_attach_task() might
2629 2629 * have already changed current->cgroups, allowing the previously
2630 2630 * referenced cgroup group to be removed and freed.
2631 2631 *
... ... @@ -2704,8 +2704,8 @@
2704 2704 * attach us to a different cgroup, decrementing the count on
2705 2705 * the first cgroup that we never incremented. But in this case,
2706 2706 * top_cgroup isn't going away, and either task has PF_EXITING set,
2707   - * which wards off any attach_task() attempts, or task is a failed
2708   - * fork, never visible to attach_task.
  2707 + * which wards off any cgroup_attach_task() attempts, or task is a failed
  2708 + * fork, never visible to cgroup_attach_task.
2709 2709 *
2710 2710 */
2711 2711 void cgroup_exit(struct task_struct *tsk, int run_callbacks)
... ... @@ -2845,7 +2845,7 @@
2845 2845 }
2846 2846  
2847 2847 /* All seems fine. Finish by moving the task into the new cgroup */
2848   - ret = attach_task(child, tsk);
  2848 + ret = cgroup_attach_task(child, tsk);
2849 2849 mutex_unlock(&cgroup_mutex);
2850 2850  
2851 2851 out_release:
... ... @@ -56,6 +56,8 @@
56 56 #include <asm/atomic.h>
57 57 #include <linux/mutex.h>
58 58 #include <linux/kfifo.h>
  59 +#include <linux/workqueue.h>
  60 +#include <linux/cgroup.h>
59 61  
60 62 /*
61 63 * Tracks how many cpusets are currently defined in system.
... ... @@ -96,6 +98,9 @@
96 98  
97 99 /* partition number for rebuild_sched_domains() */
98 100 int pn;
  101 +
  102 + /* used for walking a cpuset heirarchy */
  103 + struct list_head stack_list;
99 104 };
100 105  
101 106 /* Retrieve the cpuset for a cgroup */
102 107  
... ... @@ -111,8 +116,11 @@
111 116 return container_of(task_subsys_state(task, cpuset_subsys_id),
112 117 struct cpuset, css);
113 118 }
  119 +struct cpuset_hotplug_scanner {
  120 + struct cgroup_scanner scan;
  121 + struct cgroup *to;
  122 +};
114 123  
115   -
116 124 /* bits in struct cpuset flags field */
117 125 typedef enum {
118 126 CS_CPU_EXCLUSIVE,
119 127  
120 128  
121 129  
122 130  
123 131  
124 132  
125 133  
126 134  
127 135  
128 136  
129 137  
... ... @@ -1687,54 +1695,147 @@
1687 1695 return 0;
1688 1696 }
1689 1697  
  1698 +/**
  1699 + * cpuset_do_move_task - move a given task to another cpuset
  1700 + * @tsk: pointer to task_struct the task to move
  1701 + * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
  1702 + *
  1703 + * Called by cgroup_scan_tasks() for each task in a cgroup.
  1704 + * Return nonzero to stop the walk through the tasks.
  1705 + */
  1706 +void cpuset_do_move_task(struct task_struct *tsk, struct cgroup_scanner *scan)
  1707 +{
  1708 + struct cpuset_hotplug_scanner *chsp;
  1709 +
  1710 + chsp = container_of(scan, struct cpuset_hotplug_scanner, scan);
  1711 + cgroup_attach_task(chsp->to, tsk);
  1712 +}
  1713 +
  1714 +/**
  1715 + * move_member_tasks_to_cpuset - move tasks from one cpuset to another
  1716 + * @from: cpuset in which the tasks currently reside
  1717 + * @to: cpuset to which the tasks will be moved
  1718 + *
  1719 + * Called with manage_sem held
  1720 + * callback_mutex must not be held, as attach_task() will take it.
  1721 + *
  1722 + * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
  1723 + * calling callback functions for each.
  1724 + */
  1725 +static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
  1726 +{
  1727 + struct cpuset_hotplug_scanner scan;
  1728 +
  1729 + scan.scan.cg = from->css.cgroup;
  1730 + scan.scan.test_task = NULL; /* select all tasks in cgroup */
  1731 + scan.scan.process_task = cpuset_do_move_task;
  1732 + scan.scan.heap = NULL;
  1733 + scan.to = to->css.cgroup;
  1734 +
  1735 + if (cgroup_scan_tasks((struct cgroup_scanner *)&scan))
  1736 + printk(KERN_ERR "move_member_tasks_to_cpuset: "
  1737 + "cgroup_scan_tasks failed\n");
  1738 +}
  1739 +
1690 1740 /*
1691 1741 * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
1692 1742 * or memory nodes, we need to walk over the cpuset hierarchy,
1693 1743 * removing that CPU or node from all cpusets. If this removes the
1694   - * last CPU or node from a cpuset, then the guarantee_online_cpus()
1695   - * or guarantee_online_mems() code will use that emptied cpusets
1696   - * parent online CPUs or nodes. Cpusets that were already empty of
1697   - * CPUs or nodes are left empty.
  1744 + * last CPU or node from a cpuset, then move the tasks in the empty
  1745 + * cpuset to its next-highest non-empty parent.
1698 1746 *
1699   - * This routine is intentionally inefficient in a couple of regards.
1700   - * It will check all cpusets in a subtree even if the top cpuset of
1701   - * the subtree has no offline CPUs or nodes. It checks both CPUs and
1702   - * nodes, even though the caller could have been coded to know that
1703   - * only one of CPUs or nodes needed to be checked on a given call.
1704   - * This was done to minimize text size rather than cpu cycles.
  1747 + * The parent cpuset has some superset of the 'mems' nodes that the
  1748 + * newly empty cpuset held, so no migration of memory is necessary.
1705 1749 *
1706   - * Call with both manage_mutex and callback_mutex held.
1707   - *
1708   - * Recursive, on depth of cpuset subtree.
  1750 + * Called with both manage_sem and callback_sem held
1709 1751 */
  1752 +static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
  1753 +{
  1754 + struct cpuset *parent;
1710 1755  
1711   -static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
  1756 + /* the cgroup's css_sets list is in use if there are tasks
  1757 + in the cpuset; the list is empty if there are none;
  1758 + the cs->css.refcnt seems always 0 */
  1759 + if (list_empty(&cs->css.cgroup->css_sets))
  1760 + return;
  1761 +
  1762 + /*
  1763 + * Find its next-highest non-empty parent, (top cpuset
  1764 + * has online cpus, so can't be empty).
  1765 + */
  1766 + parent = cs->parent;
  1767 + while (cpus_empty(parent->cpus_allowed)) {
  1768 + /*
  1769 + * this empty cpuset should now be considered to
  1770 + * have been used, and therefore eligible for
  1771 + * release when empty (if it is notify_on_release)
  1772 + */
  1773 + parent = parent->parent;
  1774 + }
  1775 +
  1776 + move_member_tasks_to_cpuset(cs, parent);
  1777 +}
  1778 +
  1779 +/*
  1780 + * Walk the specified cpuset subtree and look for empty cpusets.
  1781 + * The tasks of such cpuset must be moved to a parent cpuset.
  1782 + *
  1783 + * Note that such a notify_on_release cpuset must have had, at some time,
  1784 + * member tasks or cpuset descendants and cpus and memory, before it can
  1785 + * be a candidate for release.
  1786 + *
  1787 + * Called with manage_mutex held. We take callback_mutex to modify
  1788 + * cpus_allowed and mems_allowed.
  1789 + *
  1790 + * This walk processes the tree from top to bottom, completing one layer
  1791 + * before dropping down to the next. It always processes a node before
  1792 + * any of its children.
  1793 + *
  1794 + * For now, since we lack memory hot unplug, we'll never see a cpuset
  1795 + * that has tasks along with an empty 'mems'. But if we did see such
  1796 + * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
  1797 + */
  1798 +static void scan_for_empty_cpusets(const struct cpuset *root)
1712 1799 {
  1800 + struct cpuset *cp; /* scans cpusets being updated */
  1801 + struct cpuset *child; /* scans child cpusets of cp */
  1802 + struct list_head queue;
1713 1803 struct cgroup *cont;
1714   - struct cpuset *c;
1715 1804  
1716   - /* Each of our child cpusets mems must be online */
1717   - list_for_each_entry(cont, &cur->css.cgroup->children, sibling) {
1718   - c = cgroup_cs(cont);
1719   - guarantee_online_cpus_mems_in_subtree(c);
1720   - if (!cpus_empty(c->cpus_allowed))
1721   - guarantee_online_cpus(c, &c->cpus_allowed);
1722   - if (!nodes_empty(c->mems_allowed))
1723   - guarantee_online_mems(c, &c->mems_allowed);
  1805 + INIT_LIST_HEAD(&queue);
  1806 +
  1807 + list_add_tail((struct list_head *)&root->stack_list, &queue);
  1808 +
  1809 + mutex_lock(&callback_mutex);
  1810 + while (!list_empty(&queue)) {
  1811 + cp = container_of(queue.next, struct cpuset, stack_list);
  1812 + list_del(queue.next);
  1813 + list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
  1814 + child = cgroup_cs(cont);
  1815 + list_add_tail(&child->stack_list, &queue);
  1816 + }
  1817 + cont = cp->css.cgroup;
  1818 + /* Remove offline cpus and mems from this cpuset. */
  1819 + cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map);
  1820 + nodes_and(cp->mems_allowed, cp->mems_allowed,
  1821 + node_states[N_HIGH_MEMORY]);
  1822 + if ((cpus_empty(cp->cpus_allowed) ||
  1823 + nodes_empty(cp->mems_allowed))) {
  1824 + /* Move tasks from the empty cpuset to a parent */
  1825 + mutex_unlock(&callback_mutex);
  1826 + remove_tasks_in_empty_cpuset(cp);
  1827 + mutex_lock(&callback_mutex);
  1828 + }
1724 1829 }
  1830 + mutex_unlock(&callback_mutex);
  1831 + return;
1725 1832 }
1726 1833  
1727 1834 /*
1728 1835 * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
1729 1836 * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to
1730   - * track what's online after any CPU or memory node hotplug or unplug
1731   - * event.
  1837 + * track what's online after any CPU or memory node hotplug or unplug event.
1732 1838 *
1733   - * To ensure that we don't remove a CPU or node from the top cpuset
1734   - * that is currently in use by a child cpuset (which would violate
1735   - * the rule that cpusets must be subsets of their parent), we first
1736   - * call the recursive routine guarantee_online_cpus_mems_in_subtree().
1737   - *
1738 1839 * Since there are two callers of this routine, one for CPU hotplug
1739 1840 * events and one for memory node hotplug events, we could have coded
1740 1841 * two separate routines here. We code it as a single common routine
1741 1842  
1742 1843  
1743 1844  
... ... @@ -1744,13 +1845,11 @@
1744 1845 static void common_cpu_mem_hotplug_unplug(void)
1745 1846 {
1746 1847 cgroup_lock();
1747   - mutex_lock(&callback_mutex);
1748 1848  
1749   - guarantee_online_cpus_mems_in_subtree(&top_cpuset);
1750 1849 top_cpuset.cpus_allowed = cpu_online_map;
1751 1850 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
  1851 + scan_for_empty_cpusets(&top_cpuset);
1752 1852  
1753   - mutex_unlock(&callback_mutex);
1754 1853 cgroup_unlock();
1755 1854 }
1756 1855