cgroups: mechanism to process each task in a cgroup

Provide cgroup_scan_tasks(), which iterates through every task in a cgroup, calling a test function and a process function for each. And call the process function without holding the css_set_lock lock. The idea is David Rientjes', predicting that such a function will make it much easier in the future to extend things that require access to each task in a cgroup without holding the lock, [akpm@linux-foundation.org: cleanup] [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Cliff Wickman <cpw@sgi.com> Cc: Paul Menage <menage@google.com> Cc: Paul Jackson <pj@sgi.com> Acked-by: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

cgroups: mechanism to process each task in a cgroup
Provide cgroup_scan_tasks(), which iterates through every task in a cgroup, calling a test function and a process function for each. And call the process function without holding the css_set_lock lock. The idea is David Rientjes', predicting that such a function will make it much easier in the future to extend things that require access to each task in a cgroup without holding the lock, [akpm@linux-foundation.org: cleanup] [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Cliff Wickman <cpw@sgi.com> Cc: Paul Menage <menage@google.com> Cc: Paul Jackson <pj@sgi.com> Acked-by: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Cliff Wickman · Linus Torvalds
1 parent dfc05c259e
Showing 2 changed files with 200 additions and 12 deletions Side-by-side Diff
include/linux/cgroup.h
kernel/cgroup.c
@@ -14,6 +14,7 @@
 #include <linux/nodemask.h>
 #include <linux/rcupdate.h>
 #include <linux/cgroupstats.h>
+#include <linux/prio_heap.h>
  
 #ifdef CONFIG_CGROUPS
  
@@ -207,6 +208,14 @@
 	int (*release) (struct inode *inode, struct file *file);
 };
  
+struct cgroup_scanner {
+	struct cgroup *cg;
+	int (*test_task)(struct task_struct *p, struct cgroup_scanner *scan);
+	void (*process_task)(struct task_struct *p,
+			struct cgroup_scanner *scan);
+	struct ptr_heap *heap;
+};
+
 /* Add a new file to the given cgroup directory. Should only be
  * called by subsystems from within a populate() method */
 int cgroup_add_file(struct cgroup *cont, struct cgroup_subsys *subsys,
  
@@ -299,11 +308,16 @@
  *    returns NULL or until you want to end the iteration
  *
  * 3) call cgroup_iter_end() to destroy the iterator.
+ *
+ * Or, call cgroup_scan_tasks() to iterate through every task in a cpuset.
+ *    - cgroup_scan_tasks() holds the css_set_lock when calling the test_task()
+ *      callback, but not while calling the process_task() callback.
  */
 void cgroup_iter_start(struct cgroup *cont, struct cgroup_iter *it);
 struct task_struct *cgroup_iter_next(struct cgroup *cont,
 					struct cgroup_iter *it);
 void cgroup_iter_end(struct cgroup *cont, struct cgroup_iter *it);
+int cgroup_scan_tasks(struct cgroup_scanner *scan);
  
 #else /* !CONFIG_CGROUPS */
  
@@ -1695,6 +1695,29 @@
 	it->task = cg->tasks.next;
 }
  
+/*
+ * To reduce the fork() overhead for systems that are not actually
+ * using their cgroups capability, we don't maintain the lists running
+ * through each css_set to its tasks until we see the list actually
+ * used - in other words after the first call to cgroup_iter_start().
+ *
+ * The tasklist_lock is not held here, as do_each_thread() and
+ * while_each_thread() are protected by RCU.
+ */
+void cgroup_enable_task_cg_lists(void)
+{
+	struct task_struct *p, *g;
+	write_lock(&css_set_lock);
+	use_task_css_set_links = 1;
+	do_each_thread(g, p) {
+		task_lock(p);
+		if (list_empty(&p->cg_list))
+			list_add(&p->cg_list, &p->cgroups->tasks);
+		task_unlock(p);
+	} while_each_thread(g, p);
+	write_unlock(&css_set_lock);
+}
+
 void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
 {
 	/*
@@ -1702,18 +1725,9 @@
 	 * we need to enable the list linking each css_set to its
 	 * tasks, and fix up all existing tasks.
 	 */
-	if (!use_task_css_set_links) {
-		struct task_struct *p, *g;
-		write_lock(&css_set_lock);
-		use_task_css_set_links = 1;
- 		do_each_thread(g, p) {
-			task_lock(p);
-			if (list_empty(&p->cg_list))
-				list_add(&p->cg_list, &p->cgroups->tasks);
-			task_unlock(p);
- 		} while_each_thread(g, p);
-		write_unlock(&css_set_lock);
-	}
+	if (!use_task_css_set_links)
+		cgroup_enable_task_cg_lists();
+
 	read_lock(&css_set_lock);
 	it->cg_link = &cgrp->css_sets;
 	cgroup_advance_iter(cgrp, it);
@@ -1744,6 +1758,166 @@
 void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
 {
 	read_unlock(&css_set_lock);
+}
+
+static inline int started_after_time(struct task_struct *t1,
+				     struct timespec *time,
+				     struct task_struct *t2)
+{
+	int start_diff = timespec_compare(&t1->start_time, time);
+	if (start_diff > 0) {
+		return 1;
+	} else if (start_diff < 0) {
+		return 0;
+	} else {
+		/*
+		 * Arbitrarily, if two processes started at the same
+		 * time, we'll say that the lower pointer value
+		 * started first. Note that t2 may have exited by now
+		 * so this may not be a valid pointer any longer, but
+		 * that's fine - it still serves to distinguish
+		 * between two tasks started (effectively) simultaneously.
+		 */
+		return t1 > t2;
+	}
+}
+
+/*
+ * This function is a callback from heap_insert() and is used to order
+ * the heap.
+ * In this case we order the heap in descending task start time.
+ */
+static inline int started_after(void *p1, void *p2)
+{
+	struct task_struct *t1 = p1;
+	struct task_struct *t2 = p2;
+	return started_after_time(t1, &t2->start_time, t2);
+}
+
+/**
+ * cgroup_scan_tasks - iterate though all the tasks in a cgroup
+ * @scan: struct cgroup_scanner containing arguments for the scan
+ *
+ * Arguments include pointers to callback functions test_task() and
+ * process_task().
+ * Iterate through all the tasks in a cgroup, calling test_task() for each,
+ * and if it returns true, call process_task() for it also.
+ * The test_task pointer may be NULL, meaning always true (select all tasks).
+ * Effectively duplicates cgroup_iter_{start,next,end}()
+ * but does not lock css_set_lock for the call to process_task().
+ * The struct cgroup_scanner may be embedded in any structure of the caller's
+ * creation.
+ * It is guaranteed that process_task() will act on every task that
+ * is a member of the cgroup for the duration of this call. This
+ * function may or may not call process_task() for tasks that exit
+ * or move to a different cgroup during the call, or are forked or
+ * move into the cgroup during the call.
+ *
+ * Note that test_task() may be called with locks held, and may in some
+ * situations be called multiple times for the same task, so it should
+ * be cheap.
+ * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been
+ * pre-allocated and will be used for heap operations (and its "gt" member will
+ * be overwritten), else a temporary heap will be used (allocation of which
+ * may cause this function to fail).
+ */
+int cgroup_scan_tasks(struct cgroup_scanner *scan)
+{
+	int retval, i;
+	struct cgroup_iter it;
+	struct task_struct *p, *dropped;
+	/* Never dereference latest_task, since it's not refcounted */
+	struct task_struct *latest_task = NULL;
+	struct ptr_heap tmp_heap;
+	struct ptr_heap *heap;
+	struct timespec latest_time = { 0, 0 };
+
+	if (scan->heap) {
+		/* The caller supplied our heap and pre-allocated its memory */
+		heap = scan->heap;
+		heap->gt = &started_after;
+	} else {
+		/* We need to allocate our own heap memory */
+		heap = &tmp_heap;
+		retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
+		if (retval)
+			/* cannot allocate the heap */
+			return retval;
+	}
+
+ again:
+	/*
+	 * Scan tasks in the cgroup, using the scanner's "test_task" callback
+	 * to determine which are of interest, and using the scanner's
+	 * "process_task" callback to process any of them that need an update.
+	 * Since we don't want to hold any locks during the task updates,
+	 * gather tasks to be processed in a heap structure.
+	 * The heap is sorted by descending task start time.
+	 * If the statically-sized heap fills up, we overflow tasks that
+	 * started later, and in future iterations only consider tasks that
+	 * started after the latest task in the previous pass. This
+	 * guarantees forward progress and that we don't miss any tasks.
+	 */
+	heap->size = 0;
+	cgroup_iter_start(scan->cg, &it);
+	while ((p = cgroup_iter_next(scan->cg, &it))) {
+		/*
+		 * Only affect tasks that qualify per the caller's callback,
+		 * if he provided one
+		 */
+		if (scan->test_task && !scan->test_task(p, scan))
+			continue;
+		/*
+		 * Only process tasks that started after the last task
+		 * we processed
+		 */
+		if (!started_after_time(p, &latest_time, latest_task))
+			continue;
+		dropped = heap_insert(heap, p);
+		if (dropped == NULL) {
+			/*
+			 * The new task was inserted; the heap wasn't
+			 * previously full
+			 */
+			get_task_struct(p);
+		} else if (dropped != p) {
+			/*
+			 * The new task was inserted, and pushed out a
+			 * different task
+			 */
+			get_task_struct(p);
+			put_task_struct(dropped);
+		}
+		/*
+		 * Else the new task was newer than anything already in
+		 * the heap and wasn't inserted
+		 */
+	}
+	cgroup_iter_end(scan->cg, &it);
+
+	if (heap->size) {
+		for (i = 0; i < heap->size; i++) {
+			struct task_struct *p = heap->ptrs[i];
+			if (i == 0) {
+				latest_time = p->start_time;
+				latest_task = p;
+			}
+			/* Process the task per the caller's callback */
+			scan->process_task(p, scan);
+			put_task_struct(p);
+		}
+		/*
+		 * If we had to process any tasks at all, scan again
+		 * in case some of them were in the middle of forking
+		 * children that didn't get processed.
+		 * Not the most efficient way to do it, but it avoids
+		 * having to take callback_mutex in the fork path
+		 */
+		goto again;
+	}
+	if (heap == &tmp_heap)
+		heap_free(&tmp_heap);
+	return 0;
 }
  
 /*
...	...	@@ -14,6 +14,7 @@
14	14	#include <linux/nodemask.h>
15	15	#include <linux/rcupdate.h>
16	16	#include <linux/cgroupstats.h>
	17	+#include <linux/prio_heap.h>
17	18
18	19	#ifdef CONFIG_CGROUPS
19	20
...	...	@@ -207,6 +208,14 @@
207	208	int (release) (struct inode inode, struct file *file);
208	209	};
209	210
	211	+struct cgroup_scanner {
	212	+ struct cgroup *cg;
	213	+ int (test_task)(struct task_struct p, struct cgroup_scanner *scan);
	214	+ void (process_task)(struct task_struct p,
	215	+ struct cgroup_scanner *scan);
	216	+ struct ptr_heap *heap;
	217	+};
	218	+
210	219	/* Add a new file to the given cgroup directory. Should only be
211	220	* called by subsystems from within a populate() method */
212	221	int cgroup_add_file(struct cgroup cont, struct cgroup_subsys subsys,
213	222
...	...	@@ -299,11 +308,16 @@
299	308	* returns NULL or until you want to end the iteration
300	309	*
301	310	* 3) call cgroup_iter_end() to destroy the iterator.
	311	+ *
	312	+ * Or, call cgroup_scan_tasks() to iterate through every task in a cpuset.
	313	+ * - cgroup_scan_tasks() holds the css_set_lock when calling the test_task()
	314	+ * callback, but not while calling the process_task() callback.
302	315	*/
303	316	void cgroup_iter_start(struct cgroup cont, struct cgroup_iter it);
304	317	struct task_struct cgroup_iter_next(struct cgroup cont,
305	318	struct cgroup_iter *it);
306	319	void cgroup_iter_end(struct cgroup cont, struct cgroup_iter it);
	320	+int cgroup_scan_tasks(struct cgroup_scanner *scan);
307	321
308	322	#else /* !CONFIG_CGROUPS */
309	323
...	...	@@ -1695,6 +1695,29 @@
1695	1695	it->task = cg->tasks.next;
1696	1696	}
1697	1697
	1698	+/*
	1699	+ * To reduce the fork() overhead for systems that are not actually
	1700	+ * using their cgroups capability, we don't maintain the lists running
	1701	+ * through each css_set to its tasks until we see the list actually
	1702	+ * used - in other words after the first call to cgroup_iter_start().
	1703	+ *
	1704	+ * The tasklist_lock is not held here, as do_each_thread() and
	1705	+ * while_each_thread() are protected by RCU.
	1706	+ */
	1707	+void cgroup_enable_task_cg_lists(void)
	1708	+{
	1709	+ struct task_struct p, g;
	1710	+ write_lock(&css_set_lock);
	1711	+ use_task_css_set_links = 1;
	1712	+ do_each_thread(g, p) {
	1713	+ task_lock(p);
	1714	+ if (list_empty(&p->cg_list))
	1715	+ list_add(&p->cg_list, &p->cgroups->tasks);
	1716	+ task_unlock(p);
	1717	+ } while_each_thread(g, p);
	1718	+ write_unlock(&css_set_lock);
	1719	+}
	1720	+
1698	1721	void cgroup_iter_start(struct cgroup cgrp, struct cgroup_iter it)
1699	1722	{
1700	1723	/*
...	...	@@ -1702,18 +1725,9 @@
1702	1725	* we need to enable the list linking each css_set to its
1703	1726	* tasks, and fix up all existing tasks.
1704	1727	*/
1705		- if (!use_task_css_set_links) {
1706		- struct task_struct p, g;
1707		- write_lock(&css_set_lock);
1708		- use_task_css_set_links = 1;
1709		- do_each_thread(g, p) {
1710		- task_lock(p);
1711		- if (list_empty(&p->cg_list))
1712		- list_add(&p->cg_list, &p->cgroups->tasks);
1713		- task_unlock(p);
1714		- } while_each_thread(g, p);
1715		- write_unlock(&css_set_lock);
1716		- }
	1728	+ if (!use_task_css_set_links)
	1729	+ cgroup_enable_task_cg_lists();
	1730	+
1717	1731	read_lock(&css_set_lock);
1718	1732	it->cg_link = &cgrp->css_sets;
1719	1733	cgroup_advance_iter(cgrp, it);
...	...	@@ -1744,6 +1758,166 @@
1744	1758	void cgroup_iter_end(struct cgroup cgrp, struct cgroup_iter it)
1745	1759	{
1746	1760	read_unlock(&css_set_lock);
	1761	+}
	1762	+
	1763	+static inline int started_after_time(struct task_struct *t1,
	1764	+ struct timespec *time,
	1765	+ struct task_struct *t2)
	1766	+{
	1767	+ int start_diff = timespec_compare(&t1->start_time, time);
	1768	+ if (start_diff > 0) {
	1769	+ return 1;
	1770	+ } else if (start_diff < 0) {
	1771	+ return 0;
	1772	+ } else {
	1773	+ /*
	1774	+ * Arbitrarily, if two processes started at the same
	1775	+ * time, we'll say that the lower pointer value
	1776	+ * started first. Note that t2 may have exited by now
	1777	+ * so this may not be a valid pointer any longer, but
	1778	+ * that's fine - it still serves to distinguish
	1779	+ * between two tasks started (effectively) simultaneously.
	1780	+ */
	1781	+ return t1 > t2;
	1782	+ }
	1783	+}
	1784	+
	1785	+/*
	1786	+ * This function is a callback from heap_insert() and is used to order
	1787	+ * the heap.
	1788	+ * In this case we order the heap in descending task start time.
	1789	+ */
	1790	+static inline int started_after(void p1, void p2)
	1791	+{
	1792	+ struct task_struct *t1 = p1;
	1793	+ struct task_struct *t2 = p2;
	1794	+ return started_after_time(t1, &t2->start_time, t2);
	1795	+}
	1796	+
	1797	+/**
	1798	+ * cgroup_scan_tasks - iterate though all the tasks in a cgroup
	1799	+ * @scan: struct cgroup_scanner containing arguments for the scan
	1800	+ *
	1801	+ * Arguments include pointers to callback functions test_task() and
	1802	+ * process_task().
	1803	+ * Iterate through all the tasks in a cgroup, calling test_task() for each,
	1804	+ * and if it returns true, call process_task() for it also.
	1805	+ * The test_task pointer may be NULL, meaning always true (select all tasks).
	1806	+ * Effectively duplicates cgroup_iter_{start,next,end}()
	1807	+ * but does not lock css_set_lock for the call to process_task().
	1808	+ * The struct cgroup_scanner may be embedded in any structure of the caller's
	1809	+ * creation.
	1810	+ * It is guaranteed that process_task() will act on every task that
	1811	+ * is a member of the cgroup for the duration of this call. This
	1812	+ * function may or may not call process_task() for tasks that exit
	1813	+ * or move to a different cgroup during the call, or are forked or
	1814	+ * move into the cgroup during the call.
	1815	+ *
	1816	+ * Note that test_task() may be called with locks held, and may in some
	1817	+ * situations be called multiple times for the same task, so it should
	1818	+ * be cheap.
	1819	+ * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been
	1820	+ * pre-allocated and will be used for heap operations (and its "gt" member will
	1821	+ * be overwritten), else a temporary heap will be used (allocation of which
	1822	+ * may cause this function to fail).
	1823	+ */
	1824	+int cgroup_scan_tasks(struct cgroup_scanner *scan)
	1825	+{
	1826	+ int retval, i;
	1827	+ struct cgroup_iter it;
	1828	+ struct task_struct p, dropped;
	1829	+ /* Never dereference latest_task, since it's not refcounted */
	1830	+ struct task_struct *latest_task = NULL;
	1831	+ struct ptr_heap tmp_heap;
	1832	+ struct ptr_heap *heap;
	1833	+ struct timespec latest_time = { 0, 0 };
	1834	+
	1835	+ if (scan->heap) {
	1836	+ /* The caller supplied our heap and pre-allocated its memory */
	1837	+ heap = scan->heap;
	1838	+ heap->gt = &started_after;
	1839	+ } else {
	1840	+ /* We need to allocate our own heap memory */
	1841	+ heap = &tmp_heap;
	1842	+ retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
	1843	+ if (retval)
	1844	+ /* cannot allocate the heap */
	1845	+ return retval;
	1846	+ }
	1847	+
	1848	+ again:
	1849	+ /*
	1850	+ * Scan tasks in the cgroup, using the scanner's "test_task" callback
	1851	+ * to determine which are of interest, and using the scanner's
	1852	+ * "process_task" callback to process any of them that need an update.
	1853	+ * Since we don't want to hold any locks during the task updates,
	1854	+ * gather tasks to be processed in a heap structure.
	1855	+ * The heap is sorted by descending task start time.
	1856	+ * If the statically-sized heap fills up, we overflow tasks that
	1857	+ * started later, and in future iterations only consider tasks that
	1858	+ * started after the latest task in the previous pass. This
	1859	+ * guarantees forward progress and that we don't miss any tasks.
	1860	+ */
	1861	+ heap->size = 0;
	1862	+ cgroup_iter_start(scan->cg, &it);
	1863	+ while ((p = cgroup_iter_next(scan->cg, &it))) {
	1864	+ /*
	1865	+ * Only affect tasks that qualify per the caller's callback,
	1866	+ * if he provided one
	1867	+ */
	1868	+ if (scan->test_task && !scan->test_task(p, scan))
	1869	+ continue;
	1870	+ /*
	1871	+ * Only process tasks that started after the last task
	1872	+ * we processed
	1873	+ */
	1874	+ if (!started_after_time(p, &latest_time, latest_task))
	1875	+ continue;
	1876	+ dropped = heap_insert(heap, p);
	1877	+ if (dropped == NULL) {
	1878	+ /*
	1879	+ * The new task was inserted; the heap wasn't
	1880	+ * previously full
	1881	+ */
	1882	+ get_task_struct(p);
	1883	+ } else if (dropped != p) {
	1884	+ /*
	1885	+ * The new task was inserted, and pushed out a
	1886	+ * different task
	1887	+ */
	1888	+ get_task_struct(p);
	1889	+ put_task_struct(dropped);
	1890	+ }
	1891	+ /*
	1892	+ * Else the new task was newer than anything already in
	1893	+ * the heap and wasn't inserted
	1894	+ */
	1895	+ }
	1896	+ cgroup_iter_end(scan->cg, &it);
	1897	+
	1898	+ if (heap->size) {
	1899	+ for (i = 0; i < heap->size; i++) {
	1900	+ struct task_struct *p = heap->ptrs[i];
	1901	+ if (i == 0) {
	1902	+ latest_time = p->start_time;
	1903	+ latest_task = p;
	1904	+ }
	1905	+ /* Process the task per the caller's callback */
	1906	+ scan->process_task(p, scan);
	1907	+ put_task_struct(p);
	1908	+ }
	1909	+ /*
	1910	+ * If we had to process any tasks at all, scan again
	1911	+ * in case some of them were in the middle of forking
	1912	+ * children that didn't get processed.
	1913	+ * Not the most efficient way to do it, but it avoids
	1914	+ * having to take callback_mutex in the fork path
	1915	+ */
	1916	+ goto again;
	1917	+ }
	1918	+ if (heap == &tmp_heap)
	1919	+ heap_free(&tmp_heap);
	1920	+ return 0;
1747	1921	}
1748	1922
1749	1923	/*