Commit 31a7df01fd0cd786f60873a921aecafac148c290

Authored by Cliff Wickman
Committed by Linus Torvalds
1 parent dfc05c259e

cgroups: mechanism to process each task in a cgroup

Provide cgroup_scan_tasks(), which iterates through every task in a cgroup,
calling a test function and a process function for each.  And call the process
function without holding the css_set_lock lock.

The idea is David Rientjes', predicting that such a function will make it much
easier in the future to extend things that require access to each task in a
cgroup without holding the lock,

[akpm@linux-foundation.org: cleanup]
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: Paul Menage <menage@google.com>
Cc: Paul Jackson <pj@sgi.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 2 changed files with 200 additions and 12 deletions Side-by-side Diff

include/linux/cgroup.h
... ... @@ -14,6 +14,7 @@
14 14 #include <linux/nodemask.h>
15 15 #include <linux/rcupdate.h>
16 16 #include <linux/cgroupstats.h>
  17 +#include <linux/prio_heap.h>
17 18  
18 19 #ifdef CONFIG_CGROUPS
19 20  
... ... @@ -207,6 +208,14 @@
207 208 int (*release) (struct inode *inode, struct file *file);
208 209 };
209 210  
  211 +struct cgroup_scanner {
  212 + struct cgroup *cg;
  213 + int (*test_task)(struct task_struct *p, struct cgroup_scanner *scan);
  214 + void (*process_task)(struct task_struct *p,
  215 + struct cgroup_scanner *scan);
  216 + struct ptr_heap *heap;
  217 +};
  218 +
210 219 /* Add a new file to the given cgroup directory. Should only be
211 220 * called by subsystems from within a populate() method */
212 221 int cgroup_add_file(struct cgroup *cont, struct cgroup_subsys *subsys,
213 222  
... ... @@ -299,11 +308,16 @@
299 308 * returns NULL or until you want to end the iteration
300 309 *
301 310 * 3) call cgroup_iter_end() to destroy the iterator.
  311 + *
  312 + * Or, call cgroup_scan_tasks() to iterate through every task in a cpuset.
  313 + * - cgroup_scan_tasks() holds the css_set_lock when calling the test_task()
  314 + * callback, but not while calling the process_task() callback.
302 315 */
303 316 void cgroup_iter_start(struct cgroup *cont, struct cgroup_iter *it);
304 317 struct task_struct *cgroup_iter_next(struct cgroup *cont,
305 318 struct cgroup_iter *it);
306 319 void cgroup_iter_end(struct cgroup *cont, struct cgroup_iter *it);
  320 +int cgroup_scan_tasks(struct cgroup_scanner *scan);
307 321  
308 322 #else /* !CONFIG_CGROUPS */
309 323  
... ... @@ -1695,6 +1695,29 @@
1695 1695 it->task = cg->tasks.next;
1696 1696 }
1697 1697  
  1698 +/*
  1699 + * To reduce the fork() overhead for systems that are not actually
  1700 + * using their cgroups capability, we don't maintain the lists running
  1701 + * through each css_set to its tasks until we see the list actually
  1702 + * used - in other words after the first call to cgroup_iter_start().
  1703 + *
  1704 + * The tasklist_lock is not held here, as do_each_thread() and
  1705 + * while_each_thread() are protected by RCU.
  1706 + */
  1707 +void cgroup_enable_task_cg_lists(void)
  1708 +{
  1709 + struct task_struct *p, *g;
  1710 + write_lock(&css_set_lock);
  1711 + use_task_css_set_links = 1;
  1712 + do_each_thread(g, p) {
  1713 + task_lock(p);
  1714 + if (list_empty(&p->cg_list))
  1715 + list_add(&p->cg_list, &p->cgroups->tasks);
  1716 + task_unlock(p);
  1717 + } while_each_thread(g, p);
  1718 + write_unlock(&css_set_lock);
  1719 +}
  1720 +
1698 1721 void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
1699 1722 {
1700 1723 /*
... ... @@ -1702,18 +1725,9 @@
1702 1725 * we need to enable the list linking each css_set to its
1703 1726 * tasks, and fix up all existing tasks.
1704 1727 */
1705   - if (!use_task_css_set_links) {
1706   - struct task_struct *p, *g;
1707   - write_lock(&css_set_lock);
1708   - use_task_css_set_links = 1;
1709   - do_each_thread(g, p) {
1710   - task_lock(p);
1711   - if (list_empty(&p->cg_list))
1712   - list_add(&p->cg_list, &p->cgroups->tasks);
1713   - task_unlock(p);
1714   - } while_each_thread(g, p);
1715   - write_unlock(&css_set_lock);
1716   - }
  1728 + if (!use_task_css_set_links)
  1729 + cgroup_enable_task_cg_lists();
  1730 +
1717 1731 read_lock(&css_set_lock);
1718 1732 it->cg_link = &cgrp->css_sets;
1719 1733 cgroup_advance_iter(cgrp, it);
... ... @@ -1744,6 +1758,166 @@
1744 1758 void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
1745 1759 {
1746 1760 read_unlock(&css_set_lock);
  1761 +}
  1762 +
  1763 +static inline int started_after_time(struct task_struct *t1,
  1764 + struct timespec *time,
  1765 + struct task_struct *t2)
  1766 +{
  1767 + int start_diff = timespec_compare(&t1->start_time, time);
  1768 + if (start_diff > 0) {
  1769 + return 1;
  1770 + } else if (start_diff < 0) {
  1771 + return 0;
  1772 + } else {
  1773 + /*
  1774 + * Arbitrarily, if two processes started at the same
  1775 + * time, we'll say that the lower pointer value
  1776 + * started first. Note that t2 may have exited by now
  1777 + * so this may not be a valid pointer any longer, but
  1778 + * that's fine - it still serves to distinguish
  1779 + * between two tasks started (effectively) simultaneously.
  1780 + */
  1781 + return t1 > t2;
  1782 + }
  1783 +}
  1784 +
  1785 +/*
  1786 + * This function is a callback from heap_insert() and is used to order
  1787 + * the heap.
  1788 + * In this case we order the heap in descending task start time.
  1789 + */
  1790 +static inline int started_after(void *p1, void *p2)
  1791 +{
  1792 + struct task_struct *t1 = p1;
  1793 + struct task_struct *t2 = p2;
  1794 + return started_after_time(t1, &t2->start_time, t2);
  1795 +}
  1796 +
  1797 +/**
  1798 + * cgroup_scan_tasks - iterate though all the tasks in a cgroup
  1799 + * @scan: struct cgroup_scanner containing arguments for the scan
  1800 + *
  1801 + * Arguments include pointers to callback functions test_task() and
  1802 + * process_task().
  1803 + * Iterate through all the tasks in a cgroup, calling test_task() for each,
  1804 + * and if it returns true, call process_task() for it also.
  1805 + * The test_task pointer may be NULL, meaning always true (select all tasks).
  1806 + * Effectively duplicates cgroup_iter_{start,next,end}()
  1807 + * but does not lock css_set_lock for the call to process_task().
  1808 + * The struct cgroup_scanner may be embedded in any structure of the caller's
  1809 + * creation.
  1810 + * It is guaranteed that process_task() will act on every task that
  1811 + * is a member of the cgroup for the duration of this call. This
  1812 + * function may or may not call process_task() for tasks that exit
  1813 + * or move to a different cgroup during the call, or are forked or
  1814 + * move into the cgroup during the call.
  1815 + *
  1816 + * Note that test_task() may be called with locks held, and may in some
  1817 + * situations be called multiple times for the same task, so it should
  1818 + * be cheap.
  1819 + * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been
  1820 + * pre-allocated and will be used for heap operations (and its "gt" member will
  1821 + * be overwritten), else a temporary heap will be used (allocation of which
  1822 + * may cause this function to fail).
  1823 + */
  1824 +int cgroup_scan_tasks(struct cgroup_scanner *scan)
  1825 +{
  1826 + int retval, i;
  1827 + struct cgroup_iter it;
  1828 + struct task_struct *p, *dropped;
  1829 + /* Never dereference latest_task, since it's not refcounted */
  1830 + struct task_struct *latest_task = NULL;
  1831 + struct ptr_heap tmp_heap;
  1832 + struct ptr_heap *heap;
  1833 + struct timespec latest_time = { 0, 0 };
  1834 +
  1835 + if (scan->heap) {
  1836 + /* The caller supplied our heap and pre-allocated its memory */
  1837 + heap = scan->heap;
  1838 + heap->gt = &started_after;
  1839 + } else {
  1840 + /* We need to allocate our own heap memory */
  1841 + heap = &tmp_heap;
  1842 + retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
  1843 + if (retval)
  1844 + /* cannot allocate the heap */
  1845 + return retval;
  1846 + }
  1847 +
  1848 + again:
  1849 + /*
  1850 + * Scan tasks in the cgroup, using the scanner's "test_task" callback
  1851 + * to determine which are of interest, and using the scanner's
  1852 + * "process_task" callback to process any of them that need an update.
  1853 + * Since we don't want to hold any locks during the task updates,
  1854 + * gather tasks to be processed in a heap structure.
  1855 + * The heap is sorted by descending task start time.
  1856 + * If the statically-sized heap fills up, we overflow tasks that
  1857 + * started later, and in future iterations only consider tasks that
  1858 + * started after the latest task in the previous pass. This
  1859 + * guarantees forward progress and that we don't miss any tasks.
  1860 + */
  1861 + heap->size = 0;
  1862 + cgroup_iter_start(scan->cg, &it);
  1863 + while ((p = cgroup_iter_next(scan->cg, &it))) {
  1864 + /*
  1865 + * Only affect tasks that qualify per the caller's callback,
  1866 + * if he provided one
  1867 + */
  1868 + if (scan->test_task && !scan->test_task(p, scan))
  1869 + continue;
  1870 + /*
  1871 + * Only process tasks that started after the last task
  1872 + * we processed
  1873 + */
  1874 + if (!started_after_time(p, &latest_time, latest_task))
  1875 + continue;
  1876 + dropped = heap_insert(heap, p);
  1877 + if (dropped == NULL) {
  1878 + /*
  1879 + * The new task was inserted; the heap wasn't
  1880 + * previously full
  1881 + */
  1882 + get_task_struct(p);
  1883 + } else if (dropped != p) {
  1884 + /*
  1885 + * The new task was inserted, and pushed out a
  1886 + * different task
  1887 + */
  1888 + get_task_struct(p);
  1889 + put_task_struct(dropped);
  1890 + }
  1891 + /*
  1892 + * Else the new task was newer than anything already in
  1893 + * the heap and wasn't inserted
  1894 + */
  1895 + }
  1896 + cgroup_iter_end(scan->cg, &it);
  1897 +
  1898 + if (heap->size) {
  1899 + for (i = 0; i < heap->size; i++) {
  1900 + struct task_struct *p = heap->ptrs[i];
  1901 + if (i == 0) {
  1902 + latest_time = p->start_time;
  1903 + latest_task = p;
  1904 + }
  1905 + /* Process the task per the caller's callback */
  1906 + scan->process_task(p, scan);
  1907 + put_task_struct(p);
  1908 + }
  1909 + /*
  1910 + * If we had to process any tasks at all, scan again
  1911 + * in case some of them were in the middle of forking
  1912 + * children that didn't get processed.
  1913 + * Not the most efficient way to do it, but it avoids
  1914 + * having to take callback_mutex in the fork path
  1915 + */
  1916 + goto again;
  1917 + }
  1918 + if (heap == &tmp_heap)
  1919 + heap_free(&tmp_heap);
  1920 + return 0;
1747 1921 }
1748 1922  
1749 1923 /*