Commit 31a7df01fd0cd786f60873a921aecafac148c290
Committed by
Linus Torvalds
1 parent
dfc05c259e
Exists in
master
and in
4 other branches
cgroups: mechanism to process each task in a cgroup
Provide cgroup_scan_tasks(), which iterates through every task in a cgroup, calling a test function and a process function for each. And call the process function without holding the css_set_lock lock. The idea is David Rientjes', predicting that such a function will make it much easier in the future to extend things that require access to each task in a cgroup without holding the lock, [akpm@linux-foundation.org: cleanup] [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Cliff Wickman <cpw@sgi.com> Cc: Paul Menage <menage@google.com> Cc: Paul Jackson <pj@sgi.com> Acked-by: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 2 changed files with 200 additions and 12 deletions Side-by-side Diff
include/linux/cgroup.h
... | ... | @@ -14,6 +14,7 @@ |
14 | 14 | #include <linux/nodemask.h> |
15 | 15 | #include <linux/rcupdate.h> |
16 | 16 | #include <linux/cgroupstats.h> |
17 | +#include <linux/prio_heap.h> | |
17 | 18 | |
18 | 19 | #ifdef CONFIG_CGROUPS |
19 | 20 | |
... | ... | @@ -207,6 +208,14 @@ |
207 | 208 | int (*release) (struct inode *inode, struct file *file); |
208 | 209 | }; |
209 | 210 | |
211 | +struct cgroup_scanner { | |
212 | + struct cgroup *cg; | |
213 | + int (*test_task)(struct task_struct *p, struct cgroup_scanner *scan); | |
214 | + void (*process_task)(struct task_struct *p, | |
215 | + struct cgroup_scanner *scan); | |
216 | + struct ptr_heap *heap; | |
217 | +}; | |
218 | + | |
210 | 219 | /* Add a new file to the given cgroup directory. Should only be |
211 | 220 | * called by subsystems from within a populate() method */ |
212 | 221 | int cgroup_add_file(struct cgroup *cont, struct cgroup_subsys *subsys, |
213 | 222 | |
... | ... | @@ -299,11 +308,16 @@ |
299 | 308 | * returns NULL or until you want to end the iteration |
300 | 309 | * |
301 | 310 | * 3) call cgroup_iter_end() to destroy the iterator. |
311 | + * | |
312 | + * Or, call cgroup_scan_tasks() to iterate through every task in a cpuset. | |
313 | + * - cgroup_scan_tasks() holds the css_set_lock when calling the test_task() | |
314 | + * callback, but not while calling the process_task() callback. | |
302 | 315 | */ |
303 | 316 | void cgroup_iter_start(struct cgroup *cont, struct cgroup_iter *it); |
304 | 317 | struct task_struct *cgroup_iter_next(struct cgroup *cont, |
305 | 318 | struct cgroup_iter *it); |
306 | 319 | void cgroup_iter_end(struct cgroup *cont, struct cgroup_iter *it); |
320 | +int cgroup_scan_tasks(struct cgroup_scanner *scan); | |
307 | 321 | |
308 | 322 | #else /* !CONFIG_CGROUPS */ |
309 | 323 |
kernel/cgroup.c
... | ... | @@ -1695,6 +1695,29 @@ |
1695 | 1695 | it->task = cg->tasks.next; |
1696 | 1696 | } |
1697 | 1697 | |
1698 | +/* | |
1699 | + * To reduce the fork() overhead for systems that are not actually | |
1700 | + * using their cgroups capability, we don't maintain the lists running | |
1701 | + * through each css_set to its tasks until we see the list actually | |
1702 | + * used - in other words after the first call to cgroup_iter_start(). | |
1703 | + * | |
1704 | + * The tasklist_lock is not held here, as do_each_thread() and | |
1705 | + * while_each_thread() are protected by RCU. | |
1706 | + */ | |
1707 | +void cgroup_enable_task_cg_lists(void) | |
1708 | +{ | |
1709 | + struct task_struct *p, *g; | |
1710 | + write_lock(&css_set_lock); | |
1711 | + use_task_css_set_links = 1; | |
1712 | + do_each_thread(g, p) { | |
1713 | + task_lock(p); | |
1714 | + if (list_empty(&p->cg_list)) | |
1715 | + list_add(&p->cg_list, &p->cgroups->tasks); | |
1716 | + task_unlock(p); | |
1717 | + } while_each_thread(g, p); | |
1718 | + write_unlock(&css_set_lock); | |
1719 | +} | |
1720 | + | |
1698 | 1721 | void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) |
1699 | 1722 | { |
1700 | 1723 | /* |
... | ... | @@ -1702,18 +1725,9 @@ |
1702 | 1725 | * we need to enable the list linking each css_set to its |
1703 | 1726 | * tasks, and fix up all existing tasks. |
1704 | 1727 | */ |
1705 | - if (!use_task_css_set_links) { | |
1706 | - struct task_struct *p, *g; | |
1707 | - write_lock(&css_set_lock); | |
1708 | - use_task_css_set_links = 1; | |
1709 | - do_each_thread(g, p) { | |
1710 | - task_lock(p); | |
1711 | - if (list_empty(&p->cg_list)) | |
1712 | - list_add(&p->cg_list, &p->cgroups->tasks); | |
1713 | - task_unlock(p); | |
1714 | - } while_each_thread(g, p); | |
1715 | - write_unlock(&css_set_lock); | |
1716 | - } | |
1728 | + if (!use_task_css_set_links) | |
1729 | + cgroup_enable_task_cg_lists(); | |
1730 | + | |
1717 | 1731 | read_lock(&css_set_lock); |
1718 | 1732 | it->cg_link = &cgrp->css_sets; |
1719 | 1733 | cgroup_advance_iter(cgrp, it); |
... | ... | @@ -1744,6 +1758,166 @@ |
1744 | 1758 | void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) |
1745 | 1759 | { |
1746 | 1760 | read_unlock(&css_set_lock); |
1761 | +} | |
1762 | + | |
1763 | +static inline int started_after_time(struct task_struct *t1, | |
1764 | + struct timespec *time, | |
1765 | + struct task_struct *t2) | |
1766 | +{ | |
1767 | + int start_diff = timespec_compare(&t1->start_time, time); | |
1768 | + if (start_diff > 0) { | |
1769 | + return 1; | |
1770 | + } else if (start_diff < 0) { | |
1771 | + return 0; | |
1772 | + } else { | |
1773 | + /* | |
1774 | + * Arbitrarily, if two processes started at the same | |
1775 | + * time, we'll say that the lower pointer value | |
1776 | + * started first. Note that t2 may have exited by now | |
1777 | + * so this may not be a valid pointer any longer, but | |
1778 | + * that's fine - it still serves to distinguish | |
1779 | + * between two tasks started (effectively) simultaneously. | |
1780 | + */ | |
1781 | + return t1 > t2; | |
1782 | + } | |
1783 | +} | |
1784 | + | |
1785 | +/* | |
1786 | + * This function is a callback from heap_insert() and is used to order | |
1787 | + * the heap. | |
1788 | + * In this case we order the heap in descending task start time. | |
1789 | + */ | |
1790 | +static inline int started_after(void *p1, void *p2) | |
1791 | +{ | |
1792 | + struct task_struct *t1 = p1; | |
1793 | + struct task_struct *t2 = p2; | |
1794 | + return started_after_time(t1, &t2->start_time, t2); | |
1795 | +} | |
1796 | + | |
1797 | +/** | |
1798 | + * cgroup_scan_tasks - iterate though all the tasks in a cgroup | |
1799 | + * @scan: struct cgroup_scanner containing arguments for the scan | |
1800 | + * | |
1801 | + * Arguments include pointers to callback functions test_task() and | |
1802 | + * process_task(). | |
1803 | + * Iterate through all the tasks in a cgroup, calling test_task() for each, | |
1804 | + * and if it returns true, call process_task() for it also. | |
1805 | + * The test_task pointer may be NULL, meaning always true (select all tasks). | |
1806 | + * Effectively duplicates cgroup_iter_{start,next,end}() | |
1807 | + * but does not lock css_set_lock for the call to process_task(). | |
1808 | + * The struct cgroup_scanner may be embedded in any structure of the caller's | |
1809 | + * creation. | |
1810 | + * It is guaranteed that process_task() will act on every task that | |
1811 | + * is a member of the cgroup for the duration of this call. This | |
1812 | + * function may or may not call process_task() for tasks that exit | |
1813 | + * or move to a different cgroup during the call, or are forked or | |
1814 | + * move into the cgroup during the call. | |
1815 | + * | |
1816 | + * Note that test_task() may be called with locks held, and may in some | |
1817 | + * situations be called multiple times for the same task, so it should | |
1818 | + * be cheap. | |
1819 | + * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been | |
1820 | + * pre-allocated and will be used for heap operations (and its "gt" member will | |
1821 | + * be overwritten), else a temporary heap will be used (allocation of which | |
1822 | + * may cause this function to fail). | |
1823 | + */ | |
1824 | +int cgroup_scan_tasks(struct cgroup_scanner *scan) | |
1825 | +{ | |
1826 | + int retval, i; | |
1827 | + struct cgroup_iter it; | |
1828 | + struct task_struct *p, *dropped; | |
1829 | + /* Never dereference latest_task, since it's not refcounted */ | |
1830 | + struct task_struct *latest_task = NULL; | |
1831 | + struct ptr_heap tmp_heap; | |
1832 | + struct ptr_heap *heap; | |
1833 | + struct timespec latest_time = { 0, 0 }; | |
1834 | + | |
1835 | + if (scan->heap) { | |
1836 | + /* The caller supplied our heap and pre-allocated its memory */ | |
1837 | + heap = scan->heap; | |
1838 | + heap->gt = &started_after; | |
1839 | + } else { | |
1840 | + /* We need to allocate our own heap memory */ | |
1841 | + heap = &tmp_heap; | |
1842 | + retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after); | |
1843 | + if (retval) | |
1844 | + /* cannot allocate the heap */ | |
1845 | + return retval; | |
1846 | + } | |
1847 | + | |
1848 | + again: | |
1849 | + /* | |
1850 | + * Scan tasks in the cgroup, using the scanner's "test_task" callback | |
1851 | + * to determine which are of interest, and using the scanner's | |
1852 | + * "process_task" callback to process any of them that need an update. | |
1853 | + * Since we don't want to hold any locks during the task updates, | |
1854 | + * gather tasks to be processed in a heap structure. | |
1855 | + * The heap is sorted by descending task start time. | |
1856 | + * If the statically-sized heap fills up, we overflow tasks that | |
1857 | + * started later, and in future iterations only consider tasks that | |
1858 | + * started after the latest task in the previous pass. This | |
1859 | + * guarantees forward progress and that we don't miss any tasks. | |
1860 | + */ | |
1861 | + heap->size = 0; | |
1862 | + cgroup_iter_start(scan->cg, &it); | |
1863 | + while ((p = cgroup_iter_next(scan->cg, &it))) { | |
1864 | + /* | |
1865 | + * Only affect tasks that qualify per the caller's callback, | |
1866 | + * if he provided one | |
1867 | + */ | |
1868 | + if (scan->test_task && !scan->test_task(p, scan)) | |
1869 | + continue; | |
1870 | + /* | |
1871 | + * Only process tasks that started after the last task | |
1872 | + * we processed | |
1873 | + */ | |
1874 | + if (!started_after_time(p, &latest_time, latest_task)) | |
1875 | + continue; | |
1876 | + dropped = heap_insert(heap, p); | |
1877 | + if (dropped == NULL) { | |
1878 | + /* | |
1879 | + * The new task was inserted; the heap wasn't | |
1880 | + * previously full | |
1881 | + */ | |
1882 | + get_task_struct(p); | |
1883 | + } else if (dropped != p) { | |
1884 | + /* | |
1885 | + * The new task was inserted, and pushed out a | |
1886 | + * different task | |
1887 | + */ | |
1888 | + get_task_struct(p); | |
1889 | + put_task_struct(dropped); | |
1890 | + } | |
1891 | + /* | |
1892 | + * Else the new task was newer than anything already in | |
1893 | + * the heap and wasn't inserted | |
1894 | + */ | |
1895 | + } | |
1896 | + cgroup_iter_end(scan->cg, &it); | |
1897 | + | |
1898 | + if (heap->size) { | |
1899 | + for (i = 0; i < heap->size; i++) { | |
1900 | + struct task_struct *p = heap->ptrs[i]; | |
1901 | + if (i == 0) { | |
1902 | + latest_time = p->start_time; | |
1903 | + latest_task = p; | |
1904 | + } | |
1905 | + /* Process the task per the caller's callback */ | |
1906 | + scan->process_task(p, scan); | |
1907 | + put_task_struct(p); | |
1908 | + } | |
1909 | + /* | |
1910 | + * If we had to process any tasks at all, scan again | |
1911 | + * in case some of them were in the middle of forking | |
1912 | + * children that didn't get processed. | |
1913 | + * Not the most efficient way to do it, but it avoids | |
1914 | + * having to take callback_mutex in the fork path | |
1915 | + */ | |
1916 | + goto again; | |
1917 | + } | |
1918 | + if (heap == &tmp_heap) | |
1919 | + heap_free(&tmp_heap); | |
1920 | + return 0; | |
1747 | 1921 | } |
1748 | 1922 | |
1749 | 1923 | /* |