Commit 3d3f26a7baaa921a0e790b4c72d20f0de91a5d65

Authored by Ingo Molnar
Committed by Linus Torvalds
1 parent 6362e4d4ed

[PATCH] kernel/cpuset.c, mutex conversion

convert cpuset.c's callback_sem and manage_sem to mutexes.
Build and boot tested by Ingo.
Build, boot, unit and stress tested by pj.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

Showing 1 changed file with 103 additions and 109 deletions Side-by-side Diff

... ... @@ -53,7 +53,7 @@
53 53  
54 54 #include <asm/uaccess.h>
55 55 #include <asm/atomic.h>
56   -#include <asm/semaphore.h>
  56 +#include <linux/mutex.h>
57 57  
58 58 #define CPUSET_SUPER_MAGIC 0x27e0eb
59 59  
60 60  
61 61  
62 62  
63 63  
64 64  
65 65  
66 66  
67 67  
68 68  
69 69  
70 70  
71 71  
72 72  
73 73  
74 74  
... ... @@ -168,63 +168,57 @@
168 168 static struct super_block *cpuset_sb;
169 169  
170 170 /*
171   - * We have two global cpuset semaphores below. They can nest.
172   - * It is ok to first take manage_sem, then nest callback_sem. We also
  171 + * We have two global cpuset mutexes below. They can nest.
  172 + * It is ok to first take manage_mutex, then nest callback_mutex. We also
173 173 * require taking task_lock() when dereferencing a tasks cpuset pointer.
174 174 * See "The task_lock() exception", at the end of this comment.
175 175 *
176   - * A task must hold both semaphores to modify cpusets. If a task
177   - * holds manage_sem, then it blocks others wanting that semaphore,
178   - * ensuring that it is the only task able to also acquire callback_sem
  176 + * A task must hold both mutexes to modify cpusets. If a task
  177 + * holds manage_mutex, then it blocks others wanting that mutex,
  178 + * ensuring that it is the only task able to also acquire callback_mutex
179 179 * and be able to modify cpusets. It can perform various checks on
180 180 * the cpuset structure first, knowing nothing will change. It can
181   - * also allocate memory while just holding manage_sem. While it is
  181 + * also allocate memory while just holding manage_mutex. While it is
182 182 * performing these checks, various callback routines can briefly
183   - * acquire callback_sem to query cpusets. Once it is ready to make
184   - * the changes, it takes callback_sem, blocking everyone else.
  183 + * acquire callback_mutex to query cpusets. Once it is ready to make
  184 + * the changes, it takes callback_mutex, blocking everyone else.
185 185 *
186 186 * Calls to the kernel memory allocator can not be made while holding
187   - * callback_sem, as that would risk double tripping on callback_sem
  187 + * callback_mutex, as that would risk double tripping on callback_mutex
188 188 * from one of the callbacks into the cpuset code from within
189 189 * __alloc_pages().
190 190 *
191   - * If a task is only holding callback_sem, then it has read-only
  191 + * If a task is only holding callback_mutex, then it has read-only
192 192 * access to cpusets.
193 193 *
194 194 * The task_struct fields mems_allowed and mems_generation may only
195 195 * be accessed in the context of that task, so require no locks.
196 196 *
197 197 * Any task can increment and decrement the count field without lock.
198   - * So in general, code holding manage_sem or callback_sem can't rely
  198 + * So in general, code holding manage_mutex or callback_mutex can't rely
199 199 * on the count field not changing. However, if the count goes to
200   - * zero, then only attach_task(), which holds both semaphores, can
  200 + * zero, then only attach_task(), which holds both mutexes, can
201 201 * increment it again. Because a count of zero means that no tasks
202 202 * are currently attached, therefore there is no way a task attached
203 203 * to that cpuset can fork (the other way to increment the count).
204   - * So code holding manage_sem or callback_sem can safely assume that
  204 + * So code holding manage_mutex or callback_mutex can safely assume that
205 205 * if the count is zero, it will stay zero. Similarly, if a task
206   - * holds manage_sem or callback_sem on a cpuset with zero count, it
  206 + * holds manage_mutex or callback_mutex on a cpuset with zero count, it
207 207 * knows that the cpuset won't be removed, as cpuset_rmdir() needs
208   - * both of those semaphores.
  208 + * both of those mutexes.
209 209 *
210   - * A possible optimization to improve parallelism would be to make
211   - * callback_sem a R/W semaphore (rwsem), allowing the callback routines
212   - * to proceed in parallel, with read access, until the holder of
213   - * manage_sem needed to take this rwsem for exclusive write access
214   - * and modify some cpusets.
215   - *
216 210 * The cpuset_common_file_write handler for operations that modify
217   - * the cpuset hierarchy holds manage_sem across the entire operation,
  211 + * the cpuset hierarchy holds manage_mutex across the entire operation,
218 212 * single threading all such cpuset modifications across the system.
219 213 *
220   - * The cpuset_common_file_read() handlers only hold callback_sem across
  214 + * The cpuset_common_file_read() handlers only hold callback_mutex across
221 215 * small pieces of code, such as when reading out possibly multi-word
222 216 * cpumasks and nodemasks.
223 217 *
224 218 * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't
225   - * (usually) take either semaphore. These are the two most performance
  219 + * (usually) take either mutex. These are the two most performance
226 220 * critical pieces of code here. The exception occurs on cpuset_exit(),
227   - * when a task in a notify_on_release cpuset exits. Then manage_sem
  221 + * when a task in a notify_on_release cpuset exits. Then manage_mutex
228 222 * is taken, and if the cpuset count is zero, a usermode call made
229 223 * to /sbin/cpuset_release_agent with the name of the cpuset (path
230 224 * relative to the root of cpuset file system) as the argument.
231 225  
... ... @@ -242,9 +236,9 @@
242 236 *
243 237 * The need for this exception arises from the action of attach_task(),
244 238 * which overwrites one tasks cpuset pointer with another. It does
245   - * so using both semaphores, however there are several performance
  239 + * so using both mutexes, however there are several performance
246 240 * critical places that need to reference task->cpuset without the
247   - * expense of grabbing a system global semaphore. Therefore except as
  241 + * expense of grabbing a system global mutex. Therefore except as
248 242 * noted below, when dereferencing or, as in attach_task(), modifying
249 243 * a tasks cpuset pointer we use task_lock(), which acts on a spinlock
250 244 * (task->alloc_lock) already in the task_struct routinely used for
... ... @@ -256,8 +250,8 @@
256 250 * the routine cpuset_update_task_memory_state().
257 251 */
258 252  
259   -static DECLARE_MUTEX(manage_sem);
260   -static DECLARE_MUTEX(callback_sem);
  253 +static DEFINE_MUTEX(manage_mutex);
  254 +static DEFINE_MUTEX(callback_mutex);
261 255  
262 256 /*
263 257 * A couple of forward declarations required, due to cyclic reference loop:
... ... @@ -432,7 +426,7 @@
432 426 }
433 427  
434 428 /*
435   - * Call with manage_sem held. Writes path of cpuset into buf.
  429 + * Call with manage_mutex held. Writes path of cpuset into buf.
436 430 * Returns 0 on success, -errno on error.
437 431 */
438 432  
439 433  
... ... @@ -484,11 +478,11 @@
484 478 * status of the /sbin/cpuset_release_agent task, so no sense holding
485 479 * our caller up for that.
486 480 *
487   - * When we had only one cpuset semaphore, we had to call this
  481 + * When we had only one cpuset mutex, we had to call this
488 482 * without holding it, to avoid deadlock when call_usermodehelper()
489 483 * allocated memory. With two locks, we could now call this while
490   - * holding manage_sem, but we still don't, so as to minimize
491   - * the time manage_sem is held.
  484 + * holding manage_mutex, but we still don't, so as to minimize
  485 + * the time manage_mutex is held.
492 486 */
493 487  
494 488 static void cpuset_release_agent(const char *pathbuf)
495 489  
... ... @@ -520,15 +514,15 @@
520 514 * cs is notify_on_release() and now both the user count is zero and
521 515 * the list of children is empty, prepare cpuset path in a kmalloc'd
522 516 * buffer, to be returned via ppathbuf, so that the caller can invoke
523   - * cpuset_release_agent() with it later on, once manage_sem is dropped.
524   - * Call here with manage_sem held.
  517 + * cpuset_release_agent() with it later on, once manage_mutex is dropped.
  518 + * Call here with manage_mutex held.
525 519 *
526 520 * This check_for_release() routine is responsible for kmalloc'ing
527 521 * pathbuf. The above cpuset_release_agent() is responsible for
528 522 * kfree'ing pathbuf. The caller of these routines is responsible
529 523 * for providing a pathbuf pointer, initialized to NULL, then
530   - * calling check_for_release() with manage_sem held and the address
531   - * of the pathbuf pointer, then dropping manage_sem, then calling
  524 + * calling check_for_release() with manage_mutex held and the address
  525 + * of the pathbuf pointer, then dropping manage_mutex, then calling
532 526 * cpuset_release_agent() with pathbuf, as set by check_for_release().
533 527 */
534 528  
... ... @@ -559,7 +553,7 @@
559 553 * One way or another, we guarantee to return some non-empty subset
560 554 * of cpu_online_map.
561 555 *
562   - * Call with callback_sem held.
  556 + * Call with callback_mutex held.
563 557 */
564 558  
565 559 static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
... ... @@ -583,7 +577,7 @@
583 577 * One way or another, we guarantee to return some non-empty subset
584 578 * of node_online_map.
585 579 *
586   - * Call with callback_sem held.
  580 + * Call with callback_mutex held.
587 581 */
588 582  
589 583 static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
590 584  
... ... @@ -608,12 +602,12 @@
608 602 * current->cpuset if a task has its memory placement changed.
609 603 * Do not call this routine if in_interrupt().
610 604 *
611   - * Call without callback_sem or task_lock() held. May be called
612   - * with or without manage_sem held. Doesn't need task_lock to guard
  605 + * Call without callback_mutex or task_lock() held. May be called
  606 + * with or without manage_mutex held. Doesn't need task_lock to guard
613 607 * against another task changing a non-NULL cpuset pointer to NULL,
614 608 * as that is only done by a task on itself, and if the current task
615 609 * is here, it is not simultaneously in the exit code NULL'ing its
616   - * cpuset pointer. This routine also might acquire callback_sem and
  610 + * cpuset pointer. This routine also might acquire callback_mutex and
617 611 * current->mm->mmap_sem during call.
618 612 *
619 613 * Reading current->cpuset->mems_generation doesn't need task_lock
620 614  
... ... @@ -658,13 +652,13 @@
658 652 }
659 653  
660 654 if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
661   - down(&callback_sem);
  655 + mutex_lock(&callback_mutex);
662 656 task_lock(tsk);
663 657 cs = tsk->cpuset; /* Maybe changed when task not locked */
664 658 guarantee_online_mems(cs, &tsk->mems_allowed);
665 659 tsk->cpuset_mems_generation = cs->mems_generation;
666 660 task_unlock(tsk);
667   - up(&callback_sem);
  661 + mutex_unlock(&callback_mutex);
668 662 mpol_rebind_task(tsk, &tsk->mems_allowed);
669 663 }
670 664 }
... ... @@ -674,7 +668,7 @@
674 668 *
675 669 * One cpuset is a subset of another if all its allowed CPUs and
676 670 * Memory Nodes are a subset of the other, and its exclusive flags
677   - * are only set if the other's are set. Call holding manage_sem.
  671 + * are only set if the other's are set. Call holding manage_mutex.
678 672 */
679 673  
680 674 static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
... ... @@ -692,7 +686,7 @@
692 686 * If we replaced the flag and mask values of the current cpuset
693 687 * (cur) with those values in the trial cpuset (trial), would
694 688 * our various subset and exclusive rules still be valid? Presumes
695   - * manage_sem held.
  689 + * manage_mutex held.
696 690 *
697 691 * 'cur' is the address of an actual, in-use cpuset. Operations
698 692 * such as list traversal that depend on the actual address of the
... ... @@ -746,7 +740,7 @@
746 740 * exclusive child cpusets
747 741 * Build these two partitions by calling partition_sched_domains
748 742 *
749   - * Call with manage_sem held. May nest a call to the
  743 + * Call with manage_mutex held. May nest a call to the
750 744 * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
751 745 */
752 746  
... ... @@ -792,7 +786,7 @@
792 786 }
793 787  
794 788 /*
795   - * Call with manage_sem held. May take callback_sem during call.
  789 + * Call with manage_mutex held. May take callback_mutex during call.
796 790 */
797 791  
798 792 static int update_cpumask(struct cpuset *cs, char *buf)
799 793  
... ... @@ -811,9 +805,9 @@
811 805 if (retval < 0)
812 806 return retval;
813 807 cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed);
814   - down(&callback_sem);
  808 + mutex_lock(&callback_mutex);
815 809 cs->cpus_allowed = trialcs.cpus_allowed;
816   - up(&callback_sem);
  810 + mutex_unlock(&callback_mutex);
817 811 if (is_cpu_exclusive(cs) && !cpus_unchanged)
818 812 update_cpu_domains(cs);
819 813 return 0;
... ... @@ -827,7 +821,7 @@
827 821 * the cpuset is marked 'memory_migrate', migrate the tasks
828 822 * pages to the new memory.
829 823 *
830   - * Call with manage_sem held. May take callback_sem during call.
  824 + * Call with manage_mutex held. May take callback_mutex during call.
831 825 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
832 826 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
833 827 * their mempolicies to the cpusets new mems_allowed.
834 828  
... ... @@ -862,11 +856,11 @@
862 856 if (retval < 0)
863 857 goto done;
864 858  
865   - down(&callback_sem);
  859 + mutex_lock(&callback_mutex);
866 860 cs->mems_allowed = trialcs.mems_allowed;
867 861 atomic_inc(&cpuset_mems_generation);
868 862 cs->mems_generation = atomic_read(&cpuset_mems_generation);
869   - up(&callback_sem);
  863 + mutex_unlock(&callback_mutex);
870 864  
871 865 set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */
872 866  
... ... @@ -922,7 +916,7 @@
922 916 * tasklist_lock. Forks can happen again now - the mpol_copy()
923 917 * cpuset_being_rebound check will catch such forks, and rebind
924 918 * their vma mempolicies too. Because we still hold the global
925   - * cpuset manage_sem, we know that no other rebind effort will
  919 + * cpuset manage_mutex, we know that no other rebind effort will
926 920 * be contending for the global variable cpuset_being_rebound.
927 921 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
928 922 * is idempotent. Also migrate pages in each mm to new nodes.
... ... @@ -948,7 +942,7 @@
948 942 }
949 943  
950 944 /*
951   - * Call with manage_sem held.
  945 + * Call with manage_mutex held.
952 946 */
953 947  
954 948 static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
... ... @@ -967,7 +961,7 @@
967 961 * cs: the cpuset to update
968 962 * buf: the buffer where we read the 0 or 1
969 963 *
970   - * Call with manage_sem held.
  964 + * Call with manage_mutex held.
971 965 */
972 966  
973 967 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
974 968  
... ... @@ -989,12 +983,12 @@
989 983 return err;
990 984 cpu_exclusive_changed =
991 985 (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
992   - down(&callback_sem);
  986 + mutex_lock(&callback_mutex);
993 987 if (turning_on)
994 988 set_bit(bit, &cs->flags);
995 989 else
996 990 clear_bit(bit, &cs->flags);
997   - up(&callback_sem);
  991 + mutex_unlock(&callback_mutex);
998 992  
999 993 if (cpu_exclusive_changed)
1000 994 update_cpu_domains(cs);
... ... @@ -1104,7 +1098,7 @@
1104 1098 * writing the path of the old cpuset in 'ppathbuf' if it needs to be
1105 1099 * notified on release.
1106 1100 *
1107   - * Call holding manage_sem. May take callback_sem and task_lock of
  1101 + * Call holding manage_mutex. May take callback_mutex and task_lock of
1108 1102 * the task 'pid' during call.
1109 1103 */
1110 1104  
1111 1105  
... ... @@ -1144,13 +1138,13 @@
1144 1138 get_task_struct(tsk);
1145 1139 }
1146 1140  
1147   - down(&callback_sem);
  1141 + mutex_lock(&callback_mutex);
1148 1142  
1149 1143 task_lock(tsk);
1150 1144 oldcs = tsk->cpuset;
1151 1145 if (!oldcs) {
1152 1146 task_unlock(tsk);
1153   - up(&callback_sem);
  1147 + mutex_unlock(&callback_mutex);
1154 1148 put_task_struct(tsk);
1155 1149 return -ESRCH;
1156 1150 }
... ... @@ -1164,7 +1158,7 @@
1164 1158 from = oldcs->mems_allowed;
1165 1159 to = cs->mems_allowed;
1166 1160  
1167   - up(&callback_sem);
  1161 + mutex_unlock(&callback_mutex);
1168 1162  
1169 1163 mm = get_task_mm(tsk);
1170 1164 if (mm) {
... ... @@ -1221,7 +1215,7 @@
1221 1215 }
1222 1216 buffer[nbytes] = 0; /* nul-terminate */
1223 1217  
1224   - down(&manage_sem);
  1218 + mutex_lock(&manage_mutex);
1225 1219  
1226 1220 if (is_removed(cs)) {
1227 1221 retval = -ENODEV;
... ... @@ -1264,7 +1258,7 @@
1264 1258 if (retval == 0)
1265 1259 retval = nbytes;
1266 1260 out2:
1267   - up(&manage_sem);
  1261 + mutex_unlock(&manage_mutex);
1268 1262 cpuset_release_agent(pathbuf);
1269 1263 out1:
1270 1264 kfree(buffer);
1271 1265  
... ... @@ -1304,9 +1298,9 @@
1304 1298 {
1305 1299 cpumask_t mask;
1306 1300  
1307   - down(&callback_sem);
  1301 + mutex_lock(&callback_mutex);
1308 1302 mask = cs->cpus_allowed;
1309   - up(&callback_sem);
  1303 + mutex_unlock(&callback_mutex);
1310 1304  
1311 1305 return cpulist_scnprintf(page, PAGE_SIZE, mask);
1312 1306 }
1313 1307  
... ... @@ -1315,9 +1309,9 @@
1315 1309 {
1316 1310 nodemask_t mask;
1317 1311  
1318   - down(&callback_sem);
  1312 + mutex_lock(&callback_mutex);
1319 1313 mask = cs->mems_allowed;
1320   - up(&callback_sem);
  1314 + mutex_unlock(&callback_mutex);
1321 1315  
1322 1316 return nodelist_scnprintf(page, PAGE_SIZE, mask);
1323 1317 }
... ... @@ -1598,7 +1592,7 @@
1598 1592 * Handle an open on 'tasks' file. Prepare a buffer listing the
1599 1593 * process id's of tasks currently attached to the cpuset being opened.
1600 1594 *
1601   - * Does not require any specific cpuset semaphores, and does not take any.
  1595 + * Does not require any specific cpuset mutexes, and does not take any.
1602 1596 */
1603 1597 static int cpuset_tasks_open(struct inode *unused, struct file *file)
1604 1598 {
... ... @@ -1754,7 +1748,7 @@
1754 1748 * name: name of the new cpuset. Will be strcpy'ed.
1755 1749 * mode: mode to set on new inode
1756 1750 *
1757   - * Must be called with the semaphore on the parent inode held
  1751 + * Must be called with the mutex on the parent inode held
1758 1752 */
1759 1753  
1760 1754 static long cpuset_create(struct cpuset *parent, const char *name, int mode)
... ... @@ -1766,7 +1760,7 @@
1766 1760 if (!cs)
1767 1761 return -ENOMEM;
1768 1762  
1769   - down(&manage_sem);
  1763 + mutex_lock(&manage_mutex);
1770 1764 cpuset_update_task_memory_state();
1771 1765 cs->flags = 0;
1772 1766 if (notify_on_release(parent))
1773 1767  
1774 1768  
1775 1769  
1776 1770  
... ... @@ -1782,28 +1776,28 @@
1782 1776  
1783 1777 cs->parent = parent;
1784 1778  
1785   - down(&callback_sem);
  1779 + mutex_lock(&callback_mutex);
1786 1780 list_add(&cs->sibling, &cs->parent->children);
1787 1781 number_of_cpusets++;
1788   - up(&callback_sem);
  1782 + mutex_unlock(&callback_mutex);
1789 1783  
1790 1784 err = cpuset_create_dir(cs, name, mode);
1791 1785 if (err < 0)
1792 1786 goto err;
1793 1787  
1794 1788 /*
1795   - * Release manage_sem before cpuset_populate_dir() because it
  1789 + * Release manage_mutex before cpuset_populate_dir() because it
1796 1790 * will down() this new directory's i_mutex and if we race with
1797 1791 * another mkdir, we might deadlock.
1798 1792 */
1799   - up(&manage_sem);
  1793 + mutex_unlock(&manage_mutex);
1800 1794  
1801 1795 err = cpuset_populate_dir(cs->dentry);
1802 1796 /* If err < 0, we have a half-filled directory - oh well ;) */
1803 1797 return 0;
1804 1798 err:
1805 1799 list_del(&cs->sibling);
1806   - up(&manage_sem);
  1800 + mutex_unlock(&manage_mutex);
1807 1801 kfree(cs);
1808 1802 return err;
1809 1803 }
1810 1804  
1811 1805  
1812 1806  
... ... @@ -1825,18 +1819,18 @@
1825 1819  
1826 1820 /* the vfs holds both inode->i_mutex already */
1827 1821  
1828   - down(&manage_sem);
  1822 + mutex_lock(&manage_mutex);
1829 1823 cpuset_update_task_memory_state();
1830 1824 if (atomic_read(&cs->count) > 0) {
1831   - up(&manage_sem);
  1825 + mutex_unlock(&manage_mutex);
1832 1826 return -EBUSY;
1833 1827 }
1834 1828 if (!list_empty(&cs->children)) {
1835   - up(&manage_sem);
  1829 + mutex_unlock(&manage_mutex);
1836 1830 return -EBUSY;
1837 1831 }
1838 1832 parent = cs->parent;
1839   - down(&callback_sem);
  1833 + mutex_lock(&callback_mutex);
1840 1834 set_bit(CS_REMOVED, &cs->flags);
1841 1835 if (is_cpu_exclusive(cs))
1842 1836 update_cpu_domains(cs);
1843 1837  
... ... @@ -1848,10 +1842,10 @@
1848 1842 cpuset_d_remove_dir(d);
1849 1843 dput(d);
1850 1844 number_of_cpusets--;
1851   - up(&callback_sem);
  1845 + mutex_unlock(&callback_mutex);
1852 1846 if (list_empty(&parent->children))
1853 1847 check_for_release(parent, &pathbuf);
1854   - up(&manage_sem);
  1848 + mutex_unlock(&manage_mutex);
1855 1849 cpuset_release_agent(pathbuf);
1856 1850 return 0;
1857 1851 }
1858 1852  
1859 1853  
... ... @@ -1960,19 +1954,19 @@
1960 1954 * Description: Detach cpuset from @tsk and release it.
1961 1955 *
1962 1956 * Note that cpusets marked notify_on_release force every task in
1963   - * them to take the global manage_sem semaphore when exiting.
  1957 + * them to take the global manage_mutex mutex when exiting.
1964 1958 * This could impact scaling on very large systems. Be reluctant to
1965 1959 * use notify_on_release cpusets where very high task exit scaling
1966 1960 * is required on large systems.
1967 1961 *
1968 1962 * Don't even think about derefencing 'cs' after the cpuset use count
1969   - * goes to zero, except inside a critical section guarded by manage_sem
1970   - * or callback_sem. Otherwise a zero cpuset use count is a license to
  1963 + * goes to zero, except inside a critical section guarded by manage_mutex
  1964 + * or callback_mutex. Otherwise a zero cpuset use count is a license to
1971 1965 * any other task to nuke the cpuset immediately, via cpuset_rmdir().
1972 1966 *
1973   - * This routine has to take manage_sem, not callback_sem, because
1974   - * it is holding that semaphore while calling check_for_release(),
1975   - * which calls kmalloc(), so can't be called holding callback__sem().
  1967 + * This routine has to take manage_mutex, not callback_mutex, because
  1968 + * it is holding that mutex while calling check_for_release(),
  1969 + * which calls kmalloc(), so can't be called holding callback_mutex().
1976 1970 *
1977 1971 * We don't need to task_lock() this reference to tsk->cpuset,
1978 1972 * because tsk is already marked PF_EXITING, so attach_task() won't
1979 1973  
... ... @@ -2022,10 +2016,10 @@
2022 2016 if (notify_on_release(cs)) {
2023 2017 char *pathbuf = NULL;
2024 2018  
2025   - down(&manage_sem);
  2019 + mutex_lock(&manage_mutex);
2026 2020 if (atomic_dec_and_test(&cs->count))
2027 2021 check_for_release(cs, &pathbuf);
2028   - up(&manage_sem);
  2022 + mutex_unlock(&manage_mutex);
2029 2023 cpuset_release_agent(pathbuf);
2030 2024 } else {
2031 2025 atomic_dec(&cs->count);
2032 2026  
... ... @@ -2046,11 +2040,11 @@
2046 2040 {
2047 2041 cpumask_t mask;
2048 2042  
2049   - down(&callback_sem);
  2043 + mutex_lock(&callback_mutex);
2050 2044 task_lock(tsk);
2051 2045 guarantee_online_cpus(tsk->cpuset, &mask);
2052 2046 task_unlock(tsk);
2053   - up(&callback_sem);
  2047 + mutex_unlock(&callback_mutex);
2054 2048  
2055 2049 return mask;
2056 2050 }
2057 2051  
... ... @@ -2074,11 +2068,11 @@
2074 2068 {
2075 2069 nodemask_t mask;
2076 2070  
2077   - down(&callback_sem);
  2071 + mutex_lock(&callback_mutex);
2078 2072 task_lock(tsk);
2079 2073 guarantee_online_mems(tsk->cpuset, &mask);
2080 2074 task_unlock(tsk);
2081   - up(&callback_sem);
  2075 + mutex_unlock(&callback_mutex);
2082 2076  
2083 2077 return mask;
2084 2078 }
... ... @@ -2104,7 +2098,7 @@
2104 2098  
2105 2099 /*
2106 2100 * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive
2107   - * ancestor to the specified cpuset. Call holding callback_sem.
  2101 + * ancestor to the specified cpuset. Call holding callback_mutex.
2108 2102 * If no ancestor is mem_exclusive (an unusual configuration), then
2109 2103 * returns the root cpuset.
2110 2104 */
2111 2105  
... ... @@ -2131,12 +2125,12 @@
2131 2125 * GFP_KERNEL allocations are not so marked, so can escape to the
2132 2126 * nearest mem_exclusive ancestor cpuset.
2133 2127 *
2134   - * Scanning up parent cpusets requires callback_sem. The __alloc_pages()
  2128 + * Scanning up parent cpusets requires callback_mutex. The __alloc_pages()
2135 2129 * routine only calls here with __GFP_HARDWALL bit _not_ set if
2136 2130 * it's a GFP_KERNEL allocation, and all nodes in the current tasks
2137 2131 * mems_allowed came up empty on the first pass over the zonelist.
2138 2132 * So only GFP_KERNEL allocations, if all nodes in the cpuset are
2139   - * short of memory, might require taking the callback_sem semaphore.
  2133 + * short of memory, might require taking the callback_mutex mutex.
2140 2134 *
2141 2135 * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages()
2142 2136 * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing
2143 2137  
2144 2138  
2145 2139  
2146 2140  
2147 2141  
... ... @@ -2171,31 +2165,31 @@
2171 2165 return 1;
2172 2166  
2173 2167 /* Not hardwall and node outside mems_allowed: scan up cpusets */
2174   - down(&callback_sem);
  2168 + mutex_lock(&callback_mutex);
2175 2169  
2176 2170 task_lock(current);
2177 2171 cs = nearest_exclusive_ancestor(current->cpuset);
2178 2172 task_unlock(current);
2179 2173  
2180 2174 allowed = node_isset(node, cs->mems_allowed);
2181   - up(&callback_sem);
  2175 + mutex_unlock(&callback_mutex);
2182 2176 return allowed;
2183 2177 }
2184 2178  
2185 2179 /**
2186 2180 * cpuset_lock - lock out any changes to cpuset structures
2187 2181 *
2188   - * The out of memory (oom) code needs to lock down cpusets
  2182 + * The out of memory (oom) code needs to mutex_lock cpusets
2189 2183 * from being changed while it scans the tasklist looking for a
2190   - * task in an overlapping cpuset. Expose callback_sem via this
  2184 + * task in an overlapping cpuset. Expose callback_mutex via this
2191 2185 * cpuset_lock() routine, so the oom code can lock it, before
2192 2186 * locking the task list. The tasklist_lock is a spinlock, so
2193   - * must be taken inside callback_sem.
  2187 + * must be taken inside callback_mutex.
2194 2188 */
2195 2189  
2196 2190 void cpuset_lock(void)
2197 2191 {
2198   - down(&callback_sem);
  2192 + mutex_lock(&callback_mutex);
2199 2193 }
2200 2194  
2201 2195 /**
... ... @@ -2206,7 +2200,7 @@
2206 2200  
2207 2201 void cpuset_unlock(void)
2208 2202 {
2209   - up(&callback_sem);
  2203 + mutex_unlock(&callback_mutex);
2210 2204 }
2211 2205  
2212 2206 /**
... ... @@ -2218,7 +2212,7 @@
2218 2212 * determine if task @p's memory usage might impact the memory
2219 2213 * available to the current task.
2220 2214 *
2221   - * Call while holding callback_sem.
  2215 + * Call while holding callback_mutex.
2222 2216 **/
2223 2217  
2224 2218 int cpuset_excl_nodes_overlap(const struct task_struct *p)
... ... @@ -2289,7 +2283,7 @@
2289 2283 * - Used for /proc/<pid>/cpuset.
2290 2284 * - No need to task_lock(tsk) on this tsk->cpuset reference, as it
2291 2285 * doesn't really matter if tsk->cpuset changes after we read it,
2292   - * and we take manage_sem, keeping attach_task() from changing it
  2286 + * and we take manage_mutex, keeping attach_task() from changing it
2293 2287 * anyway.
2294 2288 */
2295 2289  
... ... @@ -2305,7 +2299,7 @@
2305 2299 return -ENOMEM;
2306 2300  
2307 2301 tsk = m->private;
2308   - down(&manage_sem);
  2302 + mutex_lock(&manage_mutex);
2309 2303 cs = tsk->cpuset;
2310 2304 if (!cs) {
2311 2305 retval = -EINVAL;
... ... @@ -2318,7 +2312,7 @@
2318 2312 seq_puts(m, buf);
2319 2313 seq_putc(m, '\n');
2320 2314 out:
2321   - up(&manage_sem);
  2315 + mutex_unlock(&manage_mutex);
2322 2316 kfree(buf);
2323 2317 return retval;
2324 2318 }