Commit 13fbca4c6ecd96ec1a1cfa2e4f2ce191fe928a5e
Committed by
Linus Torvalds
1 parent
b3eaa9fc5c
Exists in
ti-lsk-linux-4.1.y
and in
12 other branches
futex: Always cleanup owner tid in unlock_pi
If the owner died bit is set at futex_unlock_pi, we currently do not cleanup the user space futex. So the owner TID of the current owner (the unlocker) persists. That's observable inconsistant state, especially when the ownership of the pi state got transferred. Clean it up unconditionally. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: Kees Cook <keescook@chromium.org> Cc: Will Drewry <wad@chromium.org> Cc: Darren Hart <dvhart@linux.intel.com> Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 1 changed file with 18 additions and 22 deletions Inline Diff
kernel/futex.c
1 | /* | 1 | /* |
2 | * Fast Userspace Mutexes (which I call "Futexes!"). | 2 | * Fast Userspace Mutexes (which I call "Futexes!"). |
3 | * (C) Rusty Russell, IBM 2002 | 3 | * (C) Rusty Russell, IBM 2002 |
4 | * | 4 | * |
5 | * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar | 5 | * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar |
6 | * (C) Copyright 2003 Red Hat Inc, All Rights Reserved | 6 | * (C) Copyright 2003 Red Hat Inc, All Rights Reserved |
7 | * | 7 | * |
8 | * Removed page pinning, fix privately mapped COW pages and other cleanups | 8 | * Removed page pinning, fix privately mapped COW pages and other cleanups |
9 | * (C) Copyright 2003, 2004 Jamie Lokier | 9 | * (C) Copyright 2003, 2004 Jamie Lokier |
10 | * | 10 | * |
11 | * Robust futex support started by Ingo Molnar | 11 | * Robust futex support started by Ingo Molnar |
12 | * (C) Copyright 2006 Red Hat Inc, All Rights Reserved | 12 | * (C) Copyright 2006 Red Hat Inc, All Rights Reserved |
13 | * Thanks to Thomas Gleixner for suggestions, analysis and fixes. | 13 | * Thanks to Thomas Gleixner for suggestions, analysis and fixes. |
14 | * | 14 | * |
15 | * PI-futex support started by Ingo Molnar and Thomas Gleixner | 15 | * PI-futex support started by Ingo Molnar and Thomas Gleixner |
16 | * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | 16 | * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> |
17 | * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> | 17 | * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> |
18 | * | 18 | * |
19 | * PRIVATE futexes by Eric Dumazet | 19 | * PRIVATE futexes by Eric Dumazet |
20 | * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> | 20 | * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> |
21 | * | 21 | * |
22 | * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com> | 22 | * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com> |
23 | * Copyright (C) IBM Corporation, 2009 | 23 | * Copyright (C) IBM Corporation, 2009 |
24 | * Thanks to Thomas Gleixner for conceptual design and careful reviews. | 24 | * Thanks to Thomas Gleixner for conceptual design and careful reviews. |
25 | * | 25 | * |
26 | * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly | 26 | * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly |
27 | * enough at me, Linus for the original (flawed) idea, Matthew | 27 | * enough at me, Linus for the original (flawed) idea, Matthew |
28 | * Kirkwood for proof-of-concept implementation. | 28 | * Kirkwood for proof-of-concept implementation. |
29 | * | 29 | * |
30 | * "The futexes are also cursed." | 30 | * "The futexes are also cursed." |
31 | * "But they come in a choice of three flavours!" | 31 | * "But they come in a choice of three flavours!" |
32 | * | 32 | * |
33 | * This program is free software; you can redistribute it and/or modify | 33 | * This program is free software; you can redistribute it and/or modify |
34 | * it under the terms of the GNU General Public License as published by | 34 | * it under the terms of the GNU General Public License as published by |
35 | * the Free Software Foundation; either version 2 of the License, or | 35 | * the Free Software Foundation; either version 2 of the License, or |
36 | * (at your option) any later version. | 36 | * (at your option) any later version. |
37 | * | 37 | * |
38 | * This program is distributed in the hope that it will be useful, | 38 | * This program is distributed in the hope that it will be useful, |
39 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | 39 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
40 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 40 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
41 | * GNU General Public License for more details. | 41 | * GNU General Public License for more details. |
42 | * | 42 | * |
43 | * You should have received a copy of the GNU General Public License | 43 | * You should have received a copy of the GNU General Public License |
44 | * along with this program; if not, write to the Free Software | 44 | * along with this program; if not, write to the Free Software |
45 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | 45 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
46 | */ | 46 | */ |
47 | #include <linux/slab.h> | 47 | #include <linux/slab.h> |
48 | #include <linux/poll.h> | 48 | #include <linux/poll.h> |
49 | #include <linux/fs.h> | 49 | #include <linux/fs.h> |
50 | #include <linux/file.h> | 50 | #include <linux/file.h> |
51 | #include <linux/jhash.h> | 51 | #include <linux/jhash.h> |
52 | #include <linux/init.h> | 52 | #include <linux/init.h> |
53 | #include <linux/futex.h> | 53 | #include <linux/futex.h> |
54 | #include <linux/mount.h> | 54 | #include <linux/mount.h> |
55 | #include <linux/pagemap.h> | 55 | #include <linux/pagemap.h> |
56 | #include <linux/syscalls.h> | 56 | #include <linux/syscalls.h> |
57 | #include <linux/signal.h> | 57 | #include <linux/signal.h> |
58 | #include <linux/export.h> | 58 | #include <linux/export.h> |
59 | #include <linux/magic.h> | 59 | #include <linux/magic.h> |
60 | #include <linux/pid.h> | 60 | #include <linux/pid.h> |
61 | #include <linux/nsproxy.h> | 61 | #include <linux/nsproxy.h> |
62 | #include <linux/ptrace.h> | 62 | #include <linux/ptrace.h> |
63 | #include <linux/sched/rt.h> | 63 | #include <linux/sched/rt.h> |
64 | #include <linux/hugetlb.h> | 64 | #include <linux/hugetlb.h> |
65 | #include <linux/freezer.h> | 65 | #include <linux/freezer.h> |
66 | #include <linux/bootmem.h> | 66 | #include <linux/bootmem.h> |
67 | 67 | ||
68 | #include <asm/futex.h> | 68 | #include <asm/futex.h> |
69 | 69 | ||
70 | #include "locking/rtmutex_common.h" | 70 | #include "locking/rtmutex_common.h" |
71 | 71 | ||
72 | /* | 72 | /* |
73 | * READ this before attempting to hack on futexes! | 73 | * READ this before attempting to hack on futexes! |
74 | * | 74 | * |
75 | * Basic futex operation and ordering guarantees | 75 | * Basic futex operation and ordering guarantees |
76 | * ============================================= | 76 | * ============================================= |
77 | * | 77 | * |
78 | * The waiter reads the futex value in user space and calls | 78 | * The waiter reads the futex value in user space and calls |
79 | * futex_wait(). This function computes the hash bucket and acquires | 79 | * futex_wait(). This function computes the hash bucket and acquires |
80 | * the hash bucket lock. After that it reads the futex user space value | 80 | * the hash bucket lock. After that it reads the futex user space value |
81 | * again and verifies that the data has not changed. If it has not changed | 81 | * again and verifies that the data has not changed. If it has not changed |
82 | * it enqueues itself into the hash bucket, releases the hash bucket lock | 82 | * it enqueues itself into the hash bucket, releases the hash bucket lock |
83 | * and schedules. | 83 | * and schedules. |
84 | * | 84 | * |
85 | * The waker side modifies the user space value of the futex and calls | 85 | * The waker side modifies the user space value of the futex and calls |
86 | * futex_wake(). This function computes the hash bucket and acquires the | 86 | * futex_wake(). This function computes the hash bucket and acquires the |
87 | * hash bucket lock. Then it looks for waiters on that futex in the hash | 87 | * hash bucket lock. Then it looks for waiters on that futex in the hash |
88 | * bucket and wakes them. | 88 | * bucket and wakes them. |
89 | * | 89 | * |
90 | * In futex wake up scenarios where no tasks are blocked on a futex, taking | 90 | * In futex wake up scenarios where no tasks are blocked on a futex, taking |
91 | * the hb spinlock can be avoided and simply return. In order for this | 91 | * the hb spinlock can be avoided and simply return. In order for this |
92 | * optimization to work, ordering guarantees must exist so that the waiter | 92 | * optimization to work, ordering guarantees must exist so that the waiter |
93 | * being added to the list is acknowledged when the list is concurrently being | 93 | * being added to the list is acknowledged when the list is concurrently being |
94 | * checked by the waker, avoiding scenarios like the following: | 94 | * checked by the waker, avoiding scenarios like the following: |
95 | * | 95 | * |
96 | * CPU 0 CPU 1 | 96 | * CPU 0 CPU 1 |
97 | * val = *futex; | 97 | * val = *futex; |
98 | * sys_futex(WAIT, futex, val); | 98 | * sys_futex(WAIT, futex, val); |
99 | * futex_wait(futex, val); | 99 | * futex_wait(futex, val); |
100 | * uval = *futex; | 100 | * uval = *futex; |
101 | * *futex = newval; | 101 | * *futex = newval; |
102 | * sys_futex(WAKE, futex); | 102 | * sys_futex(WAKE, futex); |
103 | * futex_wake(futex); | 103 | * futex_wake(futex); |
104 | * if (queue_empty()) | 104 | * if (queue_empty()) |
105 | * return; | 105 | * return; |
106 | * if (uval == val) | 106 | * if (uval == val) |
107 | * lock(hash_bucket(futex)); | 107 | * lock(hash_bucket(futex)); |
108 | * queue(); | 108 | * queue(); |
109 | * unlock(hash_bucket(futex)); | 109 | * unlock(hash_bucket(futex)); |
110 | * schedule(); | 110 | * schedule(); |
111 | * | 111 | * |
112 | * This would cause the waiter on CPU 0 to wait forever because it | 112 | * This would cause the waiter on CPU 0 to wait forever because it |
113 | * missed the transition of the user space value from val to newval | 113 | * missed the transition of the user space value from val to newval |
114 | * and the waker did not find the waiter in the hash bucket queue. | 114 | * and the waker did not find the waiter in the hash bucket queue. |
115 | * | 115 | * |
116 | * The correct serialization ensures that a waiter either observes | 116 | * The correct serialization ensures that a waiter either observes |
117 | * the changed user space value before blocking or is woken by a | 117 | * the changed user space value before blocking or is woken by a |
118 | * concurrent waker: | 118 | * concurrent waker: |
119 | * | 119 | * |
120 | * CPU 0 CPU 1 | 120 | * CPU 0 CPU 1 |
121 | * val = *futex; | 121 | * val = *futex; |
122 | * sys_futex(WAIT, futex, val); | 122 | * sys_futex(WAIT, futex, val); |
123 | * futex_wait(futex, val); | 123 | * futex_wait(futex, val); |
124 | * | 124 | * |
125 | * waiters++; (a) | 125 | * waiters++; (a) |
126 | * mb(); (A) <-- paired with -. | 126 | * mb(); (A) <-- paired with -. |
127 | * | | 127 | * | |
128 | * lock(hash_bucket(futex)); | | 128 | * lock(hash_bucket(futex)); | |
129 | * | | 129 | * | |
130 | * uval = *futex; | | 130 | * uval = *futex; | |
131 | * | *futex = newval; | 131 | * | *futex = newval; |
132 | * | sys_futex(WAKE, futex); | 132 | * | sys_futex(WAKE, futex); |
133 | * | futex_wake(futex); | 133 | * | futex_wake(futex); |
134 | * | | 134 | * | |
135 | * `-------> mb(); (B) | 135 | * `-------> mb(); (B) |
136 | * if (uval == val) | 136 | * if (uval == val) |
137 | * queue(); | 137 | * queue(); |
138 | * unlock(hash_bucket(futex)); | 138 | * unlock(hash_bucket(futex)); |
139 | * schedule(); if (waiters) | 139 | * schedule(); if (waiters) |
140 | * lock(hash_bucket(futex)); | 140 | * lock(hash_bucket(futex)); |
141 | * else wake_waiters(futex); | 141 | * else wake_waiters(futex); |
142 | * waiters--; (b) unlock(hash_bucket(futex)); | 142 | * waiters--; (b) unlock(hash_bucket(futex)); |
143 | * | 143 | * |
144 | * Where (A) orders the waiters increment and the futex value read through | 144 | * Where (A) orders the waiters increment and the futex value read through |
145 | * atomic operations (see hb_waiters_inc) and where (B) orders the write | 145 | * atomic operations (see hb_waiters_inc) and where (B) orders the write |
146 | * to futex and the waiters read -- this is done by the barriers in | 146 | * to futex and the waiters read -- this is done by the barriers in |
147 | * get_futex_key_refs(), through either ihold or atomic_inc, depending on the | 147 | * get_futex_key_refs(), through either ihold or atomic_inc, depending on the |
148 | * futex type. | 148 | * futex type. |
149 | * | 149 | * |
150 | * This yields the following case (where X:=waiters, Y:=futex): | 150 | * This yields the following case (where X:=waiters, Y:=futex): |
151 | * | 151 | * |
152 | * X = Y = 0 | 152 | * X = Y = 0 |
153 | * | 153 | * |
154 | * w[X]=1 w[Y]=1 | 154 | * w[X]=1 w[Y]=1 |
155 | * MB MB | 155 | * MB MB |
156 | * r[Y]=y r[X]=x | 156 | * r[Y]=y r[X]=x |
157 | * | 157 | * |
158 | * Which guarantees that x==0 && y==0 is impossible; which translates back into | 158 | * Which guarantees that x==0 && y==0 is impossible; which translates back into |
159 | * the guarantee that we cannot both miss the futex variable change and the | 159 | * the guarantee that we cannot both miss the futex variable change and the |
160 | * enqueue. | 160 | * enqueue. |
161 | * | 161 | * |
162 | * Note that a new waiter is accounted for in (a) even when it is possible that | 162 | * Note that a new waiter is accounted for in (a) even when it is possible that |
163 | * the wait call can return error, in which case we backtrack from it in (b). | 163 | * the wait call can return error, in which case we backtrack from it in (b). |
164 | * Refer to the comment in queue_lock(). | 164 | * Refer to the comment in queue_lock(). |
165 | * | 165 | * |
166 | * Similarly, in order to account for waiters being requeued on another | 166 | * Similarly, in order to account for waiters being requeued on another |
167 | * address we always increment the waiters for the destination bucket before | 167 | * address we always increment the waiters for the destination bucket before |
168 | * acquiring the lock. It then decrements them again after releasing it - | 168 | * acquiring the lock. It then decrements them again after releasing it - |
169 | * the code that actually moves the futex(es) between hash buckets (requeue_futex) | 169 | * the code that actually moves the futex(es) between hash buckets (requeue_futex) |
170 | * will do the additional required waiter count housekeeping. This is done for | 170 | * will do the additional required waiter count housekeeping. This is done for |
171 | * double_lock_hb() and double_unlock_hb(), respectively. | 171 | * double_lock_hb() and double_unlock_hb(), respectively. |
172 | */ | 172 | */ |
173 | 173 | ||
174 | #ifndef CONFIG_HAVE_FUTEX_CMPXCHG | 174 | #ifndef CONFIG_HAVE_FUTEX_CMPXCHG |
175 | int __read_mostly futex_cmpxchg_enabled; | 175 | int __read_mostly futex_cmpxchg_enabled; |
176 | #endif | 176 | #endif |
177 | 177 | ||
178 | /* | 178 | /* |
179 | * Futex flags used to encode options to functions and preserve them across | 179 | * Futex flags used to encode options to functions and preserve them across |
180 | * restarts. | 180 | * restarts. |
181 | */ | 181 | */ |
182 | #define FLAGS_SHARED 0x01 | 182 | #define FLAGS_SHARED 0x01 |
183 | #define FLAGS_CLOCKRT 0x02 | 183 | #define FLAGS_CLOCKRT 0x02 |
184 | #define FLAGS_HAS_TIMEOUT 0x04 | 184 | #define FLAGS_HAS_TIMEOUT 0x04 |
185 | 185 | ||
186 | /* | 186 | /* |
187 | * Priority Inheritance state: | 187 | * Priority Inheritance state: |
188 | */ | 188 | */ |
189 | struct futex_pi_state { | 189 | struct futex_pi_state { |
190 | /* | 190 | /* |
191 | * list of 'owned' pi_state instances - these have to be | 191 | * list of 'owned' pi_state instances - these have to be |
192 | * cleaned up in do_exit() if the task exits prematurely: | 192 | * cleaned up in do_exit() if the task exits prematurely: |
193 | */ | 193 | */ |
194 | struct list_head list; | 194 | struct list_head list; |
195 | 195 | ||
196 | /* | 196 | /* |
197 | * The PI object: | 197 | * The PI object: |
198 | */ | 198 | */ |
199 | struct rt_mutex pi_mutex; | 199 | struct rt_mutex pi_mutex; |
200 | 200 | ||
201 | struct task_struct *owner; | 201 | struct task_struct *owner; |
202 | atomic_t refcount; | 202 | atomic_t refcount; |
203 | 203 | ||
204 | union futex_key key; | 204 | union futex_key key; |
205 | }; | 205 | }; |
206 | 206 | ||
207 | /** | 207 | /** |
208 | * struct futex_q - The hashed futex queue entry, one per waiting task | 208 | * struct futex_q - The hashed futex queue entry, one per waiting task |
209 | * @list: priority-sorted list of tasks waiting on this futex | 209 | * @list: priority-sorted list of tasks waiting on this futex |
210 | * @task: the task waiting on the futex | 210 | * @task: the task waiting on the futex |
211 | * @lock_ptr: the hash bucket lock | 211 | * @lock_ptr: the hash bucket lock |
212 | * @key: the key the futex is hashed on | 212 | * @key: the key the futex is hashed on |
213 | * @pi_state: optional priority inheritance state | 213 | * @pi_state: optional priority inheritance state |
214 | * @rt_waiter: rt_waiter storage for use with requeue_pi | 214 | * @rt_waiter: rt_waiter storage for use with requeue_pi |
215 | * @requeue_pi_key: the requeue_pi target futex key | 215 | * @requeue_pi_key: the requeue_pi target futex key |
216 | * @bitset: bitset for the optional bitmasked wakeup | 216 | * @bitset: bitset for the optional bitmasked wakeup |
217 | * | 217 | * |
218 | * We use this hashed waitqueue, instead of a normal wait_queue_t, so | 218 | * We use this hashed waitqueue, instead of a normal wait_queue_t, so |
219 | * we can wake only the relevant ones (hashed queues may be shared). | 219 | * we can wake only the relevant ones (hashed queues may be shared). |
220 | * | 220 | * |
221 | * A futex_q has a woken state, just like tasks have TASK_RUNNING. | 221 | * A futex_q has a woken state, just like tasks have TASK_RUNNING. |
222 | * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. | 222 | * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. |
223 | * The order of wakeup is always to make the first condition true, then | 223 | * The order of wakeup is always to make the first condition true, then |
224 | * the second. | 224 | * the second. |
225 | * | 225 | * |
226 | * PI futexes are typically woken before they are removed from the hash list via | 226 | * PI futexes are typically woken before they are removed from the hash list via |
227 | * the rt_mutex code. See unqueue_me_pi(). | 227 | * the rt_mutex code. See unqueue_me_pi(). |
228 | */ | 228 | */ |
229 | struct futex_q { | 229 | struct futex_q { |
230 | struct plist_node list; | 230 | struct plist_node list; |
231 | 231 | ||
232 | struct task_struct *task; | 232 | struct task_struct *task; |
233 | spinlock_t *lock_ptr; | 233 | spinlock_t *lock_ptr; |
234 | union futex_key key; | 234 | union futex_key key; |
235 | struct futex_pi_state *pi_state; | 235 | struct futex_pi_state *pi_state; |
236 | struct rt_mutex_waiter *rt_waiter; | 236 | struct rt_mutex_waiter *rt_waiter; |
237 | union futex_key *requeue_pi_key; | 237 | union futex_key *requeue_pi_key; |
238 | u32 bitset; | 238 | u32 bitset; |
239 | }; | 239 | }; |
240 | 240 | ||
241 | static const struct futex_q futex_q_init = { | 241 | static const struct futex_q futex_q_init = { |
242 | /* list gets initialized in queue_me()*/ | 242 | /* list gets initialized in queue_me()*/ |
243 | .key = FUTEX_KEY_INIT, | 243 | .key = FUTEX_KEY_INIT, |
244 | .bitset = FUTEX_BITSET_MATCH_ANY | 244 | .bitset = FUTEX_BITSET_MATCH_ANY |
245 | }; | 245 | }; |
246 | 246 | ||
247 | /* | 247 | /* |
248 | * Hash buckets are shared by all the futex_keys that hash to the same | 248 | * Hash buckets are shared by all the futex_keys that hash to the same |
249 | * location. Each key may have multiple futex_q structures, one for each task | 249 | * location. Each key may have multiple futex_q structures, one for each task |
250 | * waiting on a futex. | 250 | * waiting on a futex. |
251 | */ | 251 | */ |
252 | struct futex_hash_bucket { | 252 | struct futex_hash_bucket { |
253 | atomic_t waiters; | 253 | atomic_t waiters; |
254 | spinlock_t lock; | 254 | spinlock_t lock; |
255 | struct plist_head chain; | 255 | struct plist_head chain; |
256 | } ____cacheline_aligned_in_smp; | 256 | } ____cacheline_aligned_in_smp; |
257 | 257 | ||
258 | static unsigned long __read_mostly futex_hashsize; | 258 | static unsigned long __read_mostly futex_hashsize; |
259 | 259 | ||
260 | static struct futex_hash_bucket *futex_queues; | 260 | static struct futex_hash_bucket *futex_queues; |
261 | 261 | ||
262 | static inline void futex_get_mm(union futex_key *key) | 262 | static inline void futex_get_mm(union futex_key *key) |
263 | { | 263 | { |
264 | atomic_inc(&key->private.mm->mm_count); | 264 | atomic_inc(&key->private.mm->mm_count); |
265 | /* | 265 | /* |
266 | * Ensure futex_get_mm() implies a full barrier such that | 266 | * Ensure futex_get_mm() implies a full barrier such that |
267 | * get_futex_key() implies a full barrier. This is relied upon | 267 | * get_futex_key() implies a full barrier. This is relied upon |
268 | * as full barrier (B), see the ordering comment above. | 268 | * as full barrier (B), see the ordering comment above. |
269 | */ | 269 | */ |
270 | smp_mb__after_atomic_inc(); | 270 | smp_mb__after_atomic_inc(); |
271 | } | 271 | } |
272 | 272 | ||
273 | /* | 273 | /* |
274 | * Reflects a new waiter being added to the waitqueue. | 274 | * Reflects a new waiter being added to the waitqueue. |
275 | */ | 275 | */ |
276 | static inline void hb_waiters_inc(struct futex_hash_bucket *hb) | 276 | static inline void hb_waiters_inc(struct futex_hash_bucket *hb) |
277 | { | 277 | { |
278 | #ifdef CONFIG_SMP | 278 | #ifdef CONFIG_SMP |
279 | atomic_inc(&hb->waiters); | 279 | atomic_inc(&hb->waiters); |
280 | /* | 280 | /* |
281 | * Full barrier (A), see the ordering comment above. | 281 | * Full barrier (A), see the ordering comment above. |
282 | */ | 282 | */ |
283 | smp_mb__after_atomic_inc(); | 283 | smp_mb__after_atomic_inc(); |
284 | #endif | 284 | #endif |
285 | } | 285 | } |
286 | 286 | ||
287 | /* | 287 | /* |
288 | * Reflects a waiter being removed from the waitqueue by wakeup | 288 | * Reflects a waiter being removed from the waitqueue by wakeup |
289 | * paths. | 289 | * paths. |
290 | */ | 290 | */ |
291 | static inline void hb_waiters_dec(struct futex_hash_bucket *hb) | 291 | static inline void hb_waiters_dec(struct futex_hash_bucket *hb) |
292 | { | 292 | { |
293 | #ifdef CONFIG_SMP | 293 | #ifdef CONFIG_SMP |
294 | atomic_dec(&hb->waiters); | 294 | atomic_dec(&hb->waiters); |
295 | #endif | 295 | #endif |
296 | } | 296 | } |
297 | 297 | ||
298 | static inline int hb_waiters_pending(struct futex_hash_bucket *hb) | 298 | static inline int hb_waiters_pending(struct futex_hash_bucket *hb) |
299 | { | 299 | { |
300 | #ifdef CONFIG_SMP | 300 | #ifdef CONFIG_SMP |
301 | return atomic_read(&hb->waiters); | 301 | return atomic_read(&hb->waiters); |
302 | #else | 302 | #else |
303 | return 1; | 303 | return 1; |
304 | #endif | 304 | #endif |
305 | } | 305 | } |
306 | 306 | ||
307 | /* | 307 | /* |
308 | * We hash on the keys returned from get_futex_key (see below). | 308 | * We hash on the keys returned from get_futex_key (see below). |
309 | */ | 309 | */ |
310 | static struct futex_hash_bucket *hash_futex(union futex_key *key) | 310 | static struct futex_hash_bucket *hash_futex(union futex_key *key) |
311 | { | 311 | { |
312 | u32 hash = jhash2((u32*)&key->both.word, | 312 | u32 hash = jhash2((u32*)&key->both.word, |
313 | (sizeof(key->both.word)+sizeof(key->both.ptr))/4, | 313 | (sizeof(key->both.word)+sizeof(key->both.ptr))/4, |
314 | key->both.offset); | 314 | key->both.offset); |
315 | return &futex_queues[hash & (futex_hashsize - 1)]; | 315 | return &futex_queues[hash & (futex_hashsize - 1)]; |
316 | } | 316 | } |
317 | 317 | ||
318 | /* | 318 | /* |
319 | * Return 1 if two futex_keys are equal, 0 otherwise. | 319 | * Return 1 if two futex_keys are equal, 0 otherwise. |
320 | */ | 320 | */ |
321 | static inline int match_futex(union futex_key *key1, union futex_key *key2) | 321 | static inline int match_futex(union futex_key *key1, union futex_key *key2) |
322 | { | 322 | { |
323 | return (key1 && key2 | 323 | return (key1 && key2 |
324 | && key1->both.word == key2->both.word | 324 | && key1->both.word == key2->both.word |
325 | && key1->both.ptr == key2->both.ptr | 325 | && key1->both.ptr == key2->both.ptr |
326 | && key1->both.offset == key2->both.offset); | 326 | && key1->both.offset == key2->both.offset); |
327 | } | 327 | } |
328 | 328 | ||
329 | /* | 329 | /* |
330 | * Take a reference to the resource addressed by a key. | 330 | * Take a reference to the resource addressed by a key. |
331 | * Can be called while holding spinlocks. | 331 | * Can be called while holding spinlocks. |
332 | * | 332 | * |
333 | */ | 333 | */ |
334 | static void get_futex_key_refs(union futex_key *key) | 334 | static void get_futex_key_refs(union futex_key *key) |
335 | { | 335 | { |
336 | if (!key->both.ptr) | 336 | if (!key->both.ptr) |
337 | return; | 337 | return; |
338 | 338 | ||
339 | switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { | 339 | switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { |
340 | case FUT_OFF_INODE: | 340 | case FUT_OFF_INODE: |
341 | ihold(key->shared.inode); /* implies MB (B) */ | 341 | ihold(key->shared.inode); /* implies MB (B) */ |
342 | break; | 342 | break; |
343 | case FUT_OFF_MMSHARED: | 343 | case FUT_OFF_MMSHARED: |
344 | futex_get_mm(key); /* implies MB (B) */ | 344 | futex_get_mm(key); /* implies MB (B) */ |
345 | break; | 345 | break; |
346 | } | 346 | } |
347 | } | 347 | } |
348 | 348 | ||
349 | /* | 349 | /* |
350 | * Drop a reference to the resource addressed by a key. | 350 | * Drop a reference to the resource addressed by a key. |
351 | * The hash bucket spinlock must not be held. | 351 | * The hash bucket spinlock must not be held. |
352 | */ | 352 | */ |
353 | static void drop_futex_key_refs(union futex_key *key) | 353 | static void drop_futex_key_refs(union futex_key *key) |
354 | { | 354 | { |
355 | if (!key->both.ptr) { | 355 | if (!key->both.ptr) { |
356 | /* If we're here then we tried to put a key we failed to get */ | 356 | /* If we're here then we tried to put a key we failed to get */ |
357 | WARN_ON_ONCE(1); | 357 | WARN_ON_ONCE(1); |
358 | return; | 358 | return; |
359 | } | 359 | } |
360 | 360 | ||
361 | switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { | 361 | switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { |
362 | case FUT_OFF_INODE: | 362 | case FUT_OFF_INODE: |
363 | iput(key->shared.inode); | 363 | iput(key->shared.inode); |
364 | break; | 364 | break; |
365 | case FUT_OFF_MMSHARED: | 365 | case FUT_OFF_MMSHARED: |
366 | mmdrop(key->private.mm); | 366 | mmdrop(key->private.mm); |
367 | break; | 367 | break; |
368 | } | 368 | } |
369 | } | 369 | } |
370 | 370 | ||
371 | /** | 371 | /** |
372 | * get_futex_key() - Get parameters which are the keys for a futex | 372 | * get_futex_key() - Get parameters which are the keys for a futex |
373 | * @uaddr: virtual address of the futex | 373 | * @uaddr: virtual address of the futex |
374 | * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED | 374 | * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED |
375 | * @key: address where result is stored. | 375 | * @key: address where result is stored. |
376 | * @rw: mapping needs to be read/write (values: VERIFY_READ, | 376 | * @rw: mapping needs to be read/write (values: VERIFY_READ, |
377 | * VERIFY_WRITE) | 377 | * VERIFY_WRITE) |
378 | * | 378 | * |
379 | * Return: a negative error code or 0 | 379 | * Return: a negative error code or 0 |
380 | * | 380 | * |
381 | * The key words are stored in *key on success. | 381 | * The key words are stored in *key on success. |
382 | * | 382 | * |
383 | * For shared mappings, it's (page->index, file_inode(vma->vm_file), | 383 | * For shared mappings, it's (page->index, file_inode(vma->vm_file), |
384 | * offset_within_page). For private mappings, it's (uaddr, current->mm). | 384 | * offset_within_page). For private mappings, it's (uaddr, current->mm). |
385 | * We can usually work out the index without swapping in the page. | 385 | * We can usually work out the index without swapping in the page. |
386 | * | 386 | * |
387 | * lock_page() might sleep, the caller should not hold a spinlock. | 387 | * lock_page() might sleep, the caller should not hold a spinlock. |
388 | */ | 388 | */ |
389 | static int | 389 | static int |
390 | get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) | 390 | get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) |
391 | { | 391 | { |
392 | unsigned long address = (unsigned long)uaddr; | 392 | unsigned long address = (unsigned long)uaddr; |
393 | struct mm_struct *mm = current->mm; | 393 | struct mm_struct *mm = current->mm; |
394 | struct page *page, *page_head; | 394 | struct page *page, *page_head; |
395 | int err, ro = 0; | 395 | int err, ro = 0; |
396 | 396 | ||
397 | /* | 397 | /* |
398 | * The futex address must be "naturally" aligned. | 398 | * The futex address must be "naturally" aligned. |
399 | */ | 399 | */ |
400 | key->both.offset = address % PAGE_SIZE; | 400 | key->both.offset = address % PAGE_SIZE; |
401 | if (unlikely((address % sizeof(u32)) != 0)) | 401 | if (unlikely((address % sizeof(u32)) != 0)) |
402 | return -EINVAL; | 402 | return -EINVAL; |
403 | address -= key->both.offset; | 403 | address -= key->both.offset; |
404 | 404 | ||
405 | if (unlikely(!access_ok(rw, uaddr, sizeof(u32)))) | 405 | if (unlikely(!access_ok(rw, uaddr, sizeof(u32)))) |
406 | return -EFAULT; | 406 | return -EFAULT; |
407 | 407 | ||
408 | /* | 408 | /* |
409 | * PROCESS_PRIVATE futexes are fast. | 409 | * PROCESS_PRIVATE futexes are fast. |
410 | * As the mm cannot disappear under us and the 'key' only needs | 410 | * As the mm cannot disappear under us and the 'key' only needs |
411 | * virtual address, we dont even have to find the underlying vma. | 411 | * virtual address, we dont even have to find the underlying vma. |
412 | * Note : We do have to check 'uaddr' is a valid user address, | 412 | * Note : We do have to check 'uaddr' is a valid user address, |
413 | * but access_ok() should be faster than find_vma() | 413 | * but access_ok() should be faster than find_vma() |
414 | */ | 414 | */ |
415 | if (!fshared) { | 415 | if (!fshared) { |
416 | key->private.mm = mm; | 416 | key->private.mm = mm; |
417 | key->private.address = address; | 417 | key->private.address = address; |
418 | get_futex_key_refs(key); /* implies MB (B) */ | 418 | get_futex_key_refs(key); /* implies MB (B) */ |
419 | return 0; | 419 | return 0; |
420 | } | 420 | } |
421 | 421 | ||
422 | again: | 422 | again: |
423 | err = get_user_pages_fast(address, 1, 1, &page); | 423 | err = get_user_pages_fast(address, 1, 1, &page); |
424 | /* | 424 | /* |
425 | * If write access is not required (eg. FUTEX_WAIT), try | 425 | * If write access is not required (eg. FUTEX_WAIT), try |
426 | * and get read-only access. | 426 | * and get read-only access. |
427 | */ | 427 | */ |
428 | if (err == -EFAULT && rw == VERIFY_READ) { | 428 | if (err == -EFAULT && rw == VERIFY_READ) { |
429 | err = get_user_pages_fast(address, 1, 0, &page); | 429 | err = get_user_pages_fast(address, 1, 0, &page); |
430 | ro = 1; | 430 | ro = 1; |
431 | } | 431 | } |
432 | if (err < 0) | 432 | if (err < 0) |
433 | return err; | 433 | return err; |
434 | else | 434 | else |
435 | err = 0; | 435 | err = 0; |
436 | 436 | ||
437 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 437 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
438 | page_head = page; | 438 | page_head = page; |
439 | if (unlikely(PageTail(page))) { | 439 | if (unlikely(PageTail(page))) { |
440 | put_page(page); | 440 | put_page(page); |
441 | /* serialize against __split_huge_page_splitting() */ | 441 | /* serialize against __split_huge_page_splitting() */ |
442 | local_irq_disable(); | 442 | local_irq_disable(); |
443 | if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) { | 443 | if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) { |
444 | page_head = compound_head(page); | 444 | page_head = compound_head(page); |
445 | /* | 445 | /* |
446 | * page_head is valid pointer but we must pin | 446 | * page_head is valid pointer but we must pin |
447 | * it before taking the PG_lock and/or | 447 | * it before taking the PG_lock and/or |
448 | * PG_compound_lock. The moment we re-enable | 448 | * PG_compound_lock. The moment we re-enable |
449 | * irqs __split_huge_page_splitting() can | 449 | * irqs __split_huge_page_splitting() can |
450 | * return and the head page can be freed from | 450 | * return and the head page can be freed from |
451 | * under us. We can't take the PG_lock and/or | 451 | * under us. We can't take the PG_lock and/or |
452 | * PG_compound_lock on a page that could be | 452 | * PG_compound_lock on a page that could be |
453 | * freed from under us. | 453 | * freed from under us. |
454 | */ | 454 | */ |
455 | if (page != page_head) { | 455 | if (page != page_head) { |
456 | get_page(page_head); | 456 | get_page(page_head); |
457 | put_page(page); | 457 | put_page(page); |
458 | } | 458 | } |
459 | local_irq_enable(); | 459 | local_irq_enable(); |
460 | } else { | 460 | } else { |
461 | local_irq_enable(); | 461 | local_irq_enable(); |
462 | goto again; | 462 | goto again; |
463 | } | 463 | } |
464 | } | 464 | } |
465 | #else | 465 | #else |
466 | page_head = compound_head(page); | 466 | page_head = compound_head(page); |
467 | if (page != page_head) { | 467 | if (page != page_head) { |
468 | get_page(page_head); | 468 | get_page(page_head); |
469 | put_page(page); | 469 | put_page(page); |
470 | } | 470 | } |
471 | #endif | 471 | #endif |
472 | 472 | ||
473 | lock_page(page_head); | 473 | lock_page(page_head); |
474 | 474 | ||
475 | /* | 475 | /* |
476 | * If page_head->mapping is NULL, then it cannot be a PageAnon | 476 | * If page_head->mapping is NULL, then it cannot be a PageAnon |
477 | * page; but it might be the ZERO_PAGE or in the gate area or | 477 | * page; but it might be the ZERO_PAGE or in the gate area or |
478 | * in a special mapping (all cases which we are happy to fail); | 478 | * in a special mapping (all cases which we are happy to fail); |
479 | * or it may have been a good file page when get_user_pages_fast | 479 | * or it may have been a good file page when get_user_pages_fast |
480 | * found it, but truncated or holepunched or subjected to | 480 | * found it, but truncated or holepunched or subjected to |
481 | * invalidate_complete_page2 before we got the page lock (also | 481 | * invalidate_complete_page2 before we got the page lock (also |
482 | * cases which we are happy to fail). And we hold a reference, | 482 | * cases which we are happy to fail). And we hold a reference, |
483 | * so refcount care in invalidate_complete_page's remove_mapping | 483 | * so refcount care in invalidate_complete_page's remove_mapping |
484 | * prevents drop_caches from setting mapping to NULL beneath us. | 484 | * prevents drop_caches from setting mapping to NULL beneath us. |
485 | * | 485 | * |
486 | * The case we do have to guard against is when memory pressure made | 486 | * The case we do have to guard against is when memory pressure made |
487 | * shmem_writepage move it from filecache to swapcache beneath us: | 487 | * shmem_writepage move it from filecache to swapcache beneath us: |
488 | * an unlikely race, but we do need to retry for page_head->mapping. | 488 | * an unlikely race, but we do need to retry for page_head->mapping. |
489 | */ | 489 | */ |
490 | if (!page_head->mapping) { | 490 | if (!page_head->mapping) { |
491 | int shmem_swizzled = PageSwapCache(page_head); | 491 | int shmem_swizzled = PageSwapCache(page_head); |
492 | unlock_page(page_head); | 492 | unlock_page(page_head); |
493 | put_page(page_head); | 493 | put_page(page_head); |
494 | if (shmem_swizzled) | 494 | if (shmem_swizzled) |
495 | goto again; | 495 | goto again; |
496 | return -EFAULT; | 496 | return -EFAULT; |
497 | } | 497 | } |
498 | 498 | ||
499 | /* | 499 | /* |
500 | * Private mappings are handled in a simple way. | 500 | * Private mappings are handled in a simple way. |
501 | * | 501 | * |
502 | * NOTE: When userspace waits on a MAP_SHARED mapping, even if | 502 | * NOTE: When userspace waits on a MAP_SHARED mapping, even if |
503 | * it's a read-only handle, it's expected that futexes attach to | 503 | * it's a read-only handle, it's expected that futexes attach to |
504 | * the object not the particular process. | 504 | * the object not the particular process. |
505 | */ | 505 | */ |
506 | if (PageAnon(page_head)) { | 506 | if (PageAnon(page_head)) { |
507 | /* | 507 | /* |
508 | * A RO anonymous page will never change and thus doesn't make | 508 | * A RO anonymous page will never change and thus doesn't make |
509 | * sense for futex operations. | 509 | * sense for futex operations. |
510 | */ | 510 | */ |
511 | if (ro) { | 511 | if (ro) { |
512 | err = -EFAULT; | 512 | err = -EFAULT; |
513 | goto out; | 513 | goto out; |
514 | } | 514 | } |
515 | 515 | ||
516 | key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ | 516 | key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ |
517 | key->private.mm = mm; | 517 | key->private.mm = mm; |
518 | key->private.address = address; | 518 | key->private.address = address; |
519 | } else { | 519 | } else { |
520 | key->both.offset |= FUT_OFF_INODE; /* inode-based key */ | 520 | key->both.offset |= FUT_OFF_INODE; /* inode-based key */ |
521 | key->shared.inode = page_head->mapping->host; | 521 | key->shared.inode = page_head->mapping->host; |
522 | key->shared.pgoff = basepage_index(page); | 522 | key->shared.pgoff = basepage_index(page); |
523 | } | 523 | } |
524 | 524 | ||
525 | get_futex_key_refs(key); /* implies MB (B) */ | 525 | get_futex_key_refs(key); /* implies MB (B) */ |
526 | 526 | ||
527 | out: | 527 | out: |
528 | unlock_page(page_head); | 528 | unlock_page(page_head); |
529 | put_page(page_head); | 529 | put_page(page_head); |
530 | return err; | 530 | return err; |
531 | } | 531 | } |
532 | 532 | ||
533 | static inline void put_futex_key(union futex_key *key) | 533 | static inline void put_futex_key(union futex_key *key) |
534 | { | 534 | { |
535 | drop_futex_key_refs(key); | 535 | drop_futex_key_refs(key); |
536 | } | 536 | } |
537 | 537 | ||
538 | /** | 538 | /** |
539 | * fault_in_user_writeable() - Fault in user address and verify RW access | 539 | * fault_in_user_writeable() - Fault in user address and verify RW access |
540 | * @uaddr: pointer to faulting user space address | 540 | * @uaddr: pointer to faulting user space address |
541 | * | 541 | * |
542 | * Slow path to fixup the fault we just took in the atomic write | 542 | * Slow path to fixup the fault we just took in the atomic write |
543 | * access to @uaddr. | 543 | * access to @uaddr. |
544 | * | 544 | * |
545 | * We have no generic implementation of a non-destructive write to the | 545 | * We have no generic implementation of a non-destructive write to the |
546 | * user address. We know that we faulted in the atomic pagefault | 546 | * user address. We know that we faulted in the atomic pagefault |
547 | * disabled section so we can as well avoid the #PF overhead by | 547 | * disabled section so we can as well avoid the #PF overhead by |
548 | * calling get_user_pages() right away. | 548 | * calling get_user_pages() right away. |
549 | */ | 549 | */ |
550 | static int fault_in_user_writeable(u32 __user *uaddr) | 550 | static int fault_in_user_writeable(u32 __user *uaddr) |
551 | { | 551 | { |
552 | struct mm_struct *mm = current->mm; | 552 | struct mm_struct *mm = current->mm; |
553 | int ret; | 553 | int ret; |
554 | 554 | ||
555 | down_read(&mm->mmap_sem); | 555 | down_read(&mm->mmap_sem); |
556 | ret = fixup_user_fault(current, mm, (unsigned long)uaddr, | 556 | ret = fixup_user_fault(current, mm, (unsigned long)uaddr, |
557 | FAULT_FLAG_WRITE); | 557 | FAULT_FLAG_WRITE); |
558 | up_read(&mm->mmap_sem); | 558 | up_read(&mm->mmap_sem); |
559 | 559 | ||
560 | return ret < 0 ? ret : 0; | 560 | return ret < 0 ? ret : 0; |
561 | } | 561 | } |
562 | 562 | ||
563 | /** | 563 | /** |
564 | * futex_top_waiter() - Return the highest priority waiter on a futex | 564 | * futex_top_waiter() - Return the highest priority waiter on a futex |
565 | * @hb: the hash bucket the futex_q's reside in | 565 | * @hb: the hash bucket the futex_q's reside in |
566 | * @key: the futex key (to distinguish it from other futex futex_q's) | 566 | * @key: the futex key (to distinguish it from other futex futex_q's) |
567 | * | 567 | * |
568 | * Must be called with the hb lock held. | 568 | * Must be called with the hb lock held. |
569 | */ | 569 | */ |
570 | static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, | 570 | static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, |
571 | union futex_key *key) | 571 | union futex_key *key) |
572 | { | 572 | { |
573 | struct futex_q *this; | 573 | struct futex_q *this; |
574 | 574 | ||
575 | plist_for_each_entry(this, &hb->chain, list) { | 575 | plist_for_each_entry(this, &hb->chain, list) { |
576 | if (match_futex(&this->key, key)) | 576 | if (match_futex(&this->key, key)) |
577 | return this; | 577 | return this; |
578 | } | 578 | } |
579 | return NULL; | 579 | return NULL; |
580 | } | 580 | } |
581 | 581 | ||
582 | static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr, | 582 | static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr, |
583 | u32 uval, u32 newval) | 583 | u32 uval, u32 newval) |
584 | { | 584 | { |
585 | int ret; | 585 | int ret; |
586 | 586 | ||
587 | pagefault_disable(); | 587 | pagefault_disable(); |
588 | ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); | 588 | ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); |
589 | pagefault_enable(); | 589 | pagefault_enable(); |
590 | 590 | ||
591 | return ret; | 591 | return ret; |
592 | } | 592 | } |
593 | 593 | ||
594 | static int get_futex_value_locked(u32 *dest, u32 __user *from) | 594 | static int get_futex_value_locked(u32 *dest, u32 __user *from) |
595 | { | 595 | { |
596 | int ret; | 596 | int ret; |
597 | 597 | ||
598 | pagefault_disable(); | 598 | pagefault_disable(); |
599 | ret = __copy_from_user_inatomic(dest, from, sizeof(u32)); | 599 | ret = __copy_from_user_inatomic(dest, from, sizeof(u32)); |
600 | pagefault_enable(); | 600 | pagefault_enable(); |
601 | 601 | ||
602 | return ret ? -EFAULT : 0; | 602 | return ret ? -EFAULT : 0; |
603 | } | 603 | } |
604 | 604 | ||
605 | 605 | ||
606 | /* | 606 | /* |
607 | * PI code: | 607 | * PI code: |
608 | */ | 608 | */ |
609 | static int refill_pi_state_cache(void) | 609 | static int refill_pi_state_cache(void) |
610 | { | 610 | { |
611 | struct futex_pi_state *pi_state; | 611 | struct futex_pi_state *pi_state; |
612 | 612 | ||
613 | if (likely(current->pi_state_cache)) | 613 | if (likely(current->pi_state_cache)) |
614 | return 0; | 614 | return 0; |
615 | 615 | ||
616 | pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL); | 616 | pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL); |
617 | 617 | ||
618 | if (!pi_state) | 618 | if (!pi_state) |
619 | return -ENOMEM; | 619 | return -ENOMEM; |
620 | 620 | ||
621 | INIT_LIST_HEAD(&pi_state->list); | 621 | INIT_LIST_HEAD(&pi_state->list); |
622 | /* pi_mutex gets initialized later */ | 622 | /* pi_mutex gets initialized later */ |
623 | pi_state->owner = NULL; | 623 | pi_state->owner = NULL; |
624 | atomic_set(&pi_state->refcount, 1); | 624 | atomic_set(&pi_state->refcount, 1); |
625 | pi_state->key = FUTEX_KEY_INIT; | 625 | pi_state->key = FUTEX_KEY_INIT; |
626 | 626 | ||
627 | current->pi_state_cache = pi_state; | 627 | current->pi_state_cache = pi_state; |
628 | 628 | ||
629 | return 0; | 629 | return 0; |
630 | } | 630 | } |
631 | 631 | ||
632 | static struct futex_pi_state * alloc_pi_state(void) | 632 | static struct futex_pi_state * alloc_pi_state(void) |
633 | { | 633 | { |
634 | struct futex_pi_state *pi_state = current->pi_state_cache; | 634 | struct futex_pi_state *pi_state = current->pi_state_cache; |
635 | 635 | ||
636 | WARN_ON(!pi_state); | 636 | WARN_ON(!pi_state); |
637 | current->pi_state_cache = NULL; | 637 | current->pi_state_cache = NULL; |
638 | 638 | ||
639 | return pi_state; | 639 | return pi_state; |
640 | } | 640 | } |
641 | 641 | ||
642 | static void free_pi_state(struct futex_pi_state *pi_state) | 642 | static void free_pi_state(struct futex_pi_state *pi_state) |
643 | { | 643 | { |
644 | if (!atomic_dec_and_test(&pi_state->refcount)) | 644 | if (!atomic_dec_and_test(&pi_state->refcount)) |
645 | return; | 645 | return; |
646 | 646 | ||
647 | /* | 647 | /* |
648 | * If pi_state->owner is NULL, the owner is most probably dying | 648 | * If pi_state->owner is NULL, the owner is most probably dying |
649 | * and has cleaned up the pi_state already | 649 | * and has cleaned up the pi_state already |
650 | */ | 650 | */ |
651 | if (pi_state->owner) { | 651 | if (pi_state->owner) { |
652 | raw_spin_lock_irq(&pi_state->owner->pi_lock); | 652 | raw_spin_lock_irq(&pi_state->owner->pi_lock); |
653 | list_del_init(&pi_state->list); | 653 | list_del_init(&pi_state->list); |
654 | raw_spin_unlock_irq(&pi_state->owner->pi_lock); | 654 | raw_spin_unlock_irq(&pi_state->owner->pi_lock); |
655 | 655 | ||
656 | rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); | 656 | rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); |
657 | } | 657 | } |
658 | 658 | ||
659 | if (current->pi_state_cache) | 659 | if (current->pi_state_cache) |
660 | kfree(pi_state); | 660 | kfree(pi_state); |
661 | else { | 661 | else { |
662 | /* | 662 | /* |
663 | * pi_state->list is already empty. | 663 | * pi_state->list is already empty. |
664 | * clear pi_state->owner. | 664 | * clear pi_state->owner. |
665 | * refcount is at 0 - put it back to 1. | 665 | * refcount is at 0 - put it back to 1. |
666 | */ | 666 | */ |
667 | pi_state->owner = NULL; | 667 | pi_state->owner = NULL; |
668 | atomic_set(&pi_state->refcount, 1); | 668 | atomic_set(&pi_state->refcount, 1); |
669 | current->pi_state_cache = pi_state; | 669 | current->pi_state_cache = pi_state; |
670 | } | 670 | } |
671 | } | 671 | } |
672 | 672 | ||
673 | /* | 673 | /* |
674 | * Look up the task based on what TID userspace gave us. | 674 | * Look up the task based on what TID userspace gave us. |
675 | * We dont trust it. | 675 | * We dont trust it. |
676 | */ | 676 | */ |
677 | static struct task_struct * futex_find_get_task(pid_t pid) | 677 | static struct task_struct * futex_find_get_task(pid_t pid) |
678 | { | 678 | { |
679 | struct task_struct *p; | 679 | struct task_struct *p; |
680 | 680 | ||
681 | rcu_read_lock(); | 681 | rcu_read_lock(); |
682 | p = find_task_by_vpid(pid); | 682 | p = find_task_by_vpid(pid); |
683 | if (p) | 683 | if (p) |
684 | get_task_struct(p); | 684 | get_task_struct(p); |
685 | 685 | ||
686 | rcu_read_unlock(); | 686 | rcu_read_unlock(); |
687 | 687 | ||
688 | return p; | 688 | return p; |
689 | } | 689 | } |
690 | 690 | ||
691 | /* | 691 | /* |
692 | * This task is holding PI mutexes at exit time => bad. | 692 | * This task is holding PI mutexes at exit time => bad. |
693 | * Kernel cleans up PI-state, but userspace is likely hosed. | 693 | * Kernel cleans up PI-state, but userspace is likely hosed. |
694 | * (Robust-futex cleanup is separate and might save the day for userspace.) | 694 | * (Robust-futex cleanup is separate and might save the day for userspace.) |
695 | */ | 695 | */ |
696 | void exit_pi_state_list(struct task_struct *curr) | 696 | void exit_pi_state_list(struct task_struct *curr) |
697 | { | 697 | { |
698 | struct list_head *next, *head = &curr->pi_state_list; | 698 | struct list_head *next, *head = &curr->pi_state_list; |
699 | struct futex_pi_state *pi_state; | 699 | struct futex_pi_state *pi_state; |
700 | struct futex_hash_bucket *hb; | 700 | struct futex_hash_bucket *hb; |
701 | union futex_key key = FUTEX_KEY_INIT; | 701 | union futex_key key = FUTEX_KEY_INIT; |
702 | 702 | ||
703 | if (!futex_cmpxchg_enabled) | 703 | if (!futex_cmpxchg_enabled) |
704 | return; | 704 | return; |
705 | /* | 705 | /* |
706 | * We are a ZOMBIE and nobody can enqueue itself on | 706 | * We are a ZOMBIE and nobody can enqueue itself on |
707 | * pi_state_list anymore, but we have to be careful | 707 | * pi_state_list anymore, but we have to be careful |
708 | * versus waiters unqueueing themselves: | 708 | * versus waiters unqueueing themselves: |
709 | */ | 709 | */ |
710 | raw_spin_lock_irq(&curr->pi_lock); | 710 | raw_spin_lock_irq(&curr->pi_lock); |
711 | while (!list_empty(head)) { | 711 | while (!list_empty(head)) { |
712 | 712 | ||
713 | next = head->next; | 713 | next = head->next; |
714 | pi_state = list_entry(next, struct futex_pi_state, list); | 714 | pi_state = list_entry(next, struct futex_pi_state, list); |
715 | key = pi_state->key; | 715 | key = pi_state->key; |
716 | hb = hash_futex(&key); | 716 | hb = hash_futex(&key); |
717 | raw_spin_unlock_irq(&curr->pi_lock); | 717 | raw_spin_unlock_irq(&curr->pi_lock); |
718 | 718 | ||
719 | spin_lock(&hb->lock); | 719 | spin_lock(&hb->lock); |
720 | 720 | ||
721 | raw_spin_lock_irq(&curr->pi_lock); | 721 | raw_spin_lock_irq(&curr->pi_lock); |
722 | /* | 722 | /* |
723 | * We dropped the pi-lock, so re-check whether this | 723 | * We dropped the pi-lock, so re-check whether this |
724 | * task still owns the PI-state: | 724 | * task still owns the PI-state: |
725 | */ | 725 | */ |
726 | if (head->next != next) { | 726 | if (head->next != next) { |
727 | spin_unlock(&hb->lock); | 727 | spin_unlock(&hb->lock); |
728 | continue; | 728 | continue; |
729 | } | 729 | } |
730 | 730 | ||
731 | WARN_ON(pi_state->owner != curr); | 731 | WARN_ON(pi_state->owner != curr); |
732 | WARN_ON(list_empty(&pi_state->list)); | 732 | WARN_ON(list_empty(&pi_state->list)); |
733 | list_del_init(&pi_state->list); | 733 | list_del_init(&pi_state->list); |
734 | pi_state->owner = NULL; | 734 | pi_state->owner = NULL; |
735 | raw_spin_unlock_irq(&curr->pi_lock); | 735 | raw_spin_unlock_irq(&curr->pi_lock); |
736 | 736 | ||
737 | rt_mutex_unlock(&pi_state->pi_mutex); | 737 | rt_mutex_unlock(&pi_state->pi_mutex); |
738 | 738 | ||
739 | spin_unlock(&hb->lock); | 739 | spin_unlock(&hb->lock); |
740 | 740 | ||
741 | raw_spin_lock_irq(&curr->pi_lock); | 741 | raw_spin_lock_irq(&curr->pi_lock); |
742 | } | 742 | } |
743 | raw_spin_unlock_irq(&curr->pi_lock); | 743 | raw_spin_unlock_irq(&curr->pi_lock); |
744 | } | 744 | } |
745 | 745 | ||
746 | static int | 746 | static int |
747 | lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, | 747 | lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, |
748 | union futex_key *key, struct futex_pi_state **ps, | 748 | union futex_key *key, struct futex_pi_state **ps, |
749 | struct task_struct *task) | 749 | struct task_struct *task) |
750 | { | 750 | { |
751 | struct futex_pi_state *pi_state = NULL; | 751 | struct futex_pi_state *pi_state = NULL; |
752 | struct futex_q *this, *next; | 752 | struct futex_q *this, *next; |
753 | struct task_struct *p; | 753 | struct task_struct *p; |
754 | pid_t pid = uval & FUTEX_TID_MASK; | 754 | pid_t pid = uval & FUTEX_TID_MASK; |
755 | 755 | ||
756 | plist_for_each_entry_safe(this, next, &hb->chain, list) { | 756 | plist_for_each_entry_safe(this, next, &hb->chain, list) { |
757 | if (match_futex(&this->key, key)) { | 757 | if (match_futex(&this->key, key)) { |
758 | /* | 758 | /* |
759 | * Another waiter already exists - bump up | 759 | * Another waiter already exists - bump up |
760 | * the refcount and return its pi_state: | 760 | * the refcount and return its pi_state: |
761 | */ | 761 | */ |
762 | pi_state = this->pi_state; | 762 | pi_state = this->pi_state; |
763 | /* | 763 | /* |
764 | * Userspace might have messed up non-PI and PI futexes | 764 | * Userspace might have messed up non-PI and PI futexes |
765 | */ | 765 | */ |
766 | if (unlikely(!pi_state)) | 766 | if (unlikely(!pi_state)) |
767 | return -EINVAL; | 767 | return -EINVAL; |
768 | 768 | ||
769 | WARN_ON(!atomic_read(&pi_state->refcount)); | 769 | WARN_ON(!atomic_read(&pi_state->refcount)); |
770 | 770 | ||
771 | /* | 771 | /* |
772 | * When pi_state->owner is NULL then the owner died | 772 | * When pi_state->owner is NULL then the owner died |
773 | * and another waiter is on the fly. pi_state->owner | 773 | * and another waiter is on the fly. pi_state->owner |
774 | * is fixed up by the task which acquires | 774 | * is fixed up by the task which acquires |
775 | * pi_state->rt_mutex. | 775 | * pi_state->rt_mutex. |
776 | * | 776 | * |
777 | * We do not check for pid == 0 which can happen when | 777 | * We do not check for pid == 0 which can happen when |
778 | * the owner died and robust_list_exit() cleared the | 778 | * the owner died and robust_list_exit() cleared the |
779 | * TID. | 779 | * TID. |
780 | */ | 780 | */ |
781 | if (pid && pi_state->owner) { | 781 | if (pid && pi_state->owner) { |
782 | /* | 782 | /* |
783 | * Bail out if user space manipulated the | 783 | * Bail out if user space manipulated the |
784 | * futex value. | 784 | * futex value. |
785 | */ | 785 | */ |
786 | if (pid != task_pid_vnr(pi_state->owner)) | 786 | if (pid != task_pid_vnr(pi_state->owner)) |
787 | return -EINVAL; | 787 | return -EINVAL; |
788 | } | 788 | } |
789 | 789 | ||
790 | /* | 790 | /* |
791 | * Protect against a corrupted uval. If uval | 791 | * Protect against a corrupted uval. If uval |
792 | * is 0x80000000 then pid is 0 and the waiter | 792 | * is 0x80000000 then pid is 0 and the waiter |
793 | * bit is set. So the deadlock check in the | 793 | * bit is set. So the deadlock check in the |
794 | * calling code has failed and we did not fall | 794 | * calling code has failed and we did not fall |
795 | * into the check above due to !pid. | 795 | * into the check above due to !pid. |
796 | */ | 796 | */ |
797 | if (task && pi_state->owner == task) | 797 | if (task && pi_state->owner == task) |
798 | return -EDEADLK; | 798 | return -EDEADLK; |
799 | 799 | ||
800 | atomic_inc(&pi_state->refcount); | 800 | atomic_inc(&pi_state->refcount); |
801 | *ps = pi_state; | 801 | *ps = pi_state; |
802 | 802 | ||
803 | return 0; | 803 | return 0; |
804 | } | 804 | } |
805 | } | 805 | } |
806 | 806 | ||
807 | /* | 807 | /* |
808 | * We are the first waiter - try to look up the real owner and attach | 808 | * We are the first waiter - try to look up the real owner and attach |
809 | * the new pi_state to it, but bail out when TID = 0 | 809 | * the new pi_state to it, but bail out when TID = 0 |
810 | */ | 810 | */ |
811 | if (!pid) | 811 | if (!pid) |
812 | return -ESRCH; | 812 | return -ESRCH; |
813 | p = futex_find_get_task(pid); | 813 | p = futex_find_get_task(pid); |
814 | if (!p) | 814 | if (!p) |
815 | return -ESRCH; | 815 | return -ESRCH; |
816 | 816 | ||
817 | if (!p->mm) { | 817 | if (!p->mm) { |
818 | put_task_struct(p); | 818 | put_task_struct(p); |
819 | return -EPERM; | 819 | return -EPERM; |
820 | } | 820 | } |
821 | 821 | ||
822 | /* | 822 | /* |
823 | * We need to look at the task state flags to figure out, | 823 | * We need to look at the task state flags to figure out, |
824 | * whether the task is exiting. To protect against the do_exit | 824 | * whether the task is exiting. To protect against the do_exit |
825 | * change of the task flags, we do this protected by | 825 | * change of the task flags, we do this protected by |
826 | * p->pi_lock: | 826 | * p->pi_lock: |
827 | */ | 827 | */ |
828 | raw_spin_lock_irq(&p->pi_lock); | 828 | raw_spin_lock_irq(&p->pi_lock); |
829 | if (unlikely(p->flags & PF_EXITING)) { | 829 | if (unlikely(p->flags & PF_EXITING)) { |
830 | /* | 830 | /* |
831 | * The task is on the way out. When PF_EXITPIDONE is | 831 | * The task is on the way out. When PF_EXITPIDONE is |
832 | * set, we know that the task has finished the | 832 | * set, we know that the task has finished the |
833 | * cleanup: | 833 | * cleanup: |
834 | */ | 834 | */ |
835 | int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN; | 835 | int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN; |
836 | 836 | ||
837 | raw_spin_unlock_irq(&p->pi_lock); | 837 | raw_spin_unlock_irq(&p->pi_lock); |
838 | put_task_struct(p); | 838 | put_task_struct(p); |
839 | return ret; | 839 | return ret; |
840 | } | 840 | } |
841 | 841 | ||
842 | pi_state = alloc_pi_state(); | 842 | pi_state = alloc_pi_state(); |
843 | 843 | ||
844 | /* | 844 | /* |
845 | * Initialize the pi_mutex in locked state and make 'p' | 845 | * Initialize the pi_mutex in locked state and make 'p' |
846 | * the owner of it: | 846 | * the owner of it: |
847 | */ | 847 | */ |
848 | rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); | 848 | rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); |
849 | 849 | ||
850 | /* Store the key for possible exit cleanups: */ | 850 | /* Store the key for possible exit cleanups: */ |
851 | pi_state->key = *key; | 851 | pi_state->key = *key; |
852 | 852 | ||
853 | WARN_ON(!list_empty(&pi_state->list)); | 853 | WARN_ON(!list_empty(&pi_state->list)); |
854 | list_add(&pi_state->list, &p->pi_state_list); | 854 | list_add(&pi_state->list, &p->pi_state_list); |
855 | pi_state->owner = p; | 855 | pi_state->owner = p; |
856 | raw_spin_unlock_irq(&p->pi_lock); | 856 | raw_spin_unlock_irq(&p->pi_lock); |
857 | 857 | ||
858 | put_task_struct(p); | 858 | put_task_struct(p); |
859 | 859 | ||
860 | *ps = pi_state; | 860 | *ps = pi_state; |
861 | 861 | ||
862 | return 0; | 862 | return 0; |
863 | } | 863 | } |
864 | 864 | ||
865 | /** | 865 | /** |
866 | * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex | 866 | * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex |
867 | * @uaddr: the pi futex user address | 867 | * @uaddr: the pi futex user address |
868 | * @hb: the pi futex hash bucket | 868 | * @hb: the pi futex hash bucket |
869 | * @key: the futex key associated with uaddr and hb | 869 | * @key: the futex key associated with uaddr and hb |
870 | * @ps: the pi_state pointer where we store the result of the | 870 | * @ps: the pi_state pointer where we store the result of the |
871 | * lookup | 871 | * lookup |
872 | * @task: the task to perform the atomic lock work for. This will | 872 | * @task: the task to perform the atomic lock work for. This will |
873 | * be "current" except in the case of requeue pi. | 873 | * be "current" except in the case of requeue pi. |
874 | * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) | 874 | * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) |
875 | * | 875 | * |
876 | * Return: | 876 | * Return: |
877 | * 0 - ready to wait; | 877 | * 0 - ready to wait; |
878 | * 1 - acquired the lock; | 878 | * 1 - acquired the lock; |
879 | * <0 - error | 879 | * <0 - error |
880 | * | 880 | * |
881 | * The hb->lock and futex_key refs shall be held by the caller. | 881 | * The hb->lock and futex_key refs shall be held by the caller. |
882 | */ | 882 | */ |
883 | static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, | 883 | static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, |
884 | union futex_key *key, | 884 | union futex_key *key, |
885 | struct futex_pi_state **ps, | 885 | struct futex_pi_state **ps, |
886 | struct task_struct *task, int set_waiters) | 886 | struct task_struct *task, int set_waiters) |
887 | { | 887 | { |
888 | int lock_taken, ret, force_take = 0; | 888 | int lock_taken, ret, force_take = 0; |
889 | u32 uval, newval, curval, vpid = task_pid_vnr(task); | 889 | u32 uval, newval, curval, vpid = task_pid_vnr(task); |
890 | 890 | ||
891 | retry: | 891 | retry: |
892 | ret = lock_taken = 0; | 892 | ret = lock_taken = 0; |
893 | 893 | ||
894 | /* | 894 | /* |
895 | * To avoid races, we attempt to take the lock here again | 895 | * To avoid races, we attempt to take the lock here again |
896 | * (by doing a 0 -> TID atomic cmpxchg), while holding all | 896 | * (by doing a 0 -> TID atomic cmpxchg), while holding all |
897 | * the locks. It will most likely not succeed. | 897 | * the locks. It will most likely not succeed. |
898 | */ | 898 | */ |
899 | newval = vpid; | 899 | newval = vpid; |
900 | if (set_waiters) | 900 | if (set_waiters) |
901 | newval |= FUTEX_WAITERS; | 901 | newval |= FUTEX_WAITERS; |
902 | 902 | ||
903 | if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval))) | 903 | if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval))) |
904 | return -EFAULT; | 904 | return -EFAULT; |
905 | 905 | ||
906 | /* | 906 | /* |
907 | * Detect deadlocks. | 907 | * Detect deadlocks. |
908 | */ | 908 | */ |
909 | if ((unlikely((curval & FUTEX_TID_MASK) == vpid))) | 909 | if ((unlikely((curval & FUTEX_TID_MASK) == vpid))) |
910 | return -EDEADLK; | 910 | return -EDEADLK; |
911 | 911 | ||
912 | /* | 912 | /* |
913 | * Surprise - we got the lock, but we do not trust user space at all. | 913 | * Surprise - we got the lock, but we do not trust user space at all. |
914 | */ | 914 | */ |
915 | if (unlikely(!curval)) { | 915 | if (unlikely(!curval)) { |
916 | /* | 916 | /* |
917 | * We verify whether there is kernel state for this | 917 | * We verify whether there is kernel state for this |
918 | * futex. If not, we can safely assume, that the 0 -> | 918 | * futex. If not, we can safely assume, that the 0 -> |
919 | * TID transition is correct. If state exists, we do | 919 | * TID transition is correct. If state exists, we do |
920 | * not bother to fixup the user space state as it was | 920 | * not bother to fixup the user space state as it was |
921 | * corrupted already. | 921 | * corrupted already. |
922 | */ | 922 | */ |
923 | return futex_top_waiter(hb, key) ? -EINVAL : 1; | 923 | return futex_top_waiter(hb, key) ? -EINVAL : 1; |
924 | } | 924 | } |
925 | 925 | ||
926 | uval = curval; | 926 | uval = curval; |
927 | 927 | ||
928 | /* | 928 | /* |
929 | * Set the FUTEX_WAITERS flag, so the owner will know it has someone | 929 | * Set the FUTEX_WAITERS flag, so the owner will know it has someone |
930 | * to wake at the next unlock. | 930 | * to wake at the next unlock. |
931 | */ | 931 | */ |
932 | newval = curval | FUTEX_WAITERS; | 932 | newval = curval | FUTEX_WAITERS; |
933 | 933 | ||
934 | /* | 934 | /* |
935 | * Should we force take the futex? See below. | 935 | * Should we force take the futex? See below. |
936 | */ | 936 | */ |
937 | if (unlikely(force_take)) { | 937 | if (unlikely(force_take)) { |
938 | /* | 938 | /* |
939 | * Keep the OWNER_DIED and the WAITERS bit and set the | 939 | * Keep the OWNER_DIED and the WAITERS bit and set the |
940 | * new TID value. | 940 | * new TID value. |
941 | */ | 941 | */ |
942 | newval = (curval & ~FUTEX_TID_MASK) | vpid; | 942 | newval = (curval & ~FUTEX_TID_MASK) | vpid; |
943 | force_take = 0; | 943 | force_take = 0; |
944 | lock_taken = 1; | 944 | lock_taken = 1; |
945 | } | 945 | } |
946 | 946 | ||
947 | if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) | 947 | if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) |
948 | return -EFAULT; | 948 | return -EFAULT; |
949 | if (unlikely(curval != uval)) | 949 | if (unlikely(curval != uval)) |
950 | goto retry; | 950 | goto retry; |
951 | 951 | ||
952 | /* | 952 | /* |
953 | * We took the lock due to forced take over. | 953 | * We took the lock due to forced take over. |
954 | */ | 954 | */ |
955 | if (unlikely(lock_taken)) | 955 | if (unlikely(lock_taken)) |
956 | return 1; | 956 | return 1; |
957 | 957 | ||
958 | /* | 958 | /* |
959 | * We dont have the lock. Look up the PI state (or create it if | 959 | * We dont have the lock. Look up the PI state (or create it if |
960 | * we are the first waiter): | 960 | * we are the first waiter): |
961 | */ | 961 | */ |
962 | ret = lookup_pi_state(uval, hb, key, ps, task); | 962 | ret = lookup_pi_state(uval, hb, key, ps, task); |
963 | 963 | ||
964 | if (unlikely(ret)) { | 964 | if (unlikely(ret)) { |
965 | switch (ret) { | 965 | switch (ret) { |
966 | case -ESRCH: | 966 | case -ESRCH: |
967 | /* | 967 | /* |
968 | * We failed to find an owner for this | 968 | * We failed to find an owner for this |
969 | * futex. So we have no pi_state to block | 969 | * futex. So we have no pi_state to block |
970 | * on. This can happen in two cases: | 970 | * on. This can happen in two cases: |
971 | * | 971 | * |
972 | * 1) The owner died | 972 | * 1) The owner died |
973 | * 2) A stale FUTEX_WAITERS bit | 973 | * 2) A stale FUTEX_WAITERS bit |
974 | * | 974 | * |
975 | * Re-read the futex value. | 975 | * Re-read the futex value. |
976 | */ | 976 | */ |
977 | if (get_futex_value_locked(&curval, uaddr)) | 977 | if (get_futex_value_locked(&curval, uaddr)) |
978 | return -EFAULT; | 978 | return -EFAULT; |
979 | 979 | ||
980 | /* | 980 | /* |
981 | * If the owner died or we have a stale | 981 | * If the owner died or we have a stale |
982 | * WAITERS bit the owner TID in the user space | 982 | * WAITERS bit the owner TID in the user space |
983 | * futex is 0. | 983 | * futex is 0. |
984 | */ | 984 | */ |
985 | if (!(curval & FUTEX_TID_MASK)) { | 985 | if (!(curval & FUTEX_TID_MASK)) { |
986 | force_take = 1; | 986 | force_take = 1; |
987 | goto retry; | 987 | goto retry; |
988 | } | 988 | } |
989 | default: | 989 | default: |
990 | break; | 990 | break; |
991 | } | 991 | } |
992 | } | 992 | } |
993 | 993 | ||
994 | return ret; | 994 | return ret; |
995 | } | 995 | } |
996 | 996 | ||
997 | /** | 997 | /** |
998 | * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket | 998 | * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket |
999 | * @q: The futex_q to unqueue | 999 | * @q: The futex_q to unqueue |
1000 | * | 1000 | * |
1001 | * The q->lock_ptr must not be NULL and must be held by the caller. | 1001 | * The q->lock_ptr must not be NULL and must be held by the caller. |
1002 | */ | 1002 | */ |
1003 | static void __unqueue_futex(struct futex_q *q) | 1003 | static void __unqueue_futex(struct futex_q *q) |
1004 | { | 1004 | { |
1005 | struct futex_hash_bucket *hb; | 1005 | struct futex_hash_bucket *hb; |
1006 | 1006 | ||
1007 | if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr)) | 1007 | if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr)) |
1008 | || WARN_ON(plist_node_empty(&q->list))) | 1008 | || WARN_ON(plist_node_empty(&q->list))) |
1009 | return; | 1009 | return; |
1010 | 1010 | ||
1011 | hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); | 1011 | hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); |
1012 | plist_del(&q->list, &hb->chain); | 1012 | plist_del(&q->list, &hb->chain); |
1013 | hb_waiters_dec(hb); | 1013 | hb_waiters_dec(hb); |
1014 | } | 1014 | } |
1015 | 1015 | ||
1016 | /* | 1016 | /* |
1017 | * The hash bucket lock must be held when this is called. | 1017 | * The hash bucket lock must be held when this is called. |
1018 | * Afterwards, the futex_q must not be accessed. | 1018 | * Afterwards, the futex_q must not be accessed. |
1019 | */ | 1019 | */ |
1020 | static void wake_futex(struct futex_q *q) | 1020 | static void wake_futex(struct futex_q *q) |
1021 | { | 1021 | { |
1022 | struct task_struct *p = q->task; | 1022 | struct task_struct *p = q->task; |
1023 | 1023 | ||
1024 | if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n")) | 1024 | if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n")) |
1025 | return; | 1025 | return; |
1026 | 1026 | ||
1027 | /* | 1027 | /* |
1028 | * We set q->lock_ptr = NULL _before_ we wake up the task. If | 1028 | * We set q->lock_ptr = NULL _before_ we wake up the task. If |
1029 | * a non-futex wake up happens on another CPU then the task | 1029 | * a non-futex wake up happens on another CPU then the task |
1030 | * might exit and p would dereference a non-existing task | 1030 | * might exit and p would dereference a non-existing task |
1031 | * struct. Prevent this by holding a reference on p across the | 1031 | * struct. Prevent this by holding a reference on p across the |
1032 | * wake up. | 1032 | * wake up. |
1033 | */ | 1033 | */ |
1034 | get_task_struct(p); | 1034 | get_task_struct(p); |
1035 | 1035 | ||
1036 | __unqueue_futex(q); | 1036 | __unqueue_futex(q); |
1037 | /* | 1037 | /* |
1038 | * The waiting task can free the futex_q as soon as | 1038 | * The waiting task can free the futex_q as soon as |
1039 | * q->lock_ptr = NULL is written, without taking any locks. A | 1039 | * q->lock_ptr = NULL is written, without taking any locks. A |
1040 | * memory barrier is required here to prevent the following | 1040 | * memory barrier is required here to prevent the following |
1041 | * store to lock_ptr from getting ahead of the plist_del. | 1041 | * store to lock_ptr from getting ahead of the plist_del. |
1042 | */ | 1042 | */ |
1043 | smp_wmb(); | 1043 | smp_wmb(); |
1044 | q->lock_ptr = NULL; | 1044 | q->lock_ptr = NULL; |
1045 | 1045 | ||
1046 | wake_up_state(p, TASK_NORMAL); | 1046 | wake_up_state(p, TASK_NORMAL); |
1047 | put_task_struct(p); | 1047 | put_task_struct(p); |
1048 | } | 1048 | } |
1049 | 1049 | ||
1050 | static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) | 1050 | static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) |
1051 | { | 1051 | { |
1052 | struct task_struct *new_owner; | 1052 | struct task_struct *new_owner; |
1053 | struct futex_pi_state *pi_state = this->pi_state; | 1053 | struct futex_pi_state *pi_state = this->pi_state; |
1054 | u32 uninitialized_var(curval), newval; | 1054 | u32 uninitialized_var(curval), newval; |
1055 | int ret = 0; | ||
1055 | 1056 | ||
1056 | if (!pi_state) | 1057 | if (!pi_state) |
1057 | return -EINVAL; | 1058 | return -EINVAL; |
1058 | 1059 | ||
1059 | /* | 1060 | /* |
1060 | * If current does not own the pi_state then the futex is | 1061 | * If current does not own the pi_state then the futex is |
1061 | * inconsistent and user space fiddled with the futex value. | 1062 | * inconsistent and user space fiddled with the futex value. |
1062 | */ | 1063 | */ |
1063 | if (pi_state->owner != current) | 1064 | if (pi_state->owner != current) |
1064 | return -EINVAL; | 1065 | return -EINVAL; |
1065 | 1066 | ||
1066 | raw_spin_lock(&pi_state->pi_mutex.wait_lock); | 1067 | raw_spin_lock(&pi_state->pi_mutex.wait_lock); |
1067 | new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); | 1068 | new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); |
1068 | 1069 | ||
1069 | /* | 1070 | /* |
1070 | * It is possible that the next waiter (the one that brought | 1071 | * It is possible that the next waiter (the one that brought |
1071 | * this owner to the kernel) timed out and is no longer | 1072 | * this owner to the kernel) timed out and is no longer |
1072 | * waiting on the lock. | 1073 | * waiting on the lock. |
1073 | */ | 1074 | */ |
1074 | if (!new_owner) | 1075 | if (!new_owner) |
1075 | new_owner = this->task; | 1076 | new_owner = this->task; |
1076 | 1077 | ||
1077 | /* | 1078 | /* |
1078 | * We pass it to the next owner. (The WAITERS bit is always | 1079 | * We pass it to the next owner. The WAITERS bit is always |
1079 | * kept enabled while there is PI state around. We must also | 1080 | * kept enabled while there is PI state around. We cleanup the |
1080 | * preserve the owner died bit.) | 1081 | * owner died bit, because we are the owner. |
1081 | */ | 1082 | */ |
1082 | if (!(uval & FUTEX_OWNER_DIED)) { | 1083 | newval = FUTEX_WAITERS | task_pid_vnr(new_owner); |
1083 | int ret = 0; | ||
1084 | 1084 | ||
1085 | newval = FUTEX_WAITERS | task_pid_vnr(new_owner); | 1085 | if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) |
1086 | 1086 | ret = -EFAULT; | |
1087 | if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) | 1087 | else if (curval != uval) |
1088 | ret = -EFAULT; | 1088 | ret = -EINVAL; |
1089 | else if (curval != uval) | 1089 | if (ret) { |
1090 | ret = -EINVAL; | 1090 | raw_spin_unlock(&pi_state->pi_mutex.wait_lock); |
1091 | if (ret) { | 1091 | return ret; |
1092 | raw_spin_unlock(&pi_state->pi_mutex.wait_lock); | ||
1093 | return ret; | ||
1094 | } | ||
1095 | } | 1092 | } |
1096 | 1093 | ||
1097 | raw_spin_lock_irq(&pi_state->owner->pi_lock); | 1094 | raw_spin_lock_irq(&pi_state->owner->pi_lock); |
1098 | WARN_ON(list_empty(&pi_state->list)); | 1095 | WARN_ON(list_empty(&pi_state->list)); |
1099 | list_del_init(&pi_state->list); | 1096 | list_del_init(&pi_state->list); |
1100 | raw_spin_unlock_irq(&pi_state->owner->pi_lock); | 1097 | raw_spin_unlock_irq(&pi_state->owner->pi_lock); |
1101 | 1098 | ||
1102 | raw_spin_lock_irq(&new_owner->pi_lock); | 1099 | raw_spin_lock_irq(&new_owner->pi_lock); |
1103 | WARN_ON(!list_empty(&pi_state->list)); | 1100 | WARN_ON(!list_empty(&pi_state->list)); |
1104 | list_add(&pi_state->list, &new_owner->pi_state_list); | 1101 | list_add(&pi_state->list, &new_owner->pi_state_list); |
1105 | pi_state->owner = new_owner; | 1102 | pi_state->owner = new_owner; |
1106 | raw_spin_unlock_irq(&new_owner->pi_lock); | 1103 | raw_spin_unlock_irq(&new_owner->pi_lock); |
1107 | 1104 | ||
1108 | raw_spin_unlock(&pi_state->pi_mutex.wait_lock); | 1105 | raw_spin_unlock(&pi_state->pi_mutex.wait_lock); |
1109 | rt_mutex_unlock(&pi_state->pi_mutex); | 1106 | rt_mutex_unlock(&pi_state->pi_mutex); |
1110 | 1107 | ||
1111 | return 0; | 1108 | return 0; |
1112 | } | 1109 | } |
1113 | 1110 | ||
1114 | static int unlock_futex_pi(u32 __user *uaddr, u32 uval) | 1111 | static int unlock_futex_pi(u32 __user *uaddr, u32 uval) |
1115 | { | 1112 | { |
1116 | u32 uninitialized_var(oldval); | 1113 | u32 uninitialized_var(oldval); |
1117 | 1114 | ||
1118 | /* | 1115 | /* |
1119 | * There is no waiter, so we unlock the futex. The owner died | 1116 | * There is no waiter, so we unlock the futex. The owner died |
1120 | * bit has not to be preserved here. We are the owner: | 1117 | * bit has not to be preserved here. We are the owner: |
1121 | */ | 1118 | */ |
1122 | if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0)) | 1119 | if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0)) |
1123 | return -EFAULT; | 1120 | return -EFAULT; |
1124 | if (oldval != uval) | 1121 | if (oldval != uval) |
1125 | return -EAGAIN; | 1122 | return -EAGAIN; |
1126 | 1123 | ||
1127 | return 0; | 1124 | return 0; |
1128 | } | 1125 | } |
1129 | 1126 | ||
1130 | /* | 1127 | /* |
1131 | * Express the locking dependencies for lockdep: | 1128 | * Express the locking dependencies for lockdep: |
1132 | */ | 1129 | */ |
1133 | static inline void | 1130 | static inline void |
1134 | double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) | 1131 | double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) |
1135 | { | 1132 | { |
1136 | if (hb1 <= hb2) { | 1133 | if (hb1 <= hb2) { |
1137 | spin_lock(&hb1->lock); | 1134 | spin_lock(&hb1->lock); |
1138 | if (hb1 < hb2) | 1135 | if (hb1 < hb2) |
1139 | spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING); | 1136 | spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING); |
1140 | } else { /* hb1 > hb2 */ | 1137 | } else { /* hb1 > hb2 */ |
1141 | spin_lock(&hb2->lock); | 1138 | spin_lock(&hb2->lock); |
1142 | spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING); | 1139 | spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING); |
1143 | } | 1140 | } |
1144 | } | 1141 | } |
1145 | 1142 | ||
1146 | static inline void | 1143 | static inline void |
1147 | double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) | 1144 | double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) |
1148 | { | 1145 | { |
1149 | spin_unlock(&hb1->lock); | 1146 | spin_unlock(&hb1->lock); |
1150 | if (hb1 != hb2) | 1147 | if (hb1 != hb2) |
1151 | spin_unlock(&hb2->lock); | 1148 | spin_unlock(&hb2->lock); |
1152 | } | 1149 | } |
1153 | 1150 | ||
1154 | /* | 1151 | /* |
1155 | * Wake up waiters matching bitset queued on this futex (uaddr). | 1152 | * Wake up waiters matching bitset queued on this futex (uaddr). |
1156 | */ | 1153 | */ |
1157 | static int | 1154 | static int |
1158 | futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) | 1155 | futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) |
1159 | { | 1156 | { |
1160 | struct futex_hash_bucket *hb; | 1157 | struct futex_hash_bucket *hb; |
1161 | struct futex_q *this, *next; | 1158 | struct futex_q *this, *next; |
1162 | union futex_key key = FUTEX_KEY_INIT; | 1159 | union futex_key key = FUTEX_KEY_INIT; |
1163 | int ret; | 1160 | int ret; |
1164 | 1161 | ||
1165 | if (!bitset) | 1162 | if (!bitset) |
1166 | return -EINVAL; | 1163 | return -EINVAL; |
1167 | 1164 | ||
1168 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_READ); | 1165 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_READ); |
1169 | if (unlikely(ret != 0)) | 1166 | if (unlikely(ret != 0)) |
1170 | goto out; | 1167 | goto out; |
1171 | 1168 | ||
1172 | hb = hash_futex(&key); | 1169 | hb = hash_futex(&key); |
1173 | 1170 | ||
1174 | /* Make sure we really have tasks to wakeup */ | 1171 | /* Make sure we really have tasks to wakeup */ |
1175 | if (!hb_waiters_pending(hb)) | 1172 | if (!hb_waiters_pending(hb)) |
1176 | goto out_put_key; | 1173 | goto out_put_key; |
1177 | 1174 | ||
1178 | spin_lock(&hb->lock); | 1175 | spin_lock(&hb->lock); |
1179 | 1176 | ||
1180 | plist_for_each_entry_safe(this, next, &hb->chain, list) { | 1177 | plist_for_each_entry_safe(this, next, &hb->chain, list) { |
1181 | if (match_futex (&this->key, &key)) { | 1178 | if (match_futex (&this->key, &key)) { |
1182 | if (this->pi_state || this->rt_waiter) { | 1179 | if (this->pi_state || this->rt_waiter) { |
1183 | ret = -EINVAL; | 1180 | ret = -EINVAL; |
1184 | break; | 1181 | break; |
1185 | } | 1182 | } |
1186 | 1183 | ||
1187 | /* Check if one of the bits is set in both bitsets */ | 1184 | /* Check if one of the bits is set in both bitsets */ |
1188 | if (!(this->bitset & bitset)) | 1185 | if (!(this->bitset & bitset)) |
1189 | continue; | 1186 | continue; |
1190 | 1187 | ||
1191 | wake_futex(this); | 1188 | wake_futex(this); |
1192 | if (++ret >= nr_wake) | 1189 | if (++ret >= nr_wake) |
1193 | break; | 1190 | break; |
1194 | } | 1191 | } |
1195 | } | 1192 | } |
1196 | 1193 | ||
1197 | spin_unlock(&hb->lock); | 1194 | spin_unlock(&hb->lock); |
1198 | out_put_key: | 1195 | out_put_key: |
1199 | put_futex_key(&key); | 1196 | put_futex_key(&key); |
1200 | out: | 1197 | out: |
1201 | return ret; | 1198 | return ret; |
1202 | } | 1199 | } |
1203 | 1200 | ||
1204 | /* | 1201 | /* |
1205 | * Wake up all waiters hashed on the physical page that is mapped | 1202 | * Wake up all waiters hashed on the physical page that is mapped |
1206 | * to this virtual address: | 1203 | * to this virtual address: |
1207 | */ | 1204 | */ |
1208 | static int | 1205 | static int |
1209 | futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, | 1206 | futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, |
1210 | int nr_wake, int nr_wake2, int op) | 1207 | int nr_wake, int nr_wake2, int op) |
1211 | { | 1208 | { |
1212 | union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; | 1209 | union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; |
1213 | struct futex_hash_bucket *hb1, *hb2; | 1210 | struct futex_hash_bucket *hb1, *hb2; |
1214 | struct futex_q *this, *next; | 1211 | struct futex_q *this, *next; |
1215 | int ret, op_ret; | 1212 | int ret, op_ret; |
1216 | 1213 | ||
1217 | retry: | 1214 | retry: |
1218 | ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); | 1215 | ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); |
1219 | if (unlikely(ret != 0)) | 1216 | if (unlikely(ret != 0)) |
1220 | goto out; | 1217 | goto out; |
1221 | ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); | 1218 | ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); |
1222 | if (unlikely(ret != 0)) | 1219 | if (unlikely(ret != 0)) |
1223 | goto out_put_key1; | 1220 | goto out_put_key1; |
1224 | 1221 | ||
1225 | hb1 = hash_futex(&key1); | 1222 | hb1 = hash_futex(&key1); |
1226 | hb2 = hash_futex(&key2); | 1223 | hb2 = hash_futex(&key2); |
1227 | 1224 | ||
1228 | retry_private: | 1225 | retry_private: |
1229 | double_lock_hb(hb1, hb2); | 1226 | double_lock_hb(hb1, hb2); |
1230 | op_ret = futex_atomic_op_inuser(op, uaddr2); | 1227 | op_ret = futex_atomic_op_inuser(op, uaddr2); |
1231 | if (unlikely(op_ret < 0)) { | 1228 | if (unlikely(op_ret < 0)) { |
1232 | 1229 | ||
1233 | double_unlock_hb(hb1, hb2); | 1230 | double_unlock_hb(hb1, hb2); |
1234 | 1231 | ||
1235 | #ifndef CONFIG_MMU | 1232 | #ifndef CONFIG_MMU |
1236 | /* | 1233 | /* |
1237 | * we don't get EFAULT from MMU faults if we don't have an MMU, | 1234 | * we don't get EFAULT from MMU faults if we don't have an MMU, |
1238 | * but we might get them from range checking | 1235 | * but we might get them from range checking |
1239 | */ | 1236 | */ |
1240 | ret = op_ret; | 1237 | ret = op_ret; |
1241 | goto out_put_keys; | 1238 | goto out_put_keys; |
1242 | #endif | 1239 | #endif |
1243 | 1240 | ||
1244 | if (unlikely(op_ret != -EFAULT)) { | 1241 | if (unlikely(op_ret != -EFAULT)) { |
1245 | ret = op_ret; | 1242 | ret = op_ret; |
1246 | goto out_put_keys; | 1243 | goto out_put_keys; |
1247 | } | 1244 | } |
1248 | 1245 | ||
1249 | ret = fault_in_user_writeable(uaddr2); | 1246 | ret = fault_in_user_writeable(uaddr2); |
1250 | if (ret) | 1247 | if (ret) |
1251 | goto out_put_keys; | 1248 | goto out_put_keys; |
1252 | 1249 | ||
1253 | if (!(flags & FLAGS_SHARED)) | 1250 | if (!(flags & FLAGS_SHARED)) |
1254 | goto retry_private; | 1251 | goto retry_private; |
1255 | 1252 | ||
1256 | put_futex_key(&key2); | 1253 | put_futex_key(&key2); |
1257 | put_futex_key(&key1); | 1254 | put_futex_key(&key1); |
1258 | goto retry; | 1255 | goto retry; |
1259 | } | 1256 | } |
1260 | 1257 | ||
1261 | plist_for_each_entry_safe(this, next, &hb1->chain, list) { | 1258 | plist_for_each_entry_safe(this, next, &hb1->chain, list) { |
1262 | if (match_futex (&this->key, &key1)) { | 1259 | if (match_futex (&this->key, &key1)) { |
1263 | if (this->pi_state || this->rt_waiter) { | 1260 | if (this->pi_state || this->rt_waiter) { |
1264 | ret = -EINVAL; | 1261 | ret = -EINVAL; |
1265 | goto out_unlock; | 1262 | goto out_unlock; |
1266 | } | 1263 | } |
1267 | wake_futex(this); | 1264 | wake_futex(this); |
1268 | if (++ret >= nr_wake) | 1265 | if (++ret >= nr_wake) |
1269 | break; | 1266 | break; |
1270 | } | 1267 | } |
1271 | } | 1268 | } |
1272 | 1269 | ||
1273 | if (op_ret > 0) { | 1270 | if (op_ret > 0) { |
1274 | op_ret = 0; | 1271 | op_ret = 0; |
1275 | plist_for_each_entry_safe(this, next, &hb2->chain, list) { | 1272 | plist_for_each_entry_safe(this, next, &hb2->chain, list) { |
1276 | if (match_futex (&this->key, &key2)) { | 1273 | if (match_futex (&this->key, &key2)) { |
1277 | if (this->pi_state || this->rt_waiter) { | 1274 | if (this->pi_state || this->rt_waiter) { |
1278 | ret = -EINVAL; | 1275 | ret = -EINVAL; |
1279 | goto out_unlock; | 1276 | goto out_unlock; |
1280 | } | 1277 | } |
1281 | wake_futex(this); | 1278 | wake_futex(this); |
1282 | if (++op_ret >= nr_wake2) | 1279 | if (++op_ret >= nr_wake2) |
1283 | break; | 1280 | break; |
1284 | } | 1281 | } |
1285 | } | 1282 | } |
1286 | ret += op_ret; | 1283 | ret += op_ret; |
1287 | } | 1284 | } |
1288 | 1285 | ||
1289 | out_unlock: | 1286 | out_unlock: |
1290 | double_unlock_hb(hb1, hb2); | 1287 | double_unlock_hb(hb1, hb2); |
1291 | out_put_keys: | 1288 | out_put_keys: |
1292 | put_futex_key(&key2); | 1289 | put_futex_key(&key2); |
1293 | out_put_key1: | 1290 | out_put_key1: |
1294 | put_futex_key(&key1); | 1291 | put_futex_key(&key1); |
1295 | out: | 1292 | out: |
1296 | return ret; | 1293 | return ret; |
1297 | } | 1294 | } |
1298 | 1295 | ||
1299 | /** | 1296 | /** |
1300 | * requeue_futex() - Requeue a futex_q from one hb to another | 1297 | * requeue_futex() - Requeue a futex_q from one hb to another |
1301 | * @q: the futex_q to requeue | 1298 | * @q: the futex_q to requeue |
1302 | * @hb1: the source hash_bucket | 1299 | * @hb1: the source hash_bucket |
1303 | * @hb2: the target hash_bucket | 1300 | * @hb2: the target hash_bucket |
1304 | * @key2: the new key for the requeued futex_q | 1301 | * @key2: the new key for the requeued futex_q |
1305 | */ | 1302 | */ |
1306 | static inline | 1303 | static inline |
1307 | void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, | 1304 | void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, |
1308 | struct futex_hash_bucket *hb2, union futex_key *key2) | 1305 | struct futex_hash_bucket *hb2, union futex_key *key2) |
1309 | { | 1306 | { |
1310 | 1307 | ||
1311 | /* | 1308 | /* |
1312 | * If key1 and key2 hash to the same bucket, no need to | 1309 | * If key1 and key2 hash to the same bucket, no need to |
1313 | * requeue. | 1310 | * requeue. |
1314 | */ | 1311 | */ |
1315 | if (likely(&hb1->chain != &hb2->chain)) { | 1312 | if (likely(&hb1->chain != &hb2->chain)) { |
1316 | plist_del(&q->list, &hb1->chain); | 1313 | plist_del(&q->list, &hb1->chain); |
1317 | hb_waiters_dec(hb1); | 1314 | hb_waiters_dec(hb1); |
1318 | plist_add(&q->list, &hb2->chain); | 1315 | plist_add(&q->list, &hb2->chain); |
1319 | hb_waiters_inc(hb2); | 1316 | hb_waiters_inc(hb2); |
1320 | q->lock_ptr = &hb2->lock; | 1317 | q->lock_ptr = &hb2->lock; |
1321 | } | 1318 | } |
1322 | get_futex_key_refs(key2); | 1319 | get_futex_key_refs(key2); |
1323 | q->key = *key2; | 1320 | q->key = *key2; |
1324 | } | 1321 | } |
1325 | 1322 | ||
1326 | /** | 1323 | /** |
1327 | * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue | 1324 | * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue |
1328 | * @q: the futex_q | 1325 | * @q: the futex_q |
1329 | * @key: the key of the requeue target futex | 1326 | * @key: the key of the requeue target futex |
1330 | * @hb: the hash_bucket of the requeue target futex | 1327 | * @hb: the hash_bucket of the requeue target futex |
1331 | * | 1328 | * |
1332 | * During futex_requeue, with requeue_pi=1, it is possible to acquire the | 1329 | * During futex_requeue, with requeue_pi=1, it is possible to acquire the |
1333 | * target futex if it is uncontended or via a lock steal. Set the futex_q key | 1330 | * target futex if it is uncontended or via a lock steal. Set the futex_q key |
1334 | * to the requeue target futex so the waiter can detect the wakeup on the right | 1331 | * to the requeue target futex so the waiter can detect the wakeup on the right |
1335 | * futex, but remove it from the hb and NULL the rt_waiter so it can detect | 1332 | * futex, but remove it from the hb and NULL the rt_waiter so it can detect |
1336 | * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock | 1333 | * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock |
1337 | * to protect access to the pi_state to fixup the owner later. Must be called | 1334 | * to protect access to the pi_state to fixup the owner later. Must be called |
1338 | * with both q->lock_ptr and hb->lock held. | 1335 | * with both q->lock_ptr and hb->lock held. |
1339 | */ | 1336 | */ |
1340 | static inline | 1337 | static inline |
1341 | void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, | 1338 | void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, |
1342 | struct futex_hash_bucket *hb) | 1339 | struct futex_hash_bucket *hb) |
1343 | { | 1340 | { |
1344 | get_futex_key_refs(key); | 1341 | get_futex_key_refs(key); |
1345 | q->key = *key; | 1342 | q->key = *key; |
1346 | 1343 | ||
1347 | __unqueue_futex(q); | 1344 | __unqueue_futex(q); |
1348 | 1345 | ||
1349 | WARN_ON(!q->rt_waiter); | 1346 | WARN_ON(!q->rt_waiter); |
1350 | q->rt_waiter = NULL; | 1347 | q->rt_waiter = NULL; |
1351 | 1348 | ||
1352 | q->lock_ptr = &hb->lock; | 1349 | q->lock_ptr = &hb->lock; |
1353 | 1350 | ||
1354 | wake_up_state(q->task, TASK_NORMAL); | 1351 | wake_up_state(q->task, TASK_NORMAL); |
1355 | } | 1352 | } |
1356 | 1353 | ||
1357 | /** | 1354 | /** |
1358 | * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter | 1355 | * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter |
1359 | * @pifutex: the user address of the to futex | 1356 | * @pifutex: the user address of the to futex |
1360 | * @hb1: the from futex hash bucket, must be locked by the caller | 1357 | * @hb1: the from futex hash bucket, must be locked by the caller |
1361 | * @hb2: the to futex hash bucket, must be locked by the caller | 1358 | * @hb2: the to futex hash bucket, must be locked by the caller |
1362 | * @key1: the from futex key | 1359 | * @key1: the from futex key |
1363 | * @key2: the to futex key | 1360 | * @key2: the to futex key |
1364 | * @ps: address to store the pi_state pointer | 1361 | * @ps: address to store the pi_state pointer |
1365 | * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) | 1362 | * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) |
1366 | * | 1363 | * |
1367 | * Try and get the lock on behalf of the top waiter if we can do it atomically. | 1364 | * Try and get the lock on behalf of the top waiter if we can do it atomically. |
1368 | * Wake the top waiter if we succeed. If the caller specified set_waiters, | 1365 | * Wake the top waiter if we succeed. If the caller specified set_waiters, |
1369 | * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. | 1366 | * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. |
1370 | * hb1 and hb2 must be held by the caller. | 1367 | * hb1 and hb2 must be held by the caller. |
1371 | * | 1368 | * |
1372 | * Return: | 1369 | * Return: |
1373 | * 0 - failed to acquire the lock atomically; | 1370 | * 0 - failed to acquire the lock atomically; |
1374 | * >0 - acquired the lock, return value is vpid of the top_waiter | 1371 | * >0 - acquired the lock, return value is vpid of the top_waiter |
1375 | * <0 - error | 1372 | * <0 - error |
1376 | */ | 1373 | */ |
1377 | static int futex_proxy_trylock_atomic(u32 __user *pifutex, | 1374 | static int futex_proxy_trylock_atomic(u32 __user *pifutex, |
1378 | struct futex_hash_bucket *hb1, | 1375 | struct futex_hash_bucket *hb1, |
1379 | struct futex_hash_bucket *hb2, | 1376 | struct futex_hash_bucket *hb2, |
1380 | union futex_key *key1, union futex_key *key2, | 1377 | union futex_key *key1, union futex_key *key2, |
1381 | struct futex_pi_state **ps, int set_waiters) | 1378 | struct futex_pi_state **ps, int set_waiters) |
1382 | { | 1379 | { |
1383 | struct futex_q *top_waiter = NULL; | 1380 | struct futex_q *top_waiter = NULL; |
1384 | u32 curval; | 1381 | u32 curval; |
1385 | int ret, vpid; | 1382 | int ret, vpid; |
1386 | 1383 | ||
1387 | if (get_futex_value_locked(&curval, pifutex)) | 1384 | if (get_futex_value_locked(&curval, pifutex)) |
1388 | return -EFAULT; | 1385 | return -EFAULT; |
1389 | 1386 | ||
1390 | /* | 1387 | /* |
1391 | * Find the top_waiter and determine if there are additional waiters. | 1388 | * Find the top_waiter and determine if there are additional waiters. |
1392 | * If the caller intends to requeue more than 1 waiter to pifutex, | 1389 | * If the caller intends to requeue more than 1 waiter to pifutex, |
1393 | * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now, | 1390 | * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now, |
1394 | * as we have means to handle the possible fault. If not, don't set | 1391 | * as we have means to handle the possible fault. If not, don't set |
1395 | * the bit unecessarily as it will force the subsequent unlock to enter | 1392 | * the bit unecessarily as it will force the subsequent unlock to enter |
1396 | * the kernel. | 1393 | * the kernel. |
1397 | */ | 1394 | */ |
1398 | top_waiter = futex_top_waiter(hb1, key1); | 1395 | top_waiter = futex_top_waiter(hb1, key1); |
1399 | 1396 | ||
1400 | /* There are no waiters, nothing for us to do. */ | 1397 | /* There are no waiters, nothing for us to do. */ |
1401 | if (!top_waiter) | 1398 | if (!top_waiter) |
1402 | return 0; | 1399 | return 0; |
1403 | 1400 | ||
1404 | /* Ensure we requeue to the expected futex. */ | 1401 | /* Ensure we requeue to the expected futex. */ |
1405 | if (!match_futex(top_waiter->requeue_pi_key, key2)) | 1402 | if (!match_futex(top_waiter->requeue_pi_key, key2)) |
1406 | return -EINVAL; | 1403 | return -EINVAL; |
1407 | 1404 | ||
1408 | /* | 1405 | /* |
1409 | * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in | 1406 | * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in |
1410 | * the contended case or if set_waiters is 1. The pi_state is returned | 1407 | * the contended case or if set_waiters is 1. The pi_state is returned |
1411 | * in ps in contended cases. | 1408 | * in ps in contended cases. |
1412 | */ | 1409 | */ |
1413 | vpid = task_pid_vnr(top_waiter->task); | 1410 | vpid = task_pid_vnr(top_waiter->task); |
1414 | ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, | 1411 | ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, |
1415 | set_waiters); | 1412 | set_waiters); |
1416 | if (ret == 1) { | 1413 | if (ret == 1) { |
1417 | requeue_pi_wake_futex(top_waiter, key2, hb2); | 1414 | requeue_pi_wake_futex(top_waiter, key2, hb2); |
1418 | return vpid; | 1415 | return vpid; |
1419 | } | 1416 | } |
1420 | return ret; | 1417 | return ret; |
1421 | } | 1418 | } |
1422 | 1419 | ||
1423 | /** | 1420 | /** |
1424 | * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 | 1421 | * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 |
1425 | * @uaddr1: source futex user address | 1422 | * @uaddr1: source futex user address |
1426 | * @flags: futex flags (FLAGS_SHARED, etc.) | 1423 | * @flags: futex flags (FLAGS_SHARED, etc.) |
1427 | * @uaddr2: target futex user address | 1424 | * @uaddr2: target futex user address |
1428 | * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) | 1425 | * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) |
1429 | * @nr_requeue: number of waiters to requeue (0-INT_MAX) | 1426 | * @nr_requeue: number of waiters to requeue (0-INT_MAX) |
1430 | * @cmpval: @uaddr1 expected value (or %NULL) | 1427 | * @cmpval: @uaddr1 expected value (or %NULL) |
1431 | * @requeue_pi: if we are attempting to requeue from a non-pi futex to a | 1428 | * @requeue_pi: if we are attempting to requeue from a non-pi futex to a |
1432 | * pi futex (pi to pi requeue is not supported) | 1429 | * pi futex (pi to pi requeue is not supported) |
1433 | * | 1430 | * |
1434 | * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire | 1431 | * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire |
1435 | * uaddr2 atomically on behalf of the top waiter. | 1432 | * uaddr2 atomically on behalf of the top waiter. |
1436 | * | 1433 | * |
1437 | * Return: | 1434 | * Return: |
1438 | * >=0 - on success, the number of tasks requeued or woken; | 1435 | * >=0 - on success, the number of tasks requeued or woken; |
1439 | * <0 - on error | 1436 | * <0 - on error |
1440 | */ | 1437 | */ |
1441 | static int futex_requeue(u32 __user *uaddr1, unsigned int flags, | 1438 | static int futex_requeue(u32 __user *uaddr1, unsigned int flags, |
1442 | u32 __user *uaddr2, int nr_wake, int nr_requeue, | 1439 | u32 __user *uaddr2, int nr_wake, int nr_requeue, |
1443 | u32 *cmpval, int requeue_pi) | 1440 | u32 *cmpval, int requeue_pi) |
1444 | { | 1441 | { |
1445 | union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; | 1442 | union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; |
1446 | int drop_count = 0, task_count = 0, ret; | 1443 | int drop_count = 0, task_count = 0, ret; |
1447 | struct futex_pi_state *pi_state = NULL; | 1444 | struct futex_pi_state *pi_state = NULL; |
1448 | struct futex_hash_bucket *hb1, *hb2; | 1445 | struct futex_hash_bucket *hb1, *hb2; |
1449 | struct futex_q *this, *next; | 1446 | struct futex_q *this, *next; |
1450 | 1447 | ||
1451 | if (requeue_pi) { | 1448 | if (requeue_pi) { |
1452 | /* | 1449 | /* |
1453 | * Requeue PI only works on two distinct uaddrs. This | 1450 | * Requeue PI only works on two distinct uaddrs. This |
1454 | * check is only valid for private futexes. See below. | 1451 | * check is only valid for private futexes. See below. |
1455 | */ | 1452 | */ |
1456 | if (uaddr1 == uaddr2) | 1453 | if (uaddr1 == uaddr2) |
1457 | return -EINVAL; | 1454 | return -EINVAL; |
1458 | 1455 | ||
1459 | /* | 1456 | /* |
1460 | * requeue_pi requires a pi_state, try to allocate it now | 1457 | * requeue_pi requires a pi_state, try to allocate it now |
1461 | * without any locks in case it fails. | 1458 | * without any locks in case it fails. |
1462 | */ | 1459 | */ |
1463 | if (refill_pi_state_cache()) | 1460 | if (refill_pi_state_cache()) |
1464 | return -ENOMEM; | 1461 | return -ENOMEM; |
1465 | /* | 1462 | /* |
1466 | * requeue_pi must wake as many tasks as it can, up to nr_wake | 1463 | * requeue_pi must wake as many tasks as it can, up to nr_wake |
1467 | * + nr_requeue, since it acquires the rt_mutex prior to | 1464 | * + nr_requeue, since it acquires the rt_mutex prior to |
1468 | * returning to userspace, so as to not leave the rt_mutex with | 1465 | * returning to userspace, so as to not leave the rt_mutex with |
1469 | * waiters and no owner. However, second and third wake-ups | 1466 | * waiters and no owner. However, second and third wake-ups |
1470 | * cannot be predicted as they involve race conditions with the | 1467 | * cannot be predicted as they involve race conditions with the |
1471 | * first wake and a fault while looking up the pi_state. Both | 1468 | * first wake and a fault while looking up the pi_state. Both |
1472 | * pthread_cond_signal() and pthread_cond_broadcast() should | 1469 | * pthread_cond_signal() and pthread_cond_broadcast() should |
1473 | * use nr_wake=1. | 1470 | * use nr_wake=1. |
1474 | */ | 1471 | */ |
1475 | if (nr_wake != 1) | 1472 | if (nr_wake != 1) |
1476 | return -EINVAL; | 1473 | return -EINVAL; |
1477 | } | 1474 | } |
1478 | 1475 | ||
1479 | retry: | 1476 | retry: |
1480 | if (pi_state != NULL) { | 1477 | if (pi_state != NULL) { |
1481 | /* | 1478 | /* |
1482 | * We will have to lookup the pi_state again, so free this one | 1479 | * We will have to lookup the pi_state again, so free this one |
1483 | * to keep the accounting correct. | 1480 | * to keep the accounting correct. |
1484 | */ | 1481 | */ |
1485 | free_pi_state(pi_state); | 1482 | free_pi_state(pi_state); |
1486 | pi_state = NULL; | 1483 | pi_state = NULL; |
1487 | } | 1484 | } |
1488 | 1485 | ||
1489 | ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); | 1486 | ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); |
1490 | if (unlikely(ret != 0)) | 1487 | if (unlikely(ret != 0)) |
1491 | goto out; | 1488 | goto out; |
1492 | ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, | 1489 | ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, |
1493 | requeue_pi ? VERIFY_WRITE : VERIFY_READ); | 1490 | requeue_pi ? VERIFY_WRITE : VERIFY_READ); |
1494 | if (unlikely(ret != 0)) | 1491 | if (unlikely(ret != 0)) |
1495 | goto out_put_key1; | 1492 | goto out_put_key1; |
1496 | 1493 | ||
1497 | /* | 1494 | /* |
1498 | * The check above which compares uaddrs is not sufficient for | 1495 | * The check above which compares uaddrs is not sufficient for |
1499 | * shared futexes. We need to compare the keys: | 1496 | * shared futexes. We need to compare the keys: |
1500 | */ | 1497 | */ |
1501 | if (requeue_pi && match_futex(&key1, &key2)) { | 1498 | if (requeue_pi && match_futex(&key1, &key2)) { |
1502 | ret = -EINVAL; | 1499 | ret = -EINVAL; |
1503 | goto out_put_keys; | 1500 | goto out_put_keys; |
1504 | } | 1501 | } |
1505 | 1502 | ||
1506 | hb1 = hash_futex(&key1); | 1503 | hb1 = hash_futex(&key1); |
1507 | hb2 = hash_futex(&key2); | 1504 | hb2 = hash_futex(&key2); |
1508 | 1505 | ||
1509 | retry_private: | 1506 | retry_private: |
1510 | hb_waiters_inc(hb2); | 1507 | hb_waiters_inc(hb2); |
1511 | double_lock_hb(hb1, hb2); | 1508 | double_lock_hb(hb1, hb2); |
1512 | 1509 | ||
1513 | if (likely(cmpval != NULL)) { | 1510 | if (likely(cmpval != NULL)) { |
1514 | u32 curval; | 1511 | u32 curval; |
1515 | 1512 | ||
1516 | ret = get_futex_value_locked(&curval, uaddr1); | 1513 | ret = get_futex_value_locked(&curval, uaddr1); |
1517 | 1514 | ||
1518 | if (unlikely(ret)) { | 1515 | if (unlikely(ret)) { |
1519 | double_unlock_hb(hb1, hb2); | 1516 | double_unlock_hb(hb1, hb2); |
1520 | hb_waiters_dec(hb2); | 1517 | hb_waiters_dec(hb2); |
1521 | 1518 | ||
1522 | ret = get_user(curval, uaddr1); | 1519 | ret = get_user(curval, uaddr1); |
1523 | if (ret) | 1520 | if (ret) |
1524 | goto out_put_keys; | 1521 | goto out_put_keys; |
1525 | 1522 | ||
1526 | if (!(flags & FLAGS_SHARED)) | 1523 | if (!(flags & FLAGS_SHARED)) |
1527 | goto retry_private; | 1524 | goto retry_private; |
1528 | 1525 | ||
1529 | put_futex_key(&key2); | 1526 | put_futex_key(&key2); |
1530 | put_futex_key(&key1); | 1527 | put_futex_key(&key1); |
1531 | goto retry; | 1528 | goto retry; |
1532 | } | 1529 | } |
1533 | if (curval != *cmpval) { | 1530 | if (curval != *cmpval) { |
1534 | ret = -EAGAIN; | 1531 | ret = -EAGAIN; |
1535 | goto out_unlock; | 1532 | goto out_unlock; |
1536 | } | 1533 | } |
1537 | } | 1534 | } |
1538 | 1535 | ||
1539 | if (requeue_pi && (task_count - nr_wake < nr_requeue)) { | 1536 | if (requeue_pi && (task_count - nr_wake < nr_requeue)) { |
1540 | /* | 1537 | /* |
1541 | * Attempt to acquire uaddr2 and wake the top waiter. If we | 1538 | * Attempt to acquire uaddr2 and wake the top waiter. If we |
1542 | * intend to requeue waiters, force setting the FUTEX_WAITERS | 1539 | * intend to requeue waiters, force setting the FUTEX_WAITERS |
1543 | * bit. We force this here where we are able to easily handle | 1540 | * bit. We force this here where we are able to easily handle |
1544 | * faults rather in the requeue loop below. | 1541 | * faults rather in the requeue loop below. |
1545 | */ | 1542 | */ |
1546 | ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, | 1543 | ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, |
1547 | &key2, &pi_state, nr_requeue); | 1544 | &key2, &pi_state, nr_requeue); |
1548 | 1545 | ||
1549 | /* | 1546 | /* |
1550 | * At this point the top_waiter has either taken uaddr2 or is | 1547 | * At this point the top_waiter has either taken uaddr2 or is |
1551 | * waiting on it. If the former, then the pi_state will not | 1548 | * waiting on it. If the former, then the pi_state will not |
1552 | * exist yet, look it up one more time to ensure we have a | 1549 | * exist yet, look it up one more time to ensure we have a |
1553 | * reference to it. If the lock was taken, ret contains the | 1550 | * reference to it. If the lock was taken, ret contains the |
1554 | * vpid of the top waiter task. | 1551 | * vpid of the top waiter task. |
1555 | */ | 1552 | */ |
1556 | if (ret > 0) { | 1553 | if (ret > 0) { |
1557 | WARN_ON(pi_state); | 1554 | WARN_ON(pi_state); |
1558 | drop_count++; | 1555 | drop_count++; |
1559 | task_count++; | 1556 | task_count++; |
1560 | /* | 1557 | /* |
1561 | * If we acquired the lock, then the user | 1558 | * If we acquired the lock, then the user |
1562 | * space value of uaddr2 should be vpid. It | 1559 | * space value of uaddr2 should be vpid. It |
1563 | * cannot be changed by the top waiter as it | 1560 | * cannot be changed by the top waiter as it |
1564 | * is blocked on hb2 lock if it tries to do | 1561 | * is blocked on hb2 lock if it tries to do |
1565 | * so. If something fiddled with it behind our | 1562 | * so. If something fiddled with it behind our |
1566 | * back the pi state lookup might unearth | 1563 | * back the pi state lookup might unearth |
1567 | * it. So we rather use the known value than | 1564 | * it. So we rather use the known value than |
1568 | * rereading and handing potential crap to | 1565 | * rereading and handing potential crap to |
1569 | * lookup_pi_state. | 1566 | * lookup_pi_state. |
1570 | */ | 1567 | */ |
1571 | ret = lookup_pi_state(ret, hb2, &key2, &pi_state, NULL); | 1568 | ret = lookup_pi_state(ret, hb2, &key2, &pi_state, NULL); |
1572 | } | 1569 | } |
1573 | 1570 | ||
1574 | switch (ret) { | 1571 | switch (ret) { |
1575 | case 0: | 1572 | case 0: |
1576 | break; | 1573 | break; |
1577 | case -EFAULT: | 1574 | case -EFAULT: |
1578 | double_unlock_hb(hb1, hb2); | 1575 | double_unlock_hb(hb1, hb2); |
1579 | hb_waiters_dec(hb2); | 1576 | hb_waiters_dec(hb2); |
1580 | put_futex_key(&key2); | 1577 | put_futex_key(&key2); |
1581 | put_futex_key(&key1); | 1578 | put_futex_key(&key1); |
1582 | ret = fault_in_user_writeable(uaddr2); | 1579 | ret = fault_in_user_writeable(uaddr2); |
1583 | if (!ret) | 1580 | if (!ret) |
1584 | goto retry; | 1581 | goto retry; |
1585 | goto out; | 1582 | goto out; |
1586 | case -EAGAIN: | 1583 | case -EAGAIN: |
1587 | /* The owner was exiting, try again. */ | 1584 | /* The owner was exiting, try again. */ |
1588 | double_unlock_hb(hb1, hb2); | 1585 | double_unlock_hb(hb1, hb2); |
1589 | hb_waiters_dec(hb2); | 1586 | hb_waiters_dec(hb2); |
1590 | put_futex_key(&key2); | 1587 | put_futex_key(&key2); |
1591 | put_futex_key(&key1); | 1588 | put_futex_key(&key1); |
1592 | cond_resched(); | 1589 | cond_resched(); |
1593 | goto retry; | 1590 | goto retry; |
1594 | default: | 1591 | default: |
1595 | goto out_unlock; | 1592 | goto out_unlock; |
1596 | } | 1593 | } |
1597 | } | 1594 | } |
1598 | 1595 | ||
1599 | plist_for_each_entry_safe(this, next, &hb1->chain, list) { | 1596 | plist_for_each_entry_safe(this, next, &hb1->chain, list) { |
1600 | if (task_count - nr_wake >= nr_requeue) | 1597 | if (task_count - nr_wake >= nr_requeue) |
1601 | break; | 1598 | break; |
1602 | 1599 | ||
1603 | if (!match_futex(&this->key, &key1)) | 1600 | if (!match_futex(&this->key, &key1)) |
1604 | continue; | 1601 | continue; |
1605 | 1602 | ||
1606 | /* | 1603 | /* |
1607 | * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always | 1604 | * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always |
1608 | * be paired with each other and no other futex ops. | 1605 | * be paired with each other and no other futex ops. |
1609 | * | 1606 | * |
1610 | * We should never be requeueing a futex_q with a pi_state, | 1607 | * We should never be requeueing a futex_q with a pi_state, |
1611 | * which is awaiting a futex_unlock_pi(). | 1608 | * which is awaiting a futex_unlock_pi(). |
1612 | */ | 1609 | */ |
1613 | if ((requeue_pi && !this->rt_waiter) || | 1610 | if ((requeue_pi && !this->rt_waiter) || |
1614 | (!requeue_pi && this->rt_waiter) || | 1611 | (!requeue_pi && this->rt_waiter) || |
1615 | this->pi_state) { | 1612 | this->pi_state) { |
1616 | ret = -EINVAL; | 1613 | ret = -EINVAL; |
1617 | break; | 1614 | break; |
1618 | } | 1615 | } |
1619 | 1616 | ||
1620 | /* | 1617 | /* |
1621 | * Wake nr_wake waiters. For requeue_pi, if we acquired the | 1618 | * Wake nr_wake waiters. For requeue_pi, if we acquired the |
1622 | * lock, we already woke the top_waiter. If not, it will be | 1619 | * lock, we already woke the top_waiter. If not, it will be |
1623 | * woken by futex_unlock_pi(). | 1620 | * woken by futex_unlock_pi(). |
1624 | */ | 1621 | */ |
1625 | if (++task_count <= nr_wake && !requeue_pi) { | 1622 | if (++task_count <= nr_wake && !requeue_pi) { |
1626 | wake_futex(this); | 1623 | wake_futex(this); |
1627 | continue; | 1624 | continue; |
1628 | } | 1625 | } |
1629 | 1626 | ||
1630 | /* Ensure we requeue to the expected futex for requeue_pi. */ | 1627 | /* Ensure we requeue to the expected futex for requeue_pi. */ |
1631 | if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) { | 1628 | if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) { |
1632 | ret = -EINVAL; | 1629 | ret = -EINVAL; |
1633 | break; | 1630 | break; |
1634 | } | 1631 | } |
1635 | 1632 | ||
1636 | /* | 1633 | /* |
1637 | * Requeue nr_requeue waiters and possibly one more in the case | 1634 | * Requeue nr_requeue waiters and possibly one more in the case |
1638 | * of requeue_pi if we couldn't acquire the lock atomically. | 1635 | * of requeue_pi if we couldn't acquire the lock atomically. |
1639 | */ | 1636 | */ |
1640 | if (requeue_pi) { | 1637 | if (requeue_pi) { |
1641 | /* Prepare the waiter to take the rt_mutex. */ | 1638 | /* Prepare the waiter to take the rt_mutex. */ |
1642 | atomic_inc(&pi_state->refcount); | 1639 | atomic_inc(&pi_state->refcount); |
1643 | this->pi_state = pi_state; | 1640 | this->pi_state = pi_state; |
1644 | ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, | 1641 | ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, |
1645 | this->rt_waiter, | 1642 | this->rt_waiter, |
1646 | this->task, 1); | 1643 | this->task, 1); |
1647 | if (ret == 1) { | 1644 | if (ret == 1) { |
1648 | /* We got the lock. */ | 1645 | /* We got the lock. */ |
1649 | requeue_pi_wake_futex(this, &key2, hb2); | 1646 | requeue_pi_wake_futex(this, &key2, hb2); |
1650 | drop_count++; | 1647 | drop_count++; |
1651 | continue; | 1648 | continue; |
1652 | } else if (ret) { | 1649 | } else if (ret) { |
1653 | /* -EDEADLK */ | 1650 | /* -EDEADLK */ |
1654 | this->pi_state = NULL; | 1651 | this->pi_state = NULL; |
1655 | free_pi_state(pi_state); | 1652 | free_pi_state(pi_state); |
1656 | goto out_unlock; | 1653 | goto out_unlock; |
1657 | } | 1654 | } |
1658 | } | 1655 | } |
1659 | requeue_futex(this, hb1, hb2, &key2); | 1656 | requeue_futex(this, hb1, hb2, &key2); |
1660 | drop_count++; | 1657 | drop_count++; |
1661 | } | 1658 | } |
1662 | 1659 | ||
1663 | out_unlock: | 1660 | out_unlock: |
1664 | double_unlock_hb(hb1, hb2); | 1661 | double_unlock_hb(hb1, hb2); |
1665 | hb_waiters_dec(hb2); | 1662 | hb_waiters_dec(hb2); |
1666 | 1663 | ||
1667 | /* | 1664 | /* |
1668 | * drop_futex_key_refs() must be called outside the spinlocks. During | 1665 | * drop_futex_key_refs() must be called outside the spinlocks. During |
1669 | * the requeue we moved futex_q's from the hash bucket at key1 to the | 1666 | * the requeue we moved futex_q's from the hash bucket at key1 to the |
1670 | * one at key2 and updated their key pointer. We no longer need to | 1667 | * one at key2 and updated their key pointer. We no longer need to |
1671 | * hold the references to key1. | 1668 | * hold the references to key1. |
1672 | */ | 1669 | */ |
1673 | while (--drop_count >= 0) | 1670 | while (--drop_count >= 0) |
1674 | drop_futex_key_refs(&key1); | 1671 | drop_futex_key_refs(&key1); |
1675 | 1672 | ||
1676 | out_put_keys: | 1673 | out_put_keys: |
1677 | put_futex_key(&key2); | 1674 | put_futex_key(&key2); |
1678 | out_put_key1: | 1675 | out_put_key1: |
1679 | put_futex_key(&key1); | 1676 | put_futex_key(&key1); |
1680 | out: | 1677 | out: |
1681 | if (pi_state != NULL) | 1678 | if (pi_state != NULL) |
1682 | free_pi_state(pi_state); | 1679 | free_pi_state(pi_state); |
1683 | return ret ? ret : task_count; | 1680 | return ret ? ret : task_count; |
1684 | } | 1681 | } |
1685 | 1682 | ||
1686 | /* The key must be already stored in q->key. */ | 1683 | /* The key must be already stored in q->key. */ |
1687 | static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) | 1684 | static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) |
1688 | __acquires(&hb->lock) | 1685 | __acquires(&hb->lock) |
1689 | { | 1686 | { |
1690 | struct futex_hash_bucket *hb; | 1687 | struct futex_hash_bucket *hb; |
1691 | 1688 | ||
1692 | hb = hash_futex(&q->key); | 1689 | hb = hash_futex(&q->key); |
1693 | 1690 | ||
1694 | /* | 1691 | /* |
1695 | * Increment the counter before taking the lock so that | 1692 | * Increment the counter before taking the lock so that |
1696 | * a potential waker won't miss a to-be-slept task that is | 1693 | * a potential waker won't miss a to-be-slept task that is |
1697 | * waiting for the spinlock. This is safe as all queue_lock() | 1694 | * waiting for the spinlock. This is safe as all queue_lock() |
1698 | * users end up calling queue_me(). Similarly, for housekeeping, | 1695 | * users end up calling queue_me(). Similarly, for housekeeping, |
1699 | * decrement the counter at queue_unlock() when some error has | 1696 | * decrement the counter at queue_unlock() when some error has |
1700 | * occurred and we don't end up adding the task to the list. | 1697 | * occurred and we don't end up adding the task to the list. |
1701 | */ | 1698 | */ |
1702 | hb_waiters_inc(hb); | 1699 | hb_waiters_inc(hb); |
1703 | 1700 | ||
1704 | q->lock_ptr = &hb->lock; | 1701 | q->lock_ptr = &hb->lock; |
1705 | 1702 | ||
1706 | spin_lock(&hb->lock); /* implies MB (A) */ | 1703 | spin_lock(&hb->lock); /* implies MB (A) */ |
1707 | return hb; | 1704 | return hb; |
1708 | } | 1705 | } |
1709 | 1706 | ||
1710 | static inline void | 1707 | static inline void |
1711 | queue_unlock(struct futex_hash_bucket *hb) | 1708 | queue_unlock(struct futex_hash_bucket *hb) |
1712 | __releases(&hb->lock) | 1709 | __releases(&hb->lock) |
1713 | { | 1710 | { |
1714 | spin_unlock(&hb->lock); | 1711 | spin_unlock(&hb->lock); |
1715 | hb_waiters_dec(hb); | 1712 | hb_waiters_dec(hb); |
1716 | } | 1713 | } |
1717 | 1714 | ||
1718 | /** | 1715 | /** |
1719 | * queue_me() - Enqueue the futex_q on the futex_hash_bucket | 1716 | * queue_me() - Enqueue the futex_q on the futex_hash_bucket |
1720 | * @q: The futex_q to enqueue | 1717 | * @q: The futex_q to enqueue |
1721 | * @hb: The destination hash bucket | 1718 | * @hb: The destination hash bucket |
1722 | * | 1719 | * |
1723 | * The hb->lock must be held by the caller, and is released here. A call to | 1720 | * The hb->lock must be held by the caller, and is released here. A call to |
1724 | * queue_me() is typically paired with exactly one call to unqueue_me(). The | 1721 | * queue_me() is typically paired with exactly one call to unqueue_me(). The |
1725 | * exceptions involve the PI related operations, which may use unqueue_me_pi() | 1722 | * exceptions involve the PI related operations, which may use unqueue_me_pi() |
1726 | * or nothing if the unqueue is done as part of the wake process and the unqueue | 1723 | * or nothing if the unqueue is done as part of the wake process and the unqueue |
1727 | * state is implicit in the state of woken task (see futex_wait_requeue_pi() for | 1724 | * state is implicit in the state of woken task (see futex_wait_requeue_pi() for |
1728 | * an example). | 1725 | * an example). |
1729 | */ | 1726 | */ |
1730 | static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) | 1727 | static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) |
1731 | __releases(&hb->lock) | 1728 | __releases(&hb->lock) |
1732 | { | 1729 | { |
1733 | int prio; | 1730 | int prio; |
1734 | 1731 | ||
1735 | /* | 1732 | /* |
1736 | * The priority used to register this element is | 1733 | * The priority used to register this element is |
1737 | * - either the real thread-priority for the real-time threads | 1734 | * - either the real thread-priority for the real-time threads |
1738 | * (i.e. threads with a priority lower than MAX_RT_PRIO) | 1735 | * (i.e. threads with a priority lower than MAX_RT_PRIO) |
1739 | * - or MAX_RT_PRIO for non-RT threads. | 1736 | * - or MAX_RT_PRIO for non-RT threads. |
1740 | * Thus, all RT-threads are woken first in priority order, and | 1737 | * Thus, all RT-threads are woken first in priority order, and |
1741 | * the others are woken last, in FIFO order. | 1738 | * the others are woken last, in FIFO order. |
1742 | */ | 1739 | */ |
1743 | prio = min(current->normal_prio, MAX_RT_PRIO); | 1740 | prio = min(current->normal_prio, MAX_RT_PRIO); |
1744 | 1741 | ||
1745 | plist_node_init(&q->list, prio); | 1742 | plist_node_init(&q->list, prio); |
1746 | plist_add(&q->list, &hb->chain); | 1743 | plist_add(&q->list, &hb->chain); |
1747 | q->task = current; | 1744 | q->task = current; |
1748 | spin_unlock(&hb->lock); | 1745 | spin_unlock(&hb->lock); |
1749 | } | 1746 | } |
1750 | 1747 | ||
1751 | /** | 1748 | /** |
1752 | * unqueue_me() - Remove the futex_q from its futex_hash_bucket | 1749 | * unqueue_me() - Remove the futex_q from its futex_hash_bucket |
1753 | * @q: The futex_q to unqueue | 1750 | * @q: The futex_q to unqueue |
1754 | * | 1751 | * |
1755 | * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must | 1752 | * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must |
1756 | * be paired with exactly one earlier call to queue_me(). | 1753 | * be paired with exactly one earlier call to queue_me(). |
1757 | * | 1754 | * |
1758 | * Return: | 1755 | * Return: |
1759 | * 1 - if the futex_q was still queued (and we removed unqueued it); | 1756 | * 1 - if the futex_q was still queued (and we removed unqueued it); |
1760 | * 0 - if the futex_q was already removed by the waking thread | 1757 | * 0 - if the futex_q was already removed by the waking thread |
1761 | */ | 1758 | */ |
1762 | static int unqueue_me(struct futex_q *q) | 1759 | static int unqueue_me(struct futex_q *q) |
1763 | { | 1760 | { |
1764 | spinlock_t *lock_ptr; | 1761 | spinlock_t *lock_ptr; |
1765 | int ret = 0; | 1762 | int ret = 0; |
1766 | 1763 | ||
1767 | /* In the common case we don't take the spinlock, which is nice. */ | 1764 | /* In the common case we don't take the spinlock, which is nice. */ |
1768 | retry: | 1765 | retry: |
1769 | lock_ptr = q->lock_ptr; | 1766 | lock_ptr = q->lock_ptr; |
1770 | barrier(); | 1767 | barrier(); |
1771 | if (lock_ptr != NULL) { | 1768 | if (lock_ptr != NULL) { |
1772 | spin_lock(lock_ptr); | 1769 | spin_lock(lock_ptr); |
1773 | /* | 1770 | /* |
1774 | * q->lock_ptr can change between reading it and | 1771 | * q->lock_ptr can change between reading it and |
1775 | * spin_lock(), causing us to take the wrong lock. This | 1772 | * spin_lock(), causing us to take the wrong lock. This |
1776 | * corrects the race condition. | 1773 | * corrects the race condition. |
1777 | * | 1774 | * |
1778 | * Reasoning goes like this: if we have the wrong lock, | 1775 | * Reasoning goes like this: if we have the wrong lock, |
1779 | * q->lock_ptr must have changed (maybe several times) | 1776 | * q->lock_ptr must have changed (maybe several times) |
1780 | * between reading it and the spin_lock(). It can | 1777 | * between reading it and the spin_lock(). It can |
1781 | * change again after the spin_lock() but only if it was | 1778 | * change again after the spin_lock() but only if it was |
1782 | * already changed before the spin_lock(). It cannot, | 1779 | * already changed before the spin_lock(). It cannot, |
1783 | * however, change back to the original value. Therefore | 1780 | * however, change back to the original value. Therefore |
1784 | * we can detect whether we acquired the correct lock. | 1781 | * we can detect whether we acquired the correct lock. |
1785 | */ | 1782 | */ |
1786 | if (unlikely(lock_ptr != q->lock_ptr)) { | 1783 | if (unlikely(lock_ptr != q->lock_ptr)) { |
1787 | spin_unlock(lock_ptr); | 1784 | spin_unlock(lock_ptr); |
1788 | goto retry; | 1785 | goto retry; |
1789 | } | 1786 | } |
1790 | __unqueue_futex(q); | 1787 | __unqueue_futex(q); |
1791 | 1788 | ||
1792 | BUG_ON(q->pi_state); | 1789 | BUG_ON(q->pi_state); |
1793 | 1790 | ||
1794 | spin_unlock(lock_ptr); | 1791 | spin_unlock(lock_ptr); |
1795 | ret = 1; | 1792 | ret = 1; |
1796 | } | 1793 | } |
1797 | 1794 | ||
1798 | drop_futex_key_refs(&q->key); | 1795 | drop_futex_key_refs(&q->key); |
1799 | return ret; | 1796 | return ret; |
1800 | } | 1797 | } |
1801 | 1798 | ||
1802 | /* | 1799 | /* |
1803 | * PI futexes can not be requeued and must remove themself from the | 1800 | * PI futexes can not be requeued and must remove themself from the |
1804 | * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry | 1801 | * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry |
1805 | * and dropped here. | 1802 | * and dropped here. |
1806 | */ | 1803 | */ |
1807 | static void unqueue_me_pi(struct futex_q *q) | 1804 | static void unqueue_me_pi(struct futex_q *q) |
1808 | __releases(q->lock_ptr) | 1805 | __releases(q->lock_ptr) |
1809 | { | 1806 | { |
1810 | __unqueue_futex(q); | 1807 | __unqueue_futex(q); |
1811 | 1808 | ||
1812 | BUG_ON(!q->pi_state); | 1809 | BUG_ON(!q->pi_state); |
1813 | free_pi_state(q->pi_state); | 1810 | free_pi_state(q->pi_state); |
1814 | q->pi_state = NULL; | 1811 | q->pi_state = NULL; |
1815 | 1812 | ||
1816 | spin_unlock(q->lock_ptr); | 1813 | spin_unlock(q->lock_ptr); |
1817 | } | 1814 | } |
1818 | 1815 | ||
1819 | /* | 1816 | /* |
1820 | * Fixup the pi_state owner with the new owner. | 1817 | * Fixup the pi_state owner with the new owner. |
1821 | * | 1818 | * |
1822 | * Must be called with hash bucket lock held and mm->sem held for non | 1819 | * Must be called with hash bucket lock held and mm->sem held for non |
1823 | * private futexes. | 1820 | * private futexes. |
1824 | */ | 1821 | */ |
1825 | static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, | 1822 | static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, |
1826 | struct task_struct *newowner) | 1823 | struct task_struct *newowner) |
1827 | { | 1824 | { |
1828 | u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; | 1825 | u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; |
1829 | struct futex_pi_state *pi_state = q->pi_state; | 1826 | struct futex_pi_state *pi_state = q->pi_state; |
1830 | struct task_struct *oldowner = pi_state->owner; | 1827 | struct task_struct *oldowner = pi_state->owner; |
1831 | u32 uval, uninitialized_var(curval), newval; | 1828 | u32 uval, uninitialized_var(curval), newval; |
1832 | int ret; | 1829 | int ret; |
1833 | 1830 | ||
1834 | /* Owner died? */ | 1831 | /* Owner died? */ |
1835 | if (!pi_state->owner) | 1832 | if (!pi_state->owner) |
1836 | newtid |= FUTEX_OWNER_DIED; | 1833 | newtid |= FUTEX_OWNER_DIED; |
1837 | 1834 | ||
1838 | /* | 1835 | /* |
1839 | * We are here either because we stole the rtmutex from the | 1836 | * We are here either because we stole the rtmutex from the |
1840 | * previous highest priority waiter or we are the highest priority | 1837 | * previous highest priority waiter or we are the highest priority |
1841 | * waiter but failed to get the rtmutex the first time. | 1838 | * waiter but failed to get the rtmutex the first time. |
1842 | * We have to replace the newowner TID in the user space variable. | 1839 | * We have to replace the newowner TID in the user space variable. |
1843 | * This must be atomic as we have to preserve the owner died bit here. | 1840 | * This must be atomic as we have to preserve the owner died bit here. |
1844 | * | 1841 | * |
1845 | * Note: We write the user space value _before_ changing the pi_state | 1842 | * Note: We write the user space value _before_ changing the pi_state |
1846 | * because we can fault here. Imagine swapped out pages or a fork | 1843 | * because we can fault here. Imagine swapped out pages or a fork |
1847 | * that marked all the anonymous memory readonly for cow. | 1844 | * that marked all the anonymous memory readonly for cow. |
1848 | * | 1845 | * |
1849 | * Modifying pi_state _before_ the user space value would | 1846 | * Modifying pi_state _before_ the user space value would |
1850 | * leave the pi_state in an inconsistent state when we fault | 1847 | * leave the pi_state in an inconsistent state when we fault |
1851 | * here, because we need to drop the hash bucket lock to | 1848 | * here, because we need to drop the hash bucket lock to |
1852 | * handle the fault. This might be observed in the PID check | 1849 | * handle the fault. This might be observed in the PID check |
1853 | * in lookup_pi_state. | 1850 | * in lookup_pi_state. |
1854 | */ | 1851 | */ |
1855 | retry: | 1852 | retry: |
1856 | if (get_futex_value_locked(&uval, uaddr)) | 1853 | if (get_futex_value_locked(&uval, uaddr)) |
1857 | goto handle_fault; | 1854 | goto handle_fault; |
1858 | 1855 | ||
1859 | while (1) { | 1856 | while (1) { |
1860 | newval = (uval & FUTEX_OWNER_DIED) | newtid; | 1857 | newval = (uval & FUTEX_OWNER_DIED) | newtid; |
1861 | 1858 | ||
1862 | if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) | 1859 | if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) |
1863 | goto handle_fault; | 1860 | goto handle_fault; |
1864 | if (curval == uval) | 1861 | if (curval == uval) |
1865 | break; | 1862 | break; |
1866 | uval = curval; | 1863 | uval = curval; |
1867 | } | 1864 | } |
1868 | 1865 | ||
1869 | /* | 1866 | /* |
1870 | * We fixed up user space. Now we need to fix the pi_state | 1867 | * We fixed up user space. Now we need to fix the pi_state |
1871 | * itself. | 1868 | * itself. |
1872 | */ | 1869 | */ |
1873 | if (pi_state->owner != NULL) { | 1870 | if (pi_state->owner != NULL) { |
1874 | raw_spin_lock_irq(&pi_state->owner->pi_lock); | 1871 | raw_spin_lock_irq(&pi_state->owner->pi_lock); |
1875 | WARN_ON(list_empty(&pi_state->list)); | 1872 | WARN_ON(list_empty(&pi_state->list)); |
1876 | list_del_init(&pi_state->list); | 1873 | list_del_init(&pi_state->list); |
1877 | raw_spin_unlock_irq(&pi_state->owner->pi_lock); | 1874 | raw_spin_unlock_irq(&pi_state->owner->pi_lock); |
1878 | } | 1875 | } |
1879 | 1876 | ||
1880 | pi_state->owner = newowner; | 1877 | pi_state->owner = newowner; |
1881 | 1878 | ||
1882 | raw_spin_lock_irq(&newowner->pi_lock); | 1879 | raw_spin_lock_irq(&newowner->pi_lock); |
1883 | WARN_ON(!list_empty(&pi_state->list)); | 1880 | WARN_ON(!list_empty(&pi_state->list)); |
1884 | list_add(&pi_state->list, &newowner->pi_state_list); | 1881 | list_add(&pi_state->list, &newowner->pi_state_list); |
1885 | raw_spin_unlock_irq(&newowner->pi_lock); | 1882 | raw_spin_unlock_irq(&newowner->pi_lock); |
1886 | return 0; | 1883 | return 0; |
1887 | 1884 | ||
1888 | /* | 1885 | /* |
1889 | * To handle the page fault we need to drop the hash bucket | 1886 | * To handle the page fault we need to drop the hash bucket |
1890 | * lock here. That gives the other task (either the highest priority | 1887 | * lock here. That gives the other task (either the highest priority |
1891 | * waiter itself or the task which stole the rtmutex) the | 1888 | * waiter itself or the task which stole the rtmutex) the |
1892 | * chance to try the fixup of the pi_state. So once we are | 1889 | * chance to try the fixup of the pi_state. So once we are |
1893 | * back from handling the fault we need to check the pi_state | 1890 | * back from handling the fault we need to check the pi_state |
1894 | * after reacquiring the hash bucket lock and before trying to | 1891 | * after reacquiring the hash bucket lock and before trying to |
1895 | * do another fixup. When the fixup has been done already we | 1892 | * do another fixup. When the fixup has been done already we |
1896 | * simply return. | 1893 | * simply return. |
1897 | */ | 1894 | */ |
1898 | handle_fault: | 1895 | handle_fault: |
1899 | spin_unlock(q->lock_ptr); | 1896 | spin_unlock(q->lock_ptr); |
1900 | 1897 | ||
1901 | ret = fault_in_user_writeable(uaddr); | 1898 | ret = fault_in_user_writeable(uaddr); |
1902 | 1899 | ||
1903 | spin_lock(q->lock_ptr); | 1900 | spin_lock(q->lock_ptr); |
1904 | 1901 | ||
1905 | /* | 1902 | /* |
1906 | * Check if someone else fixed it for us: | 1903 | * Check if someone else fixed it for us: |
1907 | */ | 1904 | */ |
1908 | if (pi_state->owner != oldowner) | 1905 | if (pi_state->owner != oldowner) |
1909 | return 0; | 1906 | return 0; |
1910 | 1907 | ||
1911 | if (ret) | 1908 | if (ret) |
1912 | return ret; | 1909 | return ret; |
1913 | 1910 | ||
1914 | goto retry; | 1911 | goto retry; |
1915 | } | 1912 | } |
1916 | 1913 | ||
1917 | static long futex_wait_restart(struct restart_block *restart); | 1914 | static long futex_wait_restart(struct restart_block *restart); |
1918 | 1915 | ||
1919 | /** | 1916 | /** |
1920 | * fixup_owner() - Post lock pi_state and corner case management | 1917 | * fixup_owner() - Post lock pi_state and corner case management |
1921 | * @uaddr: user address of the futex | 1918 | * @uaddr: user address of the futex |
1922 | * @q: futex_q (contains pi_state and access to the rt_mutex) | 1919 | * @q: futex_q (contains pi_state and access to the rt_mutex) |
1923 | * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) | 1920 | * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) |
1924 | * | 1921 | * |
1925 | * After attempting to lock an rt_mutex, this function is called to cleanup | 1922 | * After attempting to lock an rt_mutex, this function is called to cleanup |
1926 | * the pi_state owner as well as handle race conditions that may allow us to | 1923 | * the pi_state owner as well as handle race conditions that may allow us to |
1927 | * acquire the lock. Must be called with the hb lock held. | 1924 | * acquire the lock. Must be called with the hb lock held. |
1928 | * | 1925 | * |
1929 | * Return: | 1926 | * Return: |
1930 | * 1 - success, lock taken; | 1927 | * 1 - success, lock taken; |
1931 | * 0 - success, lock not taken; | 1928 | * 0 - success, lock not taken; |
1932 | * <0 - on error (-EFAULT) | 1929 | * <0 - on error (-EFAULT) |
1933 | */ | 1930 | */ |
1934 | static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) | 1931 | static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) |
1935 | { | 1932 | { |
1936 | struct task_struct *owner; | 1933 | struct task_struct *owner; |
1937 | int ret = 0; | 1934 | int ret = 0; |
1938 | 1935 | ||
1939 | if (locked) { | 1936 | if (locked) { |
1940 | /* | 1937 | /* |
1941 | * Got the lock. We might not be the anticipated owner if we | 1938 | * Got the lock. We might not be the anticipated owner if we |
1942 | * did a lock-steal - fix up the PI-state in that case: | 1939 | * did a lock-steal - fix up the PI-state in that case: |
1943 | */ | 1940 | */ |
1944 | if (q->pi_state->owner != current) | 1941 | if (q->pi_state->owner != current) |
1945 | ret = fixup_pi_state_owner(uaddr, q, current); | 1942 | ret = fixup_pi_state_owner(uaddr, q, current); |
1946 | goto out; | 1943 | goto out; |
1947 | } | 1944 | } |
1948 | 1945 | ||
1949 | /* | 1946 | /* |
1950 | * Catch the rare case, where the lock was released when we were on the | 1947 | * Catch the rare case, where the lock was released when we were on the |
1951 | * way back before we locked the hash bucket. | 1948 | * way back before we locked the hash bucket. |
1952 | */ | 1949 | */ |
1953 | if (q->pi_state->owner == current) { | 1950 | if (q->pi_state->owner == current) { |
1954 | /* | 1951 | /* |
1955 | * Try to get the rt_mutex now. This might fail as some other | 1952 | * Try to get the rt_mutex now. This might fail as some other |
1956 | * task acquired the rt_mutex after we removed ourself from the | 1953 | * task acquired the rt_mutex after we removed ourself from the |
1957 | * rt_mutex waiters list. | 1954 | * rt_mutex waiters list. |
1958 | */ | 1955 | */ |
1959 | if (rt_mutex_trylock(&q->pi_state->pi_mutex)) { | 1956 | if (rt_mutex_trylock(&q->pi_state->pi_mutex)) { |
1960 | locked = 1; | 1957 | locked = 1; |
1961 | goto out; | 1958 | goto out; |
1962 | } | 1959 | } |
1963 | 1960 | ||
1964 | /* | 1961 | /* |
1965 | * pi_state is incorrect, some other task did a lock steal and | 1962 | * pi_state is incorrect, some other task did a lock steal and |
1966 | * we returned due to timeout or signal without taking the | 1963 | * we returned due to timeout or signal without taking the |
1967 | * rt_mutex. Too late. | 1964 | * rt_mutex. Too late. |
1968 | */ | 1965 | */ |
1969 | raw_spin_lock(&q->pi_state->pi_mutex.wait_lock); | 1966 | raw_spin_lock(&q->pi_state->pi_mutex.wait_lock); |
1970 | owner = rt_mutex_owner(&q->pi_state->pi_mutex); | 1967 | owner = rt_mutex_owner(&q->pi_state->pi_mutex); |
1971 | if (!owner) | 1968 | if (!owner) |
1972 | owner = rt_mutex_next_owner(&q->pi_state->pi_mutex); | 1969 | owner = rt_mutex_next_owner(&q->pi_state->pi_mutex); |
1973 | raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock); | 1970 | raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock); |
1974 | ret = fixup_pi_state_owner(uaddr, q, owner); | 1971 | ret = fixup_pi_state_owner(uaddr, q, owner); |
1975 | goto out; | 1972 | goto out; |
1976 | } | 1973 | } |
1977 | 1974 | ||
1978 | /* | 1975 | /* |
1979 | * Paranoia check. If we did not take the lock, then we should not be | 1976 | * Paranoia check. If we did not take the lock, then we should not be |
1980 | * the owner of the rt_mutex. | 1977 | * the owner of the rt_mutex. |
1981 | */ | 1978 | */ |
1982 | if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) | 1979 | if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) |
1983 | printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " | 1980 | printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " |
1984 | "pi-state %p\n", ret, | 1981 | "pi-state %p\n", ret, |
1985 | q->pi_state->pi_mutex.owner, | 1982 | q->pi_state->pi_mutex.owner, |
1986 | q->pi_state->owner); | 1983 | q->pi_state->owner); |
1987 | 1984 | ||
1988 | out: | 1985 | out: |
1989 | return ret ? ret : locked; | 1986 | return ret ? ret : locked; |
1990 | } | 1987 | } |
1991 | 1988 | ||
1992 | /** | 1989 | /** |
1993 | * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal | 1990 | * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal |
1994 | * @hb: the futex hash bucket, must be locked by the caller | 1991 | * @hb: the futex hash bucket, must be locked by the caller |
1995 | * @q: the futex_q to queue up on | 1992 | * @q: the futex_q to queue up on |
1996 | * @timeout: the prepared hrtimer_sleeper, or null for no timeout | 1993 | * @timeout: the prepared hrtimer_sleeper, or null for no timeout |
1997 | */ | 1994 | */ |
1998 | static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, | 1995 | static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, |
1999 | struct hrtimer_sleeper *timeout) | 1996 | struct hrtimer_sleeper *timeout) |
2000 | { | 1997 | { |
2001 | /* | 1998 | /* |
2002 | * The task state is guaranteed to be set before another task can | 1999 | * The task state is guaranteed to be set before another task can |
2003 | * wake it. set_current_state() is implemented using set_mb() and | 2000 | * wake it. set_current_state() is implemented using set_mb() and |
2004 | * queue_me() calls spin_unlock() upon completion, both serializing | 2001 | * queue_me() calls spin_unlock() upon completion, both serializing |
2005 | * access to the hash list and forcing another memory barrier. | 2002 | * access to the hash list and forcing another memory barrier. |
2006 | */ | 2003 | */ |
2007 | set_current_state(TASK_INTERRUPTIBLE); | 2004 | set_current_state(TASK_INTERRUPTIBLE); |
2008 | queue_me(q, hb); | 2005 | queue_me(q, hb); |
2009 | 2006 | ||
2010 | /* Arm the timer */ | 2007 | /* Arm the timer */ |
2011 | if (timeout) { | 2008 | if (timeout) { |
2012 | hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); | 2009 | hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); |
2013 | if (!hrtimer_active(&timeout->timer)) | 2010 | if (!hrtimer_active(&timeout->timer)) |
2014 | timeout->task = NULL; | 2011 | timeout->task = NULL; |
2015 | } | 2012 | } |
2016 | 2013 | ||
2017 | /* | 2014 | /* |
2018 | * If we have been removed from the hash list, then another task | 2015 | * If we have been removed from the hash list, then another task |
2019 | * has tried to wake us, and we can skip the call to schedule(). | 2016 | * has tried to wake us, and we can skip the call to schedule(). |
2020 | */ | 2017 | */ |
2021 | if (likely(!plist_node_empty(&q->list))) { | 2018 | if (likely(!plist_node_empty(&q->list))) { |
2022 | /* | 2019 | /* |
2023 | * If the timer has already expired, current will already be | 2020 | * If the timer has already expired, current will already be |
2024 | * flagged for rescheduling. Only call schedule if there | 2021 | * flagged for rescheduling. Only call schedule if there |
2025 | * is no timeout, or if it has yet to expire. | 2022 | * is no timeout, or if it has yet to expire. |
2026 | */ | 2023 | */ |
2027 | if (!timeout || timeout->task) | 2024 | if (!timeout || timeout->task) |
2028 | freezable_schedule(); | 2025 | freezable_schedule(); |
2029 | } | 2026 | } |
2030 | __set_current_state(TASK_RUNNING); | 2027 | __set_current_state(TASK_RUNNING); |
2031 | } | 2028 | } |
2032 | 2029 | ||
2033 | /** | 2030 | /** |
2034 | * futex_wait_setup() - Prepare to wait on a futex | 2031 | * futex_wait_setup() - Prepare to wait on a futex |
2035 | * @uaddr: the futex userspace address | 2032 | * @uaddr: the futex userspace address |
2036 | * @val: the expected value | 2033 | * @val: the expected value |
2037 | * @flags: futex flags (FLAGS_SHARED, etc.) | 2034 | * @flags: futex flags (FLAGS_SHARED, etc.) |
2038 | * @q: the associated futex_q | 2035 | * @q: the associated futex_q |
2039 | * @hb: storage for hash_bucket pointer to be returned to caller | 2036 | * @hb: storage for hash_bucket pointer to be returned to caller |
2040 | * | 2037 | * |
2041 | * Setup the futex_q and locate the hash_bucket. Get the futex value and | 2038 | * Setup the futex_q and locate the hash_bucket. Get the futex value and |
2042 | * compare it with the expected value. Handle atomic faults internally. | 2039 | * compare it with the expected value. Handle atomic faults internally. |
2043 | * Return with the hb lock held and a q.key reference on success, and unlocked | 2040 | * Return with the hb lock held and a q.key reference on success, and unlocked |
2044 | * with no q.key reference on failure. | 2041 | * with no q.key reference on failure. |
2045 | * | 2042 | * |
2046 | * Return: | 2043 | * Return: |
2047 | * 0 - uaddr contains val and hb has been locked; | 2044 | * 0 - uaddr contains val and hb has been locked; |
2048 | * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked | 2045 | * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked |
2049 | */ | 2046 | */ |
2050 | static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, | 2047 | static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, |
2051 | struct futex_q *q, struct futex_hash_bucket **hb) | 2048 | struct futex_q *q, struct futex_hash_bucket **hb) |
2052 | { | 2049 | { |
2053 | u32 uval; | 2050 | u32 uval; |
2054 | int ret; | 2051 | int ret; |
2055 | 2052 | ||
2056 | /* | 2053 | /* |
2057 | * Access the page AFTER the hash-bucket is locked. | 2054 | * Access the page AFTER the hash-bucket is locked. |
2058 | * Order is important: | 2055 | * Order is important: |
2059 | * | 2056 | * |
2060 | * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val); | 2057 | * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val); |
2061 | * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); } | 2058 | * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); } |
2062 | * | 2059 | * |
2063 | * The basic logical guarantee of a futex is that it blocks ONLY | 2060 | * The basic logical guarantee of a futex is that it blocks ONLY |
2064 | * if cond(var) is known to be true at the time of blocking, for | 2061 | * if cond(var) is known to be true at the time of blocking, for |
2065 | * any cond. If we locked the hash-bucket after testing *uaddr, that | 2062 | * any cond. If we locked the hash-bucket after testing *uaddr, that |
2066 | * would open a race condition where we could block indefinitely with | 2063 | * would open a race condition where we could block indefinitely with |
2067 | * cond(var) false, which would violate the guarantee. | 2064 | * cond(var) false, which would violate the guarantee. |
2068 | * | 2065 | * |
2069 | * On the other hand, we insert q and release the hash-bucket only | 2066 | * On the other hand, we insert q and release the hash-bucket only |
2070 | * after testing *uaddr. This guarantees that futex_wait() will NOT | 2067 | * after testing *uaddr. This guarantees that futex_wait() will NOT |
2071 | * absorb a wakeup if *uaddr does not match the desired values | 2068 | * absorb a wakeup if *uaddr does not match the desired values |
2072 | * while the syscall executes. | 2069 | * while the syscall executes. |
2073 | */ | 2070 | */ |
2074 | retry: | 2071 | retry: |
2075 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, VERIFY_READ); | 2072 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, VERIFY_READ); |
2076 | if (unlikely(ret != 0)) | 2073 | if (unlikely(ret != 0)) |
2077 | return ret; | 2074 | return ret; |
2078 | 2075 | ||
2079 | retry_private: | 2076 | retry_private: |
2080 | *hb = queue_lock(q); | 2077 | *hb = queue_lock(q); |
2081 | 2078 | ||
2082 | ret = get_futex_value_locked(&uval, uaddr); | 2079 | ret = get_futex_value_locked(&uval, uaddr); |
2083 | 2080 | ||
2084 | if (ret) { | 2081 | if (ret) { |
2085 | queue_unlock(*hb); | 2082 | queue_unlock(*hb); |
2086 | 2083 | ||
2087 | ret = get_user(uval, uaddr); | 2084 | ret = get_user(uval, uaddr); |
2088 | if (ret) | 2085 | if (ret) |
2089 | goto out; | 2086 | goto out; |
2090 | 2087 | ||
2091 | if (!(flags & FLAGS_SHARED)) | 2088 | if (!(flags & FLAGS_SHARED)) |
2092 | goto retry_private; | 2089 | goto retry_private; |
2093 | 2090 | ||
2094 | put_futex_key(&q->key); | 2091 | put_futex_key(&q->key); |
2095 | goto retry; | 2092 | goto retry; |
2096 | } | 2093 | } |
2097 | 2094 | ||
2098 | if (uval != val) { | 2095 | if (uval != val) { |
2099 | queue_unlock(*hb); | 2096 | queue_unlock(*hb); |
2100 | ret = -EWOULDBLOCK; | 2097 | ret = -EWOULDBLOCK; |
2101 | } | 2098 | } |
2102 | 2099 | ||
2103 | out: | 2100 | out: |
2104 | if (ret) | 2101 | if (ret) |
2105 | put_futex_key(&q->key); | 2102 | put_futex_key(&q->key); |
2106 | return ret; | 2103 | return ret; |
2107 | } | 2104 | } |
2108 | 2105 | ||
2109 | static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, | 2106 | static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, |
2110 | ktime_t *abs_time, u32 bitset) | 2107 | ktime_t *abs_time, u32 bitset) |
2111 | { | 2108 | { |
2112 | struct hrtimer_sleeper timeout, *to = NULL; | 2109 | struct hrtimer_sleeper timeout, *to = NULL; |
2113 | struct restart_block *restart; | 2110 | struct restart_block *restart; |
2114 | struct futex_hash_bucket *hb; | 2111 | struct futex_hash_bucket *hb; |
2115 | struct futex_q q = futex_q_init; | 2112 | struct futex_q q = futex_q_init; |
2116 | int ret; | 2113 | int ret; |
2117 | 2114 | ||
2118 | if (!bitset) | 2115 | if (!bitset) |
2119 | return -EINVAL; | 2116 | return -EINVAL; |
2120 | q.bitset = bitset; | 2117 | q.bitset = bitset; |
2121 | 2118 | ||
2122 | if (abs_time) { | 2119 | if (abs_time) { |
2123 | to = &timeout; | 2120 | to = &timeout; |
2124 | 2121 | ||
2125 | hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ? | 2122 | hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ? |
2126 | CLOCK_REALTIME : CLOCK_MONOTONIC, | 2123 | CLOCK_REALTIME : CLOCK_MONOTONIC, |
2127 | HRTIMER_MODE_ABS); | 2124 | HRTIMER_MODE_ABS); |
2128 | hrtimer_init_sleeper(to, current); | 2125 | hrtimer_init_sleeper(to, current); |
2129 | hrtimer_set_expires_range_ns(&to->timer, *abs_time, | 2126 | hrtimer_set_expires_range_ns(&to->timer, *abs_time, |
2130 | current->timer_slack_ns); | 2127 | current->timer_slack_ns); |
2131 | } | 2128 | } |
2132 | 2129 | ||
2133 | retry: | 2130 | retry: |
2134 | /* | 2131 | /* |
2135 | * Prepare to wait on uaddr. On success, holds hb lock and increments | 2132 | * Prepare to wait on uaddr. On success, holds hb lock and increments |
2136 | * q.key refs. | 2133 | * q.key refs. |
2137 | */ | 2134 | */ |
2138 | ret = futex_wait_setup(uaddr, val, flags, &q, &hb); | 2135 | ret = futex_wait_setup(uaddr, val, flags, &q, &hb); |
2139 | if (ret) | 2136 | if (ret) |
2140 | goto out; | 2137 | goto out; |
2141 | 2138 | ||
2142 | /* queue_me and wait for wakeup, timeout, or a signal. */ | 2139 | /* queue_me and wait for wakeup, timeout, or a signal. */ |
2143 | futex_wait_queue_me(hb, &q, to); | 2140 | futex_wait_queue_me(hb, &q, to); |
2144 | 2141 | ||
2145 | /* If we were woken (and unqueued), we succeeded, whatever. */ | 2142 | /* If we were woken (and unqueued), we succeeded, whatever. */ |
2146 | ret = 0; | 2143 | ret = 0; |
2147 | /* unqueue_me() drops q.key ref */ | 2144 | /* unqueue_me() drops q.key ref */ |
2148 | if (!unqueue_me(&q)) | 2145 | if (!unqueue_me(&q)) |
2149 | goto out; | 2146 | goto out; |
2150 | ret = -ETIMEDOUT; | 2147 | ret = -ETIMEDOUT; |
2151 | if (to && !to->task) | 2148 | if (to && !to->task) |
2152 | goto out; | 2149 | goto out; |
2153 | 2150 | ||
2154 | /* | 2151 | /* |
2155 | * We expect signal_pending(current), but we might be the | 2152 | * We expect signal_pending(current), but we might be the |
2156 | * victim of a spurious wakeup as well. | 2153 | * victim of a spurious wakeup as well. |
2157 | */ | 2154 | */ |
2158 | if (!signal_pending(current)) | 2155 | if (!signal_pending(current)) |
2159 | goto retry; | 2156 | goto retry; |
2160 | 2157 | ||
2161 | ret = -ERESTARTSYS; | 2158 | ret = -ERESTARTSYS; |
2162 | if (!abs_time) | 2159 | if (!abs_time) |
2163 | goto out; | 2160 | goto out; |
2164 | 2161 | ||
2165 | restart = ¤t_thread_info()->restart_block; | 2162 | restart = ¤t_thread_info()->restart_block; |
2166 | restart->fn = futex_wait_restart; | 2163 | restart->fn = futex_wait_restart; |
2167 | restart->futex.uaddr = uaddr; | 2164 | restart->futex.uaddr = uaddr; |
2168 | restart->futex.val = val; | 2165 | restart->futex.val = val; |
2169 | restart->futex.time = abs_time->tv64; | 2166 | restart->futex.time = abs_time->tv64; |
2170 | restart->futex.bitset = bitset; | 2167 | restart->futex.bitset = bitset; |
2171 | restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; | 2168 | restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; |
2172 | 2169 | ||
2173 | ret = -ERESTART_RESTARTBLOCK; | 2170 | ret = -ERESTART_RESTARTBLOCK; |
2174 | 2171 | ||
2175 | out: | 2172 | out: |
2176 | if (to) { | 2173 | if (to) { |
2177 | hrtimer_cancel(&to->timer); | 2174 | hrtimer_cancel(&to->timer); |
2178 | destroy_hrtimer_on_stack(&to->timer); | 2175 | destroy_hrtimer_on_stack(&to->timer); |
2179 | } | 2176 | } |
2180 | return ret; | 2177 | return ret; |
2181 | } | 2178 | } |
2182 | 2179 | ||
2183 | 2180 | ||
2184 | static long futex_wait_restart(struct restart_block *restart) | 2181 | static long futex_wait_restart(struct restart_block *restart) |
2185 | { | 2182 | { |
2186 | u32 __user *uaddr = restart->futex.uaddr; | 2183 | u32 __user *uaddr = restart->futex.uaddr; |
2187 | ktime_t t, *tp = NULL; | 2184 | ktime_t t, *tp = NULL; |
2188 | 2185 | ||
2189 | if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { | 2186 | if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { |
2190 | t.tv64 = restart->futex.time; | 2187 | t.tv64 = restart->futex.time; |
2191 | tp = &t; | 2188 | tp = &t; |
2192 | } | 2189 | } |
2193 | restart->fn = do_no_restart_syscall; | 2190 | restart->fn = do_no_restart_syscall; |
2194 | 2191 | ||
2195 | return (long)futex_wait(uaddr, restart->futex.flags, | 2192 | return (long)futex_wait(uaddr, restart->futex.flags, |
2196 | restart->futex.val, tp, restart->futex.bitset); | 2193 | restart->futex.val, tp, restart->futex.bitset); |
2197 | } | 2194 | } |
2198 | 2195 | ||
2199 | 2196 | ||
2200 | /* | 2197 | /* |
2201 | * Userspace tried a 0 -> TID atomic transition of the futex value | 2198 | * Userspace tried a 0 -> TID atomic transition of the futex value |
2202 | * and failed. The kernel side here does the whole locking operation: | 2199 | * and failed. The kernel side here does the whole locking operation: |
2203 | * if there are waiters then it will block, it does PI, etc. (Due to | 2200 | * if there are waiters then it will block, it does PI, etc. (Due to |
2204 | * races the kernel might see a 0 value of the futex too.) | 2201 | * races the kernel might see a 0 value of the futex too.) |
2205 | */ | 2202 | */ |
2206 | static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect, | 2203 | static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect, |
2207 | ktime_t *time, int trylock) | 2204 | ktime_t *time, int trylock) |
2208 | { | 2205 | { |
2209 | struct hrtimer_sleeper timeout, *to = NULL; | 2206 | struct hrtimer_sleeper timeout, *to = NULL; |
2210 | struct futex_hash_bucket *hb; | 2207 | struct futex_hash_bucket *hb; |
2211 | struct futex_q q = futex_q_init; | 2208 | struct futex_q q = futex_q_init; |
2212 | int res, ret; | 2209 | int res, ret; |
2213 | 2210 | ||
2214 | if (refill_pi_state_cache()) | 2211 | if (refill_pi_state_cache()) |
2215 | return -ENOMEM; | 2212 | return -ENOMEM; |
2216 | 2213 | ||
2217 | if (time) { | 2214 | if (time) { |
2218 | to = &timeout; | 2215 | to = &timeout; |
2219 | hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME, | 2216 | hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME, |
2220 | HRTIMER_MODE_ABS); | 2217 | HRTIMER_MODE_ABS); |
2221 | hrtimer_init_sleeper(to, current); | 2218 | hrtimer_init_sleeper(to, current); |
2222 | hrtimer_set_expires(&to->timer, *time); | 2219 | hrtimer_set_expires(&to->timer, *time); |
2223 | } | 2220 | } |
2224 | 2221 | ||
2225 | retry: | 2222 | retry: |
2226 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, VERIFY_WRITE); | 2223 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, VERIFY_WRITE); |
2227 | if (unlikely(ret != 0)) | 2224 | if (unlikely(ret != 0)) |
2228 | goto out; | 2225 | goto out; |
2229 | 2226 | ||
2230 | retry_private: | 2227 | retry_private: |
2231 | hb = queue_lock(&q); | 2228 | hb = queue_lock(&q); |
2232 | 2229 | ||
2233 | ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0); | 2230 | ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0); |
2234 | if (unlikely(ret)) { | 2231 | if (unlikely(ret)) { |
2235 | switch (ret) { | 2232 | switch (ret) { |
2236 | case 1: | 2233 | case 1: |
2237 | /* We got the lock. */ | 2234 | /* We got the lock. */ |
2238 | ret = 0; | 2235 | ret = 0; |
2239 | goto out_unlock_put_key; | 2236 | goto out_unlock_put_key; |
2240 | case -EFAULT: | 2237 | case -EFAULT: |
2241 | goto uaddr_faulted; | 2238 | goto uaddr_faulted; |
2242 | case -EAGAIN: | 2239 | case -EAGAIN: |
2243 | /* | 2240 | /* |
2244 | * Task is exiting and we just wait for the | 2241 | * Task is exiting and we just wait for the |
2245 | * exit to complete. | 2242 | * exit to complete. |
2246 | */ | 2243 | */ |
2247 | queue_unlock(hb); | 2244 | queue_unlock(hb); |
2248 | put_futex_key(&q.key); | 2245 | put_futex_key(&q.key); |
2249 | cond_resched(); | 2246 | cond_resched(); |
2250 | goto retry; | 2247 | goto retry; |
2251 | default: | 2248 | default: |
2252 | goto out_unlock_put_key; | 2249 | goto out_unlock_put_key; |
2253 | } | 2250 | } |
2254 | } | 2251 | } |
2255 | 2252 | ||
2256 | /* | 2253 | /* |
2257 | * Only actually queue now that the atomic ops are done: | 2254 | * Only actually queue now that the atomic ops are done: |
2258 | */ | 2255 | */ |
2259 | queue_me(&q, hb); | 2256 | queue_me(&q, hb); |
2260 | 2257 | ||
2261 | WARN_ON(!q.pi_state); | 2258 | WARN_ON(!q.pi_state); |
2262 | /* | 2259 | /* |
2263 | * Block on the PI mutex: | 2260 | * Block on the PI mutex: |
2264 | */ | 2261 | */ |
2265 | if (!trylock) | 2262 | if (!trylock) |
2266 | ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1); | 2263 | ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1); |
2267 | else { | 2264 | else { |
2268 | ret = rt_mutex_trylock(&q.pi_state->pi_mutex); | 2265 | ret = rt_mutex_trylock(&q.pi_state->pi_mutex); |
2269 | /* Fixup the trylock return value: */ | 2266 | /* Fixup the trylock return value: */ |
2270 | ret = ret ? 0 : -EWOULDBLOCK; | 2267 | ret = ret ? 0 : -EWOULDBLOCK; |
2271 | } | 2268 | } |
2272 | 2269 | ||
2273 | spin_lock(q.lock_ptr); | 2270 | spin_lock(q.lock_ptr); |
2274 | /* | 2271 | /* |
2275 | * Fixup the pi_state owner and possibly acquire the lock if we | 2272 | * Fixup the pi_state owner and possibly acquire the lock if we |
2276 | * haven't already. | 2273 | * haven't already. |
2277 | */ | 2274 | */ |
2278 | res = fixup_owner(uaddr, &q, !ret); | 2275 | res = fixup_owner(uaddr, &q, !ret); |
2279 | /* | 2276 | /* |
2280 | * If fixup_owner() returned an error, proprogate that. If it acquired | 2277 | * If fixup_owner() returned an error, proprogate that. If it acquired |
2281 | * the lock, clear our -ETIMEDOUT or -EINTR. | 2278 | * the lock, clear our -ETIMEDOUT or -EINTR. |
2282 | */ | 2279 | */ |
2283 | if (res) | 2280 | if (res) |
2284 | ret = (res < 0) ? res : 0; | 2281 | ret = (res < 0) ? res : 0; |
2285 | 2282 | ||
2286 | /* | 2283 | /* |
2287 | * If fixup_owner() faulted and was unable to handle the fault, unlock | 2284 | * If fixup_owner() faulted and was unable to handle the fault, unlock |
2288 | * it and return the fault to userspace. | 2285 | * it and return the fault to userspace. |
2289 | */ | 2286 | */ |
2290 | if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) | 2287 | if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) |
2291 | rt_mutex_unlock(&q.pi_state->pi_mutex); | 2288 | rt_mutex_unlock(&q.pi_state->pi_mutex); |
2292 | 2289 | ||
2293 | /* Unqueue and drop the lock */ | 2290 | /* Unqueue and drop the lock */ |
2294 | unqueue_me_pi(&q); | 2291 | unqueue_me_pi(&q); |
2295 | 2292 | ||
2296 | goto out_put_key; | 2293 | goto out_put_key; |
2297 | 2294 | ||
2298 | out_unlock_put_key: | 2295 | out_unlock_put_key: |
2299 | queue_unlock(hb); | 2296 | queue_unlock(hb); |
2300 | 2297 | ||
2301 | out_put_key: | 2298 | out_put_key: |
2302 | put_futex_key(&q.key); | 2299 | put_futex_key(&q.key); |
2303 | out: | 2300 | out: |
2304 | if (to) | 2301 | if (to) |
2305 | destroy_hrtimer_on_stack(&to->timer); | 2302 | destroy_hrtimer_on_stack(&to->timer); |
2306 | return ret != -EINTR ? ret : -ERESTARTNOINTR; | 2303 | return ret != -EINTR ? ret : -ERESTARTNOINTR; |
2307 | 2304 | ||
2308 | uaddr_faulted: | 2305 | uaddr_faulted: |
2309 | queue_unlock(hb); | 2306 | queue_unlock(hb); |
2310 | 2307 | ||
2311 | ret = fault_in_user_writeable(uaddr); | 2308 | ret = fault_in_user_writeable(uaddr); |
2312 | if (ret) | 2309 | if (ret) |
2313 | goto out_put_key; | 2310 | goto out_put_key; |
2314 | 2311 | ||
2315 | if (!(flags & FLAGS_SHARED)) | 2312 | if (!(flags & FLAGS_SHARED)) |
2316 | goto retry_private; | 2313 | goto retry_private; |
2317 | 2314 | ||
2318 | put_futex_key(&q.key); | 2315 | put_futex_key(&q.key); |
2319 | goto retry; | 2316 | goto retry; |
2320 | } | 2317 | } |
2321 | 2318 | ||
2322 | /* | 2319 | /* |
2323 | * Userspace attempted a TID -> 0 atomic transition, and failed. | 2320 | * Userspace attempted a TID -> 0 atomic transition, and failed. |
2324 | * This is the in-kernel slowpath: we look up the PI state (if any), | 2321 | * This is the in-kernel slowpath: we look up the PI state (if any), |
2325 | * and do the rt-mutex unlock. | 2322 | * and do the rt-mutex unlock. |
2326 | */ | 2323 | */ |
2327 | static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) | 2324 | static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) |
2328 | { | 2325 | { |
2329 | struct futex_hash_bucket *hb; | 2326 | struct futex_hash_bucket *hb; |
2330 | struct futex_q *this, *next; | 2327 | struct futex_q *this, *next; |
2331 | union futex_key key = FUTEX_KEY_INIT; | 2328 | union futex_key key = FUTEX_KEY_INIT; |
2332 | u32 uval, vpid = task_pid_vnr(current); | 2329 | u32 uval, vpid = task_pid_vnr(current); |
2333 | int ret; | 2330 | int ret; |
2334 | 2331 | ||
2335 | retry: | 2332 | retry: |
2336 | if (get_user(uval, uaddr)) | 2333 | if (get_user(uval, uaddr)) |
2337 | return -EFAULT; | 2334 | return -EFAULT; |
2338 | /* | 2335 | /* |
2339 | * We release only a lock we actually own: | 2336 | * We release only a lock we actually own: |
2340 | */ | 2337 | */ |
2341 | if ((uval & FUTEX_TID_MASK) != vpid) | 2338 | if ((uval & FUTEX_TID_MASK) != vpid) |
2342 | return -EPERM; | 2339 | return -EPERM; |
2343 | 2340 | ||
2344 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE); | 2341 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE); |
2345 | if (unlikely(ret != 0)) | 2342 | if (unlikely(ret != 0)) |
2346 | goto out; | 2343 | goto out; |
2347 | 2344 | ||
2348 | hb = hash_futex(&key); | 2345 | hb = hash_futex(&key); |
2349 | spin_lock(&hb->lock); | 2346 | spin_lock(&hb->lock); |
2350 | 2347 | ||
2351 | /* | 2348 | /* |
2352 | * To avoid races, try to do the TID -> 0 atomic transition | 2349 | * To avoid races, try to do the TID -> 0 atomic transition |
2353 | * again. If it succeeds then we can return without waking | 2350 | * again. If it succeeds then we can return without waking |
2354 | * anyone else up: | 2351 | * anyone else up. We only try this if neither the waiters nor |
2352 | * the owner died bit are set. | ||
2355 | */ | 2353 | */ |
2356 | if (!(uval & FUTEX_OWNER_DIED) && | 2354 | if (!(uval & ~FUTEX_TID_MASK) && |
2357 | cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0)) | 2355 | cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0)) |
2358 | goto pi_faulted; | 2356 | goto pi_faulted; |
2359 | /* | 2357 | /* |
2360 | * Rare case: we managed to release the lock atomically, | 2358 | * Rare case: we managed to release the lock atomically, |
2361 | * no need to wake anyone else up: | 2359 | * no need to wake anyone else up: |
2362 | */ | 2360 | */ |
2363 | if (unlikely(uval == vpid)) | 2361 | if (unlikely(uval == vpid)) |
2364 | goto out_unlock; | 2362 | goto out_unlock; |
2365 | 2363 | ||
2366 | /* | 2364 | /* |
2367 | * Ok, other tasks may need to be woken up - check waiters | 2365 | * Ok, other tasks may need to be woken up - check waiters |
2368 | * and do the wakeup if necessary: | 2366 | * and do the wakeup if necessary: |
2369 | */ | 2367 | */ |
2370 | plist_for_each_entry_safe(this, next, &hb->chain, list) { | 2368 | plist_for_each_entry_safe(this, next, &hb->chain, list) { |
2371 | if (!match_futex (&this->key, &key)) | 2369 | if (!match_futex (&this->key, &key)) |
2372 | continue; | 2370 | continue; |
2373 | ret = wake_futex_pi(uaddr, uval, this); | 2371 | ret = wake_futex_pi(uaddr, uval, this); |
2374 | /* | 2372 | /* |
2375 | * The atomic access to the futex value | 2373 | * The atomic access to the futex value |
2376 | * generated a pagefault, so retry the | 2374 | * generated a pagefault, so retry the |
2377 | * user-access and the wakeup: | 2375 | * user-access and the wakeup: |
2378 | */ | 2376 | */ |
2379 | if (ret == -EFAULT) | 2377 | if (ret == -EFAULT) |
2380 | goto pi_faulted; | 2378 | goto pi_faulted; |
2381 | goto out_unlock; | 2379 | goto out_unlock; |
2382 | } | 2380 | } |
2383 | /* | 2381 | /* |
2384 | * No waiters - kernel unlocks the futex: | 2382 | * No waiters - kernel unlocks the futex: |
2385 | */ | 2383 | */ |
2386 | if (!(uval & FUTEX_OWNER_DIED)) { | 2384 | ret = unlock_futex_pi(uaddr, uval); |
2387 | ret = unlock_futex_pi(uaddr, uval); | 2385 | if (ret == -EFAULT) |
2388 | if (ret == -EFAULT) | 2386 | goto pi_faulted; |
2389 | goto pi_faulted; | ||
2390 | } | ||
2391 | 2387 | ||
2392 | out_unlock: | 2388 | out_unlock: |
2393 | spin_unlock(&hb->lock); | 2389 | spin_unlock(&hb->lock); |
2394 | put_futex_key(&key); | 2390 | put_futex_key(&key); |
2395 | 2391 | ||
2396 | out: | 2392 | out: |
2397 | return ret; | 2393 | return ret; |
2398 | 2394 | ||
2399 | pi_faulted: | 2395 | pi_faulted: |
2400 | spin_unlock(&hb->lock); | 2396 | spin_unlock(&hb->lock); |
2401 | put_futex_key(&key); | 2397 | put_futex_key(&key); |
2402 | 2398 | ||
2403 | ret = fault_in_user_writeable(uaddr); | 2399 | ret = fault_in_user_writeable(uaddr); |
2404 | if (!ret) | 2400 | if (!ret) |
2405 | goto retry; | 2401 | goto retry; |
2406 | 2402 | ||
2407 | return ret; | 2403 | return ret; |
2408 | } | 2404 | } |
2409 | 2405 | ||
2410 | /** | 2406 | /** |
2411 | * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex | 2407 | * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex |
2412 | * @hb: the hash_bucket futex_q was original enqueued on | 2408 | * @hb: the hash_bucket futex_q was original enqueued on |
2413 | * @q: the futex_q woken while waiting to be requeued | 2409 | * @q: the futex_q woken while waiting to be requeued |
2414 | * @key2: the futex_key of the requeue target futex | 2410 | * @key2: the futex_key of the requeue target futex |
2415 | * @timeout: the timeout associated with the wait (NULL if none) | 2411 | * @timeout: the timeout associated with the wait (NULL if none) |
2416 | * | 2412 | * |
2417 | * Detect if the task was woken on the initial futex as opposed to the requeue | 2413 | * Detect if the task was woken on the initial futex as opposed to the requeue |
2418 | * target futex. If so, determine if it was a timeout or a signal that caused | 2414 | * target futex. If so, determine if it was a timeout or a signal that caused |
2419 | * the wakeup and return the appropriate error code to the caller. Must be | 2415 | * the wakeup and return the appropriate error code to the caller. Must be |
2420 | * called with the hb lock held. | 2416 | * called with the hb lock held. |
2421 | * | 2417 | * |
2422 | * Return: | 2418 | * Return: |
2423 | * 0 = no early wakeup detected; | 2419 | * 0 = no early wakeup detected; |
2424 | * <0 = -ETIMEDOUT or -ERESTARTNOINTR | 2420 | * <0 = -ETIMEDOUT or -ERESTARTNOINTR |
2425 | */ | 2421 | */ |
2426 | static inline | 2422 | static inline |
2427 | int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, | 2423 | int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, |
2428 | struct futex_q *q, union futex_key *key2, | 2424 | struct futex_q *q, union futex_key *key2, |
2429 | struct hrtimer_sleeper *timeout) | 2425 | struct hrtimer_sleeper *timeout) |
2430 | { | 2426 | { |
2431 | int ret = 0; | 2427 | int ret = 0; |
2432 | 2428 | ||
2433 | /* | 2429 | /* |
2434 | * With the hb lock held, we avoid races while we process the wakeup. | 2430 | * With the hb lock held, we avoid races while we process the wakeup. |
2435 | * We only need to hold hb (and not hb2) to ensure atomicity as the | 2431 | * We only need to hold hb (and not hb2) to ensure atomicity as the |
2436 | * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb. | 2432 | * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb. |
2437 | * It can't be requeued from uaddr2 to something else since we don't | 2433 | * It can't be requeued from uaddr2 to something else since we don't |
2438 | * support a PI aware source futex for requeue. | 2434 | * support a PI aware source futex for requeue. |
2439 | */ | 2435 | */ |
2440 | if (!match_futex(&q->key, key2)) { | 2436 | if (!match_futex(&q->key, key2)) { |
2441 | WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr)); | 2437 | WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr)); |
2442 | /* | 2438 | /* |
2443 | * We were woken prior to requeue by a timeout or a signal. | 2439 | * We were woken prior to requeue by a timeout or a signal. |
2444 | * Unqueue the futex_q and determine which it was. | 2440 | * Unqueue the futex_q and determine which it was. |
2445 | */ | 2441 | */ |
2446 | plist_del(&q->list, &hb->chain); | 2442 | plist_del(&q->list, &hb->chain); |
2447 | hb_waiters_dec(hb); | 2443 | hb_waiters_dec(hb); |
2448 | 2444 | ||
2449 | /* Handle spurious wakeups gracefully */ | 2445 | /* Handle spurious wakeups gracefully */ |
2450 | ret = -EWOULDBLOCK; | 2446 | ret = -EWOULDBLOCK; |
2451 | if (timeout && !timeout->task) | 2447 | if (timeout && !timeout->task) |
2452 | ret = -ETIMEDOUT; | 2448 | ret = -ETIMEDOUT; |
2453 | else if (signal_pending(current)) | 2449 | else if (signal_pending(current)) |
2454 | ret = -ERESTARTNOINTR; | 2450 | ret = -ERESTARTNOINTR; |
2455 | } | 2451 | } |
2456 | return ret; | 2452 | return ret; |
2457 | } | 2453 | } |
2458 | 2454 | ||
2459 | /** | 2455 | /** |
2460 | * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 | 2456 | * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 |
2461 | * @uaddr: the futex we initially wait on (non-pi) | 2457 | * @uaddr: the futex we initially wait on (non-pi) |
2462 | * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be | 2458 | * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be |
2463 | * the same type, no requeueing from private to shared, etc. | 2459 | * the same type, no requeueing from private to shared, etc. |
2464 | * @val: the expected value of uaddr | 2460 | * @val: the expected value of uaddr |
2465 | * @abs_time: absolute timeout | 2461 | * @abs_time: absolute timeout |
2466 | * @bitset: 32 bit wakeup bitset set by userspace, defaults to all | 2462 | * @bitset: 32 bit wakeup bitset set by userspace, defaults to all |
2467 | * @uaddr2: the pi futex we will take prior to returning to user-space | 2463 | * @uaddr2: the pi futex we will take prior to returning to user-space |
2468 | * | 2464 | * |
2469 | * The caller will wait on uaddr and will be requeued by futex_requeue() to | 2465 | * The caller will wait on uaddr and will be requeued by futex_requeue() to |
2470 | * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake | 2466 | * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake |
2471 | * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to | 2467 | * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to |
2472 | * userspace. This ensures the rt_mutex maintains an owner when it has waiters; | 2468 | * userspace. This ensures the rt_mutex maintains an owner when it has waiters; |
2473 | * without one, the pi logic would not know which task to boost/deboost, if | 2469 | * without one, the pi logic would not know which task to boost/deboost, if |
2474 | * there was a need to. | 2470 | * there was a need to. |
2475 | * | 2471 | * |
2476 | * We call schedule in futex_wait_queue_me() when we enqueue and return there | 2472 | * We call schedule in futex_wait_queue_me() when we enqueue and return there |
2477 | * via the following-- | 2473 | * via the following-- |
2478 | * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() | 2474 | * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() |
2479 | * 2) wakeup on uaddr2 after a requeue | 2475 | * 2) wakeup on uaddr2 after a requeue |
2480 | * 3) signal | 2476 | * 3) signal |
2481 | * 4) timeout | 2477 | * 4) timeout |
2482 | * | 2478 | * |
2483 | * If 3, cleanup and return -ERESTARTNOINTR. | 2479 | * If 3, cleanup and return -ERESTARTNOINTR. |
2484 | * | 2480 | * |
2485 | * If 2, we may then block on trying to take the rt_mutex and return via: | 2481 | * If 2, we may then block on trying to take the rt_mutex and return via: |
2486 | * 5) successful lock | 2482 | * 5) successful lock |
2487 | * 6) signal | 2483 | * 6) signal |
2488 | * 7) timeout | 2484 | * 7) timeout |
2489 | * 8) other lock acquisition failure | 2485 | * 8) other lock acquisition failure |
2490 | * | 2486 | * |
2491 | * If 6, return -EWOULDBLOCK (restarting the syscall would do the same). | 2487 | * If 6, return -EWOULDBLOCK (restarting the syscall would do the same). |
2492 | * | 2488 | * |
2493 | * If 4 or 7, we cleanup and return with -ETIMEDOUT. | 2489 | * If 4 or 7, we cleanup and return with -ETIMEDOUT. |
2494 | * | 2490 | * |
2495 | * Return: | 2491 | * Return: |
2496 | * 0 - On success; | 2492 | * 0 - On success; |
2497 | * <0 - On error | 2493 | * <0 - On error |
2498 | */ | 2494 | */ |
2499 | static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, | 2495 | static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, |
2500 | u32 val, ktime_t *abs_time, u32 bitset, | 2496 | u32 val, ktime_t *abs_time, u32 bitset, |
2501 | u32 __user *uaddr2) | 2497 | u32 __user *uaddr2) |
2502 | { | 2498 | { |
2503 | struct hrtimer_sleeper timeout, *to = NULL; | 2499 | struct hrtimer_sleeper timeout, *to = NULL; |
2504 | struct rt_mutex_waiter rt_waiter; | 2500 | struct rt_mutex_waiter rt_waiter; |
2505 | struct rt_mutex *pi_mutex = NULL; | 2501 | struct rt_mutex *pi_mutex = NULL; |
2506 | struct futex_hash_bucket *hb; | 2502 | struct futex_hash_bucket *hb; |
2507 | union futex_key key2 = FUTEX_KEY_INIT; | 2503 | union futex_key key2 = FUTEX_KEY_INIT; |
2508 | struct futex_q q = futex_q_init; | 2504 | struct futex_q q = futex_q_init; |
2509 | int res, ret; | 2505 | int res, ret; |
2510 | 2506 | ||
2511 | if (uaddr == uaddr2) | 2507 | if (uaddr == uaddr2) |
2512 | return -EINVAL; | 2508 | return -EINVAL; |
2513 | 2509 | ||
2514 | if (!bitset) | 2510 | if (!bitset) |
2515 | return -EINVAL; | 2511 | return -EINVAL; |
2516 | 2512 | ||
2517 | if (abs_time) { | 2513 | if (abs_time) { |
2518 | to = &timeout; | 2514 | to = &timeout; |
2519 | hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ? | 2515 | hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ? |
2520 | CLOCK_REALTIME : CLOCK_MONOTONIC, | 2516 | CLOCK_REALTIME : CLOCK_MONOTONIC, |
2521 | HRTIMER_MODE_ABS); | 2517 | HRTIMER_MODE_ABS); |
2522 | hrtimer_init_sleeper(to, current); | 2518 | hrtimer_init_sleeper(to, current); |
2523 | hrtimer_set_expires_range_ns(&to->timer, *abs_time, | 2519 | hrtimer_set_expires_range_ns(&to->timer, *abs_time, |
2524 | current->timer_slack_ns); | 2520 | current->timer_slack_ns); |
2525 | } | 2521 | } |
2526 | 2522 | ||
2527 | /* | 2523 | /* |
2528 | * The waiter is allocated on our stack, manipulated by the requeue | 2524 | * The waiter is allocated on our stack, manipulated by the requeue |
2529 | * code while we sleep on uaddr. | 2525 | * code while we sleep on uaddr. |
2530 | */ | 2526 | */ |
2531 | debug_rt_mutex_init_waiter(&rt_waiter); | 2527 | debug_rt_mutex_init_waiter(&rt_waiter); |
2532 | RB_CLEAR_NODE(&rt_waiter.pi_tree_entry); | 2528 | RB_CLEAR_NODE(&rt_waiter.pi_tree_entry); |
2533 | RB_CLEAR_NODE(&rt_waiter.tree_entry); | 2529 | RB_CLEAR_NODE(&rt_waiter.tree_entry); |
2534 | rt_waiter.task = NULL; | 2530 | rt_waiter.task = NULL; |
2535 | 2531 | ||
2536 | ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); | 2532 | ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); |
2537 | if (unlikely(ret != 0)) | 2533 | if (unlikely(ret != 0)) |
2538 | goto out; | 2534 | goto out; |
2539 | 2535 | ||
2540 | q.bitset = bitset; | 2536 | q.bitset = bitset; |
2541 | q.rt_waiter = &rt_waiter; | 2537 | q.rt_waiter = &rt_waiter; |
2542 | q.requeue_pi_key = &key2; | 2538 | q.requeue_pi_key = &key2; |
2543 | 2539 | ||
2544 | /* | 2540 | /* |
2545 | * Prepare to wait on uaddr. On success, increments q.key (key1) ref | 2541 | * Prepare to wait on uaddr. On success, increments q.key (key1) ref |
2546 | * count. | 2542 | * count. |
2547 | */ | 2543 | */ |
2548 | ret = futex_wait_setup(uaddr, val, flags, &q, &hb); | 2544 | ret = futex_wait_setup(uaddr, val, flags, &q, &hb); |
2549 | if (ret) | 2545 | if (ret) |
2550 | goto out_key2; | 2546 | goto out_key2; |
2551 | 2547 | ||
2552 | /* | 2548 | /* |
2553 | * The check above which compares uaddrs is not sufficient for | 2549 | * The check above which compares uaddrs is not sufficient for |
2554 | * shared futexes. We need to compare the keys: | 2550 | * shared futexes. We need to compare the keys: |
2555 | */ | 2551 | */ |
2556 | if (match_futex(&q.key, &key2)) { | 2552 | if (match_futex(&q.key, &key2)) { |
2557 | ret = -EINVAL; | 2553 | ret = -EINVAL; |
2558 | goto out_put_keys; | 2554 | goto out_put_keys; |
2559 | } | 2555 | } |
2560 | 2556 | ||
2561 | /* Queue the futex_q, drop the hb lock, wait for wakeup. */ | 2557 | /* Queue the futex_q, drop the hb lock, wait for wakeup. */ |
2562 | futex_wait_queue_me(hb, &q, to); | 2558 | futex_wait_queue_me(hb, &q, to); |
2563 | 2559 | ||
2564 | spin_lock(&hb->lock); | 2560 | spin_lock(&hb->lock); |
2565 | ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); | 2561 | ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); |
2566 | spin_unlock(&hb->lock); | 2562 | spin_unlock(&hb->lock); |
2567 | if (ret) | 2563 | if (ret) |
2568 | goto out_put_keys; | 2564 | goto out_put_keys; |
2569 | 2565 | ||
2570 | /* | 2566 | /* |
2571 | * In order for us to be here, we know our q.key == key2, and since | 2567 | * In order for us to be here, we know our q.key == key2, and since |
2572 | * we took the hb->lock above, we also know that futex_requeue() has | 2568 | * we took the hb->lock above, we also know that futex_requeue() has |
2573 | * completed and we no longer have to concern ourselves with a wakeup | 2569 | * completed and we no longer have to concern ourselves with a wakeup |
2574 | * race with the atomic proxy lock acquisition by the requeue code. The | 2570 | * race with the atomic proxy lock acquisition by the requeue code. The |
2575 | * futex_requeue dropped our key1 reference and incremented our key2 | 2571 | * futex_requeue dropped our key1 reference and incremented our key2 |
2576 | * reference count. | 2572 | * reference count. |
2577 | */ | 2573 | */ |
2578 | 2574 | ||
2579 | /* Check if the requeue code acquired the second futex for us. */ | 2575 | /* Check if the requeue code acquired the second futex for us. */ |
2580 | if (!q.rt_waiter) { | 2576 | if (!q.rt_waiter) { |
2581 | /* | 2577 | /* |
2582 | * Got the lock. We might not be the anticipated owner if we | 2578 | * Got the lock. We might not be the anticipated owner if we |
2583 | * did a lock-steal - fix up the PI-state in that case. | 2579 | * did a lock-steal - fix up the PI-state in that case. |
2584 | */ | 2580 | */ |
2585 | if (q.pi_state && (q.pi_state->owner != current)) { | 2581 | if (q.pi_state && (q.pi_state->owner != current)) { |
2586 | spin_lock(q.lock_ptr); | 2582 | spin_lock(q.lock_ptr); |
2587 | ret = fixup_pi_state_owner(uaddr2, &q, current); | 2583 | ret = fixup_pi_state_owner(uaddr2, &q, current); |
2588 | spin_unlock(q.lock_ptr); | 2584 | spin_unlock(q.lock_ptr); |
2589 | } | 2585 | } |
2590 | } else { | 2586 | } else { |
2591 | /* | 2587 | /* |
2592 | * We have been woken up by futex_unlock_pi(), a timeout, or a | 2588 | * We have been woken up by futex_unlock_pi(), a timeout, or a |
2593 | * signal. futex_unlock_pi() will not destroy the lock_ptr nor | 2589 | * signal. futex_unlock_pi() will not destroy the lock_ptr nor |
2594 | * the pi_state. | 2590 | * the pi_state. |
2595 | */ | 2591 | */ |
2596 | WARN_ON(!q.pi_state); | 2592 | WARN_ON(!q.pi_state); |
2597 | pi_mutex = &q.pi_state->pi_mutex; | 2593 | pi_mutex = &q.pi_state->pi_mutex; |
2598 | ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1); | 2594 | ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1); |
2599 | debug_rt_mutex_free_waiter(&rt_waiter); | 2595 | debug_rt_mutex_free_waiter(&rt_waiter); |
2600 | 2596 | ||
2601 | spin_lock(q.lock_ptr); | 2597 | spin_lock(q.lock_ptr); |
2602 | /* | 2598 | /* |
2603 | * Fixup the pi_state owner and possibly acquire the lock if we | 2599 | * Fixup the pi_state owner and possibly acquire the lock if we |
2604 | * haven't already. | 2600 | * haven't already. |
2605 | */ | 2601 | */ |
2606 | res = fixup_owner(uaddr2, &q, !ret); | 2602 | res = fixup_owner(uaddr2, &q, !ret); |
2607 | /* | 2603 | /* |
2608 | * If fixup_owner() returned an error, proprogate that. If it | 2604 | * If fixup_owner() returned an error, proprogate that. If it |
2609 | * acquired the lock, clear -ETIMEDOUT or -EINTR. | 2605 | * acquired the lock, clear -ETIMEDOUT or -EINTR. |
2610 | */ | 2606 | */ |
2611 | if (res) | 2607 | if (res) |
2612 | ret = (res < 0) ? res : 0; | 2608 | ret = (res < 0) ? res : 0; |
2613 | 2609 | ||
2614 | /* Unqueue and drop the lock. */ | 2610 | /* Unqueue and drop the lock. */ |
2615 | unqueue_me_pi(&q); | 2611 | unqueue_me_pi(&q); |
2616 | } | 2612 | } |
2617 | 2613 | ||
2618 | /* | 2614 | /* |
2619 | * If fixup_pi_state_owner() faulted and was unable to handle the | 2615 | * If fixup_pi_state_owner() faulted and was unable to handle the |
2620 | * fault, unlock the rt_mutex and return the fault to userspace. | 2616 | * fault, unlock the rt_mutex and return the fault to userspace. |
2621 | */ | 2617 | */ |
2622 | if (ret == -EFAULT) { | 2618 | if (ret == -EFAULT) { |
2623 | if (pi_mutex && rt_mutex_owner(pi_mutex) == current) | 2619 | if (pi_mutex && rt_mutex_owner(pi_mutex) == current) |
2624 | rt_mutex_unlock(pi_mutex); | 2620 | rt_mutex_unlock(pi_mutex); |
2625 | } else if (ret == -EINTR) { | 2621 | } else if (ret == -EINTR) { |
2626 | /* | 2622 | /* |
2627 | * We've already been requeued, but cannot restart by calling | 2623 | * We've already been requeued, but cannot restart by calling |
2628 | * futex_lock_pi() directly. We could restart this syscall, but | 2624 | * futex_lock_pi() directly. We could restart this syscall, but |
2629 | * it would detect that the user space "val" changed and return | 2625 | * it would detect that the user space "val" changed and return |
2630 | * -EWOULDBLOCK. Save the overhead of the restart and return | 2626 | * -EWOULDBLOCK. Save the overhead of the restart and return |
2631 | * -EWOULDBLOCK directly. | 2627 | * -EWOULDBLOCK directly. |
2632 | */ | 2628 | */ |
2633 | ret = -EWOULDBLOCK; | 2629 | ret = -EWOULDBLOCK; |
2634 | } | 2630 | } |
2635 | 2631 | ||
2636 | out_put_keys: | 2632 | out_put_keys: |
2637 | put_futex_key(&q.key); | 2633 | put_futex_key(&q.key); |
2638 | out_key2: | 2634 | out_key2: |
2639 | put_futex_key(&key2); | 2635 | put_futex_key(&key2); |
2640 | 2636 | ||
2641 | out: | 2637 | out: |
2642 | if (to) { | 2638 | if (to) { |
2643 | hrtimer_cancel(&to->timer); | 2639 | hrtimer_cancel(&to->timer); |
2644 | destroy_hrtimer_on_stack(&to->timer); | 2640 | destroy_hrtimer_on_stack(&to->timer); |
2645 | } | 2641 | } |
2646 | return ret; | 2642 | return ret; |
2647 | } | 2643 | } |
2648 | 2644 | ||
2649 | /* | 2645 | /* |
2650 | * Support for robust futexes: the kernel cleans up held futexes at | 2646 | * Support for robust futexes: the kernel cleans up held futexes at |
2651 | * thread exit time. | 2647 | * thread exit time. |
2652 | * | 2648 | * |
2653 | * Implementation: user-space maintains a per-thread list of locks it | 2649 | * Implementation: user-space maintains a per-thread list of locks it |
2654 | * is holding. Upon do_exit(), the kernel carefully walks this list, | 2650 | * is holding. Upon do_exit(), the kernel carefully walks this list, |
2655 | * and marks all locks that are owned by this thread with the | 2651 | * and marks all locks that are owned by this thread with the |
2656 | * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is | 2652 | * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is |
2657 | * always manipulated with the lock held, so the list is private and | 2653 | * always manipulated with the lock held, so the list is private and |
2658 | * per-thread. Userspace also maintains a per-thread 'list_op_pending' | 2654 | * per-thread. Userspace also maintains a per-thread 'list_op_pending' |
2659 | * field, to allow the kernel to clean up if the thread dies after | 2655 | * field, to allow the kernel to clean up if the thread dies after |
2660 | * acquiring the lock, but just before it could have added itself to | 2656 | * acquiring the lock, but just before it could have added itself to |
2661 | * the list. There can only be one such pending lock. | 2657 | * the list. There can only be one such pending lock. |
2662 | */ | 2658 | */ |
2663 | 2659 | ||
2664 | /** | 2660 | /** |
2665 | * sys_set_robust_list() - Set the robust-futex list head of a task | 2661 | * sys_set_robust_list() - Set the robust-futex list head of a task |
2666 | * @head: pointer to the list-head | 2662 | * @head: pointer to the list-head |
2667 | * @len: length of the list-head, as userspace expects | 2663 | * @len: length of the list-head, as userspace expects |
2668 | */ | 2664 | */ |
2669 | SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, | 2665 | SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, |
2670 | size_t, len) | 2666 | size_t, len) |
2671 | { | 2667 | { |
2672 | if (!futex_cmpxchg_enabled) | 2668 | if (!futex_cmpxchg_enabled) |
2673 | return -ENOSYS; | 2669 | return -ENOSYS; |
2674 | /* | 2670 | /* |
2675 | * The kernel knows only one size for now: | 2671 | * The kernel knows only one size for now: |
2676 | */ | 2672 | */ |
2677 | if (unlikely(len != sizeof(*head))) | 2673 | if (unlikely(len != sizeof(*head))) |
2678 | return -EINVAL; | 2674 | return -EINVAL; |
2679 | 2675 | ||
2680 | current->robust_list = head; | 2676 | current->robust_list = head; |
2681 | 2677 | ||
2682 | return 0; | 2678 | return 0; |
2683 | } | 2679 | } |
2684 | 2680 | ||
2685 | /** | 2681 | /** |
2686 | * sys_get_robust_list() - Get the robust-futex list head of a task | 2682 | * sys_get_robust_list() - Get the robust-futex list head of a task |
2687 | * @pid: pid of the process [zero for current task] | 2683 | * @pid: pid of the process [zero for current task] |
2688 | * @head_ptr: pointer to a list-head pointer, the kernel fills it in | 2684 | * @head_ptr: pointer to a list-head pointer, the kernel fills it in |
2689 | * @len_ptr: pointer to a length field, the kernel fills in the header size | 2685 | * @len_ptr: pointer to a length field, the kernel fills in the header size |
2690 | */ | 2686 | */ |
2691 | SYSCALL_DEFINE3(get_robust_list, int, pid, | 2687 | SYSCALL_DEFINE3(get_robust_list, int, pid, |
2692 | struct robust_list_head __user * __user *, head_ptr, | 2688 | struct robust_list_head __user * __user *, head_ptr, |
2693 | size_t __user *, len_ptr) | 2689 | size_t __user *, len_ptr) |
2694 | { | 2690 | { |
2695 | struct robust_list_head __user *head; | 2691 | struct robust_list_head __user *head; |
2696 | unsigned long ret; | 2692 | unsigned long ret; |
2697 | struct task_struct *p; | 2693 | struct task_struct *p; |
2698 | 2694 | ||
2699 | if (!futex_cmpxchg_enabled) | 2695 | if (!futex_cmpxchg_enabled) |
2700 | return -ENOSYS; | 2696 | return -ENOSYS; |
2701 | 2697 | ||
2702 | rcu_read_lock(); | 2698 | rcu_read_lock(); |
2703 | 2699 | ||
2704 | ret = -ESRCH; | 2700 | ret = -ESRCH; |
2705 | if (!pid) | 2701 | if (!pid) |
2706 | p = current; | 2702 | p = current; |
2707 | else { | 2703 | else { |
2708 | p = find_task_by_vpid(pid); | 2704 | p = find_task_by_vpid(pid); |
2709 | if (!p) | 2705 | if (!p) |
2710 | goto err_unlock; | 2706 | goto err_unlock; |
2711 | } | 2707 | } |
2712 | 2708 | ||
2713 | ret = -EPERM; | 2709 | ret = -EPERM; |
2714 | if (!ptrace_may_access(p, PTRACE_MODE_READ)) | 2710 | if (!ptrace_may_access(p, PTRACE_MODE_READ)) |
2715 | goto err_unlock; | 2711 | goto err_unlock; |
2716 | 2712 | ||
2717 | head = p->robust_list; | 2713 | head = p->robust_list; |
2718 | rcu_read_unlock(); | 2714 | rcu_read_unlock(); |
2719 | 2715 | ||
2720 | if (put_user(sizeof(*head), len_ptr)) | 2716 | if (put_user(sizeof(*head), len_ptr)) |
2721 | return -EFAULT; | 2717 | return -EFAULT; |
2722 | return put_user(head, head_ptr); | 2718 | return put_user(head, head_ptr); |
2723 | 2719 | ||
2724 | err_unlock: | 2720 | err_unlock: |
2725 | rcu_read_unlock(); | 2721 | rcu_read_unlock(); |
2726 | 2722 | ||
2727 | return ret; | 2723 | return ret; |
2728 | } | 2724 | } |
2729 | 2725 | ||
2730 | /* | 2726 | /* |
2731 | * Process a futex-list entry, check whether it's owned by the | 2727 | * Process a futex-list entry, check whether it's owned by the |
2732 | * dying task, and do notification if so: | 2728 | * dying task, and do notification if so: |
2733 | */ | 2729 | */ |
2734 | int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) | 2730 | int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) |
2735 | { | 2731 | { |
2736 | u32 uval, uninitialized_var(nval), mval; | 2732 | u32 uval, uninitialized_var(nval), mval; |
2737 | 2733 | ||
2738 | retry: | 2734 | retry: |
2739 | if (get_user(uval, uaddr)) | 2735 | if (get_user(uval, uaddr)) |
2740 | return -1; | 2736 | return -1; |
2741 | 2737 | ||
2742 | if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) { | 2738 | if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) { |
2743 | /* | 2739 | /* |
2744 | * Ok, this dying thread is truly holding a futex | 2740 | * Ok, this dying thread is truly holding a futex |
2745 | * of interest. Set the OWNER_DIED bit atomically | 2741 | * of interest. Set the OWNER_DIED bit atomically |
2746 | * via cmpxchg, and if the value had FUTEX_WAITERS | 2742 | * via cmpxchg, and if the value had FUTEX_WAITERS |
2747 | * set, wake up a waiter (if any). (We have to do a | 2743 | * set, wake up a waiter (if any). (We have to do a |
2748 | * futex_wake() even if OWNER_DIED is already set - | 2744 | * futex_wake() even if OWNER_DIED is already set - |
2749 | * to handle the rare but possible case of recursive | 2745 | * to handle the rare but possible case of recursive |
2750 | * thread-death.) The rest of the cleanup is done in | 2746 | * thread-death.) The rest of the cleanup is done in |
2751 | * userspace. | 2747 | * userspace. |
2752 | */ | 2748 | */ |
2753 | mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; | 2749 | mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; |
2754 | /* | 2750 | /* |
2755 | * We are not holding a lock here, but we want to have | 2751 | * We are not holding a lock here, but we want to have |
2756 | * the pagefault_disable/enable() protection because | 2752 | * the pagefault_disable/enable() protection because |
2757 | * we want to handle the fault gracefully. If the | 2753 | * we want to handle the fault gracefully. If the |
2758 | * access fails we try to fault in the futex with R/W | 2754 | * access fails we try to fault in the futex with R/W |
2759 | * verification via get_user_pages. get_user() above | 2755 | * verification via get_user_pages. get_user() above |
2760 | * does not guarantee R/W access. If that fails we | 2756 | * does not guarantee R/W access. If that fails we |
2761 | * give up and leave the futex locked. | 2757 | * give up and leave the futex locked. |
2762 | */ | 2758 | */ |
2763 | if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) { | 2759 | if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) { |
2764 | if (fault_in_user_writeable(uaddr)) | 2760 | if (fault_in_user_writeable(uaddr)) |
2765 | return -1; | 2761 | return -1; |
2766 | goto retry; | 2762 | goto retry; |
2767 | } | 2763 | } |
2768 | if (nval != uval) | 2764 | if (nval != uval) |
2769 | goto retry; | 2765 | goto retry; |
2770 | 2766 | ||
2771 | /* | 2767 | /* |
2772 | * Wake robust non-PI futexes here. The wakeup of | 2768 | * Wake robust non-PI futexes here. The wakeup of |
2773 | * PI futexes happens in exit_pi_state(): | 2769 | * PI futexes happens in exit_pi_state(): |
2774 | */ | 2770 | */ |
2775 | if (!pi && (uval & FUTEX_WAITERS)) | 2771 | if (!pi && (uval & FUTEX_WAITERS)) |
2776 | futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); | 2772 | futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); |
2777 | } | 2773 | } |
2778 | return 0; | 2774 | return 0; |
2779 | } | 2775 | } |
2780 | 2776 | ||
2781 | /* | 2777 | /* |
2782 | * Fetch a robust-list pointer. Bit 0 signals PI futexes: | 2778 | * Fetch a robust-list pointer. Bit 0 signals PI futexes: |
2783 | */ | 2779 | */ |
2784 | static inline int fetch_robust_entry(struct robust_list __user **entry, | 2780 | static inline int fetch_robust_entry(struct robust_list __user **entry, |
2785 | struct robust_list __user * __user *head, | 2781 | struct robust_list __user * __user *head, |
2786 | unsigned int *pi) | 2782 | unsigned int *pi) |
2787 | { | 2783 | { |
2788 | unsigned long uentry; | 2784 | unsigned long uentry; |
2789 | 2785 | ||
2790 | if (get_user(uentry, (unsigned long __user *)head)) | 2786 | if (get_user(uentry, (unsigned long __user *)head)) |
2791 | return -EFAULT; | 2787 | return -EFAULT; |
2792 | 2788 | ||
2793 | *entry = (void __user *)(uentry & ~1UL); | 2789 | *entry = (void __user *)(uentry & ~1UL); |
2794 | *pi = uentry & 1; | 2790 | *pi = uentry & 1; |
2795 | 2791 | ||
2796 | return 0; | 2792 | return 0; |
2797 | } | 2793 | } |
2798 | 2794 | ||
2799 | /* | 2795 | /* |
2800 | * Walk curr->robust_list (very carefully, it's a userspace list!) | 2796 | * Walk curr->robust_list (very carefully, it's a userspace list!) |
2801 | * and mark any locks found there dead, and notify any waiters. | 2797 | * and mark any locks found there dead, and notify any waiters. |
2802 | * | 2798 | * |
2803 | * We silently return on any sign of list-walking problem. | 2799 | * We silently return on any sign of list-walking problem. |
2804 | */ | 2800 | */ |
2805 | void exit_robust_list(struct task_struct *curr) | 2801 | void exit_robust_list(struct task_struct *curr) |
2806 | { | 2802 | { |
2807 | struct robust_list_head __user *head = curr->robust_list; | 2803 | struct robust_list_head __user *head = curr->robust_list; |
2808 | struct robust_list __user *entry, *next_entry, *pending; | 2804 | struct robust_list __user *entry, *next_entry, *pending; |
2809 | unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; | 2805 | unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; |
2810 | unsigned int uninitialized_var(next_pi); | 2806 | unsigned int uninitialized_var(next_pi); |
2811 | unsigned long futex_offset; | 2807 | unsigned long futex_offset; |
2812 | int rc; | 2808 | int rc; |
2813 | 2809 | ||
2814 | if (!futex_cmpxchg_enabled) | 2810 | if (!futex_cmpxchg_enabled) |
2815 | return; | 2811 | return; |
2816 | 2812 | ||
2817 | /* | 2813 | /* |
2818 | * Fetch the list head (which was registered earlier, via | 2814 | * Fetch the list head (which was registered earlier, via |
2819 | * sys_set_robust_list()): | 2815 | * sys_set_robust_list()): |
2820 | */ | 2816 | */ |
2821 | if (fetch_robust_entry(&entry, &head->list.next, &pi)) | 2817 | if (fetch_robust_entry(&entry, &head->list.next, &pi)) |
2822 | return; | 2818 | return; |
2823 | /* | 2819 | /* |
2824 | * Fetch the relative futex offset: | 2820 | * Fetch the relative futex offset: |
2825 | */ | 2821 | */ |
2826 | if (get_user(futex_offset, &head->futex_offset)) | 2822 | if (get_user(futex_offset, &head->futex_offset)) |
2827 | return; | 2823 | return; |
2828 | /* | 2824 | /* |
2829 | * Fetch any possibly pending lock-add first, and handle it | 2825 | * Fetch any possibly pending lock-add first, and handle it |
2830 | * if it exists: | 2826 | * if it exists: |
2831 | */ | 2827 | */ |
2832 | if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) | 2828 | if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) |
2833 | return; | 2829 | return; |
2834 | 2830 | ||
2835 | next_entry = NULL; /* avoid warning with gcc */ | 2831 | next_entry = NULL; /* avoid warning with gcc */ |
2836 | while (entry != &head->list) { | 2832 | while (entry != &head->list) { |
2837 | /* | 2833 | /* |
2838 | * Fetch the next entry in the list before calling | 2834 | * Fetch the next entry in the list before calling |
2839 | * handle_futex_death: | 2835 | * handle_futex_death: |
2840 | */ | 2836 | */ |
2841 | rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi); | 2837 | rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi); |
2842 | /* | 2838 | /* |
2843 | * A pending lock might already be on the list, so | 2839 | * A pending lock might already be on the list, so |
2844 | * don't process it twice: | 2840 | * don't process it twice: |
2845 | */ | 2841 | */ |
2846 | if (entry != pending) | 2842 | if (entry != pending) |
2847 | if (handle_futex_death((void __user *)entry + futex_offset, | 2843 | if (handle_futex_death((void __user *)entry + futex_offset, |
2848 | curr, pi)) | 2844 | curr, pi)) |
2849 | return; | 2845 | return; |
2850 | if (rc) | 2846 | if (rc) |
2851 | return; | 2847 | return; |
2852 | entry = next_entry; | 2848 | entry = next_entry; |
2853 | pi = next_pi; | 2849 | pi = next_pi; |
2854 | /* | 2850 | /* |
2855 | * Avoid excessively long or circular lists: | 2851 | * Avoid excessively long or circular lists: |
2856 | */ | 2852 | */ |
2857 | if (!--limit) | 2853 | if (!--limit) |
2858 | break; | 2854 | break; |
2859 | 2855 | ||
2860 | cond_resched(); | 2856 | cond_resched(); |
2861 | } | 2857 | } |
2862 | 2858 | ||
2863 | if (pending) | 2859 | if (pending) |
2864 | handle_futex_death((void __user *)pending + futex_offset, | 2860 | handle_futex_death((void __user *)pending + futex_offset, |
2865 | curr, pip); | 2861 | curr, pip); |
2866 | } | 2862 | } |
2867 | 2863 | ||
2868 | long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, | 2864 | long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, |
2869 | u32 __user *uaddr2, u32 val2, u32 val3) | 2865 | u32 __user *uaddr2, u32 val2, u32 val3) |
2870 | { | 2866 | { |
2871 | int cmd = op & FUTEX_CMD_MASK; | 2867 | int cmd = op & FUTEX_CMD_MASK; |
2872 | unsigned int flags = 0; | 2868 | unsigned int flags = 0; |
2873 | 2869 | ||
2874 | if (!(op & FUTEX_PRIVATE_FLAG)) | 2870 | if (!(op & FUTEX_PRIVATE_FLAG)) |
2875 | flags |= FLAGS_SHARED; | 2871 | flags |= FLAGS_SHARED; |
2876 | 2872 | ||
2877 | if (op & FUTEX_CLOCK_REALTIME) { | 2873 | if (op & FUTEX_CLOCK_REALTIME) { |
2878 | flags |= FLAGS_CLOCKRT; | 2874 | flags |= FLAGS_CLOCKRT; |
2879 | if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) | 2875 | if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) |
2880 | return -ENOSYS; | 2876 | return -ENOSYS; |
2881 | } | 2877 | } |
2882 | 2878 | ||
2883 | switch (cmd) { | 2879 | switch (cmd) { |
2884 | case FUTEX_LOCK_PI: | 2880 | case FUTEX_LOCK_PI: |
2885 | case FUTEX_UNLOCK_PI: | 2881 | case FUTEX_UNLOCK_PI: |
2886 | case FUTEX_TRYLOCK_PI: | 2882 | case FUTEX_TRYLOCK_PI: |
2887 | case FUTEX_WAIT_REQUEUE_PI: | 2883 | case FUTEX_WAIT_REQUEUE_PI: |
2888 | case FUTEX_CMP_REQUEUE_PI: | 2884 | case FUTEX_CMP_REQUEUE_PI: |
2889 | if (!futex_cmpxchg_enabled) | 2885 | if (!futex_cmpxchg_enabled) |
2890 | return -ENOSYS; | 2886 | return -ENOSYS; |
2891 | } | 2887 | } |
2892 | 2888 | ||
2893 | switch (cmd) { | 2889 | switch (cmd) { |
2894 | case FUTEX_WAIT: | 2890 | case FUTEX_WAIT: |
2895 | val3 = FUTEX_BITSET_MATCH_ANY; | 2891 | val3 = FUTEX_BITSET_MATCH_ANY; |
2896 | case FUTEX_WAIT_BITSET: | 2892 | case FUTEX_WAIT_BITSET: |
2897 | return futex_wait(uaddr, flags, val, timeout, val3); | 2893 | return futex_wait(uaddr, flags, val, timeout, val3); |
2898 | case FUTEX_WAKE: | 2894 | case FUTEX_WAKE: |
2899 | val3 = FUTEX_BITSET_MATCH_ANY; | 2895 | val3 = FUTEX_BITSET_MATCH_ANY; |
2900 | case FUTEX_WAKE_BITSET: | 2896 | case FUTEX_WAKE_BITSET: |
2901 | return futex_wake(uaddr, flags, val, val3); | 2897 | return futex_wake(uaddr, flags, val, val3); |
2902 | case FUTEX_REQUEUE: | 2898 | case FUTEX_REQUEUE: |
2903 | return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0); | 2899 | return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0); |
2904 | case FUTEX_CMP_REQUEUE: | 2900 | case FUTEX_CMP_REQUEUE: |
2905 | return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0); | 2901 | return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0); |
2906 | case FUTEX_WAKE_OP: | 2902 | case FUTEX_WAKE_OP: |
2907 | return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); | 2903 | return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); |
2908 | case FUTEX_LOCK_PI: | 2904 | case FUTEX_LOCK_PI: |
2909 | return futex_lock_pi(uaddr, flags, val, timeout, 0); | 2905 | return futex_lock_pi(uaddr, flags, val, timeout, 0); |
2910 | case FUTEX_UNLOCK_PI: | 2906 | case FUTEX_UNLOCK_PI: |
2911 | return futex_unlock_pi(uaddr, flags); | 2907 | return futex_unlock_pi(uaddr, flags); |
2912 | case FUTEX_TRYLOCK_PI: | 2908 | case FUTEX_TRYLOCK_PI: |
2913 | return futex_lock_pi(uaddr, flags, 0, timeout, 1); | 2909 | return futex_lock_pi(uaddr, flags, 0, timeout, 1); |
2914 | case FUTEX_WAIT_REQUEUE_PI: | 2910 | case FUTEX_WAIT_REQUEUE_PI: |
2915 | val3 = FUTEX_BITSET_MATCH_ANY; | 2911 | val3 = FUTEX_BITSET_MATCH_ANY; |
2916 | return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, | 2912 | return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, |
2917 | uaddr2); | 2913 | uaddr2); |
2918 | case FUTEX_CMP_REQUEUE_PI: | 2914 | case FUTEX_CMP_REQUEUE_PI: |
2919 | return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); | 2915 | return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); |
2920 | } | 2916 | } |
2921 | return -ENOSYS; | 2917 | return -ENOSYS; |
2922 | } | 2918 | } |
2923 | 2919 | ||
2924 | 2920 | ||
2925 | SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, | 2921 | SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, |
2926 | struct timespec __user *, utime, u32 __user *, uaddr2, | 2922 | struct timespec __user *, utime, u32 __user *, uaddr2, |
2927 | u32, val3) | 2923 | u32, val3) |
2928 | { | 2924 | { |
2929 | struct timespec ts; | 2925 | struct timespec ts; |
2930 | ktime_t t, *tp = NULL; | 2926 | ktime_t t, *tp = NULL; |
2931 | u32 val2 = 0; | 2927 | u32 val2 = 0; |
2932 | int cmd = op & FUTEX_CMD_MASK; | 2928 | int cmd = op & FUTEX_CMD_MASK; |
2933 | 2929 | ||
2934 | if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || | 2930 | if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || |
2935 | cmd == FUTEX_WAIT_BITSET || | 2931 | cmd == FUTEX_WAIT_BITSET || |
2936 | cmd == FUTEX_WAIT_REQUEUE_PI)) { | 2932 | cmd == FUTEX_WAIT_REQUEUE_PI)) { |
2937 | if (copy_from_user(&ts, utime, sizeof(ts)) != 0) | 2933 | if (copy_from_user(&ts, utime, sizeof(ts)) != 0) |
2938 | return -EFAULT; | 2934 | return -EFAULT; |
2939 | if (!timespec_valid(&ts)) | 2935 | if (!timespec_valid(&ts)) |
2940 | return -EINVAL; | 2936 | return -EINVAL; |
2941 | 2937 | ||
2942 | t = timespec_to_ktime(ts); | 2938 | t = timespec_to_ktime(ts); |
2943 | if (cmd == FUTEX_WAIT) | 2939 | if (cmd == FUTEX_WAIT) |
2944 | t = ktime_add_safe(ktime_get(), t); | 2940 | t = ktime_add_safe(ktime_get(), t); |
2945 | tp = &t; | 2941 | tp = &t; |
2946 | } | 2942 | } |
2947 | /* | 2943 | /* |
2948 | * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*. | 2944 | * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*. |
2949 | * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP. | 2945 | * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP. |
2950 | */ | 2946 | */ |
2951 | if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || | 2947 | if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || |
2952 | cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) | 2948 | cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) |
2953 | val2 = (u32) (unsigned long) utime; | 2949 | val2 = (u32) (unsigned long) utime; |
2954 | 2950 | ||
2955 | return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); | 2951 | return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); |
2956 | } | 2952 | } |
2957 | 2953 | ||
2958 | static void __init futex_detect_cmpxchg(void) | 2954 | static void __init futex_detect_cmpxchg(void) |
2959 | { | 2955 | { |
2960 | #ifndef CONFIG_HAVE_FUTEX_CMPXCHG | 2956 | #ifndef CONFIG_HAVE_FUTEX_CMPXCHG |
2961 | u32 curval; | 2957 | u32 curval; |
2962 | 2958 | ||
2963 | /* | 2959 | /* |
2964 | * This will fail and we want it. Some arch implementations do | 2960 | * This will fail and we want it. Some arch implementations do |
2965 | * runtime detection of the futex_atomic_cmpxchg_inatomic() | 2961 | * runtime detection of the futex_atomic_cmpxchg_inatomic() |
2966 | * functionality. We want to know that before we call in any | 2962 | * functionality. We want to know that before we call in any |
2967 | * of the complex code paths. Also we want to prevent | 2963 | * of the complex code paths. Also we want to prevent |
2968 | * registration of robust lists in that case. NULL is | 2964 | * registration of robust lists in that case. NULL is |
2969 | * guaranteed to fault and we get -EFAULT on functional | 2965 | * guaranteed to fault and we get -EFAULT on functional |
2970 | * implementation, the non-functional ones will return | 2966 | * implementation, the non-functional ones will return |
2971 | * -ENOSYS. | 2967 | * -ENOSYS. |
2972 | */ | 2968 | */ |
2973 | if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT) | 2969 | if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT) |
2974 | futex_cmpxchg_enabled = 1; | 2970 | futex_cmpxchg_enabled = 1; |
2975 | #endif | 2971 | #endif |
2976 | } | 2972 | } |
2977 | 2973 | ||
2978 | static int __init futex_init(void) | 2974 | static int __init futex_init(void) |
2979 | { | 2975 | { |
2980 | unsigned int futex_shift; | 2976 | unsigned int futex_shift; |
2981 | unsigned long i; | 2977 | unsigned long i; |
2982 | 2978 | ||
2983 | #if CONFIG_BASE_SMALL | 2979 | #if CONFIG_BASE_SMALL |
2984 | futex_hashsize = 16; | 2980 | futex_hashsize = 16; |
2985 | #else | 2981 | #else |
2986 | futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); | 2982 | futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); |
2987 | #endif | 2983 | #endif |
2988 | 2984 | ||
2989 | futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), | 2985 | futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), |
2990 | futex_hashsize, 0, | 2986 | futex_hashsize, 0, |
2991 | futex_hashsize < 256 ? HASH_SMALL : 0, | 2987 | futex_hashsize < 256 ? HASH_SMALL : 0, |
2992 | &futex_shift, NULL, | 2988 | &futex_shift, NULL, |
2993 | futex_hashsize, futex_hashsize); | 2989 | futex_hashsize, futex_hashsize); |
2994 | futex_hashsize = 1UL << futex_shift; | 2990 | futex_hashsize = 1UL << futex_shift; |
2995 | 2991 | ||
2996 | futex_detect_cmpxchg(); | 2992 | futex_detect_cmpxchg(); |
2997 | 2993 | ||
2998 | for (i = 0; i < futex_hashsize; i++) { | 2994 | for (i = 0; i < futex_hashsize; i++) { |
2999 | atomic_set(&futex_queues[i].waiters, 0); | 2995 | atomic_set(&futex_queues[i].waiters, 0); |
3000 | plist_head_init(&futex_queues[i].chain); | 2996 | plist_head_init(&futex_queues[i].chain); |
3001 | spin_lock_init(&futex_queues[i].lock); | 2997 | spin_lock_init(&futex_queues[i].lock); |
3002 | } | 2998 | } |
3003 | 2999 | ||
3004 | return 0; | 3000 | return 0; |
3005 | } | 3001 | } |
-
mentioned in commit 19040c
-
mentioned in commit 19040c
-
mentioned in commit 19040c
-
mentioned in commit 19040c
-
mentioned in commit 19040c