Commit 13fbca4c6ecd96ec1a1cfa2e4f2ce191fe928a5e

Authored by Thomas Gleixner
Committed by Linus Torvalds
1 parent b3eaa9fc5c

futex: Always cleanup owner tid in unlock_pi

If the owner died bit is set at futex_unlock_pi, we currently do not
cleanup the user space futex.  So the owner TID of the current owner
(the unlocker) persists.  That's observable inconsistant state,
especially when the ownership of the pi state got transferred.

Clean it up unconditionally.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: Will Drewry <wad@chromium.org>
Cc: Darren Hart <dvhart@linux.intel.com>
Cc: stable@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 1 changed file with 18 additions and 22 deletions Inline Diff

1 /* 1 /*
2 * Fast Userspace Mutexes (which I call "Futexes!"). 2 * Fast Userspace Mutexes (which I call "Futexes!").
3 * (C) Rusty Russell, IBM 2002 3 * (C) Rusty Russell, IBM 2002
4 * 4 *
5 * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar 5 * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
6 * (C) Copyright 2003 Red Hat Inc, All Rights Reserved 6 * (C) Copyright 2003 Red Hat Inc, All Rights Reserved
7 * 7 *
8 * Removed page pinning, fix privately mapped COW pages and other cleanups 8 * Removed page pinning, fix privately mapped COW pages and other cleanups
9 * (C) Copyright 2003, 2004 Jamie Lokier 9 * (C) Copyright 2003, 2004 Jamie Lokier
10 * 10 *
11 * Robust futex support started by Ingo Molnar 11 * Robust futex support started by Ingo Molnar
12 * (C) Copyright 2006 Red Hat Inc, All Rights Reserved 12 * (C) Copyright 2006 Red Hat Inc, All Rights Reserved
13 * Thanks to Thomas Gleixner for suggestions, analysis and fixes. 13 * Thanks to Thomas Gleixner for suggestions, analysis and fixes.
14 * 14 *
15 * PI-futex support started by Ingo Molnar and Thomas Gleixner 15 * PI-futex support started by Ingo Molnar and Thomas Gleixner
16 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 16 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
17 * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> 17 * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
18 * 18 *
19 * PRIVATE futexes by Eric Dumazet 19 * PRIVATE futexes by Eric Dumazet
20 * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> 20 * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
21 * 21 *
22 * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com> 22 * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
23 * Copyright (C) IBM Corporation, 2009 23 * Copyright (C) IBM Corporation, 2009
24 * Thanks to Thomas Gleixner for conceptual design and careful reviews. 24 * Thanks to Thomas Gleixner for conceptual design and careful reviews.
25 * 25 *
26 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly 26 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
27 * enough at me, Linus for the original (flawed) idea, Matthew 27 * enough at me, Linus for the original (flawed) idea, Matthew
28 * Kirkwood for proof-of-concept implementation. 28 * Kirkwood for proof-of-concept implementation.
29 * 29 *
30 * "The futexes are also cursed." 30 * "The futexes are also cursed."
31 * "But they come in a choice of three flavours!" 31 * "But they come in a choice of three flavours!"
32 * 32 *
33 * This program is free software; you can redistribute it and/or modify 33 * This program is free software; you can redistribute it and/or modify
34 * it under the terms of the GNU General Public License as published by 34 * it under the terms of the GNU General Public License as published by
35 * the Free Software Foundation; either version 2 of the License, or 35 * the Free Software Foundation; either version 2 of the License, or
36 * (at your option) any later version. 36 * (at your option) any later version.
37 * 37 *
38 * This program is distributed in the hope that it will be useful, 38 * This program is distributed in the hope that it will be useful,
39 * but WITHOUT ANY WARRANTY; without even the implied warranty of 39 * but WITHOUT ANY WARRANTY; without even the implied warranty of
40 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 40 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
41 * GNU General Public License for more details. 41 * GNU General Public License for more details.
42 * 42 *
43 * You should have received a copy of the GNU General Public License 43 * You should have received a copy of the GNU General Public License
44 * along with this program; if not, write to the Free Software 44 * along with this program; if not, write to the Free Software
45 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 45 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
46 */ 46 */
47 #include <linux/slab.h> 47 #include <linux/slab.h>
48 #include <linux/poll.h> 48 #include <linux/poll.h>
49 #include <linux/fs.h> 49 #include <linux/fs.h>
50 #include <linux/file.h> 50 #include <linux/file.h>
51 #include <linux/jhash.h> 51 #include <linux/jhash.h>
52 #include <linux/init.h> 52 #include <linux/init.h>
53 #include <linux/futex.h> 53 #include <linux/futex.h>
54 #include <linux/mount.h> 54 #include <linux/mount.h>
55 #include <linux/pagemap.h> 55 #include <linux/pagemap.h>
56 #include <linux/syscalls.h> 56 #include <linux/syscalls.h>
57 #include <linux/signal.h> 57 #include <linux/signal.h>
58 #include <linux/export.h> 58 #include <linux/export.h>
59 #include <linux/magic.h> 59 #include <linux/magic.h>
60 #include <linux/pid.h> 60 #include <linux/pid.h>
61 #include <linux/nsproxy.h> 61 #include <linux/nsproxy.h>
62 #include <linux/ptrace.h> 62 #include <linux/ptrace.h>
63 #include <linux/sched/rt.h> 63 #include <linux/sched/rt.h>
64 #include <linux/hugetlb.h> 64 #include <linux/hugetlb.h>
65 #include <linux/freezer.h> 65 #include <linux/freezer.h>
66 #include <linux/bootmem.h> 66 #include <linux/bootmem.h>
67 67
68 #include <asm/futex.h> 68 #include <asm/futex.h>
69 69
70 #include "locking/rtmutex_common.h" 70 #include "locking/rtmutex_common.h"
71 71
72 /* 72 /*
73 * READ this before attempting to hack on futexes! 73 * READ this before attempting to hack on futexes!
74 * 74 *
75 * Basic futex operation and ordering guarantees 75 * Basic futex operation and ordering guarantees
76 * ============================================= 76 * =============================================
77 * 77 *
78 * The waiter reads the futex value in user space and calls 78 * The waiter reads the futex value in user space and calls
79 * futex_wait(). This function computes the hash bucket and acquires 79 * futex_wait(). This function computes the hash bucket and acquires
80 * the hash bucket lock. After that it reads the futex user space value 80 * the hash bucket lock. After that it reads the futex user space value
81 * again and verifies that the data has not changed. If it has not changed 81 * again and verifies that the data has not changed. If it has not changed
82 * it enqueues itself into the hash bucket, releases the hash bucket lock 82 * it enqueues itself into the hash bucket, releases the hash bucket lock
83 * and schedules. 83 * and schedules.
84 * 84 *
85 * The waker side modifies the user space value of the futex and calls 85 * The waker side modifies the user space value of the futex and calls
86 * futex_wake(). This function computes the hash bucket and acquires the 86 * futex_wake(). This function computes the hash bucket and acquires the
87 * hash bucket lock. Then it looks for waiters on that futex in the hash 87 * hash bucket lock. Then it looks for waiters on that futex in the hash
88 * bucket and wakes them. 88 * bucket and wakes them.
89 * 89 *
90 * In futex wake up scenarios where no tasks are blocked on a futex, taking 90 * In futex wake up scenarios where no tasks are blocked on a futex, taking
91 * the hb spinlock can be avoided and simply return. In order for this 91 * the hb spinlock can be avoided and simply return. In order for this
92 * optimization to work, ordering guarantees must exist so that the waiter 92 * optimization to work, ordering guarantees must exist so that the waiter
93 * being added to the list is acknowledged when the list is concurrently being 93 * being added to the list is acknowledged when the list is concurrently being
94 * checked by the waker, avoiding scenarios like the following: 94 * checked by the waker, avoiding scenarios like the following:
95 * 95 *
96 * CPU 0 CPU 1 96 * CPU 0 CPU 1
97 * val = *futex; 97 * val = *futex;
98 * sys_futex(WAIT, futex, val); 98 * sys_futex(WAIT, futex, val);
99 * futex_wait(futex, val); 99 * futex_wait(futex, val);
100 * uval = *futex; 100 * uval = *futex;
101 * *futex = newval; 101 * *futex = newval;
102 * sys_futex(WAKE, futex); 102 * sys_futex(WAKE, futex);
103 * futex_wake(futex); 103 * futex_wake(futex);
104 * if (queue_empty()) 104 * if (queue_empty())
105 * return; 105 * return;
106 * if (uval == val) 106 * if (uval == val)
107 * lock(hash_bucket(futex)); 107 * lock(hash_bucket(futex));
108 * queue(); 108 * queue();
109 * unlock(hash_bucket(futex)); 109 * unlock(hash_bucket(futex));
110 * schedule(); 110 * schedule();
111 * 111 *
112 * This would cause the waiter on CPU 0 to wait forever because it 112 * This would cause the waiter on CPU 0 to wait forever because it
113 * missed the transition of the user space value from val to newval 113 * missed the transition of the user space value from val to newval
114 * and the waker did not find the waiter in the hash bucket queue. 114 * and the waker did not find the waiter in the hash bucket queue.
115 * 115 *
116 * The correct serialization ensures that a waiter either observes 116 * The correct serialization ensures that a waiter either observes
117 * the changed user space value before blocking or is woken by a 117 * the changed user space value before blocking or is woken by a
118 * concurrent waker: 118 * concurrent waker:
119 * 119 *
120 * CPU 0 CPU 1 120 * CPU 0 CPU 1
121 * val = *futex; 121 * val = *futex;
122 * sys_futex(WAIT, futex, val); 122 * sys_futex(WAIT, futex, val);
123 * futex_wait(futex, val); 123 * futex_wait(futex, val);
124 * 124 *
125 * waiters++; (a) 125 * waiters++; (a)
126 * mb(); (A) <-- paired with -. 126 * mb(); (A) <-- paired with -.
127 * | 127 * |
128 * lock(hash_bucket(futex)); | 128 * lock(hash_bucket(futex)); |
129 * | 129 * |
130 * uval = *futex; | 130 * uval = *futex; |
131 * | *futex = newval; 131 * | *futex = newval;
132 * | sys_futex(WAKE, futex); 132 * | sys_futex(WAKE, futex);
133 * | futex_wake(futex); 133 * | futex_wake(futex);
134 * | 134 * |
135 * `-------> mb(); (B) 135 * `-------> mb(); (B)
136 * if (uval == val) 136 * if (uval == val)
137 * queue(); 137 * queue();
138 * unlock(hash_bucket(futex)); 138 * unlock(hash_bucket(futex));
139 * schedule(); if (waiters) 139 * schedule(); if (waiters)
140 * lock(hash_bucket(futex)); 140 * lock(hash_bucket(futex));
141 * else wake_waiters(futex); 141 * else wake_waiters(futex);
142 * waiters--; (b) unlock(hash_bucket(futex)); 142 * waiters--; (b) unlock(hash_bucket(futex));
143 * 143 *
144 * Where (A) orders the waiters increment and the futex value read through 144 * Where (A) orders the waiters increment and the futex value read through
145 * atomic operations (see hb_waiters_inc) and where (B) orders the write 145 * atomic operations (see hb_waiters_inc) and where (B) orders the write
146 * to futex and the waiters read -- this is done by the barriers in 146 * to futex and the waiters read -- this is done by the barriers in
147 * get_futex_key_refs(), through either ihold or atomic_inc, depending on the 147 * get_futex_key_refs(), through either ihold or atomic_inc, depending on the
148 * futex type. 148 * futex type.
149 * 149 *
150 * This yields the following case (where X:=waiters, Y:=futex): 150 * This yields the following case (where X:=waiters, Y:=futex):
151 * 151 *
152 * X = Y = 0 152 * X = Y = 0
153 * 153 *
154 * w[X]=1 w[Y]=1 154 * w[X]=1 w[Y]=1
155 * MB MB 155 * MB MB
156 * r[Y]=y r[X]=x 156 * r[Y]=y r[X]=x
157 * 157 *
158 * Which guarantees that x==0 && y==0 is impossible; which translates back into 158 * Which guarantees that x==0 && y==0 is impossible; which translates back into
159 * the guarantee that we cannot both miss the futex variable change and the 159 * the guarantee that we cannot both miss the futex variable change and the
160 * enqueue. 160 * enqueue.
161 * 161 *
162 * Note that a new waiter is accounted for in (a) even when it is possible that 162 * Note that a new waiter is accounted for in (a) even when it is possible that
163 * the wait call can return error, in which case we backtrack from it in (b). 163 * the wait call can return error, in which case we backtrack from it in (b).
164 * Refer to the comment in queue_lock(). 164 * Refer to the comment in queue_lock().
165 * 165 *
166 * Similarly, in order to account for waiters being requeued on another 166 * Similarly, in order to account for waiters being requeued on another
167 * address we always increment the waiters for the destination bucket before 167 * address we always increment the waiters for the destination bucket before
168 * acquiring the lock. It then decrements them again after releasing it - 168 * acquiring the lock. It then decrements them again after releasing it -
169 * the code that actually moves the futex(es) between hash buckets (requeue_futex) 169 * the code that actually moves the futex(es) between hash buckets (requeue_futex)
170 * will do the additional required waiter count housekeeping. This is done for 170 * will do the additional required waiter count housekeeping. This is done for
171 * double_lock_hb() and double_unlock_hb(), respectively. 171 * double_lock_hb() and double_unlock_hb(), respectively.
172 */ 172 */
173 173
174 #ifndef CONFIG_HAVE_FUTEX_CMPXCHG 174 #ifndef CONFIG_HAVE_FUTEX_CMPXCHG
175 int __read_mostly futex_cmpxchg_enabled; 175 int __read_mostly futex_cmpxchg_enabled;
176 #endif 176 #endif
177 177
178 /* 178 /*
179 * Futex flags used to encode options to functions and preserve them across 179 * Futex flags used to encode options to functions and preserve them across
180 * restarts. 180 * restarts.
181 */ 181 */
182 #define FLAGS_SHARED 0x01 182 #define FLAGS_SHARED 0x01
183 #define FLAGS_CLOCKRT 0x02 183 #define FLAGS_CLOCKRT 0x02
184 #define FLAGS_HAS_TIMEOUT 0x04 184 #define FLAGS_HAS_TIMEOUT 0x04
185 185
186 /* 186 /*
187 * Priority Inheritance state: 187 * Priority Inheritance state:
188 */ 188 */
189 struct futex_pi_state { 189 struct futex_pi_state {
190 /* 190 /*
191 * list of 'owned' pi_state instances - these have to be 191 * list of 'owned' pi_state instances - these have to be
192 * cleaned up in do_exit() if the task exits prematurely: 192 * cleaned up in do_exit() if the task exits prematurely:
193 */ 193 */
194 struct list_head list; 194 struct list_head list;
195 195
196 /* 196 /*
197 * The PI object: 197 * The PI object:
198 */ 198 */
199 struct rt_mutex pi_mutex; 199 struct rt_mutex pi_mutex;
200 200
201 struct task_struct *owner; 201 struct task_struct *owner;
202 atomic_t refcount; 202 atomic_t refcount;
203 203
204 union futex_key key; 204 union futex_key key;
205 }; 205 };
206 206
207 /** 207 /**
208 * struct futex_q - The hashed futex queue entry, one per waiting task 208 * struct futex_q - The hashed futex queue entry, one per waiting task
209 * @list: priority-sorted list of tasks waiting on this futex 209 * @list: priority-sorted list of tasks waiting on this futex
210 * @task: the task waiting on the futex 210 * @task: the task waiting on the futex
211 * @lock_ptr: the hash bucket lock 211 * @lock_ptr: the hash bucket lock
212 * @key: the key the futex is hashed on 212 * @key: the key the futex is hashed on
213 * @pi_state: optional priority inheritance state 213 * @pi_state: optional priority inheritance state
214 * @rt_waiter: rt_waiter storage for use with requeue_pi 214 * @rt_waiter: rt_waiter storage for use with requeue_pi
215 * @requeue_pi_key: the requeue_pi target futex key 215 * @requeue_pi_key: the requeue_pi target futex key
216 * @bitset: bitset for the optional bitmasked wakeup 216 * @bitset: bitset for the optional bitmasked wakeup
217 * 217 *
218 * We use this hashed waitqueue, instead of a normal wait_queue_t, so 218 * We use this hashed waitqueue, instead of a normal wait_queue_t, so
219 * we can wake only the relevant ones (hashed queues may be shared). 219 * we can wake only the relevant ones (hashed queues may be shared).
220 * 220 *
221 * A futex_q has a woken state, just like tasks have TASK_RUNNING. 221 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
222 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. 222 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
223 * The order of wakeup is always to make the first condition true, then 223 * The order of wakeup is always to make the first condition true, then
224 * the second. 224 * the second.
225 * 225 *
226 * PI futexes are typically woken before they are removed from the hash list via 226 * PI futexes are typically woken before they are removed from the hash list via
227 * the rt_mutex code. See unqueue_me_pi(). 227 * the rt_mutex code. See unqueue_me_pi().
228 */ 228 */
229 struct futex_q { 229 struct futex_q {
230 struct plist_node list; 230 struct plist_node list;
231 231
232 struct task_struct *task; 232 struct task_struct *task;
233 spinlock_t *lock_ptr; 233 spinlock_t *lock_ptr;
234 union futex_key key; 234 union futex_key key;
235 struct futex_pi_state *pi_state; 235 struct futex_pi_state *pi_state;
236 struct rt_mutex_waiter *rt_waiter; 236 struct rt_mutex_waiter *rt_waiter;
237 union futex_key *requeue_pi_key; 237 union futex_key *requeue_pi_key;
238 u32 bitset; 238 u32 bitset;
239 }; 239 };
240 240
241 static const struct futex_q futex_q_init = { 241 static const struct futex_q futex_q_init = {
242 /* list gets initialized in queue_me()*/ 242 /* list gets initialized in queue_me()*/
243 .key = FUTEX_KEY_INIT, 243 .key = FUTEX_KEY_INIT,
244 .bitset = FUTEX_BITSET_MATCH_ANY 244 .bitset = FUTEX_BITSET_MATCH_ANY
245 }; 245 };
246 246
247 /* 247 /*
248 * Hash buckets are shared by all the futex_keys that hash to the same 248 * Hash buckets are shared by all the futex_keys that hash to the same
249 * location. Each key may have multiple futex_q structures, one for each task 249 * location. Each key may have multiple futex_q structures, one for each task
250 * waiting on a futex. 250 * waiting on a futex.
251 */ 251 */
252 struct futex_hash_bucket { 252 struct futex_hash_bucket {
253 atomic_t waiters; 253 atomic_t waiters;
254 spinlock_t lock; 254 spinlock_t lock;
255 struct plist_head chain; 255 struct plist_head chain;
256 } ____cacheline_aligned_in_smp; 256 } ____cacheline_aligned_in_smp;
257 257
258 static unsigned long __read_mostly futex_hashsize; 258 static unsigned long __read_mostly futex_hashsize;
259 259
260 static struct futex_hash_bucket *futex_queues; 260 static struct futex_hash_bucket *futex_queues;
261 261
262 static inline void futex_get_mm(union futex_key *key) 262 static inline void futex_get_mm(union futex_key *key)
263 { 263 {
264 atomic_inc(&key->private.mm->mm_count); 264 atomic_inc(&key->private.mm->mm_count);
265 /* 265 /*
266 * Ensure futex_get_mm() implies a full barrier such that 266 * Ensure futex_get_mm() implies a full barrier such that
267 * get_futex_key() implies a full barrier. This is relied upon 267 * get_futex_key() implies a full barrier. This is relied upon
268 * as full barrier (B), see the ordering comment above. 268 * as full barrier (B), see the ordering comment above.
269 */ 269 */
270 smp_mb__after_atomic_inc(); 270 smp_mb__after_atomic_inc();
271 } 271 }
272 272
273 /* 273 /*
274 * Reflects a new waiter being added to the waitqueue. 274 * Reflects a new waiter being added to the waitqueue.
275 */ 275 */
276 static inline void hb_waiters_inc(struct futex_hash_bucket *hb) 276 static inline void hb_waiters_inc(struct futex_hash_bucket *hb)
277 { 277 {
278 #ifdef CONFIG_SMP 278 #ifdef CONFIG_SMP
279 atomic_inc(&hb->waiters); 279 atomic_inc(&hb->waiters);
280 /* 280 /*
281 * Full barrier (A), see the ordering comment above. 281 * Full barrier (A), see the ordering comment above.
282 */ 282 */
283 smp_mb__after_atomic_inc(); 283 smp_mb__after_atomic_inc();
284 #endif 284 #endif
285 } 285 }
286 286
287 /* 287 /*
288 * Reflects a waiter being removed from the waitqueue by wakeup 288 * Reflects a waiter being removed from the waitqueue by wakeup
289 * paths. 289 * paths.
290 */ 290 */
291 static inline void hb_waiters_dec(struct futex_hash_bucket *hb) 291 static inline void hb_waiters_dec(struct futex_hash_bucket *hb)
292 { 292 {
293 #ifdef CONFIG_SMP 293 #ifdef CONFIG_SMP
294 atomic_dec(&hb->waiters); 294 atomic_dec(&hb->waiters);
295 #endif 295 #endif
296 } 296 }
297 297
298 static inline int hb_waiters_pending(struct futex_hash_bucket *hb) 298 static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
299 { 299 {
300 #ifdef CONFIG_SMP 300 #ifdef CONFIG_SMP
301 return atomic_read(&hb->waiters); 301 return atomic_read(&hb->waiters);
302 #else 302 #else
303 return 1; 303 return 1;
304 #endif 304 #endif
305 } 305 }
306 306
307 /* 307 /*
308 * We hash on the keys returned from get_futex_key (see below). 308 * We hash on the keys returned from get_futex_key (see below).
309 */ 309 */
310 static struct futex_hash_bucket *hash_futex(union futex_key *key) 310 static struct futex_hash_bucket *hash_futex(union futex_key *key)
311 { 311 {
312 u32 hash = jhash2((u32*)&key->both.word, 312 u32 hash = jhash2((u32*)&key->both.word,
313 (sizeof(key->both.word)+sizeof(key->both.ptr))/4, 313 (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
314 key->both.offset); 314 key->both.offset);
315 return &futex_queues[hash & (futex_hashsize - 1)]; 315 return &futex_queues[hash & (futex_hashsize - 1)];
316 } 316 }
317 317
318 /* 318 /*
319 * Return 1 if two futex_keys are equal, 0 otherwise. 319 * Return 1 if two futex_keys are equal, 0 otherwise.
320 */ 320 */
321 static inline int match_futex(union futex_key *key1, union futex_key *key2) 321 static inline int match_futex(union futex_key *key1, union futex_key *key2)
322 { 322 {
323 return (key1 && key2 323 return (key1 && key2
324 && key1->both.word == key2->both.word 324 && key1->both.word == key2->both.word
325 && key1->both.ptr == key2->both.ptr 325 && key1->both.ptr == key2->both.ptr
326 && key1->both.offset == key2->both.offset); 326 && key1->both.offset == key2->both.offset);
327 } 327 }
328 328
329 /* 329 /*
330 * Take a reference to the resource addressed by a key. 330 * Take a reference to the resource addressed by a key.
331 * Can be called while holding spinlocks. 331 * Can be called while holding spinlocks.
332 * 332 *
333 */ 333 */
334 static void get_futex_key_refs(union futex_key *key) 334 static void get_futex_key_refs(union futex_key *key)
335 { 335 {
336 if (!key->both.ptr) 336 if (!key->both.ptr)
337 return; 337 return;
338 338
339 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { 339 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
340 case FUT_OFF_INODE: 340 case FUT_OFF_INODE:
341 ihold(key->shared.inode); /* implies MB (B) */ 341 ihold(key->shared.inode); /* implies MB (B) */
342 break; 342 break;
343 case FUT_OFF_MMSHARED: 343 case FUT_OFF_MMSHARED:
344 futex_get_mm(key); /* implies MB (B) */ 344 futex_get_mm(key); /* implies MB (B) */
345 break; 345 break;
346 } 346 }
347 } 347 }
348 348
349 /* 349 /*
350 * Drop a reference to the resource addressed by a key. 350 * Drop a reference to the resource addressed by a key.
351 * The hash bucket spinlock must not be held. 351 * The hash bucket spinlock must not be held.
352 */ 352 */
353 static void drop_futex_key_refs(union futex_key *key) 353 static void drop_futex_key_refs(union futex_key *key)
354 { 354 {
355 if (!key->both.ptr) { 355 if (!key->both.ptr) {
356 /* If we're here then we tried to put a key we failed to get */ 356 /* If we're here then we tried to put a key we failed to get */
357 WARN_ON_ONCE(1); 357 WARN_ON_ONCE(1);
358 return; 358 return;
359 } 359 }
360 360
361 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { 361 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
362 case FUT_OFF_INODE: 362 case FUT_OFF_INODE:
363 iput(key->shared.inode); 363 iput(key->shared.inode);
364 break; 364 break;
365 case FUT_OFF_MMSHARED: 365 case FUT_OFF_MMSHARED:
366 mmdrop(key->private.mm); 366 mmdrop(key->private.mm);
367 break; 367 break;
368 } 368 }
369 } 369 }
370 370
371 /** 371 /**
372 * get_futex_key() - Get parameters which are the keys for a futex 372 * get_futex_key() - Get parameters which are the keys for a futex
373 * @uaddr: virtual address of the futex 373 * @uaddr: virtual address of the futex
374 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED 374 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
375 * @key: address where result is stored. 375 * @key: address where result is stored.
376 * @rw: mapping needs to be read/write (values: VERIFY_READ, 376 * @rw: mapping needs to be read/write (values: VERIFY_READ,
377 * VERIFY_WRITE) 377 * VERIFY_WRITE)
378 * 378 *
379 * Return: a negative error code or 0 379 * Return: a negative error code or 0
380 * 380 *
381 * The key words are stored in *key on success. 381 * The key words are stored in *key on success.
382 * 382 *
383 * For shared mappings, it's (page->index, file_inode(vma->vm_file), 383 * For shared mappings, it's (page->index, file_inode(vma->vm_file),
384 * offset_within_page). For private mappings, it's (uaddr, current->mm). 384 * offset_within_page). For private mappings, it's (uaddr, current->mm).
385 * We can usually work out the index without swapping in the page. 385 * We can usually work out the index without swapping in the page.
386 * 386 *
387 * lock_page() might sleep, the caller should not hold a spinlock. 387 * lock_page() might sleep, the caller should not hold a spinlock.
388 */ 388 */
389 static int 389 static int
390 get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) 390 get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
391 { 391 {
392 unsigned long address = (unsigned long)uaddr; 392 unsigned long address = (unsigned long)uaddr;
393 struct mm_struct *mm = current->mm; 393 struct mm_struct *mm = current->mm;
394 struct page *page, *page_head; 394 struct page *page, *page_head;
395 int err, ro = 0; 395 int err, ro = 0;
396 396
397 /* 397 /*
398 * The futex address must be "naturally" aligned. 398 * The futex address must be "naturally" aligned.
399 */ 399 */
400 key->both.offset = address % PAGE_SIZE; 400 key->both.offset = address % PAGE_SIZE;
401 if (unlikely((address % sizeof(u32)) != 0)) 401 if (unlikely((address % sizeof(u32)) != 0))
402 return -EINVAL; 402 return -EINVAL;
403 address -= key->both.offset; 403 address -= key->both.offset;
404 404
405 if (unlikely(!access_ok(rw, uaddr, sizeof(u32)))) 405 if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
406 return -EFAULT; 406 return -EFAULT;
407 407
408 /* 408 /*
409 * PROCESS_PRIVATE futexes are fast. 409 * PROCESS_PRIVATE futexes are fast.
410 * As the mm cannot disappear under us and the 'key' only needs 410 * As the mm cannot disappear under us and the 'key' only needs
411 * virtual address, we dont even have to find the underlying vma. 411 * virtual address, we dont even have to find the underlying vma.
412 * Note : We do have to check 'uaddr' is a valid user address, 412 * Note : We do have to check 'uaddr' is a valid user address,
413 * but access_ok() should be faster than find_vma() 413 * but access_ok() should be faster than find_vma()
414 */ 414 */
415 if (!fshared) { 415 if (!fshared) {
416 key->private.mm = mm; 416 key->private.mm = mm;
417 key->private.address = address; 417 key->private.address = address;
418 get_futex_key_refs(key); /* implies MB (B) */ 418 get_futex_key_refs(key); /* implies MB (B) */
419 return 0; 419 return 0;
420 } 420 }
421 421
422 again: 422 again:
423 err = get_user_pages_fast(address, 1, 1, &page); 423 err = get_user_pages_fast(address, 1, 1, &page);
424 /* 424 /*
425 * If write access is not required (eg. FUTEX_WAIT), try 425 * If write access is not required (eg. FUTEX_WAIT), try
426 * and get read-only access. 426 * and get read-only access.
427 */ 427 */
428 if (err == -EFAULT && rw == VERIFY_READ) { 428 if (err == -EFAULT && rw == VERIFY_READ) {
429 err = get_user_pages_fast(address, 1, 0, &page); 429 err = get_user_pages_fast(address, 1, 0, &page);
430 ro = 1; 430 ro = 1;
431 } 431 }
432 if (err < 0) 432 if (err < 0)
433 return err; 433 return err;
434 else 434 else
435 err = 0; 435 err = 0;
436 436
437 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 437 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
438 page_head = page; 438 page_head = page;
439 if (unlikely(PageTail(page))) { 439 if (unlikely(PageTail(page))) {
440 put_page(page); 440 put_page(page);
441 /* serialize against __split_huge_page_splitting() */ 441 /* serialize against __split_huge_page_splitting() */
442 local_irq_disable(); 442 local_irq_disable();
443 if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) { 443 if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) {
444 page_head = compound_head(page); 444 page_head = compound_head(page);
445 /* 445 /*
446 * page_head is valid pointer but we must pin 446 * page_head is valid pointer but we must pin
447 * it before taking the PG_lock and/or 447 * it before taking the PG_lock and/or
448 * PG_compound_lock. The moment we re-enable 448 * PG_compound_lock. The moment we re-enable
449 * irqs __split_huge_page_splitting() can 449 * irqs __split_huge_page_splitting() can
450 * return and the head page can be freed from 450 * return and the head page can be freed from
451 * under us. We can't take the PG_lock and/or 451 * under us. We can't take the PG_lock and/or
452 * PG_compound_lock on a page that could be 452 * PG_compound_lock on a page that could be
453 * freed from under us. 453 * freed from under us.
454 */ 454 */
455 if (page != page_head) { 455 if (page != page_head) {
456 get_page(page_head); 456 get_page(page_head);
457 put_page(page); 457 put_page(page);
458 } 458 }
459 local_irq_enable(); 459 local_irq_enable();
460 } else { 460 } else {
461 local_irq_enable(); 461 local_irq_enable();
462 goto again; 462 goto again;
463 } 463 }
464 } 464 }
465 #else 465 #else
466 page_head = compound_head(page); 466 page_head = compound_head(page);
467 if (page != page_head) { 467 if (page != page_head) {
468 get_page(page_head); 468 get_page(page_head);
469 put_page(page); 469 put_page(page);
470 } 470 }
471 #endif 471 #endif
472 472
473 lock_page(page_head); 473 lock_page(page_head);
474 474
475 /* 475 /*
476 * If page_head->mapping is NULL, then it cannot be a PageAnon 476 * If page_head->mapping is NULL, then it cannot be a PageAnon
477 * page; but it might be the ZERO_PAGE or in the gate area or 477 * page; but it might be the ZERO_PAGE or in the gate area or
478 * in a special mapping (all cases which we are happy to fail); 478 * in a special mapping (all cases which we are happy to fail);
479 * or it may have been a good file page when get_user_pages_fast 479 * or it may have been a good file page when get_user_pages_fast
480 * found it, but truncated or holepunched or subjected to 480 * found it, but truncated or holepunched or subjected to
481 * invalidate_complete_page2 before we got the page lock (also 481 * invalidate_complete_page2 before we got the page lock (also
482 * cases which we are happy to fail). And we hold a reference, 482 * cases which we are happy to fail). And we hold a reference,
483 * so refcount care in invalidate_complete_page's remove_mapping 483 * so refcount care in invalidate_complete_page's remove_mapping
484 * prevents drop_caches from setting mapping to NULL beneath us. 484 * prevents drop_caches from setting mapping to NULL beneath us.
485 * 485 *
486 * The case we do have to guard against is when memory pressure made 486 * The case we do have to guard against is when memory pressure made
487 * shmem_writepage move it from filecache to swapcache beneath us: 487 * shmem_writepage move it from filecache to swapcache beneath us:
488 * an unlikely race, but we do need to retry for page_head->mapping. 488 * an unlikely race, but we do need to retry for page_head->mapping.
489 */ 489 */
490 if (!page_head->mapping) { 490 if (!page_head->mapping) {
491 int shmem_swizzled = PageSwapCache(page_head); 491 int shmem_swizzled = PageSwapCache(page_head);
492 unlock_page(page_head); 492 unlock_page(page_head);
493 put_page(page_head); 493 put_page(page_head);
494 if (shmem_swizzled) 494 if (shmem_swizzled)
495 goto again; 495 goto again;
496 return -EFAULT; 496 return -EFAULT;
497 } 497 }
498 498
499 /* 499 /*
500 * Private mappings are handled in a simple way. 500 * Private mappings are handled in a simple way.
501 * 501 *
502 * NOTE: When userspace waits on a MAP_SHARED mapping, even if 502 * NOTE: When userspace waits on a MAP_SHARED mapping, even if
503 * it's a read-only handle, it's expected that futexes attach to 503 * it's a read-only handle, it's expected that futexes attach to
504 * the object not the particular process. 504 * the object not the particular process.
505 */ 505 */
506 if (PageAnon(page_head)) { 506 if (PageAnon(page_head)) {
507 /* 507 /*
508 * A RO anonymous page will never change and thus doesn't make 508 * A RO anonymous page will never change and thus doesn't make
509 * sense for futex operations. 509 * sense for futex operations.
510 */ 510 */
511 if (ro) { 511 if (ro) {
512 err = -EFAULT; 512 err = -EFAULT;
513 goto out; 513 goto out;
514 } 514 }
515 515
516 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ 516 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
517 key->private.mm = mm; 517 key->private.mm = mm;
518 key->private.address = address; 518 key->private.address = address;
519 } else { 519 } else {
520 key->both.offset |= FUT_OFF_INODE; /* inode-based key */ 520 key->both.offset |= FUT_OFF_INODE; /* inode-based key */
521 key->shared.inode = page_head->mapping->host; 521 key->shared.inode = page_head->mapping->host;
522 key->shared.pgoff = basepage_index(page); 522 key->shared.pgoff = basepage_index(page);
523 } 523 }
524 524
525 get_futex_key_refs(key); /* implies MB (B) */ 525 get_futex_key_refs(key); /* implies MB (B) */
526 526
527 out: 527 out:
528 unlock_page(page_head); 528 unlock_page(page_head);
529 put_page(page_head); 529 put_page(page_head);
530 return err; 530 return err;
531 } 531 }
532 532
533 static inline void put_futex_key(union futex_key *key) 533 static inline void put_futex_key(union futex_key *key)
534 { 534 {
535 drop_futex_key_refs(key); 535 drop_futex_key_refs(key);
536 } 536 }
537 537
538 /** 538 /**
539 * fault_in_user_writeable() - Fault in user address and verify RW access 539 * fault_in_user_writeable() - Fault in user address and verify RW access
540 * @uaddr: pointer to faulting user space address 540 * @uaddr: pointer to faulting user space address
541 * 541 *
542 * Slow path to fixup the fault we just took in the atomic write 542 * Slow path to fixup the fault we just took in the atomic write
543 * access to @uaddr. 543 * access to @uaddr.
544 * 544 *
545 * We have no generic implementation of a non-destructive write to the 545 * We have no generic implementation of a non-destructive write to the
546 * user address. We know that we faulted in the atomic pagefault 546 * user address. We know that we faulted in the atomic pagefault
547 * disabled section so we can as well avoid the #PF overhead by 547 * disabled section so we can as well avoid the #PF overhead by
548 * calling get_user_pages() right away. 548 * calling get_user_pages() right away.
549 */ 549 */
550 static int fault_in_user_writeable(u32 __user *uaddr) 550 static int fault_in_user_writeable(u32 __user *uaddr)
551 { 551 {
552 struct mm_struct *mm = current->mm; 552 struct mm_struct *mm = current->mm;
553 int ret; 553 int ret;
554 554
555 down_read(&mm->mmap_sem); 555 down_read(&mm->mmap_sem);
556 ret = fixup_user_fault(current, mm, (unsigned long)uaddr, 556 ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
557 FAULT_FLAG_WRITE); 557 FAULT_FLAG_WRITE);
558 up_read(&mm->mmap_sem); 558 up_read(&mm->mmap_sem);
559 559
560 return ret < 0 ? ret : 0; 560 return ret < 0 ? ret : 0;
561 } 561 }
562 562
563 /** 563 /**
564 * futex_top_waiter() - Return the highest priority waiter on a futex 564 * futex_top_waiter() - Return the highest priority waiter on a futex
565 * @hb: the hash bucket the futex_q's reside in 565 * @hb: the hash bucket the futex_q's reside in
566 * @key: the futex key (to distinguish it from other futex futex_q's) 566 * @key: the futex key (to distinguish it from other futex futex_q's)
567 * 567 *
568 * Must be called with the hb lock held. 568 * Must be called with the hb lock held.
569 */ 569 */
570 static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, 570 static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
571 union futex_key *key) 571 union futex_key *key)
572 { 572 {
573 struct futex_q *this; 573 struct futex_q *this;
574 574
575 plist_for_each_entry(this, &hb->chain, list) { 575 plist_for_each_entry(this, &hb->chain, list) {
576 if (match_futex(&this->key, key)) 576 if (match_futex(&this->key, key))
577 return this; 577 return this;
578 } 578 }
579 return NULL; 579 return NULL;
580 } 580 }
581 581
582 static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr, 582 static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
583 u32 uval, u32 newval) 583 u32 uval, u32 newval)
584 { 584 {
585 int ret; 585 int ret;
586 586
587 pagefault_disable(); 587 pagefault_disable();
588 ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); 588 ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
589 pagefault_enable(); 589 pagefault_enable();
590 590
591 return ret; 591 return ret;
592 } 592 }
593 593
594 static int get_futex_value_locked(u32 *dest, u32 __user *from) 594 static int get_futex_value_locked(u32 *dest, u32 __user *from)
595 { 595 {
596 int ret; 596 int ret;
597 597
598 pagefault_disable(); 598 pagefault_disable();
599 ret = __copy_from_user_inatomic(dest, from, sizeof(u32)); 599 ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
600 pagefault_enable(); 600 pagefault_enable();
601 601
602 return ret ? -EFAULT : 0; 602 return ret ? -EFAULT : 0;
603 } 603 }
604 604
605 605
606 /* 606 /*
607 * PI code: 607 * PI code:
608 */ 608 */
609 static int refill_pi_state_cache(void) 609 static int refill_pi_state_cache(void)
610 { 610 {
611 struct futex_pi_state *pi_state; 611 struct futex_pi_state *pi_state;
612 612
613 if (likely(current->pi_state_cache)) 613 if (likely(current->pi_state_cache))
614 return 0; 614 return 0;
615 615
616 pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL); 616 pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
617 617
618 if (!pi_state) 618 if (!pi_state)
619 return -ENOMEM; 619 return -ENOMEM;
620 620
621 INIT_LIST_HEAD(&pi_state->list); 621 INIT_LIST_HEAD(&pi_state->list);
622 /* pi_mutex gets initialized later */ 622 /* pi_mutex gets initialized later */
623 pi_state->owner = NULL; 623 pi_state->owner = NULL;
624 atomic_set(&pi_state->refcount, 1); 624 atomic_set(&pi_state->refcount, 1);
625 pi_state->key = FUTEX_KEY_INIT; 625 pi_state->key = FUTEX_KEY_INIT;
626 626
627 current->pi_state_cache = pi_state; 627 current->pi_state_cache = pi_state;
628 628
629 return 0; 629 return 0;
630 } 630 }
631 631
632 static struct futex_pi_state * alloc_pi_state(void) 632 static struct futex_pi_state * alloc_pi_state(void)
633 { 633 {
634 struct futex_pi_state *pi_state = current->pi_state_cache; 634 struct futex_pi_state *pi_state = current->pi_state_cache;
635 635
636 WARN_ON(!pi_state); 636 WARN_ON(!pi_state);
637 current->pi_state_cache = NULL; 637 current->pi_state_cache = NULL;
638 638
639 return pi_state; 639 return pi_state;
640 } 640 }
641 641
642 static void free_pi_state(struct futex_pi_state *pi_state) 642 static void free_pi_state(struct futex_pi_state *pi_state)
643 { 643 {
644 if (!atomic_dec_and_test(&pi_state->refcount)) 644 if (!atomic_dec_and_test(&pi_state->refcount))
645 return; 645 return;
646 646
647 /* 647 /*
648 * If pi_state->owner is NULL, the owner is most probably dying 648 * If pi_state->owner is NULL, the owner is most probably dying
649 * and has cleaned up the pi_state already 649 * and has cleaned up the pi_state already
650 */ 650 */
651 if (pi_state->owner) { 651 if (pi_state->owner) {
652 raw_spin_lock_irq(&pi_state->owner->pi_lock); 652 raw_spin_lock_irq(&pi_state->owner->pi_lock);
653 list_del_init(&pi_state->list); 653 list_del_init(&pi_state->list);
654 raw_spin_unlock_irq(&pi_state->owner->pi_lock); 654 raw_spin_unlock_irq(&pi_state->owner->pi_lock);
655 655
656 rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); 656 rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
657 } 657 }
658 658
659 if (current->pi_state_cache) 659 if (current->pi_state_cache)
660 kfree(pi_state); 660 kfree(pi_state);
661 else { 661 else {
662 /* 662 /*
663 * pi_state->list is already empty. 663 * pi_state->list is already empty.
664 * clear pi_state->owner. 664 * clear pi_state->owner.
665 * refcount is at 0 - put it back to 1. 665 * refcount is at 0 - put it back to 1.
666 */ 666 */
667 pi_state->owner = NULL; 667 pi_state->owner = NULL;
668 atomic_set(&pi_state->refcount, 1); 668 atomic_set(&pi_state->refcount, 1);
669 current->pi_state_cache = pi_state; 669 current->pi_state_cache = pi_state;
670 } 670 }
671 } 671 }
672 672
673 /* 673 /*
674 * Look up the task based on what TID userspace gave us. 674 * Look up the task based on what TID userspace gave us.
675 * We dont trust it. 675 * We dont trust it.
676 */ 676 */
677 static struct task_struct * futex_find_get_task(pid_t pid) 677 static struct task_struct * futex_find_get_task(pid_t pid)
678 { 678 {
679 struct task_struct *p; 679 struct task_struct *p;
680 680
681 rcu_read_lock(); 681 rcu_read_lock();
682 p = find_task_by_vpid(pid); 682 p = find_task_by_vpid(pid);
683 if (p) 683 if (p)
684 get_task_struct(p); 684 get_task_struct(p);
685 685
686 rcu_read_unlock(); 686 rcu_read_unlock();
687 687
688 return p; 688 return p;
689 } 689 }
690 690
691 /* 691 /*
692 * This task is holding PI mutexes at exit time => bad. 692 * This task is holding PI mutexes at exit time => bad.
693 * Kernel cleans up PI-state, but userspace is likely hosed. 693 * Kernel cleans up PI-state, but userspace is likely hosed.
694 * (Robust-futex cleanup is separate and might save the day for userspace.) 694 * (Robust-futex cleanup is separate and might save the day for userspace.)
695 */ 695 */
696 void exit_pi_state_list(struct task_struct *curr) 696 void exit_pi_state_list(struct task_struct *curr)
697 { 697 {
698 struct list_head *next, *head = &curr->pi_state_list; 698 struct list_head *next, *head = &curr->pi_state_list;
699 struct futex_pi_state *pi_state; 699 struct futex_pi_state *pi_state;
700 struct futex_hash_bucket *hb; 700 struct futex_hash_bucket *hb;
701 union futex_key key = FUTEX_KEY_INIT; 701 union futex_key key = FUTEX_KEY_INIT;
702 702
703 if (!futex_cmpxchg_enabled) 703 if (!futex_cmpxchg_enabled)
704 return; 704 return;
705 /* 705 /*
706 * We are a ZOMBIE and nobody can enqueue itself on 706 * We are a ZOMBIE and nobody can enqueue itself on
707 * pi_state_list anymore, but we have to be careful 707 * pi_state_list anymore, but we have to be careful
708 * versus waiters unqueueing themselves: 708 * versus waiters unqueueing themselves:
709 */ 709 */
710 raw_spin_lock_irq(&curr->pi_lock); 710 raw_spin_lock_irq(&curr->pi_lock);
711 while (!list_empty(head)) { 711 while (!list_empty(head)) {
712 712
713 next = head->next; 713 next = head->next;
714 pi_state = list_entry(next, struct futex_pi_state, list); 714 pi_state = list_entry(next, struct futex_pi_state, list);
715 key = pi_state->key; 715 key = pi_state->key;
716 hb = hash_futex(&key); 716 hb = hash_futex(&key);
717 raw_spin_unlock_irq(&curr->pi_lock); 717 raw_spin_unlock_irq(&curr->pi_lock);
718 718
719 spin_lock(&hb->lock); 719 spin_lock(&hb->lock);
720 720
721 raw_spin_lock_irq(&curr->pi_lock); 721 raw_spin_lock_irq(&curr->pi_lock);
722 /* 722 /*
723 * We dropped the pi-lock, so re-check whether this 723 * We dropped the pi-lock, so re-check whether this
724 * task still owns the PI-state: 724 * task still owns the PI-state:
725 */ 725 */
726 if (head->next != next) { 726 if (head->next != next) {
727 spin_unlock(&hb->lock); 727 spin_unlock(&hb->lock);
728 continue; 728 continue;
729 } 729 }
730 730
731 WARN_ON(pi_state->owner != curr); 731 WARN_ON(pi_state->owner != curr);
732 WARN_ON(list_empty(&pi_state->list)); 732 WARN_ON(list_empty(&pi_state->list));
733 list_del_init(&pi_state->list); 733 list_del_init(&pi_state->list);
734 pi_state->owner = NULL; 734 pi_state->owner = NULL;
735 raw_spin_unlock_irq(&curr->pi_lock); 735 raw_spin_unlock_irq(&curr->pi_lock);
736 736
737 rt_mutex_unlock(&pi_state->pi_mutex); 737 rt_mutex_unlock(&pi_state->pi_mutex);
738 738
739 spin_unlock(&hb->lock); 739 spin_unlock(&hb->lock);
740 740
741 raw_spin_lock_irq(&curr->pi_lock); 741 raw_spin_lock_irq(&curr->pi_lock);
742 } 742 }
743 raw_spin_unlock_irq(&curr->pi_lock); 743 raw_spin_unlock_irq(&curr->pi_lock);
744 } 744 }
745 745
746 static int 746 static int
747 lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, 747 lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
748 union futex_key *key, struct futex_pi_state **ps, 748 union futex_key *key, struct futex_pi_state **ps,
749 struct task_struct *task) 749 struct task_struct *task)
750 { 750 {
751 struct futex_pi_state *pi_state = NULL; 751 struct futex_pi_state *pi_state = NULL;
752 struct futex_q *this, *next; 752 struct futex_q *this, *next;
753 struct task_struct *p; 753 struct task_struct *p;
754 pid_t pid = uval & FUTEX_TID_MASK; 754 pid_t pid = uval & FUTEX_TID_MASK;
755 755
756 plist_for_each_entry_safe(this, next, &hb->chain, list) { 756 plist_for_each_entry_safe(this, next, &hb->chain, list) {
757 if (match_futex(&this->key, key)) { 757 if (match_futex(&this->key, key)) {
758 /* 758 /*
759 * Another waiter already exists - bump up 759 * Another waiter already exists - bump up
760 * the refcount and return its pi_state: 760 * the refcount and return its pi_state:
761 */ 761 */
762 pi_state = this->pi_state; 762 pi_state = this->pi_state;
763 /* 763 /*
764 * Userspace might have messed up non-PI and PI futexes 764 * Userspace might have messed up non-PI and PI futexes
765 */ 765 */
766 if (unlikely(!pi_state)) 766 if (unlikely(!pi_state))
767 return -EINVAL; 767 return -EINVAL;
768 768
769 WARN_ON(!atomic_read(&pi_state->refcount)); 769 WARN_ON(!atomic_read(&pi_state->refcount));
770 770
771 /* 771 /*
772 * When pi_state->owner is NULL then the owner died 772 * When pi_state->owner is NULL then the owner died
773 * and another waiter is on the fly. pi_state->owner 773 * and another waiter is on the fly. pi_state->owner
774 * is fixed up by the task which acquires 774 * is fixed up by the task which acquires
775 * pi_state->rt_mutex. 775 * pi_state->rt_mutex.
776 * 776 *
777 * We do not check for pid == 0 which can happen when 777 * We do not check for pid == 0 which can happen when
778 * the owner died and robust_list_exit() cleared the 778 * the owner died and robust_list_exit() cleared the
779 * TID. 779 * TID.
780 */ 780 */
781 if (pid && pi_state->owner) { 781 if (pid && pi_state->owner) {
782 /* 782 /*
783 * Bail out if user space manipulated the 783 * Bail out if user space manipulated the
784 * futex value. 784 * futex value.
785 */ 785 */
786 if (pid != task_pid_vnr(pi_state->owner)) 786 if (pid != task_pid_vnr(pi_state->owner))
787 return -EINVAL; 787 return -EINVAL;
788 } 788 }
789 789
790 /* 790 /*
791 * Protect against a corrupted uval. If uval 791 * Protect against a corrupted uval. If uval
792 * is 0x80000000 then pid is 0 and the waiter 792 * is 0x80000000 then pid is 0 and the waiter
793 * bit is set. So the deadlock check in the 793 * bit is set. So the deadlock check in the
794 * calling code has failed and we did not fall 794 * calling code has failed and we did not fall
795 * into the check above due to !pid. 795 * into the check above due to !pid.
796 */ 796 */
797 if (task && pi_state->owner == task) 797 if (task && pi_state->owner == task)
798 return -EDEADLK; 798 return -EDEADLK;
799 799
800 atomic_inc(&pi_state->refcount); 800 atomic_inc(&pi_state->refcount);
801 *ps = pi_state; 801 *ps = pi_state;
802 802
803 return 0; 803 return 0;
804 } 804 }
805 } 805 }
806 806
807 /* 807 /*
808 * We are the first waiter - try to look up the real owner and attach 808 * We are the first waiter - try to look up the real owner and attach
809 * the new pi_state to it, but bail out when TID = 0 809 * the new pi_state to it, but bail out when TID = 0
810 */ 810 */
811 if (!pid) 811 if (!pid)
812 return -ESRCH; 812 return -ESRCH;
813 p = futex_find_get_task(pid); 813 p = futex_find_get_task(pid);
814 if (!p) 814 if (!p)
815 return -ESRCH; 815 return -ESRCH;
816 816
817 if (!p->mm) { 817 if (!p->mm) {
818 put_task_struct(p); 818 put_task_struct(p);
819 return -EPERM; 819 return -EPERM;
820 } 820 }
821 821
822 /* 822 /*
823 * We need to look at the task state flags to figure out, 823 * We need to look at the task state flags to figure out,
824 * whether the task is exiting. To protect against the do_exit 824 * whether the task is exiting. To protect against the do_exit
825 * change of the task flags, we do this protected by 825 * change of the task flags, we do this protected by
826 * p->pi_lock: 826 * p->pi_lock:
827 */ 827 */
828 raw_spin_lock_irq(&p->pi_lock); 828 raw_spin_lock_irq(&p->pi_lock);
829 if (unlikely(p->flags & PF_EXITING)) { 829 if (unlikely(p->flags & PF_EXITING)) {
830 /* 830 /*
831 * The task is on the way out. When PF_EXITPIDONE is 831 * The task is on the way out. When PF_EXITPIDONE is
832 * set, we know that the task has finished the 832 * set, we know that the task has finished the
833 * cleanup: 833 * cleanup:
834 */ 834 */
835 int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN; 835 int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;
836 836
837 raw_spin_unlock_irq(&p->pi_lock); 837 raw_spin_unlock_irq(&p->pi_lock);
838 put_task_struct(p); 838 put_task_struct(p);
839 return ret; 839 return ret;
840 } 840 }
841 841
842 pi_state = alloc_pi_state(); 842 pi_state = alloc_pi_state();
843 843
844 /* 844 /*
845 * Initialize the pi_mutex in locked state and make 'p' 845 * Initialize the pi_mutex in locked state and make 'p'
846 * the owner of it: 846 * the owner of it:
847 */ 847 */
848 rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); 848 rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
849 849
850 /* Store the key for possible exit cleanups: */ 850 /* Store the key for possible exit cleanups: */
851 pi_state->key = *key; 851 pi_state->key = *key;
852 852
853 WARN_ON(!list_empty(&pi_state->list)); 853 WARN_ON(!list_empty(&pi_state->list));
854 list_add(&pi_state->list, &p->pi_state_list); 854 list_add(&pi_state->list, &p->pi_state_list);
855 pi_state->owner = p; 855 pi_state->owner = p;
856 raw_spin_unlock_irq(&p->pi_lock); 856 raw_spin_unlock_irq(&p->pi_lock);
857 857
858 put_task_struct(p); 858 put_task_struct(p);
859 859
860 *ps = pi_state; 860 *ps = pi_state;
861 861
862 return 0; 862 return 0;
863 } 863 }
864 864
865 /** 865 /**
866 * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex 866 * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
867 * @uaddr: the pi futex user address 867 * @uaddr: the pi futex user address
868 * @hb: the pi futex hash bucket 868 * @hb: the pi futex hash bucket
869 * @key: the futex key associated with uaddr and hb 869 * @key: the futex key associated with uaddr and hb
870 * @ps: the pi_state pointer where we store the result of the 870 * @ps: the pi_state pointer where we store the result of the
871 * lookup 871 * lookup
872 * @task: the task to perform the atomic lock work for. This will 872 * @task: the task to perform the atomic lock work for. This will
873 * be "current" except in the case of requeue pi. 873 * be "current" except in the case of requeue pi.
874 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) 874 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
875 * 875 *
876 * Return: 876 * Return:
877 * 0 - ready to wait; 877 * 0 - ready to wait;
878 * 1 - acquired the lock; 878 * 1 - acquired the lock;
879 * <0 - error 879 * <0 - error
880 * 880 *
881 * The hb->lock and futex_key refs shall be held by the caller. 881 * The hb->lock and futex_key refs shall be held by the caller.
882 */ 882 */
883 static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, 883 static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
884 union futex_key *key, 884 union futex_key *key,
885 struct futex_pi_state **ps, 885 struct futex_pi_state **ps,
886 struct task_struct *task, int set_waiters) 886 struct task_struct *task, int set_waiters)
887 { 887 {
888 int lock_taken, ret, force_take = 0; 888 int lock_taken, ret, force_take = 0;
889 u32 uval, newval, curval, vpid = task_pid_vnr(task); 889 u32 uval, newval, curval, vpid = task_pid_vnr(task);
890 890
891 retry: 891 retry:
892 ret = lock_taken = 0; 892 ret = lock_taken = 0;
893 893
894 /* 894 /*
895 * To avoid races, we attempt to take the lock here again 895 * To avoid races, we attempt to take the lock here again
896 * (by doing a 0 -> TID atomic cmpxchg), while holding all 896 * (by doing a 0 -> TID atomic cmpxchg), while holding all
897 * the locks. It will most likely not succeed. 897 * the locks. It will most likely not succeed.
898 */ 898 */
899 newval = vpid; 899 newval = vpid;
900 if (set_waiters) 900 if (set_waiters)
901 newval |= FUTEX_WAITERS; 901 newval |= FUTEX_WAITERS;
902 902
903 if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval))) 903 if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval)))
904 return -EFAULT; 904 return -EFAULT;
905 905
906 /* 906 /*
907 * Detect deadlocks. 907 * Detect deadlocks.
908 */ 908 */
909 if ((unlikely((curval & FUTEX_TID_MASK) == vpid))) 909 if ((unlikely((curval & FUTEX_TID_MASK) == vpid)))
910 return -EDEADLK; 910 return -EDEADLK;
911 911
912 /* 912 /*
913 * Surprise - we got the lock, but we do not trust user space at all. 913 * Surprise - we got the lock, but we do not trust user space at all.
914 */ 914 */
915 if (unlikely(!curval)) { 915 if (unlikely(!curval)) {
916 /* 916 /*
917 * We verify whether there is kernel state for this 917 * We verify whether there is kernel state for this
918 * futex. If not, we can safely assume, that the 0 -> 918 * futex. If not, we can safely assume, that the 0 ->
919 * TID transition is correct. If state exists, we do 919 * TID transition is correct. If state exists, we do
920 * not bother to fixup the user space state as it was 920 * not bother to fixup the user space state as it was
921 * corrupted already. 921 * corrupted already.
922 */ 922 */
923 return futex_top_waiter(hb, key) ? -EINVAL : 1; 923 return futex_top_waiter(hb, key) ? -EINVAL : 1;
924 } 924 }
925 925
926 uval = curval; 926 uval = curval;
927 927
928 /* 928 /*
929 * Set the FUTEX_WAITERS flag, so the owner will know it has someone 929 * Set the FUTEX_WAITERS flag, so the owner will know it has someone
930 * to wake at the next unlock. 930 * to wake at the next unlock.
931 */ 931 */
932 newval = curval | FUTEX_WAITERS; 932 newval = curval | FUTEX_WAITERS;
933 933
934 /* 934 /*
935 * Should we force take the futex? See below. 935 * Should we force take the futex? See below.
936 */ 936 */
937 if (unlikely(force_take)) { 937 if (unlikely(force_take)) {
938 /* 938 /*
939 * Keep the OWNER_DIED and the WAITERS bit and set the 939 * Keep the OWNER_DIED and the WAITERS bit and set the
940 * new TID value. 940 * new TID value.
941 */ 941 */
942 newval = (curval & ~FUTEX_TID_MASK) | vpid; 942 newval = (curval & ~FUTEX_TID_MASK) | vpid;
943 force_take = 0; 943 force_take = 0;
944 lock_taken = 1; 944 lock_taken = 1;
945 } 945 }
946 946
947 if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) 947 if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
948 return -EFAULT; 948 return -EFAULT;
949 if (unlikely(curval != uval)) 949 if (unlikely(curval != uval))
950 goto retry; 950 goto retry;
951 951
952 /* 952 /*
953 * We took the lock due to forced take over. 953 * We took the lock due to forced take over.
954 */ 954 */
955 if (unlikely(lock_taken)) 955 if (unlikely(lock_taken))
956 return 1; 956 return 1;
957 957
958 /* 958 /*
959 * We dont have the lock. Look up the PI state (or create it if 959 * We dont have the lock. Look up the PI state (or create it if
960 * we are the first waiter): 960 * we are the first waiter):
961 */ 961 */
962 ret = lookup_pi_state(uval, hb, key, ps, task); 962 ret = lookup_pi_state(uval, hb, key, ps, task);
963 963
964 if (unlikely(ret)) { 964 if (unlikely(ret)) {
965 switch (ret) { 965 switch (ret) {
966 case -ESRCH: 966 case -ESRCH:
967 /* 967 /*
968 * We failed to find an owner for this 968 * We failed to find an owner for this
969 * futex. So we have no pi_state to block 969 * futex. So we have no pi_state to block
970 * on. This can happen in two cases: 970 * on. This can happen in two cases:
971 * 971 *
972 * 1) The owner died 972 * 1) The owner died
973 * 2) A stale FUTEX_WAITERS bit 973 * 2) A stale FUTEX_WAITERS bit
974 * 974 *
975 * Re-read the futex value. 975 * Re-read the futex value.
976 */ 976 */
977 if (get_futex_value_locked(&curval, uaddr)) 977 if (get_futex_value_locked(&curval, uaddr))
978 return -EFAULT; 978 return -EFAULT;
979 979
980 /* 980 /*
981 * If the owner died or we have a stale 981 * If the owner died or we have a stale
982 * WAITERS bit the owner TID in the user space 982 * WAITERS bit the owner TID in the user space
983 * futex is 0. 983 * futex is 0.
984 */ 984 */
985 if (!(curval & FUTEX_TID_MASK)) { 985 if (!(curval & FUTEX_TID_MASK)) {
986 force_take = 1; 986 force_take = 1;
987 goto retry; 987 goto retry;
988 } 988 }
989 default: 989 default:
990 break; 990 break;
991 } 991 }
992 } 992 }
993 993
994 return ret; 994 return ret;
995 } 995 }
996 996
997 /** 997 /**
998 * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket 998 * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
999 * @q: The futex_q to unqueue 999 * @q: The futex_q to unqueue
1000 * 1000 *
1001 * The q->lock_ptr must not be NULL and must be held by the caller. 1001 * The q->lock_ptr must not be NULL and must be held by the caller.
1002 */ 1002 */
1003 static void __unqueue_futex(struct futex_q *q) 1003 static void __unqueue_futex(struct futex_q *q)
1004 { 1004 {
1005 struct futex_hash_bucket *hb; 1005 struct futex_hash_bucket *hb;
1006 1006
1007 if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr)) 1007 if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr))
1008 || WARN_ON(plist_node_empty(&q->list))) 1008 || WARN_ON(plist_node_empty(&q->list)))
1009 return; 1009 return;
1010 1010
1011 hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); 1011 hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
1012 plist_del(&q->list, &hb->chain); 1012 plist_del(&q->list, &hb->chain);
1013 hb_waiters_dec(hb); 1013 hb_waiters_dec(hb);
1014 } 1014 }
1015 1015
1016 /* 1016 /*
1017 * The hash bucket lock must be held when this is called. 1017 * The hash bucket lock must be held when this is called.
1018 * Afterwards, the futex_q must not be accessed. 1018 * Afterwards, the futex_q must not be accessed.
1019 */ 1019 */
1020 static void wake_futex(struct futex_q *q) 1020 static void wake_futex(struct futex_q *q)
1021 { 1021 {
1022 struct task_struct *p = q->task; 1022 struct task_struct *p = q->task;
1023 1023
1024 if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n")) 1024 if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
1025 return; 1025 return;
1026 1026
1027 /* 1027 /*
1028 * We set q->lock_ptr = NULL _before_ we wake up the task. If 1028 * We set q->lock_ptr = NULL _before_ we wake up the task. If
1029 * a non-futex wake up happens on another CPU then the task 1029 * a non-futex wake up happens on another CPU then the task
1030 * might exit and p would dereference a non-existing task 1030 * might exit and p would dereference a non-existing task
1031 * struct. Prevent this by holding a reference on p across the 1031 * struct. Prevent this by holding a reference on p across the
1032 * wake up. 1032 * wake up.
1033 */ 1033 */
1034 get_task_struct(p); 1034 get_task_struct(p);
1035 1035
1036 __unqueue_futex(q); 1036 __unqueue_futex(q);
1037 /* 1037 /*
1038 * The waiting task can free the futex_q as soon as 1038 * The waiting task can free the futex_q as soon as
1039 * q->lock_ptr = NULL is written, without taking any locks. A 1039 * q->lock_ptr = NULL is written, without taking any locks. A
1040 * memory barrier is required here to prevent the following 1040 * memory barrier is required here to prevent the following
1041 * store to lock_ptr from getting ahead of the plist_del. 1041 * store to lock_ptr from getting ahead of the plist_del.
1042 */ 1042 */
1043 smp_wmb(); 1043 smp_wmb();
1044 q->lock_ptr = NULL; 1044 q->lock_ptr = NULL;
1045 1045
1046 wake_up_state(p, TASK_NORMAL); 1046 wake_up_state(p, TASK_NORMAL);
1047 put_task_struct(p); 1047 put_task_struct(p);
1048 } 1048 }
1049 1049
1050 static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) 1050 static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
1051 { 1051 {
1052 struct task_struct *new_owner; 1052 struct task_struct *new_owner;
1053 struct futex_pi_state *pi_state = this->pi_state; 1053 struct futex_pi_state *pi_state = this->pi_state;
1054 u32 uninitialized_var(curval), newval; 1054 u32 uninitialized_var(curval), newval;
1055 int ret = 0;
1055 1056
1056 if (!pi_state) 1057 if (!pi_state)
1057 return -EINVAL; 1058 return -EINVAL;
1058 1059
1059 /* 1060 /*
1060 * If current does not own the pi_state then the futex is 1061 * If current does not own the pi_state then the futex is
1061 * inconsistent and user space fiddled with the futex value. 1062 * inconsistent and user space fiddled with the futex value.
1062 */ 1063 */
1063 if (pi_state->owner != current) 1064 if (pi_state->owner != current)
1064 return -EINVAL; 1065 return -EINVAL;
1065 1066
1066 raw_spin_lock(&pi_state->pi_mutex.wait_lock); 1067 raw_spin_lock(&pi_state->pi_mutex.wait_lock);
1067 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); 1068 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
1068 1069
1069 /* 1070 /*
1070 * It is possible that the next waiter (the one that brought 1071 * It is possible that the next waiter (the one that brought
1071 * this owner to the kernel) timed out and is no longer 1072 * this owner to the kernel) timed out and is no longer
1072 * waiting on the lock. 1073 * waiting on the lock.
1073 */ 1074 */
1074 if (!new_owner) 1075 if (!new_owner)
1075 new_owner = this->task; 1076 new_owner = this->task;
1076 1077
1077 /* 1078 /*
1078 * We pass it to the next owner. (The WAITERS bit is always 1079 * We pass it to the next owner. The WAITERS bit is always
1079 * kept enabled while there is PI state around. We must also 1080 * kept enabled while there is PI state around. We cleanup the
1080 * preserve the owner died bit.) 1081 * owner died bit, because we are the owner.
1081 */ 1082 */
1082 if (!(uval & FUTEX_OWNER_DIED)) { 1083 newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
1083 int ret = 0;
1084 1084
1085 newval = FUTEX_WAITERS | task_pid_vnr(new_owner); 1085 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
1086 1086 ret = -EFAULT;
1087 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) 1087 else if (curval != uval)
1088 ret = -EFAULT; 1088 ret = -EINVAL;
1089 else if (curval != uval) 1089 if (ret) {
1090 ret = -EINVAL; 1090 raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
1091 if (ret) { 1091 return ret;
1092 raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
1093 return ret;
1094 }
1095 } 1092 }
1096 1093
1097 raw_spin_lock_irq(&pi_state->owner->pi_lock); 1094 raw_spin_lock_irq(&pi_state->owner->pi_lock);
1098 WARN_ON(list_empty(&pi_state->list)); 1095 WARN_ON(list_empty(&pi_state->list));
1099 list_del_init(&pi_state->list); 1096 list_del_init(&pi_state->list);
1100 raw_spin_unlock_irq(&pi_state->owner->pi_lock); 1097 raw_spin_unlock_irq(&pi_state->owner->pi_lock);
1101 1098
1102 raw_spin_lock_irq(&new_owner->pi_lock); 1099 raw_spin_lock_irq(&new_owner->pi_lock);
1103 WARN_ON(!list_empty(&pi_state->list)); 1100 WARN_ON(!list_empty(&pi_state->list));
1104 list_add(&pi_state->list, &new_owner->pi_state_list); 1101 list_add(&pi_state->list, &new_owner->pi_state_list);
1105 pi_state->owner = new_owner; 1102 pi_state->owner = new_owner;
1106 raw_spin_unlock_irq(&new_owner->pi_lock); 1103 raw_spin_unlock_irq(&new_owner->pi_lock);
1107 1104
1108 raw_spin_unlock(&pi_state->pi_mutex.wait_lock); 1105 raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
1109 rt_mutex_unlock(&pi_state->pi_mutex); 1106 rt_mutex_unlock(&pi_state->pi_mutex);
1110 1107
1111 return 0; 1108 return 0;
1112 } 1109 }
1113 1110
1114 static int unlock_futex_pi(u32 __user *uaddr, u32 uval) 1111 static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
1115 { 1112 {
1116 u32 uninitialized_var(oldval); 1113 u32 uninitialized_var(oldval);
1117 1114
1118 /* 1115 /*
1119 * There is no waiter, so we unlock the futex. The owner died 1116 * There is no waiter, so we unlock the futex. The owner died
1120 * bit has not to be preserved here. We are the owner: 1117 * bit has not to be preserved here. We are the owner:
1121 */ 1118 */
1122 if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0)) 1119 if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0))
1123 return -EFAULT; 1120 return -EFAULT;
1124 if (oldval != uval) 1121 if (oldval != uval)
1125 return -EAGAIN; 1122 return -EAGAIN;
1126 1123
1127 return 0; 1124 return 0;
1128 } 1125 }
1129 1126
1130 /* 1127 /*
1131 * Express the locking dependencies for lockdep: 1128 * Express the locking dependencies for lockdep:
1132 */ 1129 */
1133 static inline void 1130 static inline void
1134 double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) 1131 double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
1135 { 1132 {
1136 if (hb1 <= hb2) { 1133 if (hb1 <= hb2) {
1137 spin_lock(&hb1->lock); 1134 spin_lock(&hb1->lock);
1138 if (hb1 < hb2) 1135 if (hb1 < hb2)
1139 spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING); 1136 spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
1140 } else { /* hb1 > hb2 */ 1137 } else { /* hb1 > hb2 */
1141 spin_lock(&hb2->lock); 1138 spin_lock(&hb2->lock);
1142 spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING); 1139 spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
1143 } 1140 }
1144 } 1141 }
1145 1142
1146 static inline void 1143 static inline void
1147 double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) 1144 double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
1148 { 1145 {
1149 spin_unlock(&hb1->lock); 1146 spin_unlock(&hb1->lock);
1150 if (hb1 != hb2) 1147 if (hb1 != hb2)
1151 spin_unlock(&hb2->lock); 1148 spin_unlock(&hb2->lock);
1152 } 1149 }
1153 1150
1154 /* 1151 /*
1155 * Wake up waiters matching bitset queued on this futex (uaddr). 1152 * Wake up waiters matching bitset queued on this futex (uaddr).
1156 */ 1153 */
1157 static int 1154 static int
1158 futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) 1155 futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
1159 { 1156 {
1160 struct futex_hash_bucket *hb; 1157 struct futex_hash_bucket *hb;
1161 struct futex_q *this, *next; 1158 struct futex_q *this, *next;
1162 union futex_key key = FUTEX_KEY_INIT; 1159 union futex_key key = FUTEX_KEY_INIT;
1163 int ret; 1160 int ret;
1164 1161
1165 if (!bitset) 1162 if (!bitset)
1166 return -EINVAL; 1163 return -EINVAL;
1167 1164
1168 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_READ); 1165 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_READ);
1169 if (unlikely(ret != 0)) 1166 if (unlikely(ret != 0))
1170 goto out; 1167 goto out;
1171 1168
1172 hb = hash_futex(&key); 1169 hb = hash_futex(&key);
1173 1170
1174 /* Make sure we really have tasks to wakeup */ 1171 /* Make sure we really have tasks to wakeup */
1175 if (!hb_waiters_pending(hb)) 1172 if (!hb_waiters_pending(hb))
1176 goto out_put_key; 1173 goto out_put_key;
1177 1174
1178 spin_lock(&hb->lock); 1175 spin_lock(&hb->lock);
1179 1176
1180 plist_for_each_entry_safe(this, next, &hb->chain, list) { 1177 plist_for_each_entry_safe(this, next, &hb->chain, list) {
1181 if (match_futex (&this->key, &key)) { 1178 if (match_futex (&this->key, &key)) {
1182 if (this->pi_state || this->rt_waiter) { 1179 if (this->pi_state || this->rt_waiter) {
1183 ret = -EINVAL; 1180 ret = -EINVAL;
1184 break; 1181 break;
1185 } 1182 }
1186 1183
1187 /* Check if one of the bits is set in both bitsets */ 1184 /* Check if one of the bits is set in both bitsets */
1188 if (!(this->bitset & bitset)) 1185 if (!(this->bitset & bitset))
1189 continue; 1186 continue;
1190 1187
1191 wake_futex(this); 1188 wake_futex(this);
1192 if (++ret >= nr_wake) 1189 if (++ret >= nr_wake)
1193 break; 1190 break;
1194 } 1191 }
1195 } 1192 }
1196 1193
1197 spin_unlock(&hb->lock); 1194 spin_unlock(&hb->lock);
1198 out_put_key: 1195 out_put_key:
1199 put_futex_key(&key); 1196 put_futex_key(&key);
1200 out: 1197 out:
1201 return ret; 1198 return ret;
1202 } 1199 }
1203 1200
1204 /* 1201 /*
1205 * Wake up all waiters hashed on the physical page that is mapped 1202 * Wake up all waiters hashed on the physical page that is mapped
1206 * to this virtual address: 1203 * to this virtual address:
1207 */ 1204 */
1208 static int 1205 static int
1209 futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, 1206 futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
1210 int nr_wake, int nr_wake2, int op) 1207 int nr_wake, int nr_wake2, int op)
1211 { 1208 {
1212 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; 1209 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1213 struct futex_hash_bucket *hb1, *hb2; 1210 struct futex_hash_bucket *hb1, *hb2;
1214 struct futex_q *this, *next; 1211 struct futex_q *this, *next;
1215 int ret, op_ret; 1212 int ret, op_ret;
1216 1213
1217 retry: 1214 retry:
1218 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); 1215 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
1219 if (unlikely(ret != 0)) 1216 if (unlikely(ret != 0))
1220 goto out; 1217 goto out;
1221 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); 1218 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
1222 if (unlikely(ret != 0)) 1219 if (unlikely(ret != 0))
1223 goto out_put_key1; 1220 goto out_put_key1;
1224 1221
1225 hb1 = hash_futex(&key1); 1222 hb1 = hash_futex(&key1);
1226 hb2 = hash_futex(&key2); 1223 hb2 = hash_futex(&key2);
1227 1224
1228 retry_private: 1225 retry_private:
1229 double_lock_hb(hb1, hb2); 1226 double_lock_hb(hb1, hb2);
1230 op_ret = futex_atomic_op_inuser(op, uaddr2); 1227 op_ret = futex_atomic_op_inuser(op, uaddr2);
1231 if (unlikely(op_ret < 0)) { 1228 if (unlikely(op_ret < 0)) {
1232 1229
1233 double_unlock_hb(hb1, hb2); 1230 double_unlock_hb(hb1, hb2);
1234 1231
1235 #ifndef CONFIG_MMU 1232 #ifndef CONFIG_MMU
1236 /* 1233 /*
1237 * we don't get EFAULT from MMU faults if we don't have an MMU, 1234 * we don't get EFAULT from MMU faults if we don't have an MMU,
1238 * but we might get them from range checking 1235 * but we might get them from range checking
1239 */ 1236 */
1240 ret = op_ret; 1237 ret = op_ret;
1241 goto out_put_keys; 1238 goto out_put_keys;
1242 #endif 1239 #endif
1243 1240
1244 if (unlikely(op_ret != -EFAULT)) { 1241 if (unlikely(op_ret != -EFAULT)) {
1245 ret = op_ret; 1242 ret = op_ret;
1246 goto out_put_keys; 1243 goto out_put_keys;
1247 } 1244 }
1248 1245
1249 ret = fault_in_user_writeable(uaddr2); 1246 ret = fault_in_user_writeable(uaddr2);
1250 if (ret) 1247 if (ret)
1251 goto out_put_keys; 1248 goto out_put_keys;
1252 1249
1253 if (!(flags & FLAGS_SHARED)) 1250 if (!(flags & FLAGS_SHARED))
1254 goto retry_private; 1251 goto retry_private;
1255 1252
1256 put_futex_key(&key2); 1253 put_futex_key(&key2);
1257 put_futex_key(&key1); 1254 put_futex_key(&key1);
1258 goto retry; 1255 goto retry;
1259 } 1256 }
1260 1257
1261 plist_for_each_entry_safe(this, next, &hb1->chain, list) { 1258 plist_for_each_entry_safe(this, next, &hb1->chain, list) {
1262 if (match_futex (&this->key, &key1)) { 1259 if (match_futex (&this->key, &key1)) {
1263 if (this->pi_state || this->rt_waiter) { 1260 if (this->pi_state || this->rt_waiter) {
1264 ret = -EINVAL; 1261 ret = -EINVAL;
1265 goto out_unlock; 1262 goto out_unlock;
1266 } 1263 }
1267 wake_futex(this); 1264 wake_futex(this);
1268 if (++ret >= nr_wake) 1265 if (++ret >= nr_wake)
1269 break; 1266 break;
1270 } 1267 }
1271 } 1268 }
1272 1269
1273 if (op_ret > 0) { 1270 if (op_ret > 0) {
1274 op_ret = 0; 1271 op_ret = 0;
1275 plist_for_each_entry_safe(this, next, &hb2->chain, list) { 1272 plist_for_each_entry_safe(this, next, &hb2->chain, list) {
1276 if (match_futex (&this->key, &key2)) { 1273 if (match_futex (&this->key, &key2)) {
1277 if (this->pi_state || this->rt_waiter) { 1274 if (this->pi_state || this->rt_waiter) {
1278 ret = -EINVAL; 1275 ret = -EINVAL;
1279 goto out_unlock; 1276 goto out_unlock;
1280 } 1277 }
1281 wake_futex(this); 1278 wake_futex(this);
1282 if (++op_ret >= nr_wake2) 1279 if (++op_ret >= nr_wake2)
1283 break; 1280 break;
1284 } 1281 }
1285 } 1282 }
1286 ret += op_ret; 1283 ret += op_ret;
1287 } 1284 }
1288 1285
1289 out_unlock: 1286 out_unlock:
1290 double_unlock_hb(hb1, hb2); 1287 double_unlock_hb(hb1, hb2);
1291 out_put_keys: 1288 out_put_keys:
1292 put_futex_key(&key2); 1289 put_futex_key(&key2);
1293 out_put_key1: 1290 out_put_key1:
1294 put_futex_key(&key1); 1291 put_futex_key(&key1);
1295 out: 1292 out:
1296 return ret; 1293 return ret;
1297 } 1294 }
1298 1295
1299 /** 1296 /**
1300 * requeue_futex() - Requeue a futex_q from one hb to another 1297 * requeue_futex() - Requeue a futex_q from one hb to another
1301 * @q: the futex_q to requeue 1298 * @q: the futex_q to requeue
1302 * @hb1: the source hash_bucket 1299 * @hb1: the source hash_bucket
1303 * @hb2: the target hash_bucket 1300 * @hb2: the target hash_bucket
1304 * @key2: the new key for the requeued futex_q 1301 * @key2: the new key for the requeued futex_q
1305 */ 1302 */
1306 static inline 1303 static inline
1307 void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, 1304 void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
1308 struct futex_hash_bucket *hb2, union futex_key *key2) 1305 struct futex_hash_bucket *hb2, union futex_key *key2)
1309 { 1306 {
1310 1307
1311 /* 1308 /*
1312 * If key1 and key2 hash to the same bucket, no need to 1309 * If key1 and key2 hash to the same bucket, no need to
1313 * requeue. 1310 * requeue.
1314 */ 1311 */
1315 if (likely(&hb1->chain != &hb2->chain)) { 1312 if (likely(&hb1->chain != &hb2->chain)) {
1316 plist_del(&q->list, &hb1->chain); 1313 plist_del(&q->list, &hb1->chain);
1317 hb_waiters_dec(hb1); 1314 hb_waiters_dec(hb1);
1318 plist_add(&q->list, &hb2->chain); 1315 plist_add(&q->list, &hb2->chain);
1319 hb_waiters_inc(hb2); 1316 hb_waiters_inc(hb2);
1320 q->lock_ptr = &hb2->lock; 1317 q->lock_ptr = &hb2->lock;
1321 } 1318 }
1322 get_futex_key_refs(key2); 1319 get_futex_key_refs(key2);
1323 q->key = *key2; 1320 q->key = *key2;
1324 } 1321 }
1325 1322
1326 /** 1323 /**
1327 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue 1324 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
1328 * @q: the futex_q 1325 * @q: the futex_q
1329 * @key: the key of the requeue target futex 1326 * @key: the key of the requeue target futex
1330 * @hb: the hash_bucket of the requeue target futex 1327 * @hb: the hash_bucket of the requeue target futex
1331 * 1328 *
1332 * During futex_requeue, with requeue_pi=1, it is possible to acquire the 1329 * During futex_requeue, with requeue_pi=1, it is possible to acquire the
1333 * target futex if it is uncontended or via a lock steal. Set the futex_q key 1330 * target futex if it is uncontended or via a lock steal. Set the futex_q key
1334 * to the requeue target futex so the waiter can detect the wakeup on the right 1331 * to the requeue target futex so the waiter can detect the wakeup on the right
1335 * futex, but remove it from the hb and NULL the rt_waiter so it can detect 1332 * futex, but remove it from the hb and NULL the rt_waiter so it can detect
1336 * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock 1333 * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock
1337 * to protect access to the pi_state to fixup the owner later. Must be called 1334 * to protect access to the pi_state to fixup the owner later. Must be called
1338 * with both q->lock_ptr and hb->lock held. 1335 * with both q->lock_ptr and hb->lock held.
1339 */ 1336 */
1340 static inline 1337 static inline
1341 void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, 1338 void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1342 struct futex_hash_bucket *hb) 1339 struct futex_hash_bucket *hb)
1343 { 1340 {
1344 get_futex_key_refs(key); 1341 get_futex_key_refs(key);
1345 q->key = *key; 1342 q->key = *key;
1346 1343
1347 __unqueue_futex(q); 1344 __unqueue_futex(q);
1348 1345
1349 WARN_ON(!q->rt_waiter); 1346 WARN_ON(!q->rt_waiter);
1350 q->rt_waiter = NULL; 1347 q->rt_waiter = NULL;
1351 1348
1352 q->lock_ptr = &hb->lock; 1349 q->lock_ptr = &hb->lock;
1353 1350
1354 wake_up_state(q->task, TASK_NORMAL); 1351 wake_up_state(q->task, TASK_NORMAL);
1355 } 1352 }
1356 1353
1357 /** 1354 /**
1358 * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter 1355 * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
1359 * @pifutex: the user address of the to futex 1356 * @pifutex: the user address of the to futex
1360 * @hb1: the from futex hash bucket, must be locked by the caller 1357 * @hb1: the from futex hash bucket, must be locked by the caller
1361 * @hb2: the to futex hash bucket, must be locked by the caller 1358 * @hb2: the to futex hash bucket, must be locked by the caller
1362 * @key1: the from futex key 1359 * @key1: the from futex key
1363 * @key2: the to futex key 1360 * @key2: the to futex key
1364 * @ps: address to store the pi_state pointer 1361 * @ps: address to store the pi_state pointer
1365 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) 1362 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
1366 * 1363 *
1367 * Try and get the lock on behalf of the top waiter if we can do it atomically. 1364 * Try and get the lock on behalf of the top waiter if we can do it atomically.
1368 * Wake the top waiter if we succeed. If the caller specified set_waiters, 1365 * Wake the top waiter if we succeed. If the caller specified set_waiters,
1369 * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. 1366 * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
1370 * hb1 and hb2 must be held by the caller. 1367 * hb1 and hb2 must be held by the caller.
1371 * 1368 *
1372 * Return: 1369 * Return:
1373 * 0 - failed to acquire the lock atomically; 1370 * 0 - failed to acquire the lock atomically;
1374 * >0 - acquired the lock, return value is vpid of the top_waiter 1371 * >0 - acquired the lock, return value is vpid of the top_waiter
1375 * <0 - error 1372 * <0 - error
1376 */ 1373 */
1377 static int futex_proxy_trylock_atomic(u32 __user *pifutex, 1374 static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1378 struct futex_hash_bucket *hb1, 1375 struct futex_hash_bucket *hb1,
1379 struct futex_hash_bucket *hb2, 1376 struct futex_hash_bucket *hb2,
1380 union futex_key *key1, union futex_key *key2, 1377 union futex_key *key1, union futex_key *key2,
1381 struct futex_pi_state **ps, int set_waiters) 1378 struct futex_pi_state **ps, int set_waiters)
1382 { 1379 {
1383 struct futex_q *top_waiter = NULL; 1380 struct futex_q *top_waiter = NULL;
1384 u32 curval; 1381 u32 curval;
1385 int ret, vpid; 1382 int ret, vpid;
1386 1383
1387 if (get_futex_value_locked(&curval, pifutex)) 1384 if (get_futex_value_locked(&curval, pifutex))
1388 return -EFAULT; 1385 return -EFAULT;
1389 1386
1390 /* 1387 /*
1391 * Find the top_waiter and determine if there are additional waiters. 1388 * Find the top_waiter and determine if there are additional waiters.
1392 * If the caller intends to requeue more than 1 waiter to pifutex, 1389 * If the caller intends to requeue more than 1 waiter to pifutex,
1393 * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now, 1390 * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
1394 * as we have means to handle the possible fault. If not, don't set 1391 * as we have means to handle the possible fault. If not, don't set
1395 * the bit unecessarily as it will force the subsequent unlock to enter 1392 * the bit unecessarily as it will force the subsequent unlock to enter
1396 * the kernel. 1393 * the kernel.
1397 */ 1394 */
1398 top_waiter = futex_top_waiter(hb1, key1); 1395 top_waiter = futex_top_waiter(hb1, key1);
1399 1396
1400 /* There are no waiters, nothing for us to do. */ 1397 /* There are no waiters, nothing for us to do. */
1401 if (!top_waiter) 1398 if (!top_waiter)
1402 return 0; 1399 return 0;
1403 1400
1404 /* Ensure we requeue to the expected futex. */ 1401 /* Ensure we requeue to the expected futex. */
1405 if (!match_futex(top_waiter->requeue_pi_key, key2)) 1402 if (!match_futex(top_waiter->requeue_pi_key, key2))
1406 return -EINVAL; 1403 return -EINVAL;
1407 1404
1408 /* 1405 /*
1409 * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in 1406 * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in
1410 * the contended case or if set_waiters is 1. The pi_state is returned 1407 * the contended case or if set_waiters is 1. The pi_state is returned
1411 * in ps in contended cases. 1408 * in ps in contended cases.
1412 */ 1409 */
1413 vpid = task_pid_vnr(top_waiter->task); 1410 vpid = task_pid_vnr(top_waiter->task);
1414 ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, 1411 ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
1415 set_waiters); 1412 set_waiters);
1416 if (ret == 1) { 1413 if (ret == 1) {
1417 requeue_pi_wake_futex(top_waiter, key2, hb2); 1414 requeue_pi_wake_futex(top_waiter, key2, hb2);
1418 return vpid; 1415 return vpid;
1419 } 1416 }
1420 return ret; 1417 return ret;
1421 } 1418 }
1422 1419
1423 /** 1420 /**
1424 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 1421 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
1425 * @uaddr1: source futex user address 1422 * @uaddr1: source futex user address
1426 * @flags: futex flags (FLAGS_SHARED, etc.) 1423 * @flags: futex flags (FLAGS_SHARED, etc.)
1427 * @uaddr2: target futex user address 1424 * @uaddr2: target futex user address
1428 * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) 1425 * @nr_wake: number of waiters to wake (must be 1 for requeue_pi)
1429 * @nr_requeue: number of waiters to requeue (0-INT_MAX) 1426 * @nr_requeue: number of waiters to requeue (0-INT_MAX)
1430 * @cmpval: @uaddr1 expected value (or %NULL) 1427 * @cmpval: @uaddr1 expected value (or %NULL)
1431 * @requeue_pi: if we are attempting to requeue from a non-pi futex to a 1428 * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
1432 * pi futex (pi to pi requeue is not supported) 1429 * pi futex (pi to pi requeue is not supported)
1433 * 1430 *
1434 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire 1431 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
1435 * uaddr2 atomically on behalf of the top waiter. 1432 * uaddr2 atomically on behalf of the top waiter.
1436 * 1433 *
1437 * Return: 1434 * Return:
1438 * >=0 - on success, the number of tasks requeued or woken; 1435 * >=0 - on success, the number of tasks requeued or woken;
1439 * <0 - on error 1436 * <0 - on error
1440 */ 1437 */
1441 static int futex_requeue(u32 __user *uaddr1, unsigned int flags, 1438 static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
1442 u32 __user *uaddr2, int nr_wake, int nr_requeue, 1439 u32 __user *uaddr2, int nr_wake, int nr_requeue,
1443 u32 *cmpval, int requeue_pi) 1440 u32 *cmpval, int requeue_pi)
1444 { 1441 {
1445 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; 1442 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1446 int drop_count = 0, task_count = 0, ret; 1443 int drop_count = 0, task_count = 0, ret;
1447 struct futex_pi_state *pi_state = NULL; 1444 struct futex_pi_state *pi_state = NULL;
1448 struct futex_hash_bucket *hb1, *hb2; 1445 struct futex_hash_bucket *hb1, *hb2;
1449 struct futex_q *this, *next; 1446 struct futex_q *this, *next;
1450 1447
1451 if (requeue_pi) { 1448 if (requeue_pi) {
1452 /* 1449 /*
1453 * Requeue PI only works on two distinct uaddrs. This 1450 * Requeue PI only works on two distinct uaddrs. This
1454 * check is only valid for private futexes. See below. 1451 * check is only valid for private futexes. See below.
1455 */ 1452 */
1456 if (uaddr1 == uaddr2) 1453 if (uaddr1 == uaddr2)
1457 return -EINVAL; 1454 return -EINVAL;
1458 1455
1459 /* 1456 /*
1460 * requeue_pi requires a pi_state, try to allocate it now 1457 * requeue_pi requires a pi_state, try to allocate it now
1461 * without any locks in case it fails. 1458 * without any locks in case it fails.
1462 */ 1459 */
1463 if (refill_pi_state_cache()) 1460 if (refill_pi_state_cache())
1464 return -ENOMEM; 1461 return -ENOMEM;
1465 /* 1462 /*
1466 * requeue_pi must wake as many tasks as it can, up to nr_wake 1463 * requeue_pi must wake as many tasks as it can, up to nr_wake
1467 * + nr_requeue, since it acquires the rt_mutex prior to 1464 * + nr_requeue, since it acquires the rt_mutex prior to
1468 * returning to userspace, so as to not leave the rt_mutex with 1465 * returning to userspace, so as to not leave the rt_mutex with
1469 * waiters and no owner. However, second and third wake-ups 1466 * waiters and no owner. However, second and third wake-ups
1470 * cannot be predicted as they involve race conditions with the 1467 * cannot be predicted as they involve race conditions with the
1471 * first wake and a fault while looking up the pi_state. Both 1468 * first wake and a fault while looking up the pi_state. Both
1472 * pthread_cond_signal() and pthread_cond_broadcast() should 1469 * pthread_cond_signal() and pthread_cond_broadcast() should
1473 * use nr_wake=1. 1470 * use nr_wake=1.
1474 */ 1471 */
1475 if (nr_wake != 1) 1472 if (nr_wake != 1)
1476 return -EINVAL; 1473 return -EINVAL;
1477 } 1474 }
1478 1475
1479 retry: 1476 retry:
1480 if (pi_state != NULL) { 1477 if (pi_state != NULL) {
1481 /* 1478 /*
1482 * We will have to lookup the pi_state again, so free this one 1479 * We will have to lookup the pi_state again, so free this one
1483 * to keep the accounting correct. 1480 * to keep the accounting correct.
1484 */ 1481 */
1485 free_pi_state(pi_state); 1482 free_pi_state(pi_state);
1486 pi_state = NULL; 1483 pi_state = NULL;
1487 } 1484 }
1488 1485
1489 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); 1486 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
1490 if (unlikely(ret != 0)) 1487 if (unlikely(ret != 0))
1491 goto out; 1488 goto out;
1492 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, 1489 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
1493 requeue_pi ? VERIFY_WRITE : VERIFY_READ); 1490 requeue_pi ? VERIFY_WRITE : VERIFY_READ);
1494 if (unlikely(ret != 0)) 1491 if (unlikely(ret != 0))
1495 goto out_put_key1; 1492 goto out_put_key1;
1496 1493
1497 /* 1494 /*
1498 * The check above which compares uaddrs is not sufficient for 1495 * The check above which compares uaddrs is not sufficient for
1499 * shared futexes. We need to compare the keys: 1496 * shared futexes. We need to compare the keys:
1500 */ 1497 */
1501 if (requeue_pi && match_futex(&key1, &key2)) { 1498 if (requeue_pi && match_futex(&key1, &key2)) {
1502 ret = -EINVAL; 1499 ret = -EINVAL;
1503 goto out_put_keys; 1500 goto out_put_keys;
1504 } 1501 }
1505 1502
1506 hb1 = hash_futex(&key1); 1503 hb1 = hash_futex(&key1);
1507 hb2 = hash_futex(&key2); 1504 hb2 = hash_futex(&key2);
1508 1505
1509 retry_private: 1506 retry_private:
1510 hb_waiters_inc(hb2); 1507 hb_waiters_inc(hb2);
1511 double_lock_hb(hb1, hb2); 1508 double_lock_hb(hb1, hb2);
1512 1509
1513 if (likely(cmpval != NULL)) { 1510 if (likely(cmpval != NULL)) {
1514 u32 curval; 1511 u32 curval;
1515 1512
1516 ret = get_futex_value_locked(&curval, uaddr1); 1513 ret = get_futex_value_locked(&curval, uaddr1);
1517 1514
1518 if (unlikely(ret)) { 1515 if (unlikely(ret)) {
1519 double_unlock_hb(hb1, hb2); 1516 double_unlock_hb(hb1, hb2);
1520 hb_waiters_dec(hb2); 1517 hb_waiters_dec(hb2);
1521 1518
1522 ret = get_user(curval, uaddr1); 1519 ret = get_user(curval, uaddr1);
1523 if (ret) 1520 if (ret)
1524 goto out_put_keys; 1521 goto out_put_keys;
1525 1522
1526 if (!(flags & FLAGS_SHARED)) 1523 if (!(flags & FLAGS_SHARED))
1527 goto retry_private; 1524 goto retry_private;
1528 1525
1529 put_futex_key(&key2); 1526 put_futex_key(&key2);
1530 put_futex_key(&key1); 1527 put_futex_key(&key1);
1531 goto retry; 1528 goto retry;
1532 } 1529 }
1533 if (curval != *cmpval) { 1530 if (curval != *cmpval) {
1534 ret = -EAGAIN; 1531 ret = -EAGAIN;
1535 goto out_unlock; 1532 goto out_unlock;
1536 } 1533 }
1537 } 1534 }
1538 1535
1539 if (requeue_pi && (task_count - nr_wake < nr_requeue)) { 1536 if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
1540 /* 1537 /*
1541 * Attempt to acquire uaddr2 and wake the top waiter. If we 1538 * Attempt to acquire uaddr2 and wake the top waiter. If we
1542 * intend to requeue waiters, force setting the FUTEX_WAITERS 1539 * intend to requeue waiters, force setting the FUTEX_WAITERS
1543 * bit. We force this here where we are able to easily handle 1540 * bit. We force this here where we are able to easily handle
1544 * faults rather in the requeue loop below. 1541 * faults rather in the requeue loop below.
1545 */ 1542 */
1546 ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, 1543 ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
1547 &key2, &pi_state, nr_requeue); 1544 &key2, &pi_state, nr_requeue);
1548 1545
1549 /* 1546 /*
1550 * At this point the top_waiter has either taken uaddr2 or is 1547 * At this point the top_waiter has either taken uaddr2 or is
1551 * waiting on it. If the former, then the pi_state will not 1548 * waiting on it. If the former, then the pi_state will not
1552 * exist yet, look it up one more time to ensure we have a 1549 * exist yet, look it up one more time to ensure we have a
1553 * reference to it. If the lock was taken, ret contains the 1550 * reference to it. If the lock was taken, ret contains the
1554 * vpid of the top waiter task. 1551 * vpid of the top waiter task.
1555 */ 1552 */
1556 if (ret > 0) { 1553 if (ret > 0) {
1557 WARN_ON(pi_state); 1554 WARN_ON(pi_state);
1558 drop_count++; 1555 drop_count++;
1559 task_count++; 1556 task_count++;
1560 /* 1557 /*
1561 * If we acquired the lock, then the user 1558 * If we acquired the lock, then the user
1562 * space value of uaddr2 should be vpid. It 1559 * space value of uaddr2 should be vpid. It
1563 * cannot be changed by the top waiter as it 1560 * cannot be changed by the top waiter as it
1564 * is blocked on hb2 lock if it tries to do 1561 * is blocked on hb2 lock if it tries to do
1565 * so. If something fiddled with it behind our 1562 * so. If something fiddled with it behind our
1566 * back the pi state lookup might unearth 1563 * back the pi state lookup might unearth
1567 * it. So we rather use the known value than 1564 * it. So we rather use the known value than
1568 * rereading and handing potential crap to 1565 * rereading and handing potential crap to
1569 * lookup_pi_state. 1566 * lookup_pi_state.
1570 */ 1567 */
1571 ret = lookup_pi_state(ret, hb2, &key2, &pi_state, NULL); 1568 ret = lookup_pi_state(ret, hb2, &key2, &pi_state, NULL);
1572 } 1569 }
1573 1570
1574 switch (ret) { 1571 switch (ret) {
1575 case 0: 1572 case 0:
1576 break; 1573 break;
1577 case -EFAULT: 1574 case -EFAULT:
1578 double_unlock_hb(hb1, hb2); 1575 double_unlock_hb(hb1, hb2);
1579 hb_waiters_dec(hb2); 1576 hb_waiters_dec(hb2);
1580 put_futex_key(&key2); 1577 put_futex_key(&key2);
1581 put_futex_key(&key1); 1578 put_futex_key(&key1);
1582 ret = fault_in_user_writeable(uaddr2); 1579 ret = fault_in_user_writeable(uaddr2);
1583 if (!ret) 1580 if (!ret)
1584 goto retry; 1581 goto retry;
1585 goto out; 1582 goto out;
1586 case -EAGAIN: 1583 case -EAGAIN:
1587 /* The owner was exiting, try again. */ 1584 /* The owner was exiting, try again. */
1588 double_unlock_hb(hb1, hb2); 1585 double_unlock_hb(hb1, hb2);
1589 hb_waiters_dec(hb2); 1586 hb_waiters_dec(hb2);
1590 put_futex_key(&key2); 1587 put_futex_key(&key2);
1591 put_futex_key(&key1); 1588 put_futex_key(&key1);
1592 cond_resched(); 1589 cond_resched();
1593 goto retry; 1590 goto retry;
1594 default: 1591 default:
1595 goto out_unlock; 1592 goto out_unlock;
1596 } 1593 }
1597 } 1594 }
1598 1595
1599 plist_for_each_entry_safe(this, next, &hb1->chain, list) { 1596 plist_for_each_entry_safe(this, next, &hb1->chain, list) {
1600 if (task_count - nr_wake >= nr_requeue) 1597 if (task_count - nr_wake >= nr_requeue)
1601 break; 1598 break;
1602 1599
1603 if (!match_futex(&this->key, &key1)) 1600 if (!match_futex(&this->key, &key1))
1604 continue; 1601 continue;
1605 1602
1606 /* 1603 /*
1607 * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always 1604 * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
1608 * be paired with each other and no other futex ops. 1605 * be paired with each other and no other futex ops.
1609 * 1606 *
1610 * We should never be requeueing a futex_q with a pi_state, 1607 * We should never be requeueing a futex_q with a pi_state,
1611 * which is awaiting a futex_unlock_pi(). 1608 * which is awaiting a futex_unlock_pi().
1612 */ 1609 */
1613 if ((requeue_pi && !this->rt_waiter) || 1610 if ((requeue_pi && !this->rt_waiter) ||
1614 (!requeue_pi && this->rt_waiter) || 1611 (!requeue_pi && this->rt_waiter) ||
1615 this->pi_state) { 1612 this->pi_state) {
1616 ret = -EINVAL; 1613 ret = -EINVAL;
1617 break; 1614 break;
1618 } 1615 }
1619 1616
1620 /* 1617 /*
1621 * Wake nr_wake waiters. For requeue_pi, if we acquired the 1618 * Wake nr_wake waiters. For requeue_pi, if we acquired the
1622 * lock, we already woke the top_waiter. If not, it will be 1619 * lock, we already woke the top_waiter. If not, it will be
1623 * woken by futex_unlock_pi(). 1620 * woken by futex_unlock_pi().
1624 */ 1621 */
1625 if (++task_count <= nr_wake && !requeue_pi) { 1622 if (++task_count <= nr_wake && !requeue_pi) {
1626 wake_futex(this); 1623 wake_futex(this);
1627 continue; 1624 continue;
1628 } 1625 }
1629 1626
1630 /* Ensure we requeue to the expected futex for requeue_pi. */ 1627 /* Ensure we requeue to the expected futex for requeue_pi. */
1631 if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) { 1628 if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) {
1632 ret = -EINVAL; 1629 ret = -EINVAL;
1633 break; 1630 break;
1634 } 1631 }
1635 1632
1636 /* 1633 /*
1637 * Requeue nr_requeue waiters and possibly one more in the case 1634 * Requeue nr_requeue waiters and possibly one more in the case
1638 * of requeue_pi if we couldn't acquire the lock atomically. 1635 * of requeue_pi if we couldn't acquire the lock atomically.
1639 */ 1636 */
1640 if (requeue_pi) { 1637 if (requeue_pi) {
1641 /* Prepare the waiter to take the rt_mutex. */ 1638 /* Prepare the waiter to take the rt_mutex. */
1642 atomic_inc(&pi_state->refcount); 1639 atomic_inc(&pi_state->refcount);
1643 this->pi_state = pi_state; 1640 this->pi_state = pi_state;
1644 ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, 1641 ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
1645 this->rt_waiter, 1642 this->rt_waiter,
1646 this->task, 1); 1643 this->task, 1);
1647 if (ret == 1) { 1644 if (ret == 1) {
1648 /* We got the lock. */ 1645 /* We got the lock. */
1649 requeue_pi_wake_futex(this, &key2, hb2); 1646 requeue_pi_wake_futex(this, &key2, hb2);
1650 drop_count++; 1647 drop_count++;
1651 continue; 1648 continue;
1652 } else if (ret) { 1649 } else if (ret) {
1653 /* -EDEADLK */ 1650 /* -EDEADLK */
1654 this->pi_state = NULL; 1651 this->pi_state = NULL;
1655 free_pi_state(pi_state); 1652 free_pi_state(pi_state);
1656 goto out_unlock; 1653 goto out_unlock;
1657 } 1654 }
1658 } 1655 }
1659 requeue_futex(this, hb1, hb2, &key2); 1656 requeue_futex(this, hb1, hb2, &key2);
1660 drop_count++; 1657 drop_count++;
1661 } 1658 }
1662 1659
1663 out_unlock: 1660 out_unlock:
1664 double_unlock_hb(hb1, hb2); 1661 double_unlock_hb(hb1, hb2);
1665 hb_waiters_dec(hb2); 1662 hb_waiters_dec(hb2);
1666 1663
1667 /* 1664 /*
1668 * drop_futex_key_refs() must be called outside the spinlocks. During 1665 * drop_futex_key_refs() must be called outside the spinlocks. During
1669 * the requeue we moved futex_q's from the hash bucket at key1 to the 1666 * the requeue we moved futex_q's from the hash bucket at key1 to the
1670 * one at key2 and updated their key pointer. We no longer need to 1667 * one at key2 and updated their key pointer. We no longer need to
1671 * hold the references to key1. 1668 * hold the references to key1.
1672 */ 1669 */
1673 while (--drop_count >= 0) 1670 while (--drop_count >= 0)
1674 drop_futex_key_refs(&key1); 1671 drop_futex_key_refs(&key1);
1675 1672
1676 out_put_keys: 1673 out_put_keys:
1677 put_futex_key(&key2); 1674 put_futex_key(&key2);
1678 out_put_key1: 1675 out_put_key1:
1679 put_futex_key(&key1); 1676 put_futex_key(&key1);
1680 out: 1677 out:
1681 if (pi_state != NULL) 1678 if (pi_state != NULL)
1682 free_pi_state(pi_state); 1679 free_pi_state(pi_state);
1683 return ret ? ret : task_count; 1680 return ret ? ret : task_count;
1684 } 1681 }
1685 1682
1686 /* The key must be already stored in q->key. */ 1683 /* The key must be already stored in q->key. */
1687 static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) 1684 static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
1688 __acquires(&hb->lock) 1685 __acquires(&hb->lock)
1689 { 1686 {
1690 struct futex_hash_bucket *hb; 1687 struct futex_hash_bucket *hb;
1691 1688
1692 hb = hash_futex(&q->key); 1689 hb = hash_futex(&q->key);
1693 1690
1694 /* 1691 /*
1695 * Increment the counter before taking the lock so that 1692 * Increment the counter before taking the lock so that
1696 * a potential waker won't miss a to-be-slept task that is 1693 * a potential waker won't miss a to-be-slept task that is
1697 * waiting for the spinlock. This is safe as all queue_lock() 1694 * waiting for the spinlock. This is safe as all queue_lock()
1698 * users end up calling queue_me(). Similarly, for housekeeping, 1695 * users end up calling queue_me(). Similarly, for housekeeping,
1699 * decrement the counter at queue_unlock() when some error has 1696 * decrement the counter at queue_unlock() when some error has
1700 * occurred and we don't end up adding the task to the list. 1697 * occurred and we don't end up adding the task to the list.
1701 */ 1698 */
1702 hb_waiters_inc(hb); 1699 hb_waiters_inc(hb);
1703 1700
1704 q->lock_ptr = &hb->lock; 1701 q->lock_ptr = &hb->lock;
1705 1702
1706 spin_lock(&hb->lock); /* implies MB (A) */ 1703 spin_lock(&hb->lock); /* implies MB (A) */
1707 return hb; 1704 return hb;
1708 } 1705 }
1709 1706
1710 static inline void 1707 static inline void
1711 queue_unlock(struct futex_hash_bucket *hb) 1708 queue_unlock(struct futex_hash_bucket *hb)
1712 __releases(&hb->lock) 1709 __releases(&hb->lock)
1713 { 1710 {
1714 spin_unlock(&hb->lock); 1711 spin_unlock(&hb->lock);
1715 hb_waiters_dec(hb); 1712 hb_waiters_dec(hb);
1716 } 1713 }
1717 1714
1718 /** 1715 /**
1719 * queue_me() - Enqueue the futex_q on the futex_hash_bucket 1716 * queue_me() - Enqueue the futex_q on the futex_hash_bucket
1720 * @q: The futex_q to enqueue 1717 * @q: The futex_q to enqueue
1721 * @hb: The destination hash bucket 1718 * @hb: The destination hash bucket
1722 * 1719 *
1723 * The hb->lock must be held by the caller, and is released here. A call to 1720 * The hb->lock must be held by the caller, and is released here. A call to
1724 * queue_me() is typically paired with exactly one call to unqueue_me(). The 1721 * queue_me() is typically paired with exactly one call to unqueue_me(). The
1725 * exceptions involve the PI related operations, which may use unqueue_me_pi() 1722 * exceptions involve the PI related operations, which may use unqueue_me_pi()
1726 * or nothing if the unqueue is done as part of the wake process and the unqueue 1723 * or nothing if the unqueue is done as part of the wake process and the unqueue
1727 * state is implicit in the state of woken task (see futex_wait_requeue_pi() for 1724 * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
1728 * an example). 1725 * an example).
1729 */ 1726 */
1730 static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) 1727 static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1731 __releases(&hb->lock) 1728 __releases(&hb->lock)
1732 { 1729 {
1733 int prio; 1730 int prio;
1734 1731
1735 /* 1732 /*
1736 * The priority used to register this element is 1733 * The priority used to register this element is
1737 * - either the real thread-priority for the real-time threads 1734 * - either the real thread-priority for the real-time threads
1738 * (i.e. threads with a priority lower than MAX_RT_PRIO) 1735 * (i.e. threads with a priority lower than MAX_RT_PRIO)
1739 * - or MAX_RT_PRIO for non-RT threads. 1736 * - or MAX_RT_PRIO for non-RT threads.
1740 * Thus, all RT-threads are woken first in priority order, and 1737 * Thus, all RT-threads are woken first in priority order, and
1741 * the others are woken last, in FIFO order. 1738 * the others are woken last, in FIFO order.
1742 */ 1739 */
1743 prio = min(current->normal_prio, MAX_RT_PRIO); 1740 prio = min(current->normal_prio, MAX_RT_PRIO);
1744 1741
1745 plist_node_init(&q->list, prio); 1742 plist_node_init(&q->list, prio);
1746 plist_add(&q->list, &hb->chain); 1743 plist_add(&q->list, &hb->chain);
1747 q->task = current; 1744 q->task = current;
1748 spin_unlock(&hb->lock); 1745 spin_unlock(&hb->lock);
1749 } 1746 }
1750 1747
1751 /** 1748 /**
1752 * unqueue_me() - Remove the futex_q from its futex_hash_bucket 1749 * unqueue_me() - Remove the futex_q from its futex_hash_bucket
1753 * @q: The futex_q to unqueue 1750 * @q: The futex_q to unqueue
1754 * 1751 *
1755 * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must 1752 * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
1756 * be paired with exactly one earlier call to queue_me(). 1753 * be paired with exactly one earlier call to queue_me().
1757 * 1754 *
1758 * Return: 1755 * Return:
1759 * 1 - if the futex_q was still queued (and we removed unqueued it); 1756 * 1 - if the futex_q was still queued (and we removed unqueued it);
1760 * 0 - if the futex_q was already removed by the waking thread 1757 * 0 - if the futex_q was already removed by the waking thread
1761 */ 1758 */
1762 static int unqueue_me(struct futex_q *q) 1759 static int unqueue_me(struct futex_q *q)
1763 { 1760 {
1764 spinlock_t *lock_ptr; 1761 spinlock_t *lock_ptr;
1765 int ret = 0; 1762 int ret = 0;
1766 1763
1767 /* In the common case we don't take the spinlock, which is nice. */ 1764 /* In the common case we don't take the spinlock, which is nice. */
1768 retry: 1765 retry:
1769 lock_ptr = q->lock_ptr; 1766 lock_ptr = q->lock_ptr;
1770 barrier(); 1767 barrier();
1771 if (lock_ptr != NULL) { 1768 if (lock_ptr != NULL) {
1772 spin_lock(lock_ptr); 1769 spin_lock(lock_ptr);
1773 /* 1770 /*
1774 * q->lock_ptr can change between reading it and 1771 * q->lock_ptr can change between reading it and
1775 * spin_lock(), causing us to take the wrong lock. This 1772 * spin_lock(), causing us to take the wrong lock. This
1776 * corrects the race condition. 1773 * corrects the race condition.
1777 * 1774 *
1778 * Reasoning goes like this: if we have the wrong lock, 1775 * Reasoning goes like this: if we have the wrong lock,
1779 * q->lock_ptr must have changed (maybe several times) 1776 * q->lock_ptr must have changed (maybe several times)
1780 * between reading it and the spin_lock(). It can 1777 * between reading it and the spin_lock(). It can
1781 * change again after the spin_lock() but only if it was 1778 * change again after the spin_lock() but only if it was
1782 * already changed before the spin_lock(). It cannot, 1779 * already changed before the spin_lock(). It cannot,
1783 * however, change back to the original value. Therefore 1780 * however, change back to the original value. Therefore
1784 * we can detect whether we acquired the correct lock. 1781 * we can detect whether we acquired the correct lock.
1785 */ 1782 */
1786 if (unlikely(lock_ptr != q->lock_ptr)) { 1783 if (unlikely(lock_ptr != q->lock_ptr)) {
1787 spin_unlock(lock_ptr); 1784 spin_unlock(lock_ptr);
1788 goto retry; 1785 goto retry;
1789 } 1786 }
1790 __unqueue_futex(q); 1787 __unqueue_futex(q);
1791 1788
1792 BUG_ON(q->pi_state); 1789 BUG_ON(q->pi_state);
1793 1790
1794 spin_unlock(lock_ptr); 1791 spin_unlock(lock_ptr);
1795 ret = 1; 1792 ret = 1;
1796 } 1793 }
1797 1794
1798 drop_futex_key_refs(&q->key); 1795 drop_futex_key_refs(&q->key);
1799 return ret; 1796 return ret;
1800 } 1797 }
1801 1798
1802 /* 1799 /*
1803 * PI futexes can not be requeued and must remove themself from the 1800 * PI futexes can not be requeued and must remove themself from the
1804 * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry 1801 * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry
1805 * and dropped here. 1802 * and dropped here.
1806 */ 1803 */
1807 static void unqueue_me_pi(struct futex_q *q) 1804 static void unqueue_me_pi(struct futex_q *q)
1808 __releases(q->lock_ptr) 1805 __releases(q->lock_ptr)
1809 { 1806 {
1810 __unqueue_futex(q); 1807 __unqueue_futex(q);
1811 1808
1812 BUG_ON(!q->pi_state); 1809 BUG_ON(!q->pi_state);
1813 free_pi_state(q->pi_state); 1810 free_pi_state(q->pi_state);
1814 q->pi_state = NULL; 1811 q->pi_state = NULL;
1815 1812
1816 spin_unlock(q->lock_ptr); 1813 spin_unlock(q->lock_ptr);
1817 } 1814 }
1818 1815
1819 /* 1816 /*
1820 * Fixup the pi_state owner with the new owner. 1817 * Fixup the pi_state owner with the new owner.
1821 * 1818 *
1822 * Must be called with hash bucket lock held and mm->sem held for non 1819 * Must be called with hash bucket lock held and mm->sem held for non
1823 * private futexes. 1820 * private futexes.
1824 */ 1821 */
1825 static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, 1822 static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1826 struct task_struct *newowner) 1823 struct task_struct *newowner)
1827 { 1824 {
1828 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; 1825 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
1829 struct futex_pi_state *pi_state = q->pi_state; 1826 struct futex_pi_state *pi_state = q->pi_state;
1830 struct task_struct *oldowner = pi_state->owner; 1827 struct task_struct *oldowner = pi_state->owner;
1831 u32 uval, uninitialized_var(curval), newval; 1828 u32 uval, uninitialized_var(curval), newval;
1832 int ret; 1829 int ret;
1833 1830
1834 /* Owner died? */ 1831 /* Owner died? */
1835 if (!pi_state->owner) 1832 if (!pi_state->owner)
1836 newtid |= FUTEX_OWNER_DIED; 1833 newtid |= FUTEX_OWNER_DIED;
1837 1834
1838 /* 1835 /*
1839 * We are here either because we stole the rtmutex from the 1836 * We are here either because we stole the rtmutex from the
1840 * previous highest priority waiter or we are the highest priority 1837 * previous highest priority waiter or we are the highest priority
1841 * waiter but failed to get the rtmutex the first time. 1838 * waiter but failed to get the rtmutex the first time.
1842 * We have to replace the newowner TID in the user space variable. 1839 * We have to replace the newowner TID in the user space variable.
1843 * This must be atomic as we have to preserve the owner died bit here. 1840 * This must be atomic as we have to preserve the owner died bit here.
1844 * 1841 *
1845 * Note: We write the user space value _before_ changing the pi_state 1842 * Note: We write the user space value _before_ changing the pi_state
1846 * because we can fault here. Imagine swapped out pages or a fork 1843 * because we can fault here. Imagine swapped out pages or a fork
1847 * that marked all the anonymous memory readonly for cow. 1844 * that marked all the anonymous memory readonly for cow.
1848 * 1845 *
1849 * Modifying pi_state _before_ the user space value would 1846 * Modifying pi_state _before_ the user space value would
1850 * leave the pi_state in an inconsistent state when we fault 1847 * leave the pi_state in an inconsistent state when we fault
1851 * here, because we need to drop the hash bucket lock to 1848 * here, because we need to drop the hash bucket lock to
1852 * handle the fault. This might be observed in the PID check 1849 * handle the fault. This might be observed in the PID check
1853 * in lookup_pi_state. 1850 * in lookup_pi_state.
1854 */ 1851 */
1855 retry: 1852 retry:
1856 if (get_futex_value_locked(&uval, uaddr)) 1853 if (get_futex_value_locked(&uval, uaddr))
1857 goto handle_fault; 1854 goto handle_fault;
1858 1855
1859 while (1) { 1856 while (1) {
1860 newval = (uval & FUTEX_OWNER_DIED) | newtid; 1857 newval = (uval & FUTEX_OWNER_DIED) | newtid;
1861 1858
1862 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) 1859 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
1863 goto handle_fault; 1860 goto handle_fault;
1864 if (curval == uval) 1861 if (curval == uval)
1865 break; 1862 break;
1866 uval = curval; 1863 uval = curval;
1867 } 1864 }
1868 1865
1869 /* 1866 /*
1870 * We fixed up user space. Now we need to fix the pi_state 1867 * We fixed up user space. Now we need to fix the pi_state
1871 * itself. 1868 * itself.
1872 */ 1869 */
1873 if (pi_state->owner != NULL) { 1870 if (pi_state->owner != NULL) {
1874 raw_spin_lock_irq(&pi_state->owner->pi_lock); 1871 raw_spin_lock_irq(&pi_state->owner->pi_lock);
1875 WARN_ON(list_empty(&pi_state->list)); 1872 WARN_ON(list_empty(&pi_state->list));
1876 list_del_init(&pi_state->list); 1873 list_del_init(&pi_state->list);
1877 raw_spin_unlock_irq(&pi_state->owner->pi_lock); 1874 raw_spin_unlock_irq(&pi_state->owner->pi_lock);
1878 } 1875 }
1879 1876
1880 pi_state->owner = newowner; 1877 pi_state->owner = newowner;
1881 1878
1882 raw_spin_lock_irq(&newowner->pi_lock); 1879 raw_spin_lock_irq(&newowner->pi_lock);
1883 WARN_ON(!list_empty(&pi_state->list)); 1880 WARN_ON(!list_empty(&pi_state->list));
1884 list_add(&pi_state->list, &newowner->pi_state_list); 1881 list_add(&pi_state->list, &newowner->pi_state_list);
1885 raw_spin_unlock_irq(&newowner->pi_lock); 1882 raw_spin_unlock_irq(&newowner->pi_lock);
1886 return 0; 1883 return 0;
1887 1884
1888 /* 1885 /*
1889 * To handle the page fault we need to drop the hash bucket 1886 * To handle the page fault we need to drop the hash bucket
1890 * lock here. That gives the other task (either the highest priority 1887 * lock here. That gives the other task (either the highest priority
1891 * waiter itself or the task which stole the rtmutex) the 1888 * waiter itself or the task which stole the rtmutex) the
1892 * chance to try the fixup of the pi_state. So once we are 1889 * chance to try the fixup of the pi_state. So once we are
1893 * back from handling the fault we need to check the pi_state 1890 * back from handling the fault we need to check the pi_state
1894 * after reacquiring the hash bucket lock and before trying to 1891 * after reacquiring the hash bucket lock and before trying to
1895 * do another fixup. When the fixup has been done already we 1892 * do another fixup. When the fixup has been done already we
1896 * simply return. 1893 * simply return.
1897 */ 1894 */
1898 handle_fault: 1895 handle_fault:
1899 spin_unlock(q->lock_ptr); 1896 spin_unlock(q->lock_ptr);
1900 1897
1901 ret = fault_in_user_writeable(uaddr); 1898 ret = fault_in_user_writeable(uaddr);
1902 1899
1903 spin_lock(q->lock_ptr); 1900 spin_lock(q->lock_ptr);
1904 1901
1905 /* 1902 /*
1906 * Check if someone else fixed it for us: 1903 * Check if someone else fixed it for us:
1907 */ 1904 */
1908 if (pi_state->owner != oldowner) 1905 if (pi_state->owner != oldowner)
1909 return 0; 1906 return 0;
1910 1907
1911 if (ret) 1908 if (ret)
1912 return ret; 1909 return ret;
1913 1910
1914 goto retry; 1911 goto retry;
1915 } 1912 }
1916 1913
1917 static long futex_wait_restart(struct restart_block *restart); 1914 static long futex_wait_restart(struct restart_block *restart);
1918 1915
1919 /** 1916 /**
1920 * fixup_owner() - Post lock pi_state and corner case management 1917 * fixup_owner() - Post lock pi_state and corner case management
1921 * @uaddr: user address of the futex 1918 * @uaddr: user address of the futex
1922 * @q: futex_q (contains pi_state and access to the rt_mutex) 1919 * @q: futex_q (contains pi_state and access to the rt_mutex)
1923 * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) 1920 * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
1924 * 1921 *
1925 * After attempting to lock an rt_mutex, this function is called to cleanup 1922 * After attempting to lock an rt_mutex, this function is called to cleanup
1926 * the pi_state owner as well as handle race conditions that may allow us to 1923 * the pi_state owner as well as handle race conditions that may allow us to
1927 * acquire the lock. Must be called with the hb lock held. 1924 * acquire the lock. Must be called with the hb lock held.
1928 * 1925 *
1929 * Return: 1926 * Return:
1930 * 1 - success, lock taken; 1927 * 1 - success, lock taken;
1931 * 0 - success, lock not taken; 1928 * 0 - success, lock not taken;
1932 * <0 - on error (-EFAULT) 1929 * <0 - on error (-EFAULT)
1933 */ 1930 */
1934 static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) 1931 static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
1935 { 1932 {
1936 struct task_struct *owner; 1933 struct task_struct *owner;
1937 int ret = 0; 1934 int ret = 0;
1938 1935
1939 if (locked) { 1936 if (locked) {
1940 /* 1937 /*
1941 * Got the lock. We might not be the anticipated owner if we 1938 * Got the lock. We might not be the anticipated owner if we
1942 * did a lock-steal - fix up the PI-state in that case: 1939 * did a lock-steal - fix up the PI-state in that case:
1943 */ 1940 */
1944 if (q->pi_state->owner != current) 1941 if (q->pi_state->owner != current)
1945 ret = fixup_pi_state_owner(uaddr, q, current); 1942 ret = fixup_pi_state_owner(uaddr, q, current);
1946 goto out; 1943 goto out;
1947 } 1944 }
1948 1945
1949 /* 1946 /*
1950 * Catch the rare case, where the lock was released when we were on the 1947 * Catch the rare case, where the lock was released when we were on the
1951 * way back before we locked the hash bucket. 1948 * way back before we locked the hash bucket.
1952 */ 1949 */
1953 if (q->pi_state->owner == current) { 1950 if (q->pi_state->owner == current) {
1954 /* 1951 /*
1955 * Try to get the rt_mutex now. This might fail as some other 1952 * Try to get the rt_mutex now. This might fail as some other
1956 * task acquired the rt_mutex after we removed ourself from the 1953 * task acquired the rt_mutex after we removed ourself from the
1957 * rt_mutex waiters list. 1954 * rt_mutex waiters list.
1958 */ 1955 */
1959 if (rt_mutex_trylock(&q->pi_state->pi_mutex)) { 1956 if (rt_mutex_trylock(&q->pi_state->pi_mutex)) {
1960 locked = 1; 1957 locked = 1;
1961 goto out; 1958 goto out;
1962 } 1959 }
1963 1960
1964 /* 1961 /*
1965 * pi_state is incorrect, some other task did a lock steal and 1962 * pi_state is incorrect, some other task did a lock steal and
1966 * we returned due to timeout or signal without taking the 1963 * we returned due to timeout or signal without taking the
1967 * rt_mutex. Too late. 1964 * rt_mutex. Too late.
1968 */ 1965 */
1969 raw_spin_lock(&q->pi_state->pi_mutex.wait_lock); 1966 raw_spin_lock(&q->pi_state->pi_mutex.wait_lock);
1970 owner = rt_mutex_owner(&q->pi_state->pi_mutex); 1967 owner = rt_mutex_owner(&q->pi_state->pi_mutex);
1971 if (!owner) 1968 if (!owner)
1972 owner = rt_mutex_next_owner(&q->pi_state->pi_mutex); 1969 owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
1973 raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock); 1970 raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock);
1974 ret = fixup_pi_state_owner(uaddr, q, owner); 1971 ret = fixup_pi_state_owner(uaddr, q, owner);
1975 goto out; 1972 goto out;
1976 } 1973 }
1977 1974
1978 /* 1975 /*
1979 * Paranoia check. If we did not take the lock, then we should not be 1976 * Paranoia check. If we did not take the lock, then we should not be
1980 * the owner of the rt_mutex. 1977 * the owner of the rt_mutex.
1981 */ 1978 */
1982 if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) 1979 if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
1983 printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " 1980 printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
1984 "pi-state %p\n", ret, 1981 "pi-state %p\n", ret,
1985 q->pi_state->pi_mutex.owner, 1982 q->pi_state->pi_mutex.owner,
1986 q->pi_state->owner); 1983 q->pi_state->owner);
1987 1984
1988 out: 1985 out:
1989 return ret ? ret : locked; 1986 return ret ? ret : locked;
1990 } 1987 }
1991 1988
1992 /** 1989 /**
1993 * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal 1990 * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
1994 * @hb: the futex hash bucket, must be locked by the caller 1991 * @hb: the futex hash bucket, must be locked by the caller
1995 * @q: the futex_q to queue up on 1992 * @q: the futex_q to queue up on
1996 * @timeout: the prepared hrtimer_sleeper, or null for no timeout 1993 * @timeout: the prepared hrtimer_sleeper, or null for no timeout
1997 */ 1994 */
1998 static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, 1995 static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1999 struct hrtimer_sleeper *timeout) 1996 struct hrtimer_sleeper *timeout)
2000 { 1997 {
2001 /* 1998 /*
2002 * The task state is guaranteed to be set before another task can 1999 * The task state is guaranteed to be set before another task can
2003 * wake it. set_current_state() is implemented using set_mb() and 2000 * wake it. set_current_state() is implemented using set_mb() and
2004 * queue_me() calls spin_unlock() upon completion, both serializing 2001 * queue_me() calls spin_unlock() upon completion, both serializing
2005 * access to the hash list and forcing another memory barrier. 2002 * access to the hash list and forcing another memory barrier.
2006 */ 2003 */
2007 set_current_state(TASK_INTERRUPTIBLE); 2004 set_current_state(TASK_INTERRUPTIBLE);
2008 queue_me(q, hb); 2005 queue_me(q, hb);
2009 2006
2010 /* Arm the timer */ 2007 /* Arm the timer */
2011 if (timeout) { 2008 if (timeout) {
2012 hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); 2009 hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
2013 if (!hrtimer_active(&timeout->timer)) 2010 if (!hrtimer_active(&timeout->timer))
2014 timeout->task = NULL; 2011 timeout->task = NULL;
2015 } 2012 }
2016 2013
2017 /* 2014 /*
2018 * If we have been removed from the hash list, then another task 2015 * If we have been removed from the hash list, then another task
2019 * has tried to wake us, and we can skip the call to schedule(). 2016 * has tried to wake us, and we can skip the call to schedule().
2020 */ 2017 */
2021 if (likely(!plist_node_empty(&q->list))) { 2018 if (likely(!plist_node_empty(&q->list))) {
2022 /* 2019 /*
2023 * If the timer has already expired, current will already be 2020 * If the timer has already expired, current will already be
2024 * flagged for rescheduling. Only call schedule if there 2021 * flagged for rescheduling. Only call schedule if there
2025 * is no timeout, or if it has yet to expire. 2022 * is no timeout, or if it has yet to expire.
2026 */ 2023 */
2027 if (!timeout || timeout->task) 2024 if (!timeout || timeout->task)
2028 freezable_schedule(); 2025 freezable_schedule();
2029 } 2026 }
2030 __set_current_state(TASK_RUNNING); 2027 __set_current_state(TASK_RUNNING);
2031 } 2028 }
2032 2029
2033 /** 2030 /**
2034 * futex_wait_setup() - Prepare to wait on a futex 2031 * futex_wait_setup() - Prepare to wait on a futex
2035 * @uaddr: the futex userspace address 2032 * @uaddr: the futex userspace address
2036 * @val: the expected value 2033 * @val: the expected value
2037 * @flags: futex flags (FLAGS_SHARED, etc.) 2034 * @flags: futex flags (FLAGS_SHARED, etc.)
2038 * @q: the associated futex_q 2035 * @q: the associated futex_q
2039 * @hb: storage for hash_bucket pointer to be returned to caller 2036 * @hb: storage for hash_bucket pointer to be returned to caller
2040 * 2037 *
2041 * Setup the futex_q and locate the hash_bucket. Get the futex value and 2038 * Setup the futex_q and locate the hash_bucket. Get the futex value and
2042 * compare it with the expected value. Handle atomic faults internally. 2039 * compare it with the expected value. Handle atomic faults internally.
2043 * Return with the hb lock held and a q.key reference on success, and unlocked 2040 * Return with the hb lock held and a q.key reference on success, and unlocked
2044 * with no q.key reference on failure. 2041 * with no q.key reference on failure.
2045 * 2042 *
2046 * Return: 2043 * Return:
2047 * 0 - uaddr contains val and hb has been locked; 2044 * 0 - uaddr contains val and hb has been locked;
2048 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked 2045 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
2049 */ 2046 */
2050 static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, 2047 static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
2051 struct futex_q *q, struct futex_hash_bucket **hb) 2048 struct futex_q *q, struct futex_hash_bucket **hb)
2052 { 2049 {
2053 u32 uval; 2050 u32 uval;
2054 int ret; 2051 int ret;
2055 2052
2056 /* 2053 /*
2057 * Access the page AFTER the hash-bucket is locked. 2054 * Access the page AFTER the hash-bucket is locked.
2058 * Order is important: 2055 * Order is important:
2059 * 2056 *
2060 * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val); 2057 * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
2061 * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); } 2058 * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); }
2062 * 2059 *
2063 * The basic logical guarantee of a futex is that it blocks ONLY 2060 * The basic logical guarantee of a futex is that it blocks ONLY
2064 * if cond(var) is known to be true at the time of blocking, for 2061 * if cond(var) is known to be true at the time of blocking, for
2065 * any cond. If we locked the hash-bucket after testing *uaddr, that 2062 * any cond. If we locked the hash-bucket after testing *uaddr, that
2066 * would open a race condition where we could block indefinitely with 2063 * would open a race condition where we could block indefinitely with
2067 * cond(var) false, which would violate the guarantee. 2064 * cond(var) false, which would violate the guarantee.
2068 * 2065 *
2069 * On the other hand, we insert q and release the hash-bucket only 2066 * On the other hand, we insert q and release the hash-bucket only
2070 * after testing *uaddr. This guarantees that futex_wait() will NOT 2067 * after testing *uaddr. This guarantees that futex_wait() will NOT
2071 * absorb a wakeup if *uaddr does not match the desired values 2068 * absorb a wakeup if *uaddr does not match the desired values
2072 * while the syscall executes. 2069 * while the syscall executes.
2073 */ 2070 */
2074 retry: 2071 retry:
2075 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, VERIFY_READ); 2072 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, VERIFY_READ);
2076 if (unlikely(ret != 0)) 2073 if (unlikely(ret != 0))
2077 return ret; 2074 return ret;
2078 2075
2079 retry_private: 2076 retry_private:
2080 *hb = queue_lock(q); 2077 *hb = queue_lock(q);
2081 2078
2082 ret = get_futex_value_locked(&uval, uaddr); 2079 ret = get_futex_value_locked(&uval, uaddr);
2083 2080
2084 if (ret) { 2081 if (ret) {
2085 queue_unlock(*hb); 2082 queue_unlock(*hb);
2086 2083
2087 ret = get_user(uval, uaddr); 2084 ret = get_user(uval, uaddr);
2088 if (ret) 2085 if (ret)
2089 goto out; 2086 goto out;
2090 2087
2091 if (!(flags & FLAGS_SHARED)) 2088 if (!(flags & FLAGS_SHARED))
2092 goto retry_private; 2089 goto retry_private;
2093 2090
2094 put_futex_key(&q->key); 2091 put_futex_key(&q->key);
2095 goto retry; 2092 goto retry;
2096 } 2093 }
2097 2094
2098 if (uval != val) { 2095 if (uval != val) {
2099 queue_unlock(*hb); 2096 queue_unlock(*hb);
2100 ret = -EWOULDBLOCK; 2097 ret = -EWOULDBLOCK;
2101 } 2098 }
2102 2099
2103 out: 2100 out:
2104 if (ret) 2101 if (ret)
2105 put_futex_key(&q->key); 2102 put_futex_key(&q->key);
2106 return ret; 2103 return ret;
2107 } 2104 }
2108 2105
2109 static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, 2106 static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
2110 ktime_t *abs_time, u32 bitset) 2107 ktime_t *abs_time, u32 bitset)
2111 { 2108 {
2112 struct hrtimer_sleeper timeout, *to = NULL; 2109 struct hrtimer_sleeper timeout, *to = NULL;
2113 struct restart_block *restart; 2110 struct restart_block *restart;
2114 struct futex_hash_bucket *hb; 2111 struct futex_hash_bucket *hb;
2115 struct futex_q q = futex_q_init; 2112 struct futex_q q = futex_q_init;
2116 int ret; 2113 int ret;
2117 2114
2118 if (!bitset) 2115 if (!bitset)
2119 return -EINVAL; 2116 return -EINVAL;
2120 q.bitset = bitset; 2117 q.bitset = bitset;
2121 2118
2122 if (abs_time) { 2119 if (abs_time) {
2123 to = &timeout; 2120 to = &timeout;
2124 2121
2125 hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ? 2122 hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
2126 CLOCK_REALTIME : CLOCK_MONOTONIC, 2123 CLOCK_REALTIME : CLOCK_MONOTONIC,
2127 HRTIMER_MODE_ABS); 2124 HRTIMER_MODE_ABS);
2128 hrtimer_init_sleeper(to, current); 2125 hrtimer_init_sleeper(to, current);
2129 hrtimer_set_expires_range_ns(&to->timer, *abs_time, 2126 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
2130 current->timer_slack_ns); 2127 current->timer_slack_ns);
2131 } 2128 }
2132 2129
2133 retry: 2130 retry:
2134 /* 2131 /*
2135 * Prepare to wait on uaddr. On success, holds hb lock and increments 2132 * Prepare to wait on uaddr. On success, holds hb lock and increments
2136 * q.key refs. 2133 * q.key refs.
2137 */ 2134 */
2138 ret = futex_wait_setup(uaddr, val, flags, &q, &hb); 2135 ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
2139 if (ret) 2136 if (ret)
2140 goto out; 2137 goto out;
2141 2138
2142 /* queue_me and wait for wakeup, timeout, or a signal. */ 2139 /* queue_me and wait for wakeup, timeout, or a signal. */
2143 futex_wait_queue_me(hb, &q, to); 2140 futex_wait_queue_me(hb, &q, to);
2144 2141
2145 /* If we were woken (and unqueued), we succeeded, whatever. */ 2142 /* If we were woken (and unqueued), we succeeded, whatever. */
2146 ret = 0; 2143 ret = 0;
2147 /* unqueue_me() drops q.key ref */ 2144 /* unqueue_me() drops q.key ref */
2148 if (!unqueue_me(&q)) 2145 if (!unqueue_me(&q))
2149 goto out; 2146 goto out;
2150 ret = -ETIMEDOUT; 2147 ret = -ETIMEDOUT;
2151 if (to && !to->task) 2148 if (to && !to->task)
2152 goto out; 2149 goto out;
2153 2150
2154 /* 2151 /*
2155 * We expect signal_pending(current), but we might be the 2152 * We expect signal_pending(current), but we might be the
2156 * victim of a spurious wakeup as well. 2153 * victim of a spurious wakeup as well.
2157 */ 2154 */
2158 if (!signal_pending(current)) 2155 if (!signal_pending(current))
2159 goto retry; 2156 goto retry;
2160 2157
2161 ret = -ERESTARTSYS; 2158 ret = -ERESTARTSYS;
2162 if (!abs_time) 2159 if (!abs_time)
2163 goto out; 2160 goto out;
2164 2161
2165 restart = &current_thread_info()->restart_block; 2162 restart = &current_thread_info()->restart_block;
2166 restart->fn = futex_wait_restart; 2163 restart->fn = futex_wait_restart;
2167 restart->futex.uaddr = uaddr; 2164 restart->futex.uaddr = uaddr;
2168 restart->futex.val = val; 2165 restart->futex.val = val;
2169 restart->futex.time = abs_time->tv64; 2166 restart->futex.time = abs_time->tv64;
2170 restart->futex.bitset = bitset; 2167 restart->futex.bitset = bitset;
2171 restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; 2168 restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
2172 2169
2173 ret = -ERESTART_RESTARTBLOCK; 2170 ret = -ERESTART_RESTARTBLOCK;
2174 2171
2175 out: 2172 out:
2176 if (to) { 2173 if (to) {
2177 hrtimer_cancel(&to->timer); 2174 hrtimer_cancel(&to->timer);
2178 destroy_hrtimer_on_stack(&to->timer); 2175 destroy_hrtimer_on_stack(&to->timer);
2179 } 2176 }
2180 return ret; 2177 return ret;
2181 } 2178 }
2182 2179
2183 2180
2184 static long futex_wait_restart(struct restart_block *restart) 2181 static long futex_wait_restart(struct restart_block *restart)
2185 { 2182 {
2186 u32 __user *uaddr = restart->futex.uaddr; 2183 u32 __user *uaddr = restart->futex.uaddr;
2187 ktime_t t, *tp = NULL; 2184 ktime_t t, *tp = NULL;
2188 2185
2189 if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { 2186 if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
2190 t.tv64 = restart->futex.time; 2187 t.tv64 = restart->futex.time;
2191 tp = &t; 2188 tp = &t;
2192 } 2189 }
2193 restart->fn = do_no_restart_syscall; 2190 restart->fn = do_no_restart_syscall;
2194 2191
2195 return (long)futex_wait(uaddr, restart->futex.flags, 2192 return (long)futex_wait(uaddr, restart->futex.flags,
2196 restart->futex.val, tp, restart->futex.bitset); 2193 restart->futex.val, tp, restart->futex.bitset);
2197 } 2194 }
2198 2195
2199 2196
2200 /* 2197 /*
2201 * Userspace tried a 0 -> TID atomic transition of the futex value 2198 * Userspace tried a 0 -> TID atomic transition of the futex value
2202 * and failed. The kernel side here does the whole locking operation: 2199 * and failed. The kernel side here does the whole locking operation:
2203 * if there are waiters then it will block, it does PI, etc. (Due to 2200 * if there are waiters then it will block, it does PI, etc. (Due to
2204 * races the kernel might see a 0 value of the futex too.) 2201 * races the kernel might see a 0 value of the futex too.)
2205 */ 2202 */
2206 static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect, 2203 static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect,
2207 ktime_t *time, int trylock) 2204 ktime_t *time, int trylock)
2208 { 2205 {
2209 struct hrtimer_sleeper timeout, *to = NULL; 2206 struct hrtimer_sleeper timeout, *to = NULL;
2210 struct futex_hash_bucket *hb; 2207 struct futex_hash_bucket *hb;
2211 struct futex_q q = futex_q_init; 2208 struct futex_q q = futex_q_init;
2212 int res, ret; 2209 int res, ret;
2213 2210
2214 if (refill_pi_state_cache()) 2211 if (refill_pi_state_cache())
2215 return -ENOMEM; 2212 return -ENOMEM;
2216 2213
2217 if (time) { 2214 if (time) {
2218 to = &timeout; 2215 to = &timeout;
2219 hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME, 2216 hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME,
2220 HRTIMER_MODE_ABS); 2217 HRTIMER_MODE_ABS);
2221 hrtimer_init_sleeper(to, current); 2218 hrtimer_init_sleeper(to, current);
2222 hrtimer_set_expires(&to->timer, *time); 2219 hrtimer_set_expires(&to->timer, *time);
2223 } 2220 }
2224 2221
2225 retry: 2222 retry:
2226 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, VERIFY_WRITE); 2223 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, VERIFY_WRITE);
2227 if (unlikely(ret != 0)) 2224 if (unlikely(ret != 0))
2228 goto out; 2225 goto out;
2229 2226
2230 retry_private: 2227 retry_private:
2231 hb = queue_lock(&q); 2228 hb = queue_lock(&q);
2232 2229
2233 ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0); 2230 ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
2234 if (unlikely(ret)) { 2231 if (unlikely(ret)) {
2235 switch (ret) { 2232 switch (ret) {
2236 case 1: 2233 case 1:
2237 /* We got the lock. */ 2234 /* We got the lock. */
2238 ret = 0; 2235 ret = 0;
2239 goto out_unlock_put_key; 2236 goto out_unlock_put_key;
2240 case -EFAULT: 2237 case -EFAULT:
2241 goto uaddr_faulted; 2238 goto uaddr_faulted;
2242 case -EAGAIN: 2239 case -EAGAIN:
2243 /* 2240 /*
2244 * Task is exiting and we just wait for the 2241 * Task is exiting and we just wait for the
2245 * exit to complete. 2242 * exit to complete.
2246 */ 2243 */
2247 queue_unlock(hb); 2244 queue_unlock(hb);
2248 put_futex_key(&q.key); 2245 put_futex_key(&q.key);
2249 cond_resched(); 2246 cond_resched();
2250 goto retry; 2247 goto retry;
2251 default: 2248 default:
2252 goto out_unlock_put_key; 2249 goto out_unlock_put_key;
2253 } 2250 }
2254 } 2251 }
2255 2252
2256 /* 2253 /*
2257 * Only actually queue now that the atomic ops are done: 2254 * Only actually queue now that the atomic ops are done:
2258 */ 2255 */
2259 queue_me(&q, hb); 2256 queue_me(&q, hb);
2260 2257
2261 WARN_ON(!q.pi_state); 2258 WARN_ON(!q.pi_state);
2262 /* 2259 /*
2263 * Block on the PI mutex: 2260 * Block on the PI mutex:
2264 */ 2261 */
2265 if (!trylock) 2262 if (!trylock)
2266 ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1); 2263 ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1);
2267 else { 2264 else {
2268 ret = rt_mutex_trylock(&q.pi_state->pi_mutex); 2265 ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
2269 /* Fixup the trylock return value: */ 2266 /* Fixup the trylock return value: */
2270 ret = ret ? 0 : -EWOULDBLOCK; 2267 ret = ret ? 0 : -EWOULDBLOCK;
2271 } 2268 }
2272 2269
2273 spin_lock(q.lock_ptr); 2270 spin_lock(q.lock_ptr);
2274 /* 2271 /*
2275 * Fixup the pi_state owner and possibly acquire the lock if we 2272 * Fixup the pi_state owner and possibly acquire the lock if we
2276 * haven't already. 2273 * haven't already.
2277 */ 2274 */
2278 res = fixup_owner(uaddr, &q, !ret); 2275 res = fixup_owner(uaddr, &q, !ret);
2279 /* 2276 /*
2280 * If fixup_owner() returned an error, proprogate that. If it acquired 2277 * If fixup_owner() returned an error, proprogate that. If it acquired
2281 * the lock, clear our -ETIMEDOUT or -EINTR. 2278 * the lock, clear our -ETIMEDOUT or -EINTR.
2282 */ 2279 */
2283 if (res) 2280 if (res)
2284 ret = (res < 0) ? res : 0; 2281 ret = (res < 0) ? res : 0;
2285 2282
2286 /* 2283 /*
2287 * If fixup_owner() faulted and was unable to handle the fault, unlock 2284 * If fixup_owner() faulted and was unable to handle the fault, unlock
2288 * it and return the fault to userspace. 2285 * it and return the fault to userspace.
2289 */ 2286 */
2290 if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) 2287 if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
2291 rt_mutex_unlock(&q.pi_state->pi_mutex); 2288 rt_mutex_unlock(&q.pi_state->pi_mutex);
2292 2289
2293 /* Unqueue and drop the lock */ 2290 /* Unqueue and drop the lock */
2294 unqueue_me_pi(&q); 2291 unqueue_me_pi(&q);
2295 2292
2296 goto out_put_key; 2293 goto out_put_key;
2297 2294
2298 out_unlock_put_key: 2295 out_unlock_put_key:
2299 queue_unlock(hb); 2296 queue_unlock(hb);
2300 2297
2301 out_put_key: 2298 out_put_key:
2302 put_futex_key(&q.key); 2299 put_futex_key(&q.key);
2303 out: 2300 out:
2304 if (to) 2301 if (to)
2305 destroy_hrtimer_on_stack(&to->timer); 2302 destroy_hrtimer_on_stack(&to->timer);
2306 return ret != -EINTR ? ret : -ERESTARTNOINTR; 2303 return ret != -EINTR ? ret : -ERESTARTNOINTR;
2307 2304
2308 uaddr_faulted: 2305 uaddr_faulted:
2309 queue_unlock(hb); 2306 queue_unlock(hb);
2310 2307
2311 ret = fault_in_user_writeable(uaddr); 2308 ret = fault_in_user_writeable(uaddr);
2312 if (ret) 2309 if (ret)
2313 goto out_put_key; 2310 goto out_put_key;
2314 2311
2315 if (!(flags & FLAGS_SHARED)) 2312 if (!(flags & FLAGS_SHARED))
2316 goto retry_private; 2313 goto retry_private;
2317 2314
2318 put_futex_key(&q.key); 2315 put_futex_key(&q.key);
2319 goto retry; 2316 goto retry;
2320 } 2317 }
2321 2318
2322 /* 2319 /*
2323 * Userspace attempted a TID -> 0 atomic transition, and failed. 2320 * Userspace attempted a TID -> 0 atomic transition, and failed.
2324 * This is the in-kernel slowpath: we look up the PI state (if any), 2321 * This is the in-kernel slowpath: we look up the PI state (if any),
2325 * and do the rt-mutex unlock. 2322 * and do the rt-mutex unlock.
2326 */ 2323 */
2327 static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) 2324 static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
2328 { 2325 {
2329 struct futex_hash_bucket *hb; 2326 struct futex_hash_bucket *hb;
2330 struct futex_q *this, *next; 2327 struct futex_q *this, *next;
2331 union futex_key key = FUTEX_KEY_INIT; 2328 union futex_key key = FUTEX_KEY_INIT;
2332 u32 uval, vpid = task_pid_vnr(current); 2329 u32 uval, vpid = task_pid_vnr(current);
2333 int ret; 2330 int ret;
2334 2331
2335 retry: 2332 retry:
2336 if (get_user(uval, uaddr)) 2333 if (get_user(uval, uaddr))
2337 return -EFAULT; 2334 return -EFAULT;
2338 /* 2335 /*
2339 * We release only a lock we actually own: 2336 * We release only a lock we actually own:
2340 */ 2337 */
2341 if ((uval & FUTEX_TID_MASK) != vpid) 2338 if ((uval & FUTEX_TID_MASK) != vpid)
2342 return -EPERM; 2339 return -EPERM;
2343 2340
2344 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE); 2341 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE);
2345 if (unlikely(ret != 0)) 2342 if (unlikely(ret != 0))
2346 goto out; 2343 goto out;
2347 2344
2348 hb = hash_futex(&key); 2345 hb = hash_futex(&key);
2349 spin_lock(&hb->lock); 2346 spin_lock(&hb->lock);
2350 2347
2351 /* 2348 /*
2352 * To avoid races, try to do the TID -> 0 atomic transition 2349 * To avoid races, try to do the TID -> 0 atomic transition
2353 * again. If it succeeds then we can return without waking 2350 * again. If it succeeds then we can return without waking
2354 * anyone else up: 2351 * anyone else up. We only try this if neither the waiters nor
2352 * the owner died bit are set.
2355 */ 2353 */
2356 if (!(uval & FUTEX_OWNER_DIED) && 2354 if (!(uval & ~FUTEX_TID_MASK) &&
2357 cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0)) 2355 cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
2358 goto pi_faulted; 2356 goto pi_faulted;
2359 /* 2357 /*
2360 * Rare case: we managed to release the lock atomically, 2358 * Rare case: we managed to release the lock atomically,
2361 * no need to wake anyone else up: 2359 * no need to wake anyone else up:
2362 */ 2360 */
2363 if (unlikely(uval == vpid)) 2361 if (unlikely(uval == vpid))
2364 goto out_unlock; 2362 goto out_unlock;
2365 2363
2366 /* 2364 /*
2367 * Ok, other tasks may need to be woken up - check waiters 2365 * Ok, other tasks may need to be woken up - check waiters
2368 * and do the wakeup if necessary: 2366 * and do the wakeup if necessary:
2369 */ 2367 */
2370 plist_for_each_entry_safe(this, next, &hb->chain, list) { 2368 plist_for_each_entry_safe(this, next, &hb->chain, list) {
2371 if (!match_futex (&this->key, &key)) 2369 if (!match_futex (&this->key, &key))
2372 continue; 2370 continue;
2373 ret = wake_futex_pi(uaddr, uval, this); 2371 ret = wake_futex_pi(uaddr, uval, this);
2374 /* 2372 /*
2375 * The atomic access to the futex value 2373 * The atomic access to the futex value
2376 * generated a pagefault, so retry the 2374 * generated a pagefault, so retry the
2377 * user-access and the wakeup: 2375 * user-access and the wakeup:
2378 */ 2376 */
2379 if (ret == -EFAULT) 2377 if (ret == -EFAULT)
2380 goto pi_faulted; 2378 goto pi_faulted;
2381 goto out_unlock; 2379 goto out_unlock;
2382 } 2380 }
2383 /* 2381 /*
2384 * No waiters - kernel unlocks the futex: 2382 * No waiters - kernel unlocks the futex:
2385 */ 2383 */
2386 if (!(uval & FUTEX_OWNER_DIED)) { 2384 ret = unlock_futex_pi(uaddr, uval);
2387 ret = unlock_futex_pi(uaddr, uval); 2385 if (ret == -EFAULT)
2388 if (ret == -EFAULT) 2386 goto pi_faulted;
2389 goto pi_faulted;
2390 }
2391 2387
2392 out_unlock: 2388 out_unlock:
2393 spin_unlock(&hb->lock); 2389 spin_unlock(&hb->lock);
2394 put_futex_key(&key); 2390 put_futex_key(&key);
2395 2391
2396 out: 2392 out:
2397 return ret; 2393 return ret;
2398 2394
2399 pi_faulted: 2395 pi_faulted:
2400 spin_unlock(&hb->lock); 2396 spin_unlock(&hb->lock);
2401 put_futex_key(&key); 2397 put_futex_key(&key);
2402 2398
2403 ret = fault_in_user_writeable(uaddr); 2399 ret = fault_in_user_writeable(uaddr);
2404 if (!ret) 2400 if (!ret)
2405 goto retry; 2401 goto retry;
2406 2402
2407 return ret; 2403 return ret;
2408 } 2404 }
2409 2405
2410 /** 2406 /**
2411 * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex 2407 * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex
2412 * @hb: the hash_bucket futex_q was original enqueued on 2408 * @hb: the hash_bucket futex_q was original enqueued on
2413 * @q: the futex_q woken while waiting to be requeued 2409 * @q: the futex_q woken while waiting to be requeued
2414 * @key2: the futex_key of the requeue target futex 2410 * @key2: the futex_key of the requeue target futex
2415 * @timeout: the timeout associated with the wait (NULL if none) 2411 * @timeout: the timeout associated with the wait (NULL if none)
2416 * 2412 *
2417 * Detect if the task was woken on the initial futex as opposed to the requeue 2413 * Detect if the task was woken on the initial futex as opposed to the requeue
2418 * target futex. If so, determine if it was a timeout or a signal that caused 2414 * target futex. If so, determine if it was a timeout or a signal that caused
2419 * the wakeup and return the appropriate error code to the caller. Must be 2415 * the wakeup and return the appropriate error code to the caller. Must be
2420 * called with the hb lock held. 2416 * called with the hb lock held.
2421 * 2417 *
2422 * Return: 2418 * Return:
2423 * 0 = no early wakeup detected; 2419 * 0 = no early wakeup detected;
2424 * <0 = -ETIMEDOUT or -ERESTARTNOINTR 2420 * <0 = -ETIMEDOUT or -ERESTARTNOINTR
2425 */ 2421 */
2426 static inline 2422 static inline
2427 int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, 2423 int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2428 struct futex_q *q, union futex_key *key2, 2424 struct futex_q *q, union futex_key *key2,
2429 struct hrtimer_sleeper *timeout) 2425 struct hrtimer_sleeper *timeout)
2430 { 2426 {
2431 int ret = 0; 2427 int ret = 0;
2432 2428
2433 /* 2429 /*
2434 * With the hb lock held, we avoid races while we process the wakeup. 2430 * With the hb lock held, we avoid races while we process the wakeup.
2435 * We only need to hold hb (and not hb2) to ensure atomicity as the 2431 * We only need to hold hb (and not hb2) to ensure atomicity as the
2436 * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb. 2432 * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
2437 * It can't be requeued from uaddr2 to something else since we don't 2433 * It can't be requeued from uaddr2 to something else since we don't
2438 * support a PI aware source futex for requeue. 2434 * support a PI aware source futex for requeue.
2439 */ 2435 */
2440 if (!match_futex(&q->key, key2)) { 2436 if (!match_futex(&q->key, key2)) {
2441 WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr)); 2437 WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr));
2442 /* 2438 /*
2443 * We were woken prior to requeue by a timeout or a signal. 2439 * We were woken prior to requeue by a timeout or a signal.
2444 * Unqueue the futex_q and determine which it was. 2440 * Unqueue the futex_q and determine which it was.
2445 */ 2441 */
2446 plist_del(&q->list, &hb->chain); 2442 plist_del(&q->list, &hb->chain);
2447 hb_waiters_dec(hb); 2443 hb_waiters_dec(hb);
2448 2444
2449 /* Handle spurious wakeups gracefully */ 2445 /* Handle spurious wakeups gracefully */
2450 ret = -EWOULDBLOCK; 2446 ret = -EWOULDBLOCK;
2451 if (timeout && !timeout->task) 2447 if (timeout && !timeout->task)
2452 ret = -ETIMEDOUT; 2448 ret = -ETIMEDOUT;
2453 else if (signal_pending(current)) 2449 else if (signal_pending(current))
2454 ret = -ERESTARTNOINTR; 2450 ret = -ERESTARTNOINTR;
2455 } 2451 }
2456 return ret; 2452 return ret;
2457 } 2453 }
2458 2454
2459 /** 2455 /**
2460 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 2456 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
2461 * @uaddr: the futex we initially wait on (non-pi) 2457 * @uaddr: the futex we initially wait on (non-pi)
2462 * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be 2458 * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
2463 * the same type, no requeueing from private to shared, etc. 2459 * the same type, no requeueing from private to shared, etc.
2464 * @val: the expected value of uaddr 2460 * @val: the expected value of uaddr
2465 * @abs_time: absolute timeout 2461 * @abs_time: absolute timeout
2466 * @bitset: 32 bit wakeup bitset set by userspace, defaults to all 2462 * @bitset: 32 bit wakeup bitset set by userspace, defaults to all
2467 * @uaddr2: the pi futex we will take prior to returning to user-space 2463 * @uaddr2: the pi futex we will take prior to returning to user-space
2468 * 2464 *
2469 * The caller will wait on uaddr and will be requeued by futex_requeue() to 2465 * The caller will wait on uaddr and will be requeued by futex_requeue() to
2470 * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake 2466 * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake
2471 * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to 2467 * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
2472 * userspace. This ensures the rt_mutex maintains an owner when it has waiters; 2468 * userspace. This ensures the rt_mutex maintains an owner when it has waiters;
2473 * without one, the pi logic would not know which task to boost/deboost, if 2469 * without one, the pi logic would not know which task to boost/deboost, if
2474 * there was a need to. 2470 * there was a need to.
2475 * 2471 *
2476 * We call schedule in futex_wait_queue_me() when we enqueue and return there 2472 * We call schedule in futex_wait_queue_me() when we enqueue and return there
2477 * via the following-- 2473 * via the following--
2478 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() 2474 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
2479 * 2) wakeup on uaddr2 after a requeue 2475 * 2) wakeup on uaddr2 after a requeue
2480 * 3) signal 2476 * 3) signal
2481 * 4) timeout 2477 * 4) timeout
2482 * 2478 *
2483 * If 3, cleanup and return -ERESTARTNOINTR. 2479 * If 3, cleanup and return -ERESTARTNOINTR.
2484 * 2480 *
2485 * If 2, we may then block on trying to take the rt_mutex and return via: 2481 * If 2, we may then block on trying to take the rt_mutex and return via:
2486 * 5) successful lock 2482 * 5) successful lock
2487 * 6) signal 2483 * 6) signal
2488 * 7) timeout 2484 * 7) timeout
2489 * 8) other lock acquisition failure 2485 * 8) other lock acquisition failure
2490 * 2486 *
2491 * If 6, return -EWOULDBLOCK (restarting the syscall would do the same). 2487 * If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
2492 * 2488 *
2493 * If 4 or 7, we cleanup and return with -ETIMEDOUT. 2489 * If 4 or 7, we cleanup and return with -ETIMEDOUT.
2494 * 2490 *
2495 * Return: 2491 * Return:
2496 * 0 - On success; 2492 * 0 - On success;
2497 * <0 - On error 2493 * <0 - On error
2498 */ 2494 */
2499 static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, 2495 static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2500 u32 val, ktime_t *abs_time, u32 bitset, 2496 u32 val, ktime_t *abs_time, u32 bitset,
2501 u32 __user *uaddr2) 2497 u32 __user *uaddr2)
2502 { 2498 {
2503 struct hrtimer_sleeper timeout, *to = NULL; 2499 struct hrtimer_sleeper timeout, *to = NULL;
2504 struct rt_mutex_waiter rt_waiter; 2500 struct rt_mutex_waiter rt_waiter;
2505 struct rt_mutex *pi_mutex = NULL; 2501 struct rt_mutex *pi_mutex = NULL;
2506 struct futex_hash_bucket *hb; 2502 struct futex_hash_bucket *hb;
2507 union futex_key key2 = FUTEX_KEY_INIT; 2503 union futex_key key2 = FUTEX_KEY_INIT;
2508 struct futex_q q = futex_q_init; 2504 struct futex_q q = futex_q_init;
2509 int res, ret; 2505 int res, ret;
2510 2506
2511 if (uaddr == uaddr2) 2507 if (uaddr == uaddr2)
2512 return -EINVAL; 2508 return -EINVAL;
2513 2509
2514 if (!bitset) 2510 if (!bitset)
2515 return -EINVAL; 2511 return -EINVAL;
2516 2512
2517 if (abs_time) { 2513 if (abs_time) {
2518 to = &timeout; 2514 to = &timeout;
2519 hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ? 2515 hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
2520 CLOCK_REALTIME : CLOCK_MONOTONIC, 2516 CLOCK_REALTIME : CLOCK_MONOTONIC,
2521 HRTIMER_MODE_ABS); 2517 HRTIMER_MODE_ABS);
2522 hrtimer_init_sleeper(to, current); 2518 hrtimer_init_sleeper(to, current);
2523 hrtimer_set_expires_range_ns(&to->timer, *abs_time, 2519 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
2524 current->timer_slack_ns); 2520 current->timer_slack_ns);
2525 } 2521 }
2526 2522
2527 /* 2523 /*
2528 * The waiter is allocated on our stack, manipulated by the requeue 2524 * The waiter is allocated on our stack, manipulated by the requeue
2529 * code while we sleep on uaddr. 2525 * code while we sleep on uaddr.
2530 */ 2526 */
2531 debug_rt_mutex_init_waiter(&rt_waiter); 2527 debug_rt_mutex_init_waiter(&rt_waiter);
2532 RB_CLEAR_NODE(&rt_waiter.pi_tree_entry); 2528 RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
2533 RB_CLEAR_NODE(&rt_waiter.tree_entry); 2529 RB_CLEAR_NODE(&rt_waiter.tree_entry);
2534 rt_waiter.task = NULL; 2530 rt_waiter.task = NULL;
2535 2531
2536 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); 2532 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
2537 if (unlikely(ret != 0)) 2533 if (unlikely(ret != 0))
2538 goto out; 2534 goto out;
2539 2535
2540 q.bitset = bitset; 2536 q.bitset = bitset;
2541 q.rt_waiter = &rt_waiter; 2537 q.rt_waiter = &rt_waiter;
2542 q.requeue_pi_key = &key2; 2538 q.requeue_pi_key = &key2;
2543 2539
2544 /* 2540 /*
2545 * Prepare to wait on uaddr. On success, increments q.key (key1) ref 2541 * Prepare to wait on uaddr. On success, increments q.key (key1) ref
2546 * count. 2542 * count.
2547 */ 2543 */
2548 ret = futex_wait_setup(uaddr, val, flags, &q, &hb); 2544 ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
2549 if (ret) 2545 if (ret)
2550 goto out_key2; 2546 goto out_key2;
2551 2547
2552 /* 2548 /*
2553 * The check above which compares uaddrs is not sufficient for 2549 * The check above which compares uaddrs is not sufficient for
2554 * shared futexes. We need to compare the keys: 2550 * shared futexes. We need to compare the keys:
2555 */ 2551 */
2556 if (match_futex(&q.key, &key2)) { 2552 if (match_futex(&q.key, &key2)) {
2557 ret = -EINVAL; 2553 ret = -EINVAL;
2558 goto out_put_keys; 2554 goto out_put_keys;
2559 } 2555 }
2560 2556
2561 /* Queue the futex_q, drop the hb lock, wait for wakeup. */ 2557 /* Queue the futex_q, drop the hb lock, wait for wakeup. */
2562 futex_wait_queue_me(hb, &q, to); 2558 futex_wait_queue_me(hb, &q, to);
2563 2559
2564 spin_lock(&hb->lock); 2560 spin_lock(&hb->lock);
2565 ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); 2561 ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
2566 spin_unlock(&hb->lock); 2562 spin_unlock(&hb->lock);
2567 if (ret) 2563 if (ret)
2568 goto out_put_keys; 2564 goto out_put_keys;
2569 2565
2570 /* 2566 /*
2571 * In order for us to be here, we know our q.key == key2, and since 2567 * In order for us to be here, we know our q.key == key2, and since
2572 * we took the hb->lock above, we also know that futex_requeue() has 2568 * we took the hb->lock above, we also know that futex_requeue() has
2573 * completed and we no longer have to concern ourselves with a wakeup 2569 * completed and we no longer have to concern ourselves with a wakeup
2574 * race with the atomic proxy lock acquisition by the requeue code. The 2570 * race with the atomic proxy lock acquisition by the requeue code. The
2575 * futex_requeue dropped our key1 reference and incremented our key2 2571 * futex_requeue dropped our key1 reference and incremented our key2
2576 * reference count. 2572 * reference count.
2577 */ 2573 */
2578 2574
2579 /* Check if the requeue code acquired the second futex for us. */ 2575 /* Check if the requeue code acquired the second futex for us. */
2580 if (!q.rt_waiter) { 2576 if (!q.rt_waiter) {
2581 /* 2577 /*
2582 * Got the lock. We might not be the anticipated owner if we 2578 * Got the lock. We might not be the anticipated owner if we
2583 * did a lock-steal - fix up the PI-state in that case. 2579 * did a lock-steal - fix up the PI-state in that case.
2584 */ 2580 */
2585 if (q.pi_state && (q.pi_state->owner != current)) { 2581 if (q.pi_state && (q.pi_state->owner != current)) {
2586 spin_lock(q.lock_ptr); 2582 spin_lock(q.lock_ptr);
2587 ret = fixup_pi_state_owner(uaddr2, &q, current); 2583 ret = fixup_pi_state_owner(uaddr2, &q, current);
2588 spin_unlock(q.lock_ptr); 2584 spin_unlock(q.lock_ptr);
2589 } 2585 }
2590 } else { 2586 } else {
2591 /* 2587 /*
2592 * We have been woken up by futex_unlock_pi(), a timeout, or a 2588 * We have been woken up by futex_unlock_pi(), a timeout, or a
2593 * signal. futex_unlock_pi() will not destroy the lock_ptr nor 2589 * signal. futex_unlock_pi() will not destroy the lock_ptr nor
2594 * the pi_state. 2590 * the pi_state.
2595 */ 2591 */
2596 WARN_ON(!q.pi_state); 2592 WARN_ON(!q.pi_state);
2597 pi_mutex = &q.pi_state->pi_mutex; 2593 pi_mutex = &q.pi_state->pi_mutex;
2598 ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1); 2594 ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
2599 debug_rt_mutex_free_waiter(&rt_waiter); 2595 debug_rt_mutex_free_waiter(&rt_waiter);
2600 2596
2601 spin_lock(q.lock_ptr); 2597 spin_lock(q.lock_ptr);
2602 /* 2598 /*
2603 * Fixup the pi_state owner and possibly acquire the lock if we 2599 * Fixup the pi_state owner and possibly acquire the lock if we
2604 * haven't already. 2600 * haven't already.
2605 */ 2601 */
2606 res = fixup_owner(uaddr2, &q, !ret); 2602 res = fixup_owner(uaddr2, &q, !ret);
2607 /* 2603 /*
2608 * If fixup_owner() returned an error, proprogate that. If it 2604 * If fixup_owner() returned an error, proprogate that. If it
2609 * acquired the lock, clear -ETIMEDOUT or -EINTR. 2605 * acquired the lock, clear -ETIMEDOUT or -EINTR.
2610 */ 2606 */
2611 if (res) 2607 if (res)
2612 ret = (res < 0) ? res : 0; 2608 ret = (res < 0) ? res : 0;
2613 2609
2614 /* Unqueue and drop the lock. */ 2610 /* Unqueue and drop the lock. */
2615 unqueue_me_pi(&q); 2611 unqueue_me_pi(&q);
2616 } 2612 }
2617 2613
2618 /* 2614 /*
2619 * If fixup_pi_state_owner() faulted and was unable to handle the 2615 * If fixup_pi_state_owner() faulted and was unable to handle the
2620 * fault, unlock the rt_mutex and return the fault to userspace. 2616 * fault, unlock the rt_mutex and return the fault to userspace.
2621 */ 2617 */
2622 if (ret == -EFAULT) { 2618 if (ret == -EFAULT) {
2623 if (pi_mutex && rt_mutex_owner(pi_mutex) == current) 2619 if (pi_mutex && rt_mutex_owner(pi_mutex) == current)
2624 rt_mutex_unlock(pi_mutex); 2620 rt_mutex_unlock(pi_mutex);
2625 } else if (ret == -EINTR) { 2621 } else if (ret == -EINTR) {
2626 /* 2622 /*
2627 * We've already been requeued, but cannot restart by calling 2623 * We've already been requeued, but cannot restart by calling
2628 * futex_lock_pi() directly. We could restart this syscall, but 2624 * futex_lock_pi() directly. We could restart this syscall, but
2629 * it would detect that the user space "val" changed and return 2625 * it would detect that the user space "val" changed and return
2630 * -EWOULDBLOCK. Save the overhead of the restart and return 2626 * -EWOULDBLOCK. Save the overhead of the restart and return
2631 * -EWOULDBLOCK directly. 2627 * -EWOULDBLOCK directly.
2632 */ 2628 */
2633 ret = -EWOULDBLOCK; 2629 ret = -EWOULDBLOCK;
2634 } 2630 }
2635 2631
2636 out_put_keys: 2632 out_put_keys:
2637 put_futex_key(&q.key); 2633 put_futex_key(&q.key);
2638 out_key2: 2634 out_key2:
2639 put_futex_key(&key2); 2635 put_futex_key(&key2);
2640 2636
2641 out: 2637 out:
2642 if (to) { 2638 if (to) {
2643 hrtimer_cancel(&to->timer); 2639 hrtimer_cancel(&to->timer);
2644 destroy_hrtimer_on_stack(&to->timer); 2640 destroy_hrtimer_on_stack(&to->timer);
2645 } 2641 }
2646 return ret; 2642 return ret;
2647 } 2643 }
2648 2644
2649 /* 2645 /*
2650 * Support for robust futexes: the kernel cleans up held futexes at 2646 * Support for robust futexes: the kernel cleans up held futexes at
2651 * thread exit time. 2647 * thread exit time.
2652 * 2648 *
2653 * Implementation: user-space maintains a per-thread list of locks it 2649 * Implementation: user-space maintains a per-thread list of locks it
2654 * is holding. Upon do_exit(), the kernel carefully walks this list, 2650 * is holding. Upon do_exit(), the kernel carefully walks this list,
2655 * and marks all locks that are owned by this thread with the 2651 * and marks all locks that are owned by this thread with the
2656 * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is 2652 * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
2657 * always manipulated with the lock held, so the list is private and 2653 * always manipulated with the lock held, so the list is private and
2658 * per-thread. Userspace also maintains a per-thread 'list_op_pending' 2654 * per-thread. Userspace also maintains a per-thread 'list_op_pending'
2659 * field, to allow the kernel to clean up if the thread dies after 2655 * field, to allow the kernel to clean up if the thread dies after
2660 * acquiring the lock, but just before it could have added itself to 2656 * acquiring the lock, but just before it could have added itself to
2661 * the list. There can only be one such pending lock. 2657 * the list. There can only be one such pending lock.
2662 */ 2658 */
2663 2659
2664 /** 2660 /**
2665 * sys_set_robust_list() - Set the robust-futex list head of a task 2661 * sys_set_robust_list() - Set the robust-futex list head of a task
2666 * @head: pointer to the list-head 2662 * @head: pointer to the list-head
2667 * @len: length of the list-head, as userspace expects 2663 * @len: length of the list-head, as userspace expects
2668 */ 2664 */
2669 SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, 2665 SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
2670 size_t, len) 2666 size_t, len)
2671 { 2667 {
2672 if (!futex_cmpxchg_enabled) 2668 if (!futex_cmpxchg_enabled)
2673 return -ENOSYS; 2669 return -ENOSYS;
2674 /* 2670 /*
2675 * The kernel knows only one size for now: 2671 * The kernel knows only one size for now:
2676 */ 2672 */
2677 if (unlikely(len != sizeof(*head))) 2673 if (unlikely(len != sizeof(*head)))
2678 return -EINVAL; 2674 return -EINVAL;
2679 2675
2680 current->robust_list = head; 2676 current->robust_list = head;
2681 2677
2682 return 0; 2678 return 0;
2683 } 2679 }
2684 2680
2685 /** 2681 /**
2686 * sys_get_robust_list() - Get the robust-futex list head of a task 2682 * sys_get_robust_list() - Get the robust-futex list head of a task
2687 * @pid: pid of the process [zero for current task] 2683 * @pid: pid of the process [zero for current task]
2688 * @head_ptr: pointer to a list-head pointer, the kernel fills it in 2684 * @head_ptr: pointer to a list-head pointer, the kernel fills it in
2689 * @len_ptr: pointer to a length field, the kernel fills in the header size 2685 * @len_ptr: pointer to a length field, the kernel fills in the header size
2690 */ 2686 */
2691 SYSCALL_DEFINE3(get_robust_list, int, pid, 2687 SYSCALL_DEFINE3(get_robust_list, int, pid,
2692 struct robust_list_head __user * __user *, head_ptr, 2688 struct robust_list_head __user * __user *, head_ptr,
2693 size_t __user *, len_ptr) 2689 size_t __user *, len_ptr)
2694 { 2690 {
2695 struct robust_list_head __user *head; 2691 struct robust_list_head __user *head;
2696 unsigned long ret; 2692 unsigned long ret;
2697 struct task_struct *p; 2693 struct task_struct *p;
2698 2694
2699 if (!futex_cmpxchg_enabled) 2695 if (!futex_cmpxchg_enabled)
2700 return -ENOSYS; 2696 return -ENOSYS;
2701 2697
2702 rcu_read_lock(); 2698 rcu_read_lock();
2703 2699
2704 ret = -ESRCH; 2700 ret = -ESRCH;
2705 if (!pid) 2701 if (!pid)
2706 p = current; 2702 p = current;
2707 else { 2703 else {
2708 p = find_task_by_vpid(pid); 2704 p = find_task_by_vpid(pid);
2709 if (!p) 2705 if (!p)
2710 goto err_unlock; 2706 goto err_unlock;
2711 } 2707 }
2712 2708
2713 ret = -EPERM; 2709 ret = -EPERM;
2714 if (!ptrace_may_access(p, PTRACE_MODE_READ)) 2710 if (!ptrace_may_access(p, PTRACE_MODE_READ))
2715 goto err_unlock; 2711 goto err_unlock;
2716 2712
2717 head = p->robust_list; 2713 head = p->robust_list;
2718 rcu_read_unlock(); 2714 rcu_read_unlock();
2719 2715
2720 if (put_user(sizeof(*head), len_ptr)) 2716 if (put_user(sizeof(*head), len_ptr))
2721 return -EFAULT; 2717 return -EFAULT;
2722 return put_user(head, head_ptr); 2718 return put_user(head, head_ptr);
2723 2719
2724 err_unlock: 2720 err_unlock:
2725 rcu_read_unlock(); 2721 rcu_read_unlock();
2726 2722
2727 return ret; 2723 return ret;
2728 } 2724 }
2729 2725
2730 /* 2726 /*
2731 * Process a futex-list entry, check whether it's owned by the 2727 * Process a futex-list entry, check whether it's owned by the
2732 * dying task, and do notification if so: 2728 * dying task, and do notification if so:
2733 */ 2729 */
2734 int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) 2730 int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
2735 { 2731 {
2736 u32 uval, uninitialized_var(nval), mval; 2732 u32 uval, uninitialized_var(nval), mval;
2737 2733
2738 retry: 2734 retry:
2739 if (get_user(uval, uaddr)) 2735 if (get_user(uval, uaddr))
2740 return -1; 2736 return -1;
2741 2737
2742 if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) { 2738 if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) {
2743 /* 2739 /*
2744 * Ok, this dying thread is truly holding a futex 2740 * Ok, this dying thread is truly holding a futex
2745 * of interest. Set the OWNER_DIED bit atomically 2741 * of interest. Set the OWNER_DIED bit atomically
2746 * via cmpxchg, and if the value had FUTEX_WAITERS 2742 * via cmpxchg, and if the value had FUTEX_WAITERS
2747 * set, wake up a waiter (if any). (We have to do a 2743 * set, wake up a waiter (if any). (We have to do a
2748 * futex_wake() even if OWNER_DIED is already set - 2744 * futex_wake() even if OWNER_DIED is already set -
2749 * to handle the rare but possible case of recursive 2745 * to handle the rare but possible case of recursive
2750 * thread-death.) The rest of the cleanup is done in 2746 * thread-death.) The rest of the cleanup is done in
2751 * userspace. 2747 * userspace.
2752 */ 2748 */
2753 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; 2749 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
2754 /* 2750 /*
2755 * We are not holding a lock here, but we want to have 2751 * We are not holding a lock here, but we want to have
2756 * the pagefault_disable/enable() protection because 2752 * the pagefault_disable/enable() protection because
2757 * we want to handle the fault gracefully. If the 2753 * we want to handle the fault gracefully. If the
2758 * access fails we try to fault in the futex with R/W 2754 * access fails we try to fault in the futex with R/W
2759 * verification via get_user_pages. get_user() above 2755 * verification via get_user_pages. get_user() above
2760 * does not guarantee R/W access. If that fails we 2756 * does not guarantee R/W access. If that fails we
2761 * give up and leave the futex locked. 2757 * give up and leave the futex locked.
2762 */ 2758 */
2763 if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) { 2759 if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) {
2764 if (fault_in_user_writeable(uaddr)) 2760 if (fault_in_user_writeable(uaddr))
2765 return -1; 2761 return -1;
2766 goto retry; 2762 goto retry;
2767 } 2763 }
2768 if (nval != uval) 2764 if (nval != uval)
2769 goto retry; 2765 goto retry;
2770 2766
2771 /* 2767 /*
2772 * Wake robust non-PI futexes here. The wakeup of 2768 * Wake robust non-PI futexes here. The wakeup of
2773 * PI futexes happens in exit_pi_state(): 2769 * PI futexes happens in exit_pi_state():
2774 */ 2770 */
2775 if (!pi && (uval & FUTEX_WAITERS)) 2771 if (!pi && (uval & FUTEX_WAITERS))
2776 futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); 2772 futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
2777 } 2773 }
2778 return 0; 2774 return 0;
2779 } 2775 }
2780 2776
2781 /* 2777 /*
2782 * Fetch a robust-list pointer. Bit 0 signals PI futexes: 2778 * Fetch a robust-list pointer. Bit 0 signals PI futexes:
2783 */ 2779 */
2784 static inline int fetch_robust_entry(struct robust_list __user **entry, 2780 static inline int fetch_robust_entry(struct robust_list __user **entry,
2785 struct robust_list __user * __user *head, 2781 struct robust_list __user * __user *head,
2786 unsigned int *pi) 2782 unsigned int *pi)
2787 { 2783 {
2788 unsigned long uentry; 2784 unsigned long uentry;
2789 2785
2790 if (get_user(uentry, (unsigned long __user *)head)) 2786 if (get_user(uentry, (unsigned long __user *)head))
2791 return -EFAULT; 2787 return -EFAULT;
2792 2788
2793 *entry = (void __user *)(uentry & ~1UL); 2789 *entry = (void __user *)(uentry & ~1UL);
2794 *pi = uentry & 1; 2790 *pi = uentry & 1;
2795 2791
2796 return 0; 2792 return 0;
2797 } 2793 }
2798 2794
2799 /* 2795 /*
2800 * Walk curr->robust_list (very carefully, it's a userspace list!) 2796 * Walk curr->robust_list (very carefully, it's a userspace list!)
2801 * and mark any locks found there dead, and notify any waiters. 2797 * and mark any locks found there dead, and notify any waiters.
2802 * 2798 *
2803 * We silently return on any sign of list-walking problem. 2799 * We silently return on any sign of list-walking problem.
2804 */ 2800 */
2805 void exit_robust_list(struct task_struct *curr) 2801 void exit_robust_list(struct task_struct *curr)
2806 { 2802 {
2807 struct robust_list_head __user *head = curr->robust_list; 2803 struct robust_list_head __user *head = curr->robust_list;
2808 struct robust_list __user *entry, *next_entry, *pending; 2804 struct robust_list __user *entry, *next_entry, *pending;
2809 unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; 2805 unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
2810 unsigned int uninitialized_var(next_pi); 2806 unsigned int uninitialized_var(next_pi);
2811 unsigned long futex_offset; 2807 unsigned long futex_offset;
2812 int rc; 2808 int rc;
2813 2809
2814 if (!futex_cmpxchg_enabled) 2810 if (!futex_cmpxchg_enabled)
2815 return; 2811 return;
2816 2812
2817 /* 2813 /*
2818 * Fetch the list head (which was registered earlier, via 2814 * Fetch the list head (which was registered earlier, via
2819 * sys_set_robust_list()): 2815 * sys_set_robust_list()):
2820 */ 2816 */
2821 if (fetch_robust_entry(&entry, &head->list.next, &pi)) 2817 if (fetch_robust_entry(&entry, &head->list.next, &pi))
2822 return; 2818 return;
2823 /* 2819 /*
2824 * Fetch the relative futex offset: 2820 * Fetch the relative futex offset:
2825 */ 2821 */
2826 if (get_user(futex_offset, &head->futex_offset)) 2822 if (get_user(futex_offset, &head->futex_offset))
2827 return; 2823 return;
2828 /* 2824 /*
2829 * Fetch any possibly pending lock-add first, and handle it 2825 * Fetch any possibly pending lock-add first, and handle it
2830 * if it exists: 2826 * if it exists:
2831 */ 2827 */
2832 if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) 2828 if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
2833 return; 2829 return;
2834 2830
2835 next_entry = NULL; /* avoid warning with gcc */ 2831 next_entry = NULL; /* avoid warning with gcc */
2836 while (entry != &head->list) { 2832 while (entry != &head->list) {
2837 /* 2833 /*
2838 * Fetch the next entry in the list before calling 2834 * Fetch the next entry in the list before calling
2839 * handle_futex_death: 2835 * handle_futex_death:
2840 */ 2836 */
2841 rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi); 2837 rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
2842 /* 2838 /*
2843 * A pending lock might already be on the list, so 2839 * A pending lock might already be on the list, so
2844 * don't process it twice: 2840 * don't process it twice:
2845 */ 2841 */
2846 if (entry != pending) 2842 if (entry != pending)
2847 if (handle_futex_death((void __user *)entry + futex_offset, 2843 if (handle_futex_death((void __user *)entry + futex_offset,
2848 curr, pi)) 2844 curr, pi))
2849 return; 2845 return;
2850 if (rc) 2846 if (rc)
2851 return; 2847 return;
2852 entry = next_entry; 2848 entry = next_entry;
2853 pi = next_pi; 2849 pi = next_pi;
2854 /* 2850 /*
2855 * Avoid excessively long or circular lists: 2851 * Avoid excessively long or circular lists:
2856 */ 2852 */
2857 if (!--limit) 2853 if (!--limit)
2858 break; 2854 break;
2859 2855
2860 cond_resched(); 2856 cond_resched();
2861 } 2857 }
2862 2858
2863 if (pending) 2859 if (pending)
2864 handle_futex_death((void __user *)pending + futex_offset, 2860 handle_futex_death((void __user *)pending + futex_offset,
2865 curr, pip); 2861 curr, pip);
2866 } 2862 }
2867 2863
2868 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, 2864 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
2869 u32 __user *uaddr2, u32 val2, u32 val3) 2865 u32 __user *uaddr2, u32 val2, u32 val3)
2870 { 2866 {
2871 int cmd = op & FUTEX_CMD_MASK; 2867 int cmd = op & FUTEX_CMD_MASK;
2872 unsigned int flags = 0; 2868 unsigned int flags = 0;
2873 2869
2874 if (!(op & FUTEX_PRIVATE_FLAG)) 2870 if (!(op & FUTEX_PRIVATE_FLAG))
2875 flags |= FLAGS_SHARED; 2871 flags |= FLAGS_SHARED;
2876 2872
2877 if (op & FUTEX_CLOCK_REALTIME) { 2873 if (op & FUTEX_CLOCK_REALTIME) {
2878 flags |= FLAGS_CLOCKRT; 2874 flags |= FLAGS_CLOCKRT;
2879 if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) 2875 if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
2880 return -ENOSYS; 2876 return -ENOSYS;
2881 } 2877 }
2882 2878
2883 switch (cmd) { 2879 switch (cmd) {
2884 case FUTEX_LOCK_PI: 2880 case FUTEX_LOCK_PI:
2885 case FUTEX_UNLOCK_PI: 2881 case FUTEX_UNLOCK_PI:
2886 case FUTEX_TRYLOCK_PI: 2882 case FUTEX_TRYLOCK_PI:
2887 case FUTEX_WAIT_REQUEUE_PI: 2883 case FUTEX_WAIT_REQUEUE_PI:
2888 case FUTEX_CMP_REQUEUE_PI: 2884 case FUTEX_CMP_REQUEUE_PI:
2889 if (!futex_cmpxchg_enabled) 2885 if (!futex_cmpxchg_enabled)
2890 return -ENOSYS; 2886 return -ENOSYS;
2891 } 2887 }
2892 2888
2893 switch (cmd) { 2889 switch (cmd) {
2894 case FUTEX_WAIT: 2890 case FUTEX_WAIT:
2895 val3 = FUTEX_BITSET_MATCH_ANY; 2891 val3 = FUTEX_BITSET_MATCH_ANY;
2896 case FUTEX_WAIT_BITSET: 2892 case FUTEX_WAIT_BITSET:
2897 return futex_wait(uaddr, flags, val, timeout, val3); 2893 return futex_wait(uaddr, flags, val, timeout, val3);
2898 case FUTEX_WAKE: 2894 case FUTEX_WAKE:
2899 val3 = FUTEX_BITSET_MATCH_ANY; 2895 val3 = FUTEX_BITSET_MATCH_ANY;
2900 case FUTEX_WAKE_BITSET: 2896 case FUTEX_WAKE_BITSET:
2901 return futex_wake(uaddr, flags, val, val3); 2897 return futex_wake(uaddr, flags, val, val3);
2902 case FUTEX_REQUEUE: 2898 case FUTEX_REQUEUE:
2903 return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0); 2899 return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
2904 case FUTEX_CMP_REQUEUE: 2900 case FUTEX_CMP_REQUEUE:
2905 return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0); 2901 return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
2906 case FUTEX_WAKE_OP: 2902 case FUTEX_WAKE_OP:
2907 return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); 2903 return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
2908 case FUTEX_LOCK_PI: 2904 case FUTEX_LOCK_PI:
2909 return futex_lock_pi(uaddr, flags, val, timeout, 0); 2905 return futex_lock_pi(uaddr, flags, val, timeout, 0);
2910 case FUTEX_UNLOCK_PI: 2906 case FUTEX_UNLOCK_PI:
2911 return futex_unlock_pi(uaddr, flags); 2907 return futex_unlock_pi(uaddr, flags);
2912 case FUTEX_TRYLOCK_PI: 2908 case FUTEX_TRYLOCK_PI:
2913 return futex_lock_pi(uaddr, flags, 0, timeout, 1); 2909 return futex_lock_pi(uaddr, flags, 0, timeout, 1);
2914 case FUTEX_WAIT_REQUEUE_PI: 2910 case FUTEX_WAIT_REQUEUE_PI:
2915 val3 = FUTEX_BITSET_MATCH_ANY; 2911 val3 = FUTEX_BITSET_MATCH_ANY;
2916 return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, 2912 return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
2917 uaddr2); 2913 uaddr2);
2918 case FUTEX_CMP_REQUEUE_PI: 2914 case FUTEX_CMP_REQUEUE_PI:
2919 return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); 2915 return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
2920 } 2916 }
2921 return -ENOSYS; 2917 return -ENOSYS;
2922 } 2918 }
2923 2919
2924 2920
2925 SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, 2921 SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
2926 struct timespec __user *, utime, u32 __user *, uaddr2, 2922 struct timespec __user *, utime, u32 __user *, uaddr2,
2927 u32, val3) 2923 u32, val3)
2928 { 2924 {
2929 struct timespec ts; 2925 struct timespec ts;
2930 ktime_t t, *tp = NULL; 2926 ktime_t t, *tp = NULL;
2931 u32 val2 = 0; 2927 u32 val2 = 0;
2932 int cmd = op & FUTEX_CMD_MASK; 2928 int cmd = op & FUTEX_CMD_MASK;
2933 2929
2934 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || 2930 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
2935 cmd == FUTEX_WAIT_BITSET || 2931 cmd == FUTEX_WAIT_BITSET ||
2936 cmd == FUTEX_WAIT_REQUEUE_PI)) { 2932 cmd == FUTEX_WAIT_REQUEUE_PI)) {
2937 if (copy_from_user(&ts, utime, sizeof(ts)) != 0) 2933 if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
2938 return -EFAULT; 2934 return -EFAULT;
2939 if (!timespec_valid(&ts)) 2935 if (!timespec_valid(&ts))
2940 return -EINVAL; 2936 return -EINVAL;
2941 2937
2942 t = timespec_to_ktime(ts); 2938 t = timespec_to_ktime(ts);
2943 if (cmd == FUTEX_WAIT) 2939 if (cmd == FUTEX_WAIT)
2944 t = ktime_add_safe(ktime_get(), t); 2940 t = ktime_add_safe(ktime_get(), t);
2945 tp = &t; 2941 tp = &t;
2946 } 2942 }
2947 /* 2943 /*
2948 * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*. 2944 * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*.
2949 * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP. 2945 * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
2950 */ 2946 */
2951 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || 2947 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
2952 cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) 2948 cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
2953 val2 = (u32) (unsigned long) utime; 2949 val2 = (u32) (unsigned long) utime;
2954 2950
2955 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); 2951 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
2956 } 2952 }
2957 2953
2958 static void __init futex_detect_cmpxchg(void) 2954 static void __init futex_detect_cmpxchg(void)
2959 { 2955 {
2960 #ifndef CONFIG_HAVE_FUTEX_CMPXCHG 2956 #ifndef CONFIG_HAVE_FUTEX_CMPXCHG
2961 u32 curval; 2957 u32 curval;
2962 2958
2963 /* 2959 /*
2964 * This will fail and we want it. Some arch implementations do 2960 * This will fail and we want it. Some arch implementations do
2965 * runtime detection of the futex_atomic_cmpxchg_inatomic() 2961 * runtime detection of the futex_atomic_cmpxchg_inatomic()
2966 * functionality. We want to know that before we call in any 2962 * functionality. We want to know that before we call in any
2967 * of the complex code paths. Also we want to prevent 2963 * of the complex code paths. Also we want to prevent
2968 * registration of robust lists in that case. NULL is 2964 * registration of robust lists in that case. NULL is
2969 * guaranteed to fault and we get -EFAULT on functional 2965 * guaranteed to fault and we get -EFAULT on functional
2970 * implementation, the non-functional ones will return 2966 * implementation, the non-functional ones will return
2971 * -ENOSYS. 2967 * -ENOSYS.
2972 */ 2968 */
2973 if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT) 2969 if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
2974 futex_cmpxchg_enabled = 1; 2970 futex_cmpxchg_enabled = 1;
2975 #endif 2971 #endif
2976 } 2972 }
2977 2973
2978 static int __init futex_init(void) 2974 static int __init futex_init(void)
2979 { 2975 {
2980 unsigned int futex_shift; 2976 unsigned int futex_shift;
2981 unsigned long i; 2977 unsigned long i;
2982 2978
2983 #if CONFIG_BASE_SMALL 2979 #if CONFIG_BASE_SMALL
2984 futex_hashsize = 16; 2980 futex_hashsize = 16;
2985 #else 2981 #else
2986 futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); 2982 futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
2987 #endif 2983 #endif
2988 2984
2989 futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), 2985 futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
2990 futex_hashsize, 0, 2986 futex_hashsize, 0,
2991 futex_hashsize < 256 ? HASH_SMALL : 0, 2987 futex_hashsize < 256 ? HASH_SMALL : 0,
2992 &futex_shift, NULL, 2988 &futex_shift, NULL,
2993 futex_hashsize, futex_hashsize); 2989 futex_hashsize, futex_hashsize);
2994 futex_hashsize = 1UL << futex_shift; 2990 futex_hashsize = 1UL << futex_shift;
2995 2991
2996 futex_detect_cmpxchg(); 2992 futex_detect_cmpxchg();
2997 2993
2998 for (i = 0; i < futex_hashsize; i++) { 2994 for (i = 0; i < futex_hashsize; i++) {
2999 atomic_set(&futex_queues[i].waiters, 0); 2995 atomic_set(&futex_queues[i].waiters, 0);
3000 plist_head_init(&futex_queues[i].chain); 2996 plist_head_init(&futex_queues[i].chain);
3001 spin_lock_init(&futex_queues[i].lock); 2997 spin_lock_init(&futex_queues[i].lock);
3002 } 2998 }
3003 2999
3004 return 0; 3000 return 0;
3005 } 3001 }