Blame view

kernel/futex.c 49.7 KB
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1
2
3
4
5
6
7
8
9
10
  /*
   *  Fast Userspace Mutexes (which I call "Futexes!").
   *  (C) Rusty Russell, IBM 2002
   *
   *  Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
   *  (C) Copyright 2003 Red Hat Inc, All Rights Reserved
   *
   *  Removed page pinning, fix privately mapped COW pages and other cleanups
   *  (C) Copyright 2003, 2004 Jamie Lokier
   *
0771dfefc   Ingo Molnar   [PATCH] lightweig...
11
12
13
14
   *  Robust futex support started by Ingo Molnar
   *  (C) Copyright 2006 Red Hat Inc, All Rights Reserved
   *  Thanks to Thomas Gleixner for suggestions, analysis and fixes.
   *
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
15
16
17
18
   *  PI-futex support started by Ingo Molnar and Thomas Gleixner
   *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
   *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
   *
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
19
20
21
   *  PRIVATE futexes by Eric Dumazet
   *  Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
   *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
   *  enough at me, Linus for the original (flawed) idea, Matthew
   *  Kirkwood for proof-of-concept implementation.
   *
   *  "The futexes are also cursed."
   *  "But they come in a choice of three flavours!"
   *
   *  This program is free software; you can redistribute it and/or modify
   *  it under the terms of the GNU General Public License as published by
   *  the Free Software Foundation; either version 2 of the License, or
   *  (at your option) any later version.
   *
   *  This program is distributed in the hope that it will be useful,
   *  but WITHOUT ANY WARRANTY; without even the implied warranty of
   *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   *  GNU General Public License for more details.
   *
   *  You should have received a copy of the GNU General Public License
   *  along with this program; if not, write to the Free Software
   *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   */
  #include <linux/slab.h>
  #include <linux/poll.h>
  #include <linux/fs.h>
  #include <linux/file.h>
  #include <linux/jhash.h>
  #include <linux/init.h>
  #include <linux/futex.h>
  #include <linux/mount.h>
  #include <linux/pagemap.h>
  #include <linux/syscalls.h>
7ed20e1ad   Jesper Juhl   [PATCH] convert t...
53
  #include <linux/signal.h>
9adef58b1   Rusty Russell   futex: get_futex_...
54
  #include <linux/module.h>
fd5eea421   Andrey Mirkin   change inotifyfs ...
55
  #include <linux/magic.h>
b488893a3   Pavel Emelyanov   pid namespaces: c...
56
57
  #include <linux/pid.h>
  #include <linux/nsproxy.h>
4732efbeb   Jakub Jelinek   [PATCH] FUTEX_WAK...
58
  #include <asm/futex.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
59

c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
60
  #include "rtmutex_common.h"
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
61
62
63
  #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
  
  /*
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
   * Priority Inheritance state:
   */
  struct futex_pi_state {
  	/*
  	 * list of 'owned' pi_state instances - these have to be
  	 * cleaned up in do_exit() if the task exits prematurely:
  	 */
  	struct list_head list;
  
  	/*
  	 * The PI object:
  	 */
  	struct rt_mutex pi_mutex;
  
  	struct task_struct *owner;
  	atomic_t refcount;
  
  	union futex_key key;
  };
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
85
86
87
88
   * We use this hashed waitqueue instead of a normal wait_queue_t, so
   * we can wake only the relevant ones (hashed queues may be shared).
   *
   * A futex_q has a woken state, just like tasks have TASK_RUNNING.
ec92d0829   Pierre Peiffer   futex priority ba...
89
   * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
90
91
92
93
   * The order of wakup is always to make the first condition true, then
   * wake up q->waiters, then make the second condition true.
   */
  struct futex_q {
ec92d0829   Pierre Peiffer   futex priority ba...
94
  	struct plist_node list;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
95
  	wait_queue_head_t waiters;
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
96
  	/* Which hash list lock to use: */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
97
  	spinlock_t *lock_ptr;
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
98
  	/* Key which the futex is hashed on: */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
99
  	union futex_key key;
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
100
  	/* For fd, sigio sent using these: */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
101
102
  	int fd;
  	struct file *filp;
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
103
104
105
106
  
  	/* Optional priority inheritance state: */
  	struct futex_pi_state *pi_state;
  	struct task_struct *task;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
107
108
109
110
111
112
  };
  
  /*
   * Split the global futex_lock into every hash list lock.
   */
  struct futex_hash_bucket {
ec92d0829   Pierre Peiffer   futex priority ba...
113
114
  	spinlock_t lock;
  	struct plist_head chain;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
115
116
117
118
119
120
121
122
  };
  
  static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
  
  /* Futex-fs vfsmount entry: */
  static struct vfsmount *futex_mnt;
  
  /*
36cf3b5c3   Thomas Gleixner   FUTEX: Tidy up th...
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
   * Take mm->mmap_sem, when futex is shared
   */
  static inline void futex_lock_mm(struct rw_semaphore *fshared)
  {
  	if (fshared)
  		down_read(fshared);
  }
  
  /*
   * Release mm->mmap_sem, when the futex is shared
   */
  static inline void futex_unlock_mm(struct rw_semaphore *fshared)
  {
  	if (fshared)
  		up_read(fshared);
  }
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
   * We hash on the keys returned from get_futex_key (see below).
   */
  static struct futex_hash_bucket *hash_futex(union futex_key *key)
  {
  	u32 hash = jhash2((u32*)&key->both.word,
  			  (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
  			  key->both.offset);
  	return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)];
  }
  
  /*
   * Return 1 if two futex_keys are equal, 0 otherwise.
   */
  static inline int match_futex(union futex_key *key1, union futex_key *key2)
  {
  	return (key1->both.word == key2->both.word
  		&& key1->both.ptr == key2->both.ptr
  		&& key1->both.offset == key2->both.offset);
  }
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
160
161
162
163
164
165
166
167
168
  /**
   * get_futex_key - Get parameters which are the keys for a futex.
   * @uaddr: virtual address of the futex
   * @shared: NULL for a PROCESS_PRIVATE futex,
   *	&current->mm->mmap_sem for a PROCESS_SHARED futex
   * @key: address where result is stored.
   *
   * Returns a negative error code or 0
   * The key words are stored in *key on success.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
169
   *
f3a43f3f6   Josef "Jeff" Sipek   [PATCH] kernel: c...
170
   * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
171
172
173
   * offset_within_page).  For private mappings, it's (uaddr, current->mm).
   * We can usually work out the index without swapping in the page.
   *
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
174
175
176
   * fshared is NULL for PROCESS_PRIVATE futexes
   * For other futexes, it points to &current->mm->mmap_sem and
   * caller must have taken the reader lock. but NOT any spinlocks.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
177
   */
fad23fc78   Adrian Bunk   kernel/futex.c: m...
178
179
  static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
  			 union futex_key *key)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
180
  {
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
181
  	unsigned long address = (unsigned long)uaddr;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
182
183
184
185
186
187
188
189
  	struct mm_struct *mm = current->mm;
  	struct vm_area_struct *vma;
  	struct page *page;
  	int err;
  
  	/*
  	 * The futex address must be "naturally" aligned.
  	 */
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
190
  	key->both.offset = address % PAGE_SIZE;
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
191
  	if (unlikely((address % sizeof(u32)) != 0))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
192
  		return -EINVAL;
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
193
  	address -= key->both.offset;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
194
195
  
  	/*
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
196
197
198
199
200
201
202
203
204
205
206
207
208
209
  	 * PROCESS_PRIVATE futexes are fast.
  	 * As the mm cannot disappear under us and the 'key' only needs
  	 * virtual address, we dont even have to find the underlying vma.
  	 * Note : We do have to check 'uaddr' is a valid user address,
  	 *        but access_ok() should be faster than find_vma()
  	 */
  	if (!fshared) {
  		if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
  			return -EFAULT;
  		key->private.mm = mm;
  		key->private.address = address;
  		return 0;
  	}
  	/*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
210
211
212
  	 * The futex is hashed differently depending on whether
  	 * it's in a shared or private mapping.  So check vma first.
  	 */
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
213
  	vma = find_extend_vma(mm, address);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
  	if (unlikely(!vma))
  		return -EFAULT;
  
  	/*
  	 * Permissions.
  	 */
  	if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ))
  		return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES;
  
  	/*
  	 * Private mappings are handled in a simple way.
  	 *
  	 * NOTE: When userspace waits on a MAP_SHARED mapping, even if
  	 * it's a read-only handle, it's expected that futexes attach to
  	 * the object not the particular process.  Therefore we use
  	 * VM_MAYSHARE here, not VM_SHARED which is restricted to shared
  	 * mappings of _writable_ handles.
  	 */
  	if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
233
  		key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
234
  		key->private.mm = mm;
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
235
  		key->private.address = address;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
236
237
238
239
240
241
  		return 0;
  	}
  
  	/*
  	 * Linear file mappings are also simple.
  	 */
f3a43f3f6   Josef "Jeff" Sipek   [PATCH] kernel: c...
242
  	key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
243
  	key->both.offset |= FUT_OFF_INODE; /* inode-based key. */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
244
  	if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
245
  		key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
246
247
248
249
250
251
252
253
254
255
  				     + vma->vm_pgoff);
  		return 0;
  	}
  
  	/*
  	 * We could walk the page table to read the non-linear
  	 * pte, and get the page index without fetching the page
  	 * from swap.  But that's a lot of code to duplicate here
  	 * for a rare case, so we simply fetch the page.
  	 */
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
256
  	err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
257
258
259
260
261
262
263
264
265
266
267
268
269
  	if (err >= 0) {
  		key->shared.pgoff =
  			page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
  		put_page(page);
  		return 0;
  	}
  	return err;
  }
  
  /*
   * Take a reference to the resource addressed by a key.
   * Can be called while holding spinlocks.
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
270
   */
fad23fc78   Adrian Bunk   kernel/futex.c: m...
271
  static void get_futex_key_refs(union futex_key *key)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
272
  {
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
273
274
275
276
  	if (key->both.ptr == 0)
  		return;
  	switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
  		case FUT_OFF_INODE:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
277
  			atomic_inc(&key->shared.inode->i_count);
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
278
279
  			break;
  		case FUT_OFF_MMSHARED:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
280
  			atomic_inc(&key->private.mm->mm_count);
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
281
  			break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
282
283
284
285
286
287
288
  	}
  }
  
  /*
   * Drop a reference to the resource addressed by a key.
   * The hash bucket spinlock must not be held.
   */
fad23fc78   Adrian Bunk   kernel/futex.c: m...
289
  static void drop_futex_key_refs(union futex_key *key)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
290
  {
c80544dc0   Stephen Hemminger   sparse pointer us...
291
  	if (!key->both.ptr)
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
292
293
294
  		return;
  	switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
  		case FUT_OFF_INODE:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
295
  			iput(key->shared.inode);
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
296
297
  			break;
  		case FUT_OFF_MMSHARED:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
298
  			mmdrop(key->private.mm);
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
299
  			break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
300
301
  	}
  }
36cf3b5c3   Thomas Gleixner   FUTEX: Tidy up th...
302
303
304
305
306
307
308
309
310
311
312
313
  static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
  {
  	u32 curval;
  
  	pagefault_disable();
  	curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
  	pagefault_enable();
  
  	return curval;
  }
  
  static int get_futex_value_locked(u32 *dest, u32 __user *from)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
314
315
  {
  	int ret;
a866374ae   Peter Zijlstra   [PATCH] mm: pagef...
316
  	pagefault_disable();
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
317
  	ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
a866374ae   Peter Zijlstra   [PATCH] mm: pagef...
318
  	pagefault_enable();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
319
320
321
322
323
  
  	return ret ? -EFAULT : 0;
  }
  
  /*
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
324
325
   * Fault handling.
   * if fshared is non NULL, current->mm->mmap_sem is already held
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
326
   */
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
327
328
  static int futex_handle_fault(unsigned long address,
  			      struct rw_semaphore *fshared, int attempt)
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
329
330
331
  {
  	struct vm_area_struct * vma;
  	struct mm_struct *mm = current->mm;
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
332
  	int ret = -EFAULT;
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
333

34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
334
335
  	if (attempt > 2)
  		return ret;
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
336

34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
337
338
339
340
341
  	if (!fshared)
  		down_read(&mm->mmap_sem);
  	vma = find_vma(mm, address);
  	if (vma && address >= vma->vm_start &&
  	    (vma->vm_flags & VM_WRITE)) {
83c54070e   Nick Piggin   mm: fault feedbac...
342
343
344
345
346
347
348
349
350
  		int fault;
  		fault = handle_mm_fault(mm, vma, address, 1);
  		if (unlikely((fault & VM_FAULT_ERROR))) {
  #if 0
  			/* XXX: let's do this when we verify it is OK */
  			if (ret & VM_FAULT_OOM)
  				ret = -ENOMEM;
  #endif
  		} else {
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
351
  			ret = 0;
83c54070e   Nick Piggin   mm: fault feedbac...
352
353
354
355
  			if (fault & VM_FAULT_MAJOR)
  				current->maj_flt++;
  			else
  				current->min_flt++;
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
356
  		}
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
357
  	}
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
358
359
360
  	if (!fshared)
  		up_read(&mm->mmap_sem);
  	return ret;
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
361
362
363
364
365
366
367
368
369
370
371
  }
  
  /*
   * PI code:
   */
  static int refill_pi_state_cache(void)
  {
  	struct futex_pi_state *pi_state;
  
  	if (likely(current->pi_state_cache))
  		return 0;
4668edc33   Burman Yan   [PATCH] kernel co...
372
  	pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
373
374
375
  
  	if (!pi_state)
  		return -ENOMEM;
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
  	INIT_LIST_HEAD(&pi_state->list);
  	/* pi_mutex gets initialized later */
  	pi_state->owner = NULL;
  	atomic_set(&pi_state->refcount, 1);
  
  	current->pi_state_cache = pi_state;
  
  	return 0;
  }
  
  static struct futex_pi_state * alloc_pi_state(void)
  {
  	struct futex_pi_state *pi_state = current->pi_state_cache;
  
  	WARN_ON(!pi_state);
  	current->pi_state_cache = NULL;
  
  	return pi_state;
  }
  
  static void free_pi_state(struct futex_pi_state *pi_state)
  {
  	if (!atomic_dec_and_test(&pi_state->refcount))
  		return;
  
  	/*
  	 * If pi_state->owner is NULL, the owner is most probably dying
  	 * and has cleaned up the pi_state already
  	 */
  	if (pi_state->owner) {
  		spin_lock_irq(&pi_state->owner->pi_lock);
  		list_del_init(&pi_state->list);
  		spin_unlock_irq(&pi_state->owner->pi_lock);
  
  		rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
  	}
  
  	if (current->pi_state_cache)
  		kfree(pi_state);
  	else {
  		/*
  		 * pi_state->list is already empty.
  		 * clear pi_state->owner.
  		 * refcount is at 0 - put it back to 1.
  		 */
  		pi_state->owner = NULL;
  		atomic_set(&pi_state->refcount, 1);
  		current->pi_state_cache = pi_state;
  	}
  }
  
  /*
   * Look up the task based on what TID userspace gave us.
   * We dont trust it.
   */
  static struct task_struct * futex_find_get_task(pid_t pid)
  {
  	struct task_struct *p;
d359b549b   Oleg Nesterov   [PATCH] futex_fin...
434
  	rcu_read_lock();
228ebcbe6   Pavel Emelyanov   Uninline find_tas...
435
  	p = find_task_by_vpid(pid);
a06381fec   Thomas Gleixner   FUTEX: Restore th...
436
437
438
439
  	if (!p || ((current->euid != p->euid) && (current->euid != p->uid)))
  		p = ERR_PTR(-ESRCH);
  	else
  		get_task_struct(p);
d359b549b   Oleg Nesterov   [PATCH] futex_fin...
440
  	rcu_read_unlock();
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
441
442
443
444
445
446
447
448
449
450
451
  
  	return p;
  }
  
  /*
   * This task is holding PI mutexes at exit time => bad.
   * Kernel cleans up PI-state, but userspace is likely hosed.
   * (Robust-futex cleanup is separate and might save the day for userspace.)
   */
  void exit_pi_state_list(struct task_struct *curr)
  {
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
452
453
  	struct list_head *next, *head = &curr->pi_state_list;
  	struct futex_pi_state *pi_state;
627371d73   Ingo Molnar   [PATCH] pi-futex:...
454
  	struct futex_hash_bucket *hb;
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
455
456
457
458
459
  	union futex_key key;
  
  	/*
  	 * We are a ZOMBIE and nobody can enqueue itself on
  	 * pi_state_list anymore, but we have to be careful
627371d73   Ingo Molnar   [PATCH] pi-futex:...
460
  	 * versus waiters unqueueing themselves:
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
461
462
463
464
465
466
467
  	 */
  	spin_lock_irq(&curr->pi_lock);
  	while (!list_empty(head)) {
  
  		next = head->next;
  		pi_state = list_entry(next, struct futex_pi_state, list);
  		key = pi_state->key;
627371d73   Ingo Molnar   [PATCH] pi-futex:...
468
  		hb = hash_futex(&key);
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
469
  		spin_unlock_irq(&curr->pi_lock);
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
470
471
472
  		spin_lock(&hb->lock);
  
  		spin_lock_irq(&curr->pi_lock);
627371d73   Ingo Molnar   [PATCH] pi-futex:...
473
474
475
476
  		/*
  		 * We dropped the pi-lock, so re-check whether this
  		 * task still owns the PI-state:
  		 */
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
477
478
479
480
  		if (head->next != next) {
  			spin_unlock(&hb->lock);
  			continue;
  		}
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
481
  		WARN_ON(pi_state->owner != curr);
627371d73   Ingo Molnar   [PATCH] pi-futex:...
482
483
  		WARN_ON(list_empty(&pi_state->list));
  		list_del_init(&pi_state->list);
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
484
485
486
487
488
489
490
491
492
493
494
495
496
  		pi_state->owner = NULL;
  		spin_unlock_irq(&curr->pi_lock);
  
  		rt_mutex_unlock(&pi_state->pi_mutex);
  
  		spin_unlock(&hb->lock);
  
  		spin_lock_irq(&curr->pi_lock);
  	}
  	spin_unlock_irq(&curr->pi_lock);
  }
  
  static int
d0aa7a70b   Pierre Peiffer   futex_requeue_pi ...
497
498
  lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
  		union futex_key *key, struct futex_pi_state **ps)
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
499
500
501
  {
  	struct futex_pi_state *pi_state = NULL;
  	struct futex_q *this, *next;
ec92d0829   Pierre Peiffer   futex priority ba...
502
  	struct plist_head *head;
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
503
  	struct task_struct *p;
778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
504
  	pid_t pid = uval & FUTEX_TID_MASK;
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
505
506
  
  	head = &hb->chain;
ec92d0829   Pierre Peiffer   futex priority ba...
507
  	plist_for_each_entry_safe(this, next, head, list) {
d0aa7a70b   Pierre Peiffer   futex_requeue_pi ...
508
  		if (match_futex(&this->key, key)) {
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
509
510
511
512
513
  			/*
  			 * Another waiter already exists - bump up
  			 * the refcount and return its pi_state:
  			 */
  			pi_state = this->pi_state;
06a9ec291   Thomas Gleixner   [PATCH] pi-futex:...
514
515
516
517
518
  			/*
  			 * Userspace might have messed up non PI and PI futexes
  			 */
  			if (unlikely(!pi_state))
  				return -EINVAL;
627371d73   Ingo Molnar   [PATCH] pi-futex:...
519
  			WARN_ON(!atomic_read(&pi_state->refcount));
778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
520
521
  			WARN_ON(pid && pi_state->owner &&
  				pi_state->owner->pid != pid);
627371d73   Ingo Molnar   [PATCH] pi-futex:...
522

c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
523
  			atomic_inc(&pi_state->refcount);
d0aa7a70b   Pierre Peiffer   futex_requeue_pi ...
524
  			*ps = pi_state;
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
525
526
527
528
529
530
  
  			return 0;
  		}
  	}
  
  	/*
e3f2ddeac   Ingo Molnar   [PATCH] pi-futex:...
531
  	 * We are the first waiter - try to look up the real owner and attach
778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
532
  	 * the new pi_state to it, but bail out when TID = 0
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
533
  	 */
778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
534
  	if (!pid)
e3f2ddeac   Ingo Molnar   [PATCH] pi-futex:...
535
  		return -ESRCH;
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
536
  	p = futex_find_get_task(pid);
778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
  	if (IS_ERR(p))
  		return PTR_ERR(p);
  
  	/*
  	 * We need to look at the task state flags to figure out,
  	 * whether the task is exiting. To protect against the do_exit
  	 * change of the task flags, we do this protected by
  	 * p->pi_lock:
  	 */
  	spin_lock_irq(&p->pi_lock);
  	if (unlikely(p->flags & PF_EXITING)) {
  		/*
  		 * The task is on the way out. When PF_EXITPIDONE is
  		 * set, we know that the task has finished the
  		 * cleanup:
  		 */
  		int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;
  
  		spin_unlock_irq(&p->pi_lock);
  		put_task_struct(p);
  		return ret;
  	}
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
559
560
561
562
563
564
565
566
567
568
  
  	pi_state = alloc_pi_state();
  
  	/*
  	 * Initialize the pi_mutex in locked state and make 'p'
  	 * the owner of it:
  	 */
  	rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
  
  	/* Store the key for possible exit cleanups: */
d0aa7a70b   Pierre Peiffer   futex_requeue_pi ...
569
  	pi_state->key = *key;
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
570

627371d73   Ingo Molnar   [PATCH] pi-futex:...
571
  	WARN_ON(!list_empty(&pi_state->list));
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
572
573
574
575
576
  	list_add(&pi_state->list, &p->pi_state_list);
  	pi_state->owner = p;
  	spin_unlock_irq(&p->pi_lock);
  
  	put_task_struct(p);
d0aa7a70b   Pierre Peiffer   futex_requeue_pi ...
577
  	*ps = pi_state;
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
578
579
580
581
582
  
  	return 0;
  }
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
583
584
585
586
587
   * The hash bucket lock must be held when this is called.
   * Afterwards, the futex_q must not be accessed.
   */
  static void wake_futex(struct futex_q *q)
  {
ec92d0829   Pierre Peiffer   futex priority ba...
588
  	plist_del(&q->list, &q->list.plist);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
589
590
591
592
  	if (q->filp)
  		send_sigio(&q->filp->f_owner, q->fd, POLL_IN);
  	/*
  	 * The lock in wake_up_all() is a crucial memory barrier after the
ec92d0829   Pierre Peiffer   futex priority ba...
593
  	 * plist_del() and also before assigning to q->lock_ptr.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
594
595
596
597
598
  	 */
  	wake_up_all(&q->waiters);
  	/*
  	 * The waiting task can free the futex_q as soon as this is written,
  	 * without taking any locks.  This must come last.
8e31108b9   Andrew Morton   [PATCH] Fix memor...
599
600
601
602
603
  	 *
  	 * A memory barrier is required here to prevent the following store
  	 * to lock_ptr from getting ahead of the wakeup. Clearing the lock
  	 * at the end of wake_up_all() does not prevent this store from
  	 * moving.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
604
  	 */
ccdea2f88   Ralf Baechle   [PATCH] futex: re...
605
  	smp_wmb();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
606
607
  	q->lock_ptr = NULL;
  }
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
608
609
610
611
612
613
614
615
  static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
  {
  	struct task_struct *new_owner;
  	struct futex_pi_state *pi_state = this->pi_state;
  	u32 curval, newval;
  
  	if (!pi_state)
  		return -EINVAL;
21778867b   Ingo Molnar   [PATCH] futex: PI...
616
  	spin_lock(&pi_state->pi_mutex.wait_lock);
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
  	new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
  
  	/*
  	 * This happens when we have stolen the lock and the original
  	 * pending owner did not enqueue itself back on the rt_mutex.
  	 * Thats not a tragedy. We know that way, that a lock waiter
  	 * is on the fly. We make the futex_q waiter the pending owner.
  	 */
  	if (!new_owner)
  		new_owner = this->task;
  
  	/*
  	 * We pass it to the next owner. (The WAITERS bit is always
  	 * kept enabled while there is PI state around. We must also
  	 * preserve the owner died bit.)
  	 */
e3f2ddeac   Ingo Molnar   [PATCH] pi-futex:...
633
  	if (!(uval & FUTEX_OWNER_DIED)) {
778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
634
  		int ret = 0;
b488893a3   Pavel Emelyanov   pid namespaces: c...
635
  		newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
e3f2ddeac   Ingo Molnar   [PATCH] pi-futex:...
636

36cf3b5c3   Thomas Gleixner   FUTEX: Tidy up th...
637
  		curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
638

e3f2ddeac   Ingo Molnar   [PATCH] pi-futex:...
639
  		if (curval == -EFAULT)
778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
640
  			ret = -EFAULT;
e3f2ddeac   Ingo Molnar   [PATCH] pi-futex:...
641
  		if (curval != uval)
778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
642
643
644
645
646
  			ret = -EINVAL;
  		if (ret) {
  			spin_unlock(&pi_state->pi_mutex.wait_lock);
  			return ret;
  		}
e3f2ddeac   Ingo Molnar   [PATCH] pi-futex:...
647
  	}
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
648

627371d73   Ingo Molnar   [PATCH] pi-futex:...
649
650
651
652
653
654
655
  	spin_lock_irq(&pi_state->owner->pi_lock);
  	WARN_ON(list_empty(&pi_state->list));
  	list_del_init(&pi_state->list);
  	spin_unlock_irq(&pi_state->owner->pi_lock);
  
  	spin_lock_irq(&new_owner->pi_lock);
  	WARN_ON(!list_empty(&pi_state->list));
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
656
657
  	list_add(&pi_state->list, &new_owner->pi_state_list);
  	pi_state->owner = new_owner;
627371d73   Ingo Molnar   [PATCH] pi-futex:...
658
  	spin_unlock_irq(&new_owner->pi_lock);
21778867b   Ingo Molnar   [PATCH] futex: PI...
659
  	spin_unlock(&pi_state->pi_mutex.wait_lock);
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
660
661
662
663
664
665
666
667
668
669
670
671
672
  	rt_mutex_unlock(&pi_state->pi_mutex);
  
  	return 0;
  }
  
  static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
  {
  	u32 oldval;
  
  	/*
  	 * There is no waiter, so we unlock the futex. The owner died
  	 * bit has not to be preserved here. We are the owner:
  	 */
36cf3b5c3   Thomas Gleixner   FUTEX: Tidy up th...
673
  	oldval = cmpxchg_futex_value_locked(uaddr, uval, 0);
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
674
675
676
677
678
679
680
681
  
  	if (oldval == -EFAULT)
  		return oldval;
  	if (oldval != uval)
  		return -EAGAIN;
  
  	return 0;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
682
  /*
8b8f319fc   Ingo Molnar   [PATCH] lockdep: ...
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
   * Express the locking dependencies for lockdep:
   */
  static inline void
  double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
  {
  	if (hb1 <= hb2) {
  		spin_lock(&hb1->lock);
  		if (hb1 < hb2)
  			spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
  	} else { /* hb1 > hb2 */
  		spin_lock(&hb2->lock);
  		spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
  	}
  }
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
699
700
701
   * Wake up all waiters hashed on the physical page that is mapped
   * to this virtual address:
   */
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
702
703
  static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
  		      int nr_wake)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
704
  {
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
705
  	struct futex_hash_bucket *hb;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
706
  	struct futex_q *this, *next;
ec92d0829   Pierre Peiffer   futex priority ba...
707
  	struct plist_head *head;
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
708
  	union futex_key key;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
709
  	int ret;
36cf3b5c3   Thomas Gleixner   FUTEX: Tidy up th...
710
  	futex_lock_mm(fshared);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
711

34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
712
  	ret = get_futex_key(uaddr, fshared, &key);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
713
714
  	if (unlikely(ret != 0))
  		goto out;
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
715
716
717
  	hb = hash_futex(&key);
  	spin_lock(&hb->lock);
  	head = &hb->chain;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
718

ec92d0829   Pierre Peiffer   futex priority ba...
719
  	plist_for_each_entry_safe(this, next, head, list) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
720
  		if (match_futex (&this->key, &key)) {
ed6f7b10e   Ingo Molnar   [PATCH] pi-futex:...
721
722
723
724
  			if (this->pi_state) {
  				ret = -EINVAL;
  				break;
  			}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
725
726
727
728
729
  			wake_futex(this);
  			if (++ret >= nr_wake)
  				break;
  		}
  	}
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
730
  	spin_unlock(&hb->lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
731
  out:
36cf3b5c3   Thomas Gleixner   FUTEX: Tidy up th...
732
  	futex_unlock_mm(fshared);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
733
734
735
736
  	return ret;
  }
  
  /*
4732efbeb   Jakub Jelinek   [PATCH] FUTEX_WAK...
737
738
739
   * Wake up all waiters hashed on the physical page that is mapped
   * to this virtual address:
   */
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
740
  static int
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
741
742
  futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared,
  	      u32 __user *uaddr2,
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
743
  	      int nr_wake, int nr_wake2, int op)
4732efbeb   Jakub Jelinek   [PATCH] FUTEX_WAK...
744
745
  {
  	union futex_key key1, key2;
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
746
  	struct futex_hash_bucket *hb1, *hb2;
ec92d0829   Pierre Peiffer   futex priority ba...
747
  	struct plist_head *head;
4732efbeb   Jakub Jelinek   [PATCH] FUTEX_WAK...
748
749
750
751
  	struct futex_q *this, *next;
  	int ret, op_ret, attempt = 0;
  
  retryfull:
36cf3b5c3   Thomas Gleixner   FUTEX: Tidy up th...
752
  	futex_lock_mm(fshared);
4732efbeb   Jakub Jelinek   [PATCH] FUTEX_WAK...
753

34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
754
  	ret = get_futex_key(uaddr1, fshared, &key1);
4732efbeb   Jakub Jelinek   [PATCH] FUTEX_WAK...
755
756
  	if (unlikely(ret != 0))
  		goto out;
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
757
  	ret = get_futex_key(uaddr2, fshared, &key2);
4732efbeb   Jakub Jelinek   [PATCH] FUTEX_WAK...
758
759
  	if (unlikely(ret != 0))
  		goto out;
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
760
761
  	hb1 = hash_futex(&key1);
  	hb2 = hash_futex(&key2);
4732efbeb   Jakub Jelinek   [PATCH] FUTEX_WAK...
762
763
  
  retry:
8b8f319fc   Ingo Molnar   [PATCH] lockdep: ...
764
  	double_lock_hb(hb1, hb2);
4732efbeb   Jakub Jelinek   [PATCH] FUTEX_WAK...
765

e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
766
  	op_ret = futex_atomic_op_inuser(op, uaddr2);
4732efbeb   Jakub Jelinek   [PATCH] FUTEX_WAK...
767
  	if (unlikely(op_ret < 0)) {
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
768
  		u32 dummy;
4732efbeb   Jakub Jelinek   [PATCH] FUTEX_WAK...
769

e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
770
771
772
  		spin_unlock(&hb1->lock);
  		if (hb1 != hb2)
  			spin_unlock(&hb2->lock);
4732efbeb   Jakub Jelinek   [PATCH] FUTEX_WAK...
773

7ee1dd3fe   David Howells   [PATCH] FRV: Make...
774
  #ifndef CONFIG_MMU
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
775
776
777
778
  		/*
  		 * we don't get EFAULT from MMU faults if we don't have an MMU,
  		 * but we might get them from range checking
  		 */
7ee1dd3fe   David Howells   [PATCH] FRV: Make...
779
780
781
  		ret = op_ret;
  		goto out;
  #endif
796f8d9b9   David Gibson   [PATCH] FUTEX_WAK...
782
783
784
785
  		if (unlikely(op_ret != -EFAULT)) {
  			ret = op_ret;
  			goto out;
  		}
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
786
787
  		/*
  		 * futex_atomic_op_inuser needs to both read and write
4732efbeb   Jakub Jelinek   [PATCH] FUTEX_WAK...
788
789
790
  		 * *(int __user *)uaddr2, but we can't modify it
  		 * non-atomically.  Therefore, if get_user below is not
  		 * enough, we need to handle the fault ourselves, while
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
791
792
  		 * still holding the mmap_sem.
  		 */
4732efbeb   Jakub Jelinek   [PATCH] FUTEX_WAK...
793
  		if (attempt++) {
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
794
  			ret = futex_handle_fault((unsigned long)uaddr2,
36cf3b5c3   Thomas Gleixner   FUTEX: Tidy up th...
795
  						 fshared, attempt);
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
796
  			if (ret)
4732efbeb   Jakub Jelinek   [PATCH] FUTEX_WAK...
797
  				goto out;
4732efbeb   Jakub Jelinek   [PATCH] FUTEX_WAK...
798
799
  			goto retry;
  		}
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
800
801
802
803
  		/*
  		 * If we would have faulted, release mmap_sem,
  		 * fault it in and start all over again.
  		 */
36cf3b5c3   Thomas Gleixner   FUTEX: Tidy up th...
804
  		futex_unlock_mm(fshared);
4732efbeb   Jakub Jelinek   [PATCH] FUTEX_WAK...
805

e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
806
  		ret = get_user(dummy, uaddr2);
4732efbeb   Jakub Jelinek   [PATCH] FUTEX_WAK...
807
808
809
810
811
  		if (ret)
  			return ret;
  
  		goto retryfull;
  	}
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
812
  	head = &hb1->chain;
4732efbeb   Jakub Jelinek   [PATCH] FUTEX_WAK...
813

ec92d0829   Pierre Peiffer   futex priority ba...
814
  	plist_for_each_entry_safe(this, next, head, list) {
4732efbeb   Jakub Jelinek   [PATCH] FUTEX_WAK...
815
816
817
818
819
820
821
822
  		if (match_futex (&this->key, &key1)) {
  			wake_futex(this);
  			if (++ret >= nr_wake)
  				break;
  		}
  	}
  
  	if (op_ret > 0) {
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
823
  		head = &hb2->chain;
4732efbeb   Jakub Jelinek   [PATCH] FUTEX_WAK...
824
825
  
  		op_ret = 0;
ec92d0829   Pierre Peiffer   futex priority ba...
826
  		plist_for_each_entry_safe(this, next, head, list) {
4732efbeb   Jakub Jelinek   [PATCH] FUTEX_WAK...
827
828
829
830
831
832
833
834
  			if (match_futex (&this->key, &key2)) {
  				wake_futex(this);
  				if (++op_ret >= nr_wake2)
  					break;
  			}
  		}
  		ret += op_ret;
  	}
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
835
836
837
  	spin_unlock(&hb1->lock);
  	if (hb1 != hb2)
  		spin_unlock(&hb2->lock);
4732efbeb   Jakub Jelinek   [PATCH] FUTEX_WAK...
838
  out:
36cf3b5c3   Thomas Gleixner   FUTEX: Tidy up th...
839
  	futex_unlock_mm(fshared);
4732efbeb   Jakub Jelinek   [PATCH] FUTEX_WAK...
840
841
842
843
  	return ret;
  }
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
844
845
846
   * Requeue all waiters hashed on one physical page to another
   * physical page.
   */
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
847
848
  static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
  			 u32 __user *uaddr2,
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
849
  			 int nr_wake, int nr_requeue, u32 *cmpval)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
850
851
  {
  	union futex_key key1, key2;
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
852
  	struct futex_hash_bucket *hb1, *hb2;
ec92d0829   Pierre Peiffer   futex priority ba...
853
  	struct plist_head *head1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
854
855
856
857
  	struct futex_q *this, *next;
  	int ret, drop_count = 0;
  
   retry:
36cf3b5c3   Thomas Gleixner   FUTEX: Tidy up th...
858
  	futex_lock_mm(fshared);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
859

34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
860
  	ret = get_futex_key(uaddr1, fshared, &key1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
861
862
  	if (unlikely(ret != 0))
  		goto out;
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
863
  	ret = get_futex_key(uaddr2, fshared, &key2);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
864
865
  	if (unlikely(ret != 0))
  		goto out;
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
866
867
  	hb1 = hash_futex(&key1);
  	hb2 = hash_futex(&key2);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
868

8b8f319fc   Ingo Molnar   [PATCH] lockdep: ...
869
  	double_lock_hb(hb1, hb2);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
870

e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
871
872
  	if (likely(cmpval != NULL)) {
  		u32 curval;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
873

e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
874
  		ret = get_futex_value_locked(&curval, uaddr1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
875
876
  
  		if (unlikely(ret)) {
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
877
878
879
  			spin_unlock(&hb1->lock);
  			if (hb1 != hb2)
  				spin_unlock(&hb2->lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
880

e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
881
882
  			/*
  			 * If we would have faulted, release mmap_sem, fault
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
883
884
  			 * it in and start all over again.
  			 */
36cf3b5c3   Thomas Gleixner   FUTEX: Tidy up th...
885
  			futex_unlock_mm(fshared);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
886

e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
887
  			ret = get_user(curval, uaddr1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
888
889
890
891
892
893
  
  			if (!ret)
  				goto retry;
  
  			return ret;
  		}
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
894
  		if (curval != *cmpval) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
895
896
897
898
  			ret = -EAGAIN;
  			goto out_unlock;
  		}
  	}
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
899
  	head1 = &hb1->chain;
ec92d0829   Pierre Peiffer   futex priority ba...
900
  	plist_for_each_entry_safe(this, next, head1, list) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
901
902
903
904
905
  		if (!match_futex (&this->key, &key1))
  			continue;
  		if (++ret <= nr_wake) {
  			wake_futex(this);
  		} else {
59e0e0ace   Sebastien Dugue   [PATCH] futex_req...
906
907
908
909
910
  			/*
  			 * If key1 and key2 hash to the same bucket, no need to
  			 * requeue.
  			 */
  			if (likely(head1 != &hb2->chain)) {
ec92d0829   Pierre Peiffer   futex priority ba...
911
912
  				plist_del(&this->list, &hb1->chain);
  				plist_add(&this->list, &hb2->chain);
59e0e0ace   Sebastien Dugue   [PATCH] futex_req...
913
  				this->lock_ptr = &hb2->lock;
ec92d0829   Pierre Peiffer   futex priority ba...
914
915
916
  #ifdef CONFIG_DEBUG_PI_LIST
  				this->list.plist.lock = &hb2->lock;
  #endif
778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
917
  			}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
918
  			this->key = key2;
9adef58b1   Rusty Russell   futex: get_futex_...
919
  			get_futex_key_refs(&key2);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
920
921
922
923
  			drop_count++;
  
  			if (ret - nr_wake >= nr_requeue)
  				break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
924
925
926
927
  		}
  	}
  
  out_unlock:
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
928
929
930
  	spin_unlock(&hb1->lock);
  	if (hb1 != hb2)
  		spin_unlock(&hb2->lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
931

9adef58b1   Rusty Russell   futex: get_futex_...
932
  	/* drop_futex_key_refs() must be called outside the spinlocks. */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
933
  	while (--drop_count >= 0)
9adef58b1   Rusty Russell   futex: get_futex_...
934
  		drop_futex_key_refs(&key1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
935
936
  
  out:
36cf3b5c3   Thomas Gleixner   FUTEX: Tidy up th...
937
  	futex_unlock_mm(fshared);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
938
939
940
941
942
943
944
  	return ret;
  }
  
  /* The key must be already stored in q->key. */
  static inline struct futex_hash_bucket *
  queue_lock(struct futex_q *q, int fd, struct file *filp)
  {
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
945
  	struct futex_hash_bucket *hb;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
946
947
948
949
950
  
  	q->fd = fd;
  	q->filp = filp;
  
  	init_waitqueue_head(&q->waiters);
9adef58b1   Rusty Russell   futex: get_futex_...
951
  	get_futex_key_refs(&q->key);
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
952
953
  	hb = hash_futex(&q->key);
  	q->lock_ptr = &hb->lock;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
954

e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
955
956
  	spin_lock(&hb->lock);
  	return hb;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
957
  }
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
958
  static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
959
  {
ec92d0829   Pierre Peiffer   futex priority ba...
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
  	int prio;
  
  	/*
  	 * The priority used to register this element is
  	 * - either the real thread-priority for the real-time threads
  	 * (i.e. threads with a priority lower than MAX_RT_PRIO)
  	 * - or MAX_RT_PRIO for non-RT threads.
  	 * Thus, all RT-threads are woken first in priority order, and
  	 * the others are woken last, in FIFO order.
  	 */
  	prio = min(current->normal_prio, MAX_RT_PRIO);
  
  	plist_node_init(&q->list, prio);
  #ifdef CONFIG_DEBUG_PI_LIST
  	q->list.plist.lock = &hb->lock;
  #endif
  	plist_add(&q->list, &hb->chain);
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
977
  	q->task = current;
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
978
  	spin_unlock(&hb->lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
979
980
981
  }
  
  static inline void
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
982
  queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
983
  {
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
984
  	spin_unlock(&hb->lock);
9adef58b1   Rusty Russell   futex: get_futex_...
985
  	drop_futex_key_refs(&q->key);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
986
987
988
989
990
991
992
993
994
995
  }
  
  /*
   * queue_me and unqueue_me must be called as a pair, each
   * exactly once.  They are called with the hashed spinlock held.
   */
  
  /* The key must be already stored in q->key. */
  static void queue_me(struct futex_q *q, int fd, struct file *filp)
  {
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
996
997
998
999
  	struct futex_hash_bucket *hb;
  
  	hb = queue_lock(q, fd, filp);
  	__queue_me(q, hb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1000
1001
1002
1003
1004
  }
  
  /* Return 1 if we were still queued (ie. 0 means we were woken) */
  static int unqueue_me(struct futex_q *q)
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1005
  	spinlock_t *lock_ptr;
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
1006
  	int ret = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1007
1008
1009
1010
  
  	/* In the common case we don't take the spinlock, which is nice. */
   retry:
  	lock_ptr = q->lock_ptr;
e91467ecd   Christian Borntraeger   [PATCH] bug in fu...
1011
  	barrier();
c80544dc0   Stephen Hemminger   sparse pointer us...
1012
  	if (lock_ptr != NULL) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
  		spin_lock(lock_ptr);
  		/*
  		 * q->lock_ptr can change between reading it and
  		 * spin_lock(), causing us to take the wrong lock.  This
  		 * corrects the race condition.
  		 *
  		 * Reasoning goes like this: if we have the wrong lock,
  		 * q->lock_ptr must have changed (maybe several times)
  		 * between reading it and the spin_lock().  It can
  		 * change again after the spin_lock() but only if it was
  		 * already changed before the spin_lock().  It cannot,
  		 * however, change back to the original value.  Therefore
  		 * we can detect whether we acquired the correct lock.
  		 */
  		if (unlikely(lock_ptr != q->lock_ptr)) {
  			spin_unlock(lock_ptr);
  			goto retry;
  		}
ec92d0829   Pierre Peiffer   futex priority ba...
1031
1032
  		WARN_ON(plist_node_empty(&q->list));
  		plist_del(&q->list, &q->list.plist);
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1033
1034
  
  		BUG_ON(q->pi_state);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1035
1036
1037
  		spin_unlock(lock_ptr);
  		ret = 1;
  	}
9adef58b1   Rusty Russell   futex: get_futex_...
1038
  	drop_futex_key_refs(&q->key);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1039
1040
  	return ret;
  }
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1041
1042
  /*
   * PI futexes can not be requeued and must remove themself from the
d0aa7a70b   Pierre Peiffer   futex_requeue_pi ...
1043
1044
   * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry
   * and dropped here.
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1045
   */
d0aa7a70b   Pierre Peiffer   futex_requeue_pi ...
1046
  static void unqueue_me_pi(struct futex_q *q)
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1047
  {
ec92d0829   Pierre Peiffer   futex priority ba...
1048
1049
  	WARN_ON(plist_node_empty(&q->list));
  	plist_del(&q->list, &q->list.plist);
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1050
1051
1052
1053
  
  	BUG_ON(!q->pi_state);
  	free_pi_state(q->pi_state);
  	q->pi_state = NULL;
d0aa7a70b   Pierre Peiffer   futex_requeue_pi ...
1054
  	spin_unlock(q->lock_ptr);
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1055

9adef58b1   Rusty Russell   futex: get_futex_...
1056
  	drop_futex_key_refs(&q->key);
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1057
  }
d0aa7a70b   Pierre Peiffer   futex_requeue_pi ...
1058
1059
1060
  /*
   * Fixup the pi_state owner with current.
   *
778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
1061
1062
   * Must be called with hash bucket lock held and mm->sem held for non
   * private futexes.
d0aa7a70b   Pierre Peiffer   futex_requeue_pi ...
1063
   */
778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
1064
  static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
d0aa7a70b   Pierre Peiffer   futex_requeue_pi ...
1065
1066
  				struct task_struct *curr)
  {
b488893a3   Pavel Emelyanov   pid namespaces: c...
1067
  	u32 newtid = task_pid_vnr(curr) | FUTEX_WAITERS;
d0aa7a70b   Pierre Peiffer   futex_requeue_pi ...
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
  	struct futex_pi_state *pi_state = q->pi_state;
  	u32 uval, curval, newval;
  	int ret;
  
  	/* Owner died? */
  	if (pi_state->owner != NULL) {
  		spin_lock_irq(&pi_state->owner->pi_lock);
  		WARN_ON(list_empty(&pi_state->list));
  		list_del_init(&pi_state->list);
  		spin_unlock_irq(&pi_state->owner->pi_lock);
  	} else
  		newtid |= FUTEX_OWNER_DIED;
  
  	pi_state->owner = curr;
  
  	spin_lock_irq(&curr->pi_lock);
  	WARN_ON(!list_empty(&pi_state->list));
  	list_add(&pi_state->list, &curr->pi_state_list);
  	spin_unlock_irq(&curr->pi_lock);
d0aa7a70b   Pierre Peiffer   futex_requeue_pi ...
1087
1088
1089
1090
1091
  	/*
  	 * We own it, so we have to replace the pending owner
  	 * TID. This must be atomic as we have preserve the
  	 * owner died bit here.
  	 */
778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
1092
  	ret = get_futex_value_locked(&uval, uaddr);
d0aa7a70b   Pierre Peiffer   futex_requeue_pi ...
1093
1094
  	while (!ret) {
  		newval = (uval & FUTEX_OWNER_DIED) | newtid;
778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
1095

36cf3b5c3   Thomas Gleixner   FUTEX: Tidy up th...
1096
  		curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
1097

d0aa7a70b   Pierre Peiffer   futex_requeue_pi ...
1098
  		if (curval == -EFAULT)
778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
1099
  			ret = -EFAULT;
d0aa7a70b   Pierre Peiffer   futex_requeue_pi ...
1100
1101
1102
1103
1104
1105
  		if (curval == uval)
  			break;
  		uval = curval;
  	}
  	return ret;
  }
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
1106
1107
1108
1109
1110
  /*
   * In case we must use restart_block to restart a futex_wait,
   * we encode in the 'arg3' shared capability
   */
  #define ARG3_SHARED  1
72c1bbf30   Nick Piggin   futex: restartabl...
1111
  static long futex_wait_restart(struct restart_block *restart);
36cf3b5c3   Thomas Gleixner   FUTEX: Tidy up th...
1112

34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
1113
1114
  static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
  		      u32 val, ktime_t *abs_time)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1115
  {
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1116
1117
  	struct task_struct *curr = current;
  	DECLARE_WAITQUEUE(wait, curr);
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
1118
  	struct futex_hash_bucket *hb;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1119
  	struct futex_q q;
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
1120
1121
  	u32 uval;
  	int ret;
bd197234b   Thomas Gleixner   Revert "futex_req...
1122
  	struct hrtimer_sleeper t;
c19384b5b   Pierre Peiffer   Make futex_wait()...
1123
  	int rem = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1124

c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1125
  	q.pi_state = NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1126
   retry:
36cf3b5c3   Thomas Gleixner   FUTEX: Tidy up th...
1127
  	futex_lock_mm(fshared);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1128

34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
1129
  	ret = get_futex_key(uaddr, fshared, &q.key);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1130
1131
  	if (unlikely(ret != 0))
  		goto out_release_sem;
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
1132
  	hb = queue_lock(&q, -1, NULL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
  
  	/*
  	 * Access the page AFTER the futex is queued.
  	 * Order is important:
  	 *
  	 *   Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
  	 *   Userspace waker:  if (cond(var)) { var = new; futex_wake(&var); }
  	 *
  	 * The basic logical guarantee of a futex is that it blocks ONLY
  	 * if cond(var) is known to be true at the time of blocking, for
  	 * any cond.  If we queued after testing *uaddr, that would open
  	 * a race condition where we could block indefinitely with
  	 * cond(var) false, which would violate the guarantee.
  	 *
  	 * A consequence is that futex_wait() can return zero and absorb
  	 * a wakeup when *uaddr != val on entry to the syscall.  This is
  	 * rare, but normal.
  	 *
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
1151
1152
  	 * for shared futexes, we hold the mmap semaphore, so the mapping
  	 * cannot have changed since we looked it up in get_futex_key.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1153
  	 */
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
1154
  	ret = get_futex_value_locked(&uval, uaddr);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1155
1156
  
  	if (unlikely(ret)) {
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
1157
  		queue_unlock(&q, hb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1158

e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
1159
1160
  		/*
  		 * If we would have faulted, release mmap_sem, fault it in and
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1161
1162
  		 * start all over again.
  		 */
36cf3b5c3   Thomas Gleixner   FUTEX: Tidy up th...
1163
  		futex_unlock_mm(fshared);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1164

e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
1165
  		ret = get_user(uval, uaddr);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1166
1167
1168
1169
1170
  
  		if (!ret)
  			goto retry;
  		return ret;
  	}
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1171
1172
1173
  	ret = -EWOULDBLOCK;
  	if (uval != val)
  		goto out_unlock_release_sem;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1174
1175
  
  	/* Only actually queue if *uaddr contained val.  */
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
1176
  	__queue_me(&q, hb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1177
1178
1179
1180
  
  	/*
  	 * Now the futex is queued and we have checked the data, we
  	 * don't want to hold mmap_sem while we sleep.
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1181
  	 */
36cf3b5c3   Thomas Gleixner   FUTEX: Tidy up th...
1182
  	futex_unlock_mm(fshared);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
  
  	/*
  	 * There might have been scheduling since the queue_me(), as we
  	 * cannot hold a spinlock across the get_user() in case it
  	 * faults, and we cannot just set TASK_INTERRUPTIBLE state when
  	 * queueing ourselves into the futex hash.  This code thus has to
  	 * rely on the futex_wake() code removing us from hash when it
  	 * wakes us up.
  	 */
  
  	/* add_wait_queue is the barrier after __set_current_state. */
  	__set_current_state(TASK_INTERRUPTIBLE);
  	add_wait_queue(&q.waiters, &wait);
  	/*
ec92d0829   Pierre Peiffer   futex priority ba...
1197
  	 * !plist_node_empty() is safe here without any lock.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1198
1199
  	 * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
  	 */
ec92d0829   Pierre Peiffer   futex priority ba...
1200
  	if (likely(!plist_node_empty(&q.list))) {
c19384b5b   Pierre Peiffer   Make futex_wait()...
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
  		if (!abs_time)
  			schedule();
  		else {
  			hrtimer_init(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
  			hrtimer_init_sleeper(&t, current);
  			t.timer.expires = *abs_time;
  
  			hrtimer_start(&t.timer, t.timer.expires, HRTIMER_MODE_ABS);
  
  			/*
  			 * the timer could have already expired, in which
  			 * case current would be flagged for rescheduling.
  			 * Don't bother calling schedule.
  			 */
  			if (likely(t.task))
  				schedule();
  
  			hrtimer_cancel(&t.timer);
72c1bbf30   Nick Piggin   futex: restartabl...
1219

c19384b5b   Pierre Peiffer   Make futex_wait()...
1220
1221
1222
  			/* Flag if a timeout occured */
  			rem = (t.task == NULL);
  		}
72c1bbf30   Nick Piggin   futex: restartabl...
1223
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
  	__set_current_state(TASK_RUNNING);
  
  	/*
  	 * NOTE: we don't remove ourselves from the waitqueue because
  	 * we are the only user of it.
  	 */
  
  	/* If we were woken (and unqueued), we succeeded, whatever. */
  	if (!unqueue_me(&q))
  		return 0;
c19384b5b   Pierre Peiffer   Make futex_wait()...
1234
  	if (rem)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1235
  		return -ETIMEDOUT;
72c1bbf30   Nick Piggin   futex: restartabl...
1236

e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
1237
1238
1239
1240
  	/*
  	 * We expect signal_pending(current), but another thread may
  	 * have handled it for us already.
  	 */
c19384b5b   Pierre Peiffer   Make futex_wait()...
1241
  	if (!abs_time)
72c1bbf30   Nick Piggin   futex: restartabl...
1242
1243
1244
1245
1246
1247
1248
  		return -ERESTARTSYS;
  	else {
  		struct restart_block *restart;
  		restart = &current_thread_info()->restart_block;
  		restart->fn = futex_wait_restart;
  		restart->arg0 = (unsigned long)uaddr;
  		restart->arg1 = (unsigned long)val;
c19384b5b   Pierre Peiffer   Make futex_wait()...
1249
  		restart->arg2 = (unsigned long)abs_time;
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
1250
1251
1252
  		restart->arg3 = 0;
  		if (fshared)
  			restart->arg3 |= ARG3_SHARED;
72c1bbf30   Nick Piggin   futex: restartabl...
1253
1254
  		return -ERESTART_RESTARTBLOCK;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1255

c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1256
1257
   out_unlock_release_sem:
  	queue_unlock(&q, hb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1258
   out_release_sem:
36cf3b5c3   Thomas Gleixner   FUTEX: Tidy up th...
1259
  	futex_unlock_mm(fshared);
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1260
1261
  	return ret;
  }
72c1bbf30   Nick Piggin   futex: restartabl...
1262
1263
1264
1265
1266
  
  static long futex_wait_restart(struct restart_block *restart)
  {
  	u32 __user *uaddr = (u32 __user *)restart->arg0;
  	u32 val = (u32)restart->arg1;
c19384b5b   Pierre Peiffer   Make futex_wait()...
1267
  	ktime_t *abs_time = (ktime_t *)restart->arg2;
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
1268
  	struct rw_semaphore *fshared = NULL;
72c1bbf30   Nick Piggin   futex: restartabl...
1269
1270
  
  	restart->fn = do_no_restart_syscall;
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
1271
1272
1273
  	if (restart->arg3 & ARG3_SHARED)
  		fshared = &current->mm->mmap_sem;
  	return (long)futex_wait(uaddr, fshared, val, abs_time);
72c1bbf30   Nick Piggin   futex: restartabl...
1274
  }
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1275
1276
1277
1278
1279
1280
  /*
   * Userspace tried a 0 -> TID atomic transition of the futex value
   * and failed. The kernel side here does the whole locking operation:
   * if there are waiters then it will block, it does PI, etc. (Due to
   * races the kernel might see a 0 value of the futex too.)
   */
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
1281
1282
  static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
  			 int detect, ktime_t *time, int trylock)
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1283
  {
c5780e976   Thomas Gleixner   [PATCH] Use the c...
1284
  	struct hrtimer_sleeper timeout, *to = NULL;
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1285
1286
1287
1288
  	struct task_struct *curr = current;
  	struct futex_hash_bucket *hb;
  	u32 uval, newval, curval;
  	struct futex_q q;
778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
1289
  	int ret, lock_taken, ownerdied = 0, attempt = 0;
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1290
1291
1292
  
  	if (refill_pi_state_cache())
  		return -ENOMEM;
c19384b5b   Pierre Peiffer   Make futex_wait()...
1293
  	if (time) {
c5780e976   Thomas Gleixner   [PATCH] Use the c...
1294
  		to = &timeout;
c9cb2e3d7   Thomas Gleixner   [PATCH] hrtimers:...
1295
  		hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
c5780e976   Thomas Gleixner   [PATCH] Use the c...
1296
  		hrtimer_init_sleeper(to, current);
c19384b5b   Pierre Peiffer   Make futex_wait()...
1297
  		to->timer.expires = *time;
c5780e976   Thomas Gleixner   [PATCH] Use the c...
1298
  	}
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1299
1300
  	q.pi_state = NULL;
   retry:
36cf3b5c3   Thomas Gleixner   FUTEX: Tidy up th...
1301
  	futex_lock_mm(fshared);
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1302

34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
1303
  	ret = get_futex_key(uaddr, fshared, &q.key);
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1304
1305
  	if (unlikely(ret != 0))
  		goto out_release_sem;
778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
1306
   retry_unlocked:
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1307
1308
1309
  	hb = queue_lock(&q, -1, NULL);
  
   retry_locked:
778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
1310
  	ret = lock_taken = 0;
d0aa7a70b   Pierre Peiffer   futex_requeue_pi ...
1311

c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1312
1313
1314
1315
1316
  	/*
  	 * To avoid races, we attempt to take the lock here again
  	 * (by doing a 0 -> TID atomic cmpxchg), while holding all
  	 * the locks. It will most likely not succeed.
  	 */
b488893a3   Pavel Emelyanov   pid namespaces: c...
1317
  	newval = task_pid_vnr(current);
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1318

36cf3b5c3   Thomas Gleixner   FUTEX: Tidy up th...
1319
  	curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1320
1321
1322
  
  	if (unlikely(curval == -EFAULT))
  		goto uaddr_faulted;
778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
1323
1324
1325
1326
  	/*
  	 * Detect deadlocks. In case of REQUEUE_PI this is a valid
  	 * situation and we return success to user space.
  	 */
b488893a3   Pavel Emelyanov   pid namespaces: c...
1327
  	if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) {
bd197234b   Thomas Gleixner   Revert "futex_req...
1328
  		ret = -EDEADLK;
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1329
1330
1331
1332
  		goto out_unlock_release_sem;
  	}
  
  	/*
778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
1333
  	 * Surprise - we got the lock. Just return to userspace:
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1334
1335
1336
1337
1338
  	 */
  	if (unlikely(!curval))
  		goto out_unlock_release_sem;
  
  	uval = curval;
778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
1339

d0aa7a70b   Pierre Peiffer   futex_requeue_pi ...
1340
  	/*
778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
1341
1342
  	 * Set the WAITERS flag, so the owner will know it has someone
  	 * to wake at next unlock
d0aa7a70b   Pierre Peiffer   futex_requeue_pi ...
1343
  	 */
778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
1344
1345
1346
1347
  	newval = curval | FUTEX_WAITERS;
  
  	/*
  	 * There are two cases, where a futex might have no owner (the
bd197234b   Thomas Gleixner   Revert "futex_req...
1348
1349
1350
  	 * owner TID is 0): OWNER_DIED. We take over the futex in this
  	 * case. We also do an unconditional take over, when the owner
  	 * of the futex died.
778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
1351
1352
1353
1354
  	 *
  	 * This is safe as we are protected by the hash bucket lock !
  	 */
  	if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
bd197234b   Thomas Gleixner   Revert "futex_req...
1355
  		/* Keep the OWNER_DIED bit */
b488893a3   Pavel Emelyanov   pid namespaces: c...
1356
  		newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(current);
778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
1357
1358
1359
  		ownerdied = 0;
  		lock_taken = 1;
  	}
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1360

36cf3b5c3   Thomas Gleixner   FUTEX: Tidy up th...
1361
  	curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1362
1363
1364
1365
1366
  
  	if (unlikely(curval == -EFAULT))
  		goto uaddr_faulted;
  	if (unlikely(curval != uval))
  		goto retry_locked;
778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
1367
  	/*
bd197234b   Thomas Gleixner   Revert "futex_req...
1368
  	 * We took the lock due to owner died take over.
778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
1369
  	 */
bd197234b   Thomas Gleixner   Revert "futex_req...
1370
  	if (unlikely(lock_taken))
d0aa7a70b   Pierre Peiffer   futex_requeue_pi ...
1371
  		goto out_unlock_release_sem;
d0aa7a70b   Pierre Peiffer   futex_requeue_pi ...
1372

c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1373
1374
1375
1376
  	/*
  	 * We dont have the lock. Look up the PI state (or create it if
  	 * we are the first waiter):
  	 */
d0aa7a70b   Pierre Peiffer   futex_requeue_pi ...
1377
  	ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state);
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1378
1379
  
  	if (unlikely(ret)) {
778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
1380
  		switch (ret) {
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1381

778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
1382
1383
1384
1385
1386
1387
  		case -EAGAIN:
  			/*
  			 * Task is exiting and we just wait for the
  			 * exit to complete.
  			 */
  			queue_unlock(&q, hb);
36cf3b5c3   Thomas Gleixner   FUTEX: Tidy up th...
1388
  			futex_unlock_mm(fshared);
778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
1389
1390
  			cond_resched();
  			goto retry;
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1391

778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
1392
1393
1394
1395
1396
1397
1398
  		case -ESRCH:
  			/*
  			 * No owner found for this futex. Check if the
  			 * OWNER_DIED bit is set to figure out whether
  			 * this is a robust futex or not.
  			 */
  			if (get_futex_value_locked(&curval, uaddr))
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1399
  				goto uaddr_faulted;
778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
1400
1401
1402
1403
1404
1405
1406
1407
  
  			/*
  			 * We simply start over in case of a robust
  			 * futex. The code above will take the futex
  			 * and return happy.
  			 */
  			if (curval & FUTEX_OWNER_DIED) {
  				ownerdied = 1;
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1408
  				goto retry_locked;
778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
1409
1410
1411
  			}
  		default:
  			goto out_unlock_release_sem;
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1412
  		}
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
  	}
  
  	/*
  	 * Only actually queue now that the atomic ops are done:
  	 */
  	__queue_me(&q, hb);
  
  	/*
  	 * Now the futex is queued and we have checked the data, we
  	 * don't want to hold mmap_sem while we sleep.
  	 */
36cf3b5c3   Thomas Gleixner   FUTEX: Tidy up th...
1424
  	futex_unlock_mm(fshared);
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
  
  	WARN_ON(!q.pi_state);
  	/*
  	 * Block on the PI mutex:
  	 */
  	if (!trylock)
  		ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1);
  	else {
  		ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
  		/* Fixup the trylock return value: */
  		ret = ret ? 0 : -EWOULDBLOCK;
  	}
36cf3b5c3   Thomas Gleixner   FUTEX: Tidy up th...
1437
  	futex_lock_mm(fshared);
a99e4e413   Vernon Mauery   [PATCH] pi-futex:...
1438
  	spin_lock(q.lock_ptr);
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1439

778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
1440
1441
1442
1443
1444
1445
1446
1447
1448
  	if (!ret) {
  		/*
  		 * Got the lock. We might not be the anticipated owner
  		 * if we did a lock-steal - fix up the PI-state in
  		 * that case:
  		 */
  		if (q.pi_state->owner != curr)
  			ret = fixup_pi_state_owner(uaddr, &q, curr);
  	} else {
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1449
1450
  		/*
  		 * Catch the rare case, where the lock was released
778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
1451
1452
  		 * when we were on the way back before we locked the
  		 * hash bucket.
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1453
  		 */
778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
  		if (q.pi_state->owner == curr &&
  		    rt_mutex_trylock(&q.pi_state->pi_mutex)) {
  			ret = 0;
  		} else {
  			/*
  			 * Paranoia check. If we did not take the lock
  			 * in the trylock above, then we should not be
  			 * the owner of the rtmutex, neither the real
  			 * nor the pending one:
  			 */
  			if (rt_mutex_owner(&q.pi_state->pi_mutex) == curr)
  				printk(KERN_ERR "futex_lock_pi: ret = %d "
  				       "pi-mutex: %p pi-state %p
  ", ret,
  				       q.pi_state->pi_mutex.owner,
  				       q.pi_state->owner);
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1470
  		}
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1471
  	}
778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
1472
1473
  	/* Unqueue and drop the lock */
  	unqueue_me_pi(&q);
36cf3b5c3   Thomas Gleixner   FUTEX: Tidy up th...
1474
  	futex_unlock_mm(fshared);
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1475

c5780e976   Thomas Gleixner   [PATCH] Use the c...
1476
  	return ret != -EINTR ? ret : -ERESTARTNOINTR;
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1477
1478
1479
1480
1481
  
   out_unlock_release_sem:
  	queue_unlock(&q, hb);
  
   out_release_sem:
36cf3b5c3   Thomas Gleixner   FUTEX: Tidy up th...
1482
  	futex_unlock_mm(fshared);
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1483
1484
1485
1486
1487
1488
1489
1490
  	return ret;
  
   uaddr_faulted:
  	/*
  	 * We have to r/w  *(int __user *)uaddr, but we can't modify it
  	 * non-atomically.  Therefore, if get_user below is not
  	 * enough, we need to handle the fault ourselves, while
  	 * still holding the mmap_sem.
778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
1491
1492
  	 *
  	 * ... and hb->lock. :-) --ANK
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1493
  	 */
778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
1494
  	queue_unlock(&q, hb);
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1495
  	if (attempt++) {
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
1496
1497
1498
  		ret = futex_handle_fault((unsigned long)uaddr, fshared,
  					 attempt);
  		if (ret)
778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
1499
1500
  			goto out_release_sem;
  		goto retry_unlocked;
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1501
  	}
36cf3b5c3   Thomas Gleixner   FUTEX: Tidy up th...
1502
  	futex_unlock_mm(fshared);
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1503
1504
1505
1506
1507
1508
1509
1510
1511
  
  	ret = get_user(uval, uaddr);
  	if (!ret && (uval != -EFAULT))
  		goto retry;
  
  	return ret;
  }
  
  /*
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1512
1513
1514
1515
   * Userspace attempted a TID -> 0 atomic transition, and failed.
   * This is the in-kernel slowpath: we look up the PI state (if any),
   * and do the rt-mutex unlock.
   */
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
1516
  static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared)
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1517
1518
1519
1520
  {
  	struct futex_hash_bucket *hb;
  	struct futex_q *this, *next;
  	u32 uval;
ec92d0829   Pierre Peiffer   futex priority ba...
1521
  	struct plist_head *head;
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1522
1523
1524
1525
1526
1527
1528
1529
1530
  	union futex_key key;
  	int ret, attempt = 0;
  
  retry:
  	if (get_user(uval, uaddr))
  		return -EFAULT;
  	/*
  	 * We release only a lock we actually own:
  	 */
b488893a3   Pavel Emelyanov   pid namespaces: c...
1531
  	if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1532
1533
1534
1535
  		return -EPERM;
  	/*
  	 * First take all the futex related locks:
  	 */
36cf3b5c3   Thomas Gleixner   FUTEX: Tidy up th...
1536
  	futex_lock_mm(fshared);
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1537

34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
1538
  	ret = get_futex_key(uaddr, fshared, &key);
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1539
1540
1541
1542
  	if (unlikely(ret != 0))
  		goto out;
  
  	hb = hash_futex(&key);
778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
1543
  retry_unlocked:
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1544
  	spin_lock(&hb->lock);
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1545
1546
1547
1548
1549
  	/*
  	 * To avoid races, try to do the TID -> 0 atomic transition
  	 * again. If it succeeds then we can return without waking
  	 * anyone else up:
  	 */
36cf3b5c3   Thomas Gleixner   FUTEX: Tidy up th...
1550
  	if (!(uval & FUTEX_OWNER_DIED))
b488893a3   Pavel Emelyanov   pid namespaces: c...
1551
  		uval = cmpxchg_futex_value_locked(uaddr, task_pid_vnr(current), 0);
36cf3b5c3   Thomas Gleixner   FUTEX: Tidy up th...
1552

c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1553
1554
1555
1556
1557
1558
1559
  
  	if (unlikely(uval == -EFAULT))
  		goto pi_faulted;
  	/*
  	 * Rare case: we managed to release the lock atomically,
  	 * no need to wake anyone else up:
  	 */
b488893a3   Pavel Emelyanov   pid namespaces: c...
1560
  	if (unlikely(uval == task_pid_vnr(current)))
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1561
1562
1563
1564
1565
1566
1567
  		goto out_unlock;
  
  	/*
  	 * Ok, other tasks may need to be woken up - check waiters
  	 * and do the wakeup if necessary:
  	 */
  	head = &hb->chain;
ec92d0829   Pierre Peiffer   futex priority ba...
1568
  	plist_for_each_entry_safe(this, next, head, list) {
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
  		if (!match_futex (&this->key, &key))
  			continue;
  		ret = wake_futex_pi(uaddr, uval, this);
  		/*
  		 * The atomic access to the futex value
  		 * generated a pagefault, so retry the
  		 * user-access and the wakeup:
  		 */
  		if (ret == -EFAULT)
  			goto pi_faulted;
  		goto out_unlock;
  	}
  	/*
  	 * No waiters - kernel unlocks the futex:
  	 */
e3f2ddeac   Ingo Molnar   [PATCH] pi-futex:...
1584
1585
1586
1587
1588
  	if (!(uval & FUTEX_OWNER_DIED)) {
  		ret = unlock_futex_pi(uaddr, uval);
  		if (ret == -EFAULT)
  			goto pi_faulted;
  	}
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1589
1590
1591
1592
  
  out_unlock:
  	spin_unlock(&hb->lock);
  out:
36cf3b5c3   Thomas Gleixner   FUTEX: Tidy up th...
1593
  	futex_unlock_mm(fshared);
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1594
1595
1596
1597
1598
1599
1600
1601
1602
  
  	return ret;
  
  pi_faulted:
  	/*
  	 * We have to r/w  *(int __user *)uaddr, but we can't modify it
  	 * non-atomically.  Therefore, if get_user below is not
  	 * enough, we need to handle the fault ourselves, while
  	 * still holding the mmap_sem.
778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
1603
1604
  	 *
  	 * ... and hb->lock. --ANK
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1605
  	 */
778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
1606
  	spin_unlock(&hb->lock);
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1607
  	if (attempt++) {
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
1608
1609
1610
  		ret = futex_handle_fault((unsigned long)uaddr, fshared,
  					 attempt);
  		if (ret)
778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
1611
  			goto out;
187226f57   John Stultz   futex_unlock_pi()...
1612
  		uval = 0;
778e9a9c3   Alexey Kuznetsov   pi-futex: fix exi...
1613
  		goto retry_unlocked;
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1614
  	}
36cf3b5c3   Thomas Gleixner   FUTEX: Tidy up th...
1615
  	futex_unlock_mm(fshared);
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1616
1617
1618
1619
  
  	ret = get_user(uval, uaddr);
  	if (!ret && (uval != -EFAULT))
  		goto retry;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1620
1621
1622
1623
1624
1625
1626
1627
1628
  	return ret;
  }
  
  static int futex_close(struct inode *inode, struct file *filp)
  {
  	struct futex_q *q = filp->private_data;
  
  	unqueue_me(q);
  	kfree(q);
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
1629

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
  	return 0;
  }
  
  /* This is one-shot: once it's gone off you need a new fd */
  static unsigned int futex_poll(struct file *filp,
  			       struct poll_table_struct *wait)
  {
  	struct futex_q *q = filp->private_data;
  	int ret = 0;
  
  	poll_wait(filp, &q->waiters, wait);
  
  	/*
ec92d0829   Pierre Peiffer   futex priority ba...
1643
  	 * plist_node_empty() is safe here without any lock.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1644
1645
  	 * q->lock_ptr != 0 is not safe, because of ordering against wakeup.
  	 */
ec92d0829   Pierre Peiffer   futex priority ba...
1646
  	if (plist_node_empty(&q->list))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1647
1648
1649
1650
  		ret = POLLIN | POLLRDNORM;
  
  	return ret;
  }
15ad7cdcf   Helge Deller   [PATCH] struct se...
1651
  static const struct file_operations futex_fops = {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1652
1653
1654
1655
1656
1657
1658
1659
  	.release	= futex_close,
  	.poll		= futex_poll,
  };
  
  /*
   * Signal allows caller to avoid the race which would occur if they
   * set the sigio stuff up afterwards.
   */
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
1660
  static int futex_fd(u32 __user *uaddr, int signal)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1661
1662
1663
1664
  {
  	struct futex_q *q;
  	struct file *filp;
  	int ret, err;
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
1665
  	struct rw_semaphore *fshared;
19c6b6ed3   Andrew Morton   [PATCH] schedule ...
1666
1667
1668
1669
  	static unsigned long printk_interval;
  
  	if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) {
  		printk(KERN_WARNING "Process `%s' used FUTEX_FD, which "
36cf3b5c3   Thomas Gleixner   FUTEX: Tidy up th...
1670
1671
1672
  		       "will be removed from the kernel in June 2007
  ",
  		       current->comm);
19c6b6ed3   Andrew Morton   [PATCH] schedule ...
1673
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1674
1675
  
  	ret = -EINVAL;
7ed20e1ad   Jesper Juhl   [PATCH] convert t...
1676
  	if (!valid_signal(signal))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
  		goto out;
  
  	ret = get_unused_fd();
  	if (ret < 0)
  		goto out;
  	filp = get_empty_filp();
  	if (!filp) {
  		put_unused_fd(ret);
  		ret = -ENFILE;
  		goto out;
  	}
  	filp->f_op = &futex_fops;
f3a43f3f6   Josef "Jeff" Sipek   [PATCH] kernel: c...
1689
1690
1691
  	filp->f_path.mnt = mntget(futex_mnt);
  	filp->f_path.dentry = dget(futex_mnt->mnt_root);
  	filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1692
1693
  
  	if (signal) {
609d7fa95   Eric W. Biederman   [PATCH] file: mod...
1694
  		err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1695
  		if (err < 0) {
39ed3fdee   Pekka Enberg   [PATCH] futex: re...
1696
  			goto error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1697
1698
1699
1700
1701
1702
  		}
  		filp->f_owner.signum = signal;
  	}
  
  	q = kmalloc(sizeof(*q), GFP_KERNEL);
  	if (!q) {
39ed3fdee   Pekka Enberg   [PATCH] futex: re...
1703
1704
  		err = -ENOMEM;
  		goto error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1705
  	}
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1706
  	q->pi_state = NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1707

34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
1708
1709
1710
  	fshared = &current->mm->mmap_sem;
  	down_read(fshared);
  	err = get_futex_key(uaddr, fshared, &q->key);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1711
1712
  
  	if (unlikely(err != 0)) {
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
1713
  		up_read(fshared);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1714
  		kfree(q);
39ed3fdee   Pekka Enberg   [PATCH] futex: re...
1715
  		goto error;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1716
1717
1718
1719
1720
1721
1722
1723
1724
  	}
  
  	/*
  	 * queue_me() must be called before releasing mmap_sem, because
  	 * key->shared.inode needs to be referenced while holding it.
  	 */
  	filp->private_data = q;
  
  	queue_me(q, ret, filp);
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
1725
  	up_read(fshared);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1726
1727
1728
1729
1730
  
  	/* Now we map fd to filp, so userspace can access it */
  	fd_install(ret, filp);
  out:
  	return ret;
39ed3fdee   Pekka Enberg   [PATCH] futex: re...
1731
1732
1733
1734
1735
  error:
  	put_unused_fd(ret);
  	put_filp(filp);
  	ret = err;
  	goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1736
  }
0771dfefc   Ingo Molnar   [PATCH] lightweig...
1737
1738
1739
1740
1741
1742
1743
  /*
   * Support for robust futexes: the kernel cleans up held futexes at
   * thread exit time.
   *
   * Implementation: user-space maintains a per-thread list of locks it
   * is holding. Upon do_exit(), the kernel carefully walks this list,
   * and marks all locks that are owned by this thread with the
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1744
   * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
0771dfefc   Ingo Molnar   [PATCH] lightweig...
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
   * always manipulated with the lock held, so the list is private and
   * per-thread. Userspace also maintains a per-thread 'list_op_pending'
   * field, to allow the kernel to clean up if the thread dies after
   * acquiring the lock, but just before it could have added itself to
   * the list. There can only be one such pending lock.
   */
  
  /**
   * sys_set_robust_list - set the robust-futex list head of a task
   * @head: pointer to the list-head
   * @len: length of the list-head, as userspace expects
   */
  asmlinkage long
  sys_set_robust_list(struct robust_list_head __user *head,
  		    size_t len)
  {
  	/*
  	 * The kernel knows only one size for now:
  	 */
  	if (unlikely(len != sizeof(*head)))
  		return -EINVAL;
  
  	current->robust_list = head;
  
  	return 0;
  }
  
  /**
   * sys_get_robust_list - get the robust-futex list head of a task
   * @pid: pid of the process [zero for current task]
   * @head_ptr: pointer to a list-head pointer, the kernel fills it in
   * @len_ptr: pointer to a length field, the kernel fills in the header size
   */
  asmlinkage long
ba46df984   Al Viro   [PATCH] __user an...
1779
  sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr,
0771dfefc   Ingo Molnar   [PATCH] lightweig...
1780
1781
  		    size_t __user *len_ptr)
  {
ba46df984   Al Viro   [PATCH] __user an...
1782
  	struct robust_list_head __user *head;
0771dfefc   Ingo Molnar   [PATCH] lightweig...
1783
1784
1785
1786
1787
1788
1789
1790
  	unsigned long ret;
  
  	if (!pid)
  		head = current->robust_list;
  	else {
  		struct task_struct *p;
  
  		ret = -ESRCH;
aaa2a97eb   Oleg Nesterov   [PATCH] sys_get_r...
1791
  		rcu_read_lock();
228ebcbe6   Pavel Emelyanov   Uninline find_tas...
1792
  		p = find_task_by_vpid(pid);
0771dfefc   Ingo Molnar   [PATCH] lightweig...
1793
1794
1795
1796
1797
1798
1799
  		if (!p)
  			goto err_unlock;
  		ret = -EPERM;
  		if ((current->euid != p->euid) && (current->euid != p->uid) &&
  				!capable(CAP_SYS_PTRACE))
  			goto err_unlock;
  		head = p->robust_list;
aaa2a97eb   Oleg Nesterov   [PATCH] sys_get_r...
1800
  		rcu_read_unlock();
0771dfefc   Ingo Molnar   [PATCH] lightweig...
1801
1802
1803
1804
1805
1806
1807
  	}
  
  	if (put_user(sizeof(*head), len_ptr))
  		return -EFAULT;
  	return put_user(head, head_ptr);
  
  err_unlock:
aaa2a97eb   Oleg Nesterov   [PATCH] sys_get_r...
1808
  	rcu_read_unlock();
0771dfefc   Ingo Molnar   [PATCH] lightweig...
1809
1810
1811
1812
1813
1814
1815
1816
  
  	return ret;
  }
  
  /*
   * Process a futex-list entry, check whether it's owned by the
   * dying task, and do notification if so:
   */
e3f2ddeac   Ingo Molnar   [PATCH] pi-futex:...
1817
  int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
0771dfefc   Ingo Molnar   [PATCH] lightweig...
1818
  {
e3f2ddeac   Ingo Molnar   [PATCH] pi-futex:...
1819
  	u32 uval, nval, mval;
0771dfefc   Ingo Molnar   [PATCH] lightweig...
1820

8f17d3a50   Ingo Molnar   [PATCH] lightweig...
1821
1822
  retry:
  	if (get_user(uval, uaddr))
0771dfefc   Ingo Molnar   [PATCH] lightweig...
1823
  		return -1;
b488893a3   Pavel Emelyanov   pid namespaces: c...
1824
  	if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) {
0771dfefc   Ingo Molnar   [PATCH] lightweig...
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
  		/*
  		 * Ok, this dying thread is truly holding a futex
  		 * of interest. Set the OWNER_DIED bit atomically
  		 * via cmpxchg, and if the value had FUTEX_WAITERS
  		 * set, wake up a waiter (if any). (We have to do a
  		 * futex_wake() even if OWNER_DIED is already set -
  		 * to handle the rare but possible case of recursive
  		 * thread-death.) The rest of the cleanup is done in
  		 * userspace.
  		 */
e3f2ddeac   Ingo Molnar   [PATCH] pi-futex:...
1835
1836
  		mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
  		nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval);
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1837
1838
1839
1840
  		if (nval == -EFAULT)
  			return -1;
  
  		if (nval != uval)
8f17d3a50   Ingo Molnar   [PATCH] lightweig...
1841
  			goto retry;
0771dfefc   Ingo Molnar   [PATCH] lightweig...
1842

e3f2ddeac   Ingo Molnar   [PATCH] pi-futex:...
1843
1844
1845
1846
  		/*
  		 * Wake robust non-PI futexes here. The wakeup of
  		 * PI futexes happens in exit_pi_state():
  		 */
36cf3b5c3   Thomas Gleixner   FUTEX: Tidy up th...
1847
  		if (!pi && (uval & FUTEX_WAITERS))
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
1848
  				futex_wake(uaddr, &curr->mm->mmap_sem, 1);
0771dfefc   Ingo Molnar   [PATCH] lightweig...
1849
1850
1851
1852
1853
  	}
  	return 0;
  }
  
  /*
e3f2ddeac   Ingo Molnar   [PATCH] pi-futex:...
1854
1855
1856
   * Fetch a robust-list pointer. Bit 0 signals PI futexes:
   */
  static inline int fetch_robust_entry(struct robust_list __user **entry,
ba46df984   Al Viro   [PATCH] __user an...
1857
1858
  				     struct robust_list __user * __user *head,
  				     int *pi)
e3f2ddeac   Ingo Molnar   [PATCH] pi-futex:...
1859
1860
  {
  	unsigned long uentry;
ba46df984   Al Viro   [PATCH] __user an...
1861
  	if (get_user(uentry, (unsigned long __user *)head))
e3f2ddeac   Ingo Molnar   [PATCH] pi-futex:...
1862
  		return -EFAULT;
ba46df984   Al Viro   [PATCH] __user an...
1863
  	*entry = (void __user *)(uentry & ~1UL);
e3f2ddeac   Ingo Molnar   [PATCH] pi-futex:...
1864
1865
1866
1867
1868
1869
  	*pi = uentry & 1;
  
  	return 0;
  }
  
  /*
0771dfefc   Ingo Molnar   [PATCH] lightweig...
1870
1871
1872
1873
1874
1875
1876
1877
   * Walk curr->robust_list (very carefully, it's a userspace list!)
   * and mark any locks found there dead, and notify any waiters.
   *
   * We silently return on any sign of list-walking problem.
   */
  void exit_robust_list(struct task_struct *curr)
  {
  	struct robust_list_head __user *head = curr->robust_list;
9f96cb1e8   Martin Schwidefsky   robust futex thre...
1878
1879
  	struct robust_list __user *entry, *next_entry, *pending;
  	unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip;
0771dfefc   Ingo Molnar   [PATCH] lightweig...
1880
  	unsigned long futex_offset;
9f96cb1e8   Martin Schwidefsky   robust futex thre...
1881
  	int rc;
0771dfefc   Ingo Molnar   [PATCH] lightweig...
1882
1883
1884
1885
1886
  
  	/*
  	 * Fetch the list head (which was registered earlier, via
  	 * sys_set_robust_list()):
  	 */
e3f2ddeac   Ingo Molnar   [PATCH] pi-futex:...
1887
  	if (fetch_robust_entry(&entry, &head->list.next, &pi))
0771dfefc   Ingo Molnar   [PATCH] lightweig...
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
  		return;
  	/*
  	 * Fetch the relative futex offset:
  	 */
  	if (get_user(futex_offset, &head->futex_offset))
  		return;
  	/*
  	 * Fetch any possibly pending lock-add first, and handle it
  	 * if it exists:
  	 */
e3f2ddeac   Ingo Molnar   [PATCH] pi-futex:...
1898
  	if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
0771dfefc   Ingo Molnar   [PATCH] lightweig...
1899
  		return;
e3f2ddeac   Ingo Molnar   [PATCH] pi-futex:...
1900

9f96cb1e8   Martin Schwidefsky   robust futex thre...
1901
  	next_entry = NULL;	/* avoid warning with gcc */
0771dfefc   Ingo Molnar   [PATCH] lightweig...
1902
1903
  	while (entry != &head->list) {
  		/*
9f96cb1e8   Martin Schwidefsky   robust futex thre...
1904
1905
1906
1907
1908
  		 * Fetch the next entry in the list before calling
  		 * handle_futex_death:
  		 */
  		rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
  		/*
0771dfefc   Ingo Molnar   [PATCH] lightweig...
1909
  		 * A pending lock might already be on the list, so
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1910
  		 * don't process it twice:
0771dfefc   Ingo Molnar   [PATCH] lightweig...
1911
1912
  		 */
  		if (entry != pending)
ba46df984   Al Viro   [PATCH] __user an...
1913
  			if (handle_futex_death((void __user *)entry + futex_offset,
e3f2ddeac   Ingo Molnar   [PATCH] pi-futex:...
1914
  						curr, pi))
0771dfefc   Ingo Molnar   [PATCH] lightweig...
1915
  				return;
9f96cb1e8   Martin Schwidefsky   robust futex thre...
1916
  		if (rc)
0771dfefc   Ingo Molnar   [PATCH] lightweig...
1917
  			return;
9f96cb1e8   Martin Schwidefsky   robust futex thre...
1918
1919
  		entry = next_entry;
  		pi = next_pi;
0771dfefc   Ingo Molnar   [PATCH] lightweig...
1920
1921
1922
1923
1924
1925
1926
1927
  		/*
  		 * Avoid excessively long or circular lists:
  		 */
  		if (!--limit)
  			break;
  
  		cond_resched();
  	}
9f96cb1e8   Martin Schwidefsky   robust futex thre...
1928
1929
1930
1931
  
  	if (pending)
  		handle_futex_death((void __user *)pending + futex_offset,
  				   curr, pip);
0771dfefc   Ingo Molnar   [PATCH] lightweig...
1932
  }
c19384b5b   Pierre Peiffer   Make futex_wait()...
1933
  long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
1934
  		u32 __user *uaddr2, u32 val2, u32 val3)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1935
1936
  {
  	int ret;
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
1937
1938
1939
1940
1941
  	int cmd = op & FUTEX_CMD_MASK;
  	struct rw_semaphore *fshared = NULL;
  
  	if (!(op & FUTEX_PRIVATE_FLAG))
  		fshared = &current->mm->mmap_sem;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1942

34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
1943
  	switch (cmd) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1944
  	case FUTEX_WAIT:
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
1945
  		ret = futex_wait(uaddr, fshared, val, timeout);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1946
1947
  		break;
  	case FUTEX_WAKE:
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
1948
  		ret = futex_wake(uaddr, fshared, val);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1949
1950
1951
1952
1953
1954
  		break;
  	case FUTEX_FD:
  		/* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */
  		ret = futex_fd(uaddr, val);
  		break;
  	case FUTEX_REQUEUE:
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
1955
  		ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1956
1957
  		break;
  	case FUTEX_CMP_REQUEUE:
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
1958
  		ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1959
  		break;
4732efbeb   Jakub Jelinek   [PATCH] FUTEX_WAK...
1960
  	case FUTEX_WAKE_OP:
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
1961
  		ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
4732efbeb   Jakub Jelinek   [PATCH] FUTEX_WAK...
1962
  		break;
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1963
  	case FUTEX_LOCK_PI:
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
1964
  		ret = futex_lock_pi(uaddr, fshared, val, timeout, 0);
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1965
1966
  		break;
  	case FUTEX_UNLOCK_PI:
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
1967
  		ret = futex_unlock_pi(uaddr, fshared);
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1968
1969
  		break;
  	case FUTEX_TRYLOCK_PI:
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
1970
  		ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
c87e2837b   Ingo Molnar   [PATCH] pi-futex:...
1971
  		break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1972
1973
1974
1975
1976
  	default:
  		ret = -ENOSYS;
  	}
  	return ret;
  }
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
1977
  asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1978
  			  struct timespec __user *utime, u32 __user *uaddr2,
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
1979
  			  u32 val3)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1980
  {
c19384b5b   Pierre Peiffer   Make futex_wait()...
1981
1982
  	struct timespec ts;
  	ktime_t t, *tp = NULL;
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
1983
  	u32 val2 = 0;
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
1984
  	int cmd = op & FUTEX_CMD_MASK;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1985

34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
1986
  	if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI)) {
c19384b5b   Pierre Peiffer   Make futex_wait()...
1987
  		if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1988
  			return -EFAULT;
c19384b5b   Pierre Peiffer   Make futex_wait()...
1989
  		if (!timespec_valid(&ts))
9741ef964   Thomas Gleixner   [PATCH] futex: ch...
1990
  			return -EINVAL;
c19384b5b   Pierre Peiffer   Make futex_wait()...
1991
1992
  
  		t = timespec_to_ktime(ts);
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
1993
  		if (cmd == FUTEX_WAIT)
c19384b5b   Pierre Peiffer   Make futex_wait()...
1994
1995
  			t = ktime_add(ktime_get(), t);
  		tp = &t;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1996
1997
  	}
  	/*
34f01cc1f   Eric Dumazet   FUTEX: new PRIVAT...
1998
  	 * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE.
f54f09861   Andreas Schwab   futex: pass nr_wa...
1999
  	 * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2000
  	 */
f54f09861   Andreas Schwab   futex: pass nr_wa...
2001
2002
  	if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
  	    cmd == FUTEX_WAKE_OP)
e2970f2fb   Ingo Molnar   [PATCH] pi-futex:...
2003
  		val2 = (u32) (unsigned long) utime;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2004

c19384b5b   Pierre Peiffer   Make futex_wait()...
2005
  	return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2006
  }
454e2398b   David Howells   [PATCH] VFS: Perm...
2007
2008
2009
  static int futexfs_get_sb(struct file_system_type *fs_type,
  			  int flags, const char *dev_name, void *data,
  			  struct vfsmount *mnt)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2010
  {
fd5eea421   Andrey Mirkin   change inotifyfs ...
2011
  	return get_sb_pseudo(fs_type, "futex", NULL, FUTEXFS_SUPER_MAGIC, mnt);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
  }
  
  static struct file_system_type futex_fs_type = {
  	.name		= "futexfs",
  	.get_sb		= futexfs_get_sb,
  	.kill_sb	= kill_anon_super,
  };
  
  static int __init init(void)
  {
95362fa90   Akinobu Mita   [PATCH] futex: in...
2022
2023
2024
2025
  	int i = register_filesystem(&futex_fs_type);
  
  	if (i)
  		return i;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2026

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2027
  	futex_mnt = kern_mount(&futex_fs_type);
95362fa90   Akinobu Mita   [PATCH] futex: in...
2028
2029
2030
2031
  	if (IS_ERR(futex_mnt)) {
  		unregister_filesystem(&futex_fs_type);
  		return PTR_ERR(futex_mnt);
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2032
2033
  
  	for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
ec92d0829   Pierre Peiffer   futex priority ba...
2034
  		plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2035
2036
2037
2038
2039
  		spin_lock_init(&futex_queues[i].lock);
  	}
  	return 0;
  }
  __initcall(init);