Commit c87e2837be82df479a6bae9f155c43516d2feebc

Authored by Ingo Molnar
Committed by Linus Torvalds
1 parent 0cdbee9920

[PATCH] pi-futex: futex_lock_pi/futex_unlock_pi support

This adds the actual pi-futex implementation, based on rt-mutexes.

[dino@in.ibm.com: fix an oops-causing race]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Dinakar Guniguntala <dino@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

Showing 7 changed files with 828 additions and 41 deletions Side-by-side Diff

include/linux/futex.h
... ... @@ -12,6 +12,9 @@
12 12 #define FUTEX_REQUEUE 3
13 13 #define FUTEX_CMP_REQUEUE 4
14 14 #define FUTEX_WAKE_OP 5
  15 +#define FUTEX_LOCK_PI 6
  16 +#define FUTEX_UNLOCK_PI 7
  17 +#define FUTEX_TRYLOCK_PI 8
15 18  
16 19 /*
17 20 * Support for robust futexes: the kernel cleans up held futexes at
18 21  
... ... @@ -97,8 +100,12 @@
97 100  
98 101 #ifdef CONFIG_FUTEX
99 102 extern void exit_robust_list(struct task_struct *curr);
  103 +extern void exit_pi_state_list(struct task_struct *curr);
100 104 #else
101 105 static inline void exit_robust_list(struct task_struct *curr)
  106 +{
  107 +}
  108 +static inline void exit_pi_state_list(struct task_struct *curr)
102 109 {
103 110 }
104 111 #endif
include/linux/sched.h
... ... @@ -84,6 +84,7 @@
84 84 #include <asm/processor.h>
85 85  
86 86 struct exec_domain;
  87 +struct futex_pi_state;
87 88  
88 89 /*
89 90 * List of flags we want to share for kernel threads,
... ... @@ -915,6 +916,8 @@
915 916 #ifdef CONFIG_COMPAT
916 917 struct compat_robust_list_head __user *compat_robust_list;
917 918 #endif
  919 + struct list_head pi_state_list;
  920 + struct futex_pi_state *pi_state_cache;
918 921  
919 922 atomic_t fs_excl; /* holding fs exclusive resources */
920 923 struct rcu_head rcu;
... ... @@ -926,6 +926,14 @@
926 926 tsk->mempolicy = NULL;
927 927 #endif
928 928 /*
  929 + * This must happen late, after the PID is not
  930 + * hashed anymore:
  931 + */
  932 + if (unlikely(!list_empty(&tsk->pi_state_list)))
  933 + exit_pi_state_list(tsk);
  934 + if (unlikely(current->pi_state_cache))
  935 + kfree(current->pi_state_cache);
  936 + /*
929 937 * If DEBUG_MUTEXES is on, make sure we are holding no locks:
930 938 */
931 939 mutex_debug_check_no_locks_held(tsk);
... ... @@ -1092,6 +1092,9 @@
1092 1092 #ifdef CONFIG_COMPAT
1093 1093 p->compat_robust_list = NULL;
1094 1094 #endif
  1095 + INIT_LIST_HEAD(&p->pi_state_list);
  1096 + p->pi_state_cache = NULL;
  1097 +
1095 1098 /*
1096 1099 * sigaltstack should be cleared when sharing the same VM
1097 1100 */
Changes suppressed. Click to show
... ... @@ -12,6 +12,10 @@
12 12 * (C) Copyright 2006 Red Hat Inc, All Rights Reserved
13 13 * Thanks to Thomas Gleixner for suggestions, analysis and fixes.
14 14 *
  15 + * PI-futex support started by Ingo Molnar and Thomas Gleixner
  16 + * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  17 + * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
  18 + *
15 19 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
16 20 * enough at me, Linus for the original (flawed) idea, Matthew
17 21 * Kirkwood for proof-of-concept implementation.
... ... @@ -46,6 +50,8 @@
46 50 #include <linux/signal.h>
47 51 #include <asm/futex.h>
48 52  
  53 +#include "rtmutex_common.h"
  54 +
49 55 #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
50 56  
51 57 /*
... ... @@ -75,6 +81,27 @@
75 81 };
76 82  
77 83 /*
  84 + * Priority Inheritance state:
  85 + */
  86 +struct futex_pi_state {
  87 + /*
  88 + * list of 'owned' pi_state instances - these have to be
  89 + * cleaned up in do_exit() if the task exits prematurely:
  90 + */
  91 + struct list_head list;
  92 +
  93 + /*
  94 + * The PI object:
  95 + */
  96 + struct rt_mutex pi_mutex;
  97 +
  98 + struct task_struct *owner;
  99 + atomic_t refcount;
  100 +
  101 + union futex_key key;
  102 +};
  103 +
  104 +/*
78 105 * We use this hashed waitqueue instead of a normal wait_queue_t, so
79 106 * we can wake only the relevant ones (hashed queues may be shared).
80 107 *
... ... @@ -96,6 +123,10 @@
96 123 /* For fd, sigio sent using these: */
97 124 int fd;
98 125 struct file *filp;
  126 +
  127 + /* Optional priority inheritance state: */
  128 + struct futex_pi_state *pi_state;
  129 + struct task_struct *task;
99 130 };
100 131  
101 132 /*
... ... @@ -259,6 +290,232 @@
259 290 }
260 291  
261 292 /*
  293 + * Fault handling. Called with current->mm->mmap_sem held.
  294 + */
  295 +static int futex_handle_fault(unsigned long address, int attempt)
  296 +{
  297 + struct vm_area_struct * vma;
  298 + struct mm_struct *mm = current->mm;
  299 +
  300 + if (attempt >= 2 || !(vma = find_vma(mm, address)) ||
  301 + vma->vm_start > address || !(vma->vm_flags & VM_WRITE))
  302 + return -EFAULT;
  303 +
  304 + switch (handle_mm_fault(mm, vma, address, 1)) {
  305 + case VM_FAULT_MINOR:
  306 + current->min_flt++;
  307 + break;
  308 + case VM_FAULT_MAJOR:
  309 + current->maj_flt++;
  310 + break;
  311 + default:
  312 + return -EFAULT;
  313 + }
  314 + return 0;
  315 +}
  316 +
  317 +/*
  318 + * PI code:
  319 + */
  320 +static int refill_pi_state_cache(void)
  321 +{
  322 + struct futex_pi_state *pi_state;
  323 +
  324 + if (likely(current->pi_state_cache))
  325 + return 0;
  326 +
  327 + pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL);
  328 +
  329 + if (!pi_state)
  330 + return -ENOMEM;
  331 +
  332 + memset(pi_state, 0, sizeof(*pi_state));
  333 + INIT_LIST_HEAD(&pi_state->list);
  334 + /* pi_mutex gets initialized later */
  335 + pi_state->owner = NULL;
  336 + atomic_set(&pi_state->refcount, 1);
  337 +
  338 + current->pi_state_cache = pi_state;
  339 +
  340 + return 0;
  341 +}
  342 +
  343 +static struct futex_pi_state * alloc_pi_state(void)
  344 +{
  345 + struct futex_pi_state *pi_state = current->pi_state_cache;
  346 +
  347 + WARN_ON(!pi_state);
  348 + current->pi_state_cache = NULL;
  349 +
  350 + return pi_state;
  351 +}
  352 +
  353 +static void free_pi_state(struct futex_pi_state *pi_state)
  354 +{
  355 + if (!atomic_dec_and_test(&pi_state->refcount))
  356 + return;
  357 +
  358 + /*
  359 + * If pi_state->owner is NULL, the owner is most probably dying
  360 + * and has cleaned up the pi_state already
  361 + */
  362 + if (pi_state->owner) {
  363 + spin_lock_irq(&pi_state->owner->pi_lock);
  364 + list_del_init(&pi_state->list);
  365 + spin_unlock_irq(&pi_state->owner->pi_lock);
  366 +
  367 + rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
  368 + }
  369 +
  370 + if (current->pi_state_cache)
  371 + kfree(pi_state);
  372 + else {
  373 + /*
  374 + * pi_state->list is already empty.
  375 + * clear pi_state->owner.
  376 + * refcount is at 0 - put it back to 1.
  377 + */
  378 + pi_state->owner = NULL;
  379 + atomic_set(&pi_state->refcount, 1);
  380 + current->pi_state_cache = pi_state;
  381 + }
  382 +}
  383 +
  384 +/*
  385 + * Look up the task based on what TID userspace gave us.
  386 + * We dont trust it.
  387 + */
  388 +static struct task_struct * futex_find_get_task(pid_t pid)
  389 +{
  390 + struct task_struct *p;
  391 +
  392 + read_lock(&tasklist_lock);
  393 + p = find_task_by_pid(pid);
  394 + if (!p)
  395 + goto out_unlock;
  396 + if ((current->euid != p->euid) && (current->euid != p->uid)) {
  397 + p = NULL;
  398 + goto out_unlock;
  399 + }
  400 + if (p->state == EXIT_ZOMBIE || p->exit_state == EXIT_ZOMBIE) {
  401 + p = NULL;
  402 + goto out_unlock;
  403 + }
  404 + get_task_struct(p);
  405 +out_unlock:
  406 + read_unlock(&tasklist_lock);
  407 +
  408 + return p;
  409 +}
  410 +
  411 +/*
  412 + * This task is holding PI mutexes at exit time => bad.
  413 + * Kernel cleans up PI-state, but userspace is likely hosed.
  414 + * (Robust-futex cleanup is separate and might save the day for userspace.)
  415 + */
  416 +void exit_pi_state_list(struct task_struct *curr)
  417 +{
  418 + struct futex_hash_bucket *hb;
  419 + struct list_head *next, *head = &curr->pi_state_list;
  420 + struct futex_pi_state *pi_state;
  421 + union futex_key key;
  422 +
  423 + /*
  424 + * We are a ZOMBIE and nobody can enqueue itself on
  425 + * pi_state_list anymore, but we have to be careful
  426 + * versus waiters unqueueing themselfs
  427 + */
  428 + spin_lock_irq(&curr->pi_lock);
  429 + while (!list_empty(head)) {
  430 +
  431 + next = head->next;
  432 + pi_state = list_entry(next, struct futex_pi_state, list);
  433 + key = pi_state->key;
  434 + spin_unlock_irq(&curr->pi_lock);
  435 +
  436 + hb = hash_futex(&key);
  437 + spin_lock(&hb->lock);
  438 +
  439 + spin_lock_irq(&curr->pi_lock);
  440 + if (head->next != next) {
  441 + spin_unlock(&hb->lock);
  442 + continue;
  443 + }
  444 +
  445 + list_del_init(&pi_state->list);
  446 +
  447 + WARN_ON(pi_state->owner != curr);
  448 +
  449 + pi_state->owner = NULL;
  450 + spin_unlock_irq(&curr->pi_lock);
  451 +
  452 + rt_mutex_unlock(&pi_state->pi_mutex);
  453 +
  454 + spin_unlock(&hb->lock);
  455 +
  456 + spin_lock_irq(&curr->pi_lock);
  457 + }
  458 + spin_unlock_irq(&curr->pi_lock);
  459 +}
  460 +
  461 +static int
  462 +lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
  463 +{
  464 + struct futex_pi_state *pi_state = NULL;
  465 + struct futex_q *this, *next;
  466 + struct list_head *head;
  467 + struct task_struct *p;
  468 + pid_t pid;
  469 +
  470 + head = &hb->chain;
  471 +
  472 + list_for_each_entry_safe(this, next, head, list) {
  473 + if (match_futex (&this->key, &me->key)) {
  474 + /*
  475 + * Another waiter already exists - bump up
  476 + * the refcount and return its pi_state:
  477 + */
  478 + pi_state = this->pi_state;
  479 + atomic_inc(&pi_state->refcount);
  480 + me->pi_state = pi_state;
  481 +
  482 + return 0;
  483 + }
  484 + }
  485 +
  486 + /*
  487 + * We are the first waiter - try to look up the real owner and
  488 + * attach the new pi_state to it:
  489 + */
  490 + pid = uval & FUTEX_TID_MASK;
  491 + p = futex_find_get_task(pid);
  492 + if (!p)
  493 + return -ESRCH;
  494 +
  495 + pi_state = alloc_pi_state();
  496 +
  497 + /*
  498 + * Initialize the pi_mutex in locked state and make 'p'
  499 + * the owner of it:
  500 + */
  501 + rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
  502 +
  503 + /* Store the key for possible exit cleanups: */
  504 + pi_state->key = me->key;
  505 +
  506 + spin_lock_irq(&p->pi_lock);
  507 + list_add(&pi_state->list, &p->pi_state_list);
  508 + pi_state->owner = p;
  509 + spin_unlock_irq(&p->pi_lock);
  510 +
  511 + put_task_struct(p);
  512 +
  513 + me->pi_state = pi_state;
  514 +
  515 + return 0;
  516 +}
  517 +
  518 +/*
262 519 * The hash bucket lock must be held when this is called.
263 520 * Afterwards, the futex_q must not be accessed.
264 521 */
... ... @@ -285,6 +542,70 @@
285 542 q->lock_ptr = NULL;
286 543 }
287 544  
  545 +static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
  546 +{
  547 + struct task_struct *new_owner;
  548 + struct futex_pi_state *pi_state = this->pi_state;
  549 + u32 curval, newval;
  550 +
  551 + if (!pi_state)
  552 + return -EINVAL;
  553 +
  554 + new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
  555 +
  556 + /*
  557 + * This happens when we have stolen the lock and the original
  558 + * pending owner did not enqueue itself back on the rt_mutex.
  559 + * Thats not a tragedy. We know that way, that a lock waiter
  560 + * is on the fly. We make the futex_q waiter the pending owner.
  561 + */
  562 + if (!new_owner)
  563 + new_owner = this->task;
  564 +
  565 + /*
  566 + * We pass it to the next owner. (The WAITERS bit is always
  567 + * kept enabled while there is PI state around. We must also
  568 + * preserve the owner died bit.)
  569 + */
  570 + newval = (uval & FUTEX_OWNER_DIED) | FUTEX_WAITERS | new_owner->pid;
  571 +
  572 + inc_preempt_count();
  573 + curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
  574 + dec_preempt_count();
  575 +
  576 + if (curval == -EFAULT)
  577 + return -EFAULT;
  578 + if (curval != uval)
  579 + return -EINVAL;
  580 +
  581 + list_del_init(&pi_state->owner->pi_state_list);
  582 + list_add(&pi_state->list, &new_owner->pi_state_list);
  583 + pi_state->owner = new_owner;
  584 + rt_mutex_unlock(&pi_state->pi_mutex);
  585 +
  586 + return 0;
  587 +}
  588 +
  589 +static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
  590 +{
  591 + u32 oldval;
  592 +
  593 + /*
  594 + * There is no waiter, so we unlock the futex. The owner died
  595 + * bit has not to be preserved here. We are the owner:
  596 + */
  597 + inc_preempt_count();
  598 + oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0);
  599 + dec_preempt_count();
  600 +
  601 + if (oldval == -EFAULT)
  602 + return oldval;
  603 + if (oldval != uval)
  604 + return -EAGAIN;
  605 +
  606 + return 0;
  607 +}
  608 +
288 609 /*
289 610 * Wake up all waiters hashed on the physical page that is mapped
290 611 * to this virtual address:
... ... @@ -309,6 +630,8 @@
309 630  
310 631 list_for_each_entry_safe(this, next, head, list) {
311 632 if (match_futex (&this->key, &key)) {
  633 + if (this->pi_state)
  634 + return -EINVAL;
312 635 wake_futex(this);
313 636 if (++ret >= nr_wake)
314 637 break;
315 638  
... ... @@ -385,27 +708,9 @@
385 708 * still holding the mmap_sem.
386 709 */
387 710 if (attempt++) {
388   - struct vm_area_struct * vma;
389   - struct mm_struct *mm = current->mm;
390   - unsigned long address = (unsigned long)uaddr2;
391   -
392   - ret = -EFAULT;
393   - if (attempt >= 2 ||
394   - !(vma = find_vma(mm, address)) ||
395   - vma->vm_start > address ||
396   - !(vma->vm_flags & VM_WRITE))
  711 + if (futex_handle_fault((unsigned long)uaddr2,
  712 + attempt))
397 713 goto out;
398   -
399   - switch (handle_mm_fault(mm, vma, address, 1)) {
400   - case VM_FAULT_MINOR:
401   - current->min_flt++;
402   - break;
403   - case VM_FAULT_MAJOR:
404   - current->maj_flt++;
405   - break;
406   - default:
407   - goto out;
408   - }
409 714 goto retry;
410 715 }
411 716  
... ... @@ -572,6 +877,7 @@
572 877 static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
573 878 {
574 879 list_add_tail(&q->list, &hb->chain);
  880 + q->task = current;
575 881 spin_unlock(&hb->lock);
576 882 }
577 883  
... ... @@ -626,6 +932,9 @@
626 932 }
627 933 WARN_ON(list_empty(&q->list));
628 934 list_del(&q->list);
  935 +
  936 + BUG_ON(q->pi_state);
  937 +
629 938 spin_unlock(lock_ptr);
630 939 ret = 1;
631 940 }
632 941  
633 942  
634 943  
... ... @@ -634,16 +943,36 @@
634 943 return ret;
635 944 }
636 945  
  946 +/*
  947 + * PI futexes can not be requeued and must remove themself from the
  948 + * hash bucket. The hash bucket lock is held on entry and dropped here.
  949 + */
  950 +static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb)
  951 +{
  952 + WARN_ON(list_empty(&q->list));
  953 + list_del(&q->list);
  954 +
  955 + BUG_ON(!q->pi_state);
  956 + free_pi_state(q->pi_state);
  957 + q->pi_state = NULL;
  958 +
  959 + spin_unlock(&hb->lock);
  960 +
  961 + drop_key_refs(&q->key);
  962 +}
  963 +
637 964 static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
638 965 {
639   - DECLARE_WAITQUEUE(wait, current);
  966 + struct task_struct *curr = current;
  967 + DECLARE_WAITQUEUE(wait, curr);
640 968 struct futex_hash_bucket *hb;
641 969 struct futex_q q;
642 970 u32 uval;
643 971 int ret;
644 972  
  973 + q.pi_state = NULL;
645 974 retry:
646   - down_read(&current->mm->mmap_sem);
  975 + down_read(&curr->mm->mmap_sem);
647 976  
648 977 ret = get_futex_key(uaddr, &q.key);
649 978 if (unlikely(ret != 0))
... ... @@ -680,7 +1009,7 @@
680 1009 * If we would have faulted, release mmap_sem, fault it in and
681 1010 * start all over again.
682 1011 */
683   - up_read(&current->mm->mmap_sem);
  1012 + up_read(&curr->mm->mmap_sem);
684 1013  
685 1014 ret = get_user(uval, uaddr);
686 1015  
... ... @@ -688,11 +1017,9 @@
688 1017 goto retry;
689 1018 return ret;
690 1019 }
691   - if (uval != val) {
692   - ret = -EWOULDBLOCK;
693   - queue_unlock(&q, hb);
694   - goto out_release_sem;
695   - }
  1020 + ret = -EWOULDBLOCK;
  1021 + if (uval != val)
  1022 + goto out_unlock_release_sem;
696 1023  
697 1024 /* Only actually queue if *uaddr contained val. */
698 1025 __queue_me(&q, hb);
... ... @@ -700,8 +1027,8 @@
700 1027 /*
701 1028 * Now the futex is queued and we have checked the data, we
702 1029 * don't want to hold mmap_sem while we sleep.
703   - */
704   - up_read(&current->mm->mmap_sem);
  1030 + */
  1031 + up_read(&curr->mm->mmap_sem);
705 1032  
706 1033 /*
707 1034 * There might have been scheduling since the queue_me(), as we
708 1035  
709 1036  
710 1037  
... ... @@ -739,9 +1066,416 @@
739 1066 */
740 1067 return -EINTR;
741 1068  
  1069 + out_unlock_release_sem:
  1070 + queue_unlock(&q, hb);
  1071 +
742 1072 out_release_sem:
  1073 + up_read(&curr->mm->mmap_sem);
  1074 + return ret;
  1075 +}
  1076 +
  1077 +/*
  1078 + * Userspace tried a 0 -> TID atomic transition of the futex value
  1079 + * and failed. The kernel side here does the whole locking operation:
  1080 + * if there are waiters then it will block, it does PI, etc. (Due to
  1081 + * races the kernel might see a 0 value of the futex too.)
  1082 + */
  1083 +static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
  1084 + struct hrtimer_sleeper *to)
  1085 +{
  1086 + struct task_struct *curr = current;
  1087 + struct futex_hash_bucket *hb;
  1088 + u32 uval, newval, curval;
  1089 + struct futex_q q;
  1090 + int ret, attempt = 0;
  1091 +
  1092 + if (refill_pi_state_cache())
  1093 + return -ENOMEM;
  1094 +
  1095 + q.pi_state = NULL;
  1096 + retry:
  1097 + down_read(&curr->mm->mmap_sem);
  1098 +
  1099 + ret = get_futex_key(uaddr, &q.key);
  1100 + if (unlikely(ret != 0))
  1101 + goto out_release_sem;
  1102 +
  1103 + hb = queue_lock(&q, -1, NULL);
  1104 +
  1105 + retry_locked:
  1106 + /*
  1107 + * To avoid races, we attempt to take the lock here again
  1108 + * (by doing a 0 -> TID atomic cmpxchg), while holding all
  1109 + * the locks. It will most likely not succeed.
  1110 + */
  1111 + newval = current->pid;
  1112 +
  1113 + inc_preempt_count();
  1114 + curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval);
  1115 + dec_preempt_count();
  1116 +
  1117 + if (unlikely(curval == -EFAULT))
  1118 + goto uaddr_faulted;
  1119 +
  1120 + /* We own the lock already */
  1121 + if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) {
  1122 + if (!detect && 0)
  1123 + force_sig(SIGKILL, current);
  1124 + ret = -EDEADLK;
  1125 + goto out_unlock_release_sem;
  1126 + }
  1127 +
  1128 + /*
  1129 + * Surprise - we got the lock. Just return
  1130 + * to userspace:
  1131 + */
  1132 + if (unlikely(!curval))
  1133 + goto out_unlock_release_sem;
  1134 +
  1135 + uval = curval;
  1136 + newval = uval | FUTEX_WAITERS;
  1137 +
  1138 + inc_preempt_count();
  1139 + curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
  1140 + dec_preempt_count();
  1141 +
  1142 + if (unlikely(curval == -EFAULT))
  1143 + goto uaddr_faulted;
  1144 + if (unlikely(curval != uval))
  1145 + goto retry_locked;
  1146 +
  1147 + /*
  1148 + * We dont have the lock. Look up the PI state (or create it if
  1149 + * we are the first waiter):
  1150 + */
  1151 + ret = lookup_pi_state(uval, hb, &q);
  1152 +
  1153 + if (unlikely(ret)) {
  1154 + /*
  1155 + * There were no waiters and the owner task lookup
  1156 + * failed. When the OWNER_DIED bit is set, then we
  1157 + * know that this is a robust futex and we actually
  1158 + * take the lock. This is safe as we are protected by
  1159 + * the hash bucket lock. We also set the waiters bit
  1160 + * unconditionally here, to simplify glibc handling of
  1161 + * multiple tasks racing to acquire the lock and
  1162 + * cleanup the problems which were left by the dead
  1163 + * owner.
  1164 + */
  1165 + if (curval & FUTEX_OWNER_DIED) {
  1166 + uval = newval;
  1167 + newval = current->pid |
  1168 + FUTEX_OWNER_DIED | FUTEX_WAITERS;
  1169 +
  1170 + inc_preempt_count();
  1171 + curval = futex_atomic_cmpxchg_inatomic(uaddr,
  1172 + uval, newval);
  1173 + dec_preempt_count();
  1174 +
  1175 + if (unlikely(curval == -EFAULT))
  1176 + goto uaddr_faulted;
  1177 + if (unlikely(curval != uval))
  1178 + goto retry_locked;
  1179 + ret = 0;
  1180 + }
  1181 + goto out_unlock_release_sem;
  1182 + }
  1183 +
  1184 + /*
  1185 + * Only actually queue now that the atomic ops are done:
  1186 + */
  1187 + __queue_me(&q, hb);
  1188 +
  1189 + /*
  1190 + * Now the futex is queued and we have checked the data, we
  1191 + * don't want to hold mmap_sem while we sleep.
  1192 + */
  1193 + up_read(&curr->mm->mmap_sem);
  1194 +
  1195 + WARN_ON(!q.pi_state);
  1196 + /*
  1197 + * Block on the PI mutex:
  1198 + */
  1199 + if (!trylock)
  1200 + ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1);
  1201 + else {
  1202 + ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
  1203 + /* Fixup the trylock return value: */
  1204 + ret = ret ? 0 : -EWOULDBLOCK;
  1205 + }
  1206 +
  1207 + down_read(&curr->mm->mmap_sem);
  1208 + hb = queue_lock(&q, -1, NULL);
  1209 +
  1210 + /*
  1211 + * Got the lock. We might not be the anticipated owner if we
  1212 + * did a lock-steal - fix up the PI-state in that case.
  1213 + */
  1214 + if (!ret && q.pi_state->owner != curr) {
  1215 + u32 newtid = current->pid | FUTEX_WAITERS;
  1216 +
  1217 + /* Owner died? */
  1218 + if (q.pi_state->owner != NULL) {
  1219 + spin_lock_irq(&q.pi_state->owner->pi_lock);
  1220 + list_del_init(&q.pi_state->list);
  1221 + spin_unlock_irq(&q.pi_state->owner->pi_lock);
  1222 + } else
  1223 + newtid |= FUTEX_OWNER_DIED;
  1224 +
  1225 + q.pi_state->owner = current;
  1226 +
  1227 + spin_lock_irq(&current->pi_lock);
  1228 + list_add(&q.pi_state->list, &current->pi_state_list);
  1229 + spin_unlock_irq(&current->pi_lock);
  1230 +
  1231 + /* Unqueue and drop the lock */
  1232 + unqueue_me_pi(&q, hb);
  1233 + up_read(&curr->mm->mmap_sem);
  1234 + /*
  1235 + * We own it, so we have to replace the pending owner
  1236 + * TID. This must be atomic as we have preserve the
  1237 + * owner died bit here.
  1238 + */
  1239 + ret = get_user(uval, uaddr);
  1240 + while (!ret) {
  1241 + newval = (uval & FUTEX_OWNER_DIED) | newtid;
  1242 + curval = futex_atomic_cmpxchg_inatomic(uaddr,
  1243 + uval, newval);
  1244 + if (curval == -EFAULT)
  1245 + ret = -EFAULT;
  1246 + if (curval == uval)
  1247 + break;
  1248 + uval = curval;
  1249 + }
  1250 + } else {
  1251 + /*
  1252 + * Catch the rare case, where the lock was released
  1253 + * when we were on the way back before we locked
  1254 + * the hash bucket.
  1255 + */
  1256 + if (ret && q.pi_state->owner == curr) {
  1257 + if (rt_mutex_trylock(&q.pi_state->pi_mutex))
  1258 + ret = 0;
  1259 + }
  1260 + /* Unqueue and drop the lock */
  1261 + unqueue_me_pi(&q, hb);
  1262 + up_read(&curr->mm->mmap_sem);
  1263 + }
  1264 +
  1265 + if (!detect && ret == -EDEADLK && 0)
  1266 + force_sig(SIGKILL, current);
  1267 +
  1268 + return ret;
  1269 +
  1270 + out_unlock_release_sem:
  1271 + queue_unlock(&q, hb);
  1272 +
  1273 + out_release_sem:
  1274 + up_read(&curr->mm->mmap_sem);
  1275 + return ret;
  1276 +
  1277 + uaddr_faulted:
  1278 + /*
  1279 + * We have to r/w *(int __user *)uaddr, but we can't modify it
  1280 + * non-atomically. Therefore, if get_user below is not
  1281 + * enough, we need to handle the fault ourselves, while
  1282 + * still holding the mmap_sem.
  1283 + */
  1284 + if (attempt++) {
  1285 + if (futex_handle_fault((unsigned long)uaddr, attempt))
  1286 + goto out_unlock_release_sem;
  1287 +
  1288 + goto retry_locked;
  1289 + }
  1290 +
  1291 + queue_unlock(&q, hb);
  1292 + up_read(&curr->mm->mmap_sem);
  1293 +
  1294 + ret = get_user(uval, uaddr);
  1295 + if (!ret && (uval != -EFAULT))
  1296 + goto retry;
  1297 +
  1298 + return ret;
  1299 +}
  1300 +
  1301 +/*
  1302 + * Restart handler
  1303 + */
  1304 +static long futex_lock_pi_restart(struct restart_block *restart)
  1305 +{
  1306 + struct hrtimer_sleeper timeout, *to = NULL;
  1307 + int ret;
  1308 +
  1309 + restart->fn = do_no_restart_syscall;
  1310 +
  1311 + if (restart->arg2 || restart->arg3) {
  1312 + to = &timeout;
  1313 + hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS);
  1314 + hrtimer_init_sleeper(to, current);
  1315 + to->timer.expires.tv64 = ((u64)restart->arg1 << 32) |
  1316 + (u64) restart->arg0;
  1317 + }
  1318 +
  1319 + pr_debug("lock_pi restart: %p, %d (%d)\n",
  1320 + (u32 __user *)restart->arg0, current->pid);
  1321 +
  1322 + ret = do_futex_lock_pi((u32 __user *)restart->arg0, restart->arg1,
  1323 + 0, to);
  1324 +
  1325 + if (ret != -EINTR)
  1326 + return ret;
  1327 +
  1328 + restart->fn = futex_lock_pi_restart;
  1329 +
  1330 + /* The other values are filled in */
  1331 + return -ERESTART_RESTARTBLOCK;
  1332 +}
  1333 +
  1334 +/*
  1335 + * Called from the syscall entry below.
  1336 + */
  1337 +static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
  1338 + long nsec, int trylock)
  1339 +{
  1340 + struct hrtimer_sleeper timeout, *to = NULL;
  1341 + struct restart_block *restart;
  1342 + int ret;
  1343 +
  1344 + if (sec != MAX_SCHEDULE_TIMEOUT) {
  1345 + to = &timeout;
  1346 + hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS);
  1347 + hrtimer_init_sleeper(to, current);
  1348 + to->timer.expires = ktime_set(sec, nsec);
  1349 + }
  1350 +
  1351 + ret = do_futex_lock_pi(uaddr, detect, trylock, to);
  1352 +
  1353 + if (ret != -EINTR)
  1354 + return ret;
  1355 +
  1356 + pr_debug("lock_pi interrupted: %p, %d (%d)\n", uaddr, current->pid);
  1357 +
  1358 + restart = &current_thread_info()->restart_block;
  1359 + restart->fn = futex_lock_pi_restart;
  1360 + restart->arg0 = (unsigned long) uaddr;
  1361 + restart->arg1 = detect;
  1362 + if (to) {
  1363 + restart->arg2 = to->timer.expires.tv64 & 0xFFFFFFFF;
  1364 + restart->arg3 = to->timer.expires.tv64 >> 32;
  1365 + } else
  1366 + restart->arg2 = restart->arg3 = 0;
  1367 +
  1368 + return -ERESTART_RESTARTBLOCK;
  1369 +}
  1370 +
  1371 +/*
  1372 + * Userspace attempted a TID -> 0 atomic transition, and failed.
  1373 + * This is the in-kernel slowpath: we look up the PI state (if any),
  1374 + * and do the rt-mutex unlock.
  1375 + */
  1376 +static int futex_unlock_pi(u32 __user *uaddr)
  1377 +{
  1378 + struct futex_hash_bucket *hb;
  1379 + struct futex_q *this, *next;
  1380 + u32 uval;
  1381 + struct list_head *head;
  1382 + union futex_key key;
  1383 + int ret, attempt = 0;
  1384 +
  1385 +retry:
  1386 + if (get_user(uval, uaddr))
  1387 + return -EFAULT;
  1388 + /*
  1389 + * We release only a lock we actually own:
  1390 + */
  1391 + if ((uval & FUTEX_TID_MASK) != current->pid)
  1392 + return -EPERM;
  1393 + /*
  1394 + * First take all the futex related locks:
  1395 + */
  1396 + down_read(&current->mm->mmap_sem);
  1397 +
  1398 + ret = get_futex_key(uaddr, &key);
  1399 + if (unlikely(ret != 0))
  1400 + goto out;
  1401 +
  1402 + hb = hash_futex(&key);
  1403 + spin_lock(&hb->lock);
  1404 +
  1405 +retry_locked:
  1406 + /*
  1407 + * To avoid races, try to do the TID -> 0 atomic transition
  1408 + * again. If it succeeds then we can return without waking
  1409 + * anyone else up:
  1410 + */
  1411 + inc_preempt_count();
  1412 + uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0);
  1413 + dec_preempt_count();
  1414 +
  1415 + if (unlikely(uval == -EFAULT))
  1416 + goto pi_faulted;
  1417 + /*
  1418 + * Rare case: we managed to release the lock atomically,
  1419 + * no need to wake anyone else up:
  1420 + */
  1421 + if (unlikely(uval == current->pid))
  1422 + goto out_unlock;
  1423 +
  1424 + /*
  1425 + * Ok, other tasks may need to be woken up - check waiters
  1426 + * and do the wakeup if necessary:
  1427 + */
  1428 + head = &hb->chain;
  1429 +
  1430 + list_for_each_entry_safe(this, next, head, list) {
  1431 + if (!match_futex (&this->key, &key))
  1432 + continue;
  1433 + ret = wake_futex_pi(uaddr, uval, this);
  1434 + /*
  1435 + * The atomic access to the futex value
  1436 + * generated a pagefault, so retry the
  1437 + * user-access and the wakeup:
  1438 + */
  1439 + if (ret == -EFAULT)
  1440 + goto pi_faulted;
  1441 + goto out_unlock;
  1442 + }
  1443 + /*
  1444 + * No waiters - kernel unlocks the futex:
  1445 + */
  1446 + ret = unlock_futex_pi(uaddr, uval);
  1447 + if (ret == -EFAULT)
  1448 + goto pi_faulted;
  1449 +
  1450 +out_unlock:
  1451 + spin_unlock(&hb->lock);
  1452 +out:
743 1453 up_read(&current->mm->mmap_sem);
  1454 +
744 1455 return ret;
  1456 +
  1457 +pi_faulted:
  1458 + /*
  1459 + * We have to r/w *(int __user *)uaddr, but we can't modify it
  1460 + * non-atomically. Therefore, if get_user below is not
  1461 + * enough, we need to handle the fault ourselves, while
  1462 + * still holding the mmap_sem.
  1463 + */
  1464 + if (attempt++) {
  1465 + if (futex_handle_fault((unsigned long)uaddr, attempt))
  1466 + goto out_unlock;
  1467 +
  1468 + goto retry_locked;
  1469 + }
  1470 +
  1471 + spin_unlock(&hb->lock);
  1472 + up_read(&current->mm->mmap_sem);
  1473 +
  1474 + ret = get_user(uval, uaddr);
  1475 + if (!ret && (uval != -EFAULT))
  1476 + goto retry;
  1477 +
  1478 + return ret;
745 1479 }
746 1480  
747 1481 static int futex_close(struct inode *inode, struct file *filp)
... ... @@ -819,6 +1553,7 @@
819 1553 err = -ENOMEM;
820 1554 goto error;
821 1555 }
  1556 + q->pi_state = NULL;
822 1557  
823 1558 down_read(&current->mm->mmap_sem);
824 1559 err = get_futex_key(uaddr, &q->key);
... ... @@ -856,7 +1591,7 @@
856 1591 * Implementation: user-space maintains a per-thread list of locks it
857 1592 * is holding. Upon do_exit(), the kernel carefully walks this list,
858 1593 * and marks all locks that are owned by this thread with the
859   - * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is
  1594 + * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
860 1595 * always manipulated with the lock held, so the list is private and
861 1596 * per-thread. Userspace also maintains a per-thread 'list_op_pending'
862 1597 * field, to allow the kernel to clean up if the thread dies after
... ... @@ -931,7 +1666,7 @@
931 1666 */
932 1667 int handle_futex_death(u32 __user *uaddr, struct task_struct *curr)
933 1668 {
934   - u32 uval;
  1669 + u32 uval, nval;
935 1670  
936 1671 retry:
937 1672 if (get_user(uval, uaddr))
... ... @@ -948,8 +1683,12 @@
948 1683 * thread-death.) The rest of the cleanup is done in
949 1684 * userspace.
950 1685 */
951   - if (futex_atomic_cmpxchg_inatomic(uaddr, uval,
952   - uval | FUTEX_OWNER_DIED) != uval)
  1686 + nval = futex_atomic_cmpxchg_inatomic(uaddr, uval,
  1687 + uval | FUTEX_OWNER_DIED);
  1688 + if (nval == -EFAULT)
  1689 + return -1;
  1690 +
  1691 + if (nval != uval)
953 1692 goto retry;
954 1693  
955 1694 if (uval & FUTEX_WAITERS)
... ... @@ -994,7 +1733,7 @@
994 1733 while (entry != &head->list) {
995 1734 /*
996 1735 * A pending lock might already be on the list, so
997   - * dont process it twice:
  1736 + * don't process it twice:
998 1737 */
999 1738 if (entry != pending)
1000 1739 if (handle_futex_death((void *)entry + futex_offset,
... ... @@ -1040,6 +1779,15 @@
1040 1779 case FUTEX_WAKE_OP:
1041 1780 ret = futex_wake_op(uaddr, uaddr2, val, val2, val3);
1042 1781 break;
  1782 + case FUTEX_LOCK_PI:
  1783 + ret = futex_lock_pi(uaddr, val, timeout, val2, 0);
  1784 + break;
  1785 + case FUTEX_UNLOCK_PI:
  1786 + ret = futex_unlock_pi(uaddr);
  1787 + break;
  1788 + case FUTEX_TRYLOCK_PI:
  1789 + ret = futex_lock_pi(uaddr, 0, timeout, val2, 1);
  1790 + break;
1043 1791 default:
1044 1792 ret = -ENOSYS;
1045 1793 }
1046 1794  
1047 1795  
... ... @@ -1055,17 +1803,22 @@
1055 1803 unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
1056 1804 u32 val2 = 0;
1057 1805  
1058   - if (utime && (op == FUTEX_WAIT)) {
  1806 + if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
1059 1807 if (copy_from_user(&t, utime, sizeof(t)) != 0)
1060 1808 return -EFAULT;
1061 1809 if (!timespec_valid(&t))
1062 1810 return -EINVAL;
1063   - timeout = timespec_to_jiffies(&t) + 1;
  1811 + if (op == FUTEX_WAIT)
  1812 + timeout = timespec_to_jiffies(&t) + 1;
  1813 + else {
  1814 + timeout = t.tv_sec;
  1815 + val2 = t.tv_nsec;
  1816 + }
1064 1817 }
1065 1818 /*
1066 1819 * requeue parameter in 'utime' if op == FUTEX_REQUEUE.
1067 1820 */
1068   - if (op >= FUTEX_REQUEUE)
  1821 + if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
1069 1822 val2 = (u32) (unsigned long) utime;
1070 1823  
1071 1824 return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
kernel/futex_compat.c
... ... @@ -129,14 +129,19 @@
129 129 unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
130 130 int val2 = 0;
131 131  
132   - if (utime && (op == FUTEX_WAIT)) {
  132 + if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
133 133 if (get_compat_timespec(&t, utime))
134 134 return -EFAULT;
135 135 if (!timespec_valid(&t))
136 136 return -EINVAL;
137   - timeout = timespec_to_jiffies(&t) + 1;
  137 + if (op == FUTEX_WAIT)
  138 + timeout = timespec_to_jiffies(&t) + 1;
  139 + else {
  140 + timeout = t.tv_sec;
  141 + val2 = t.tv_nsec;
  142 + }
138 143 }
139   - if (op >= FUTEX_REQUEUE)
  144 + if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
140 145 val2 = (int) (unsigned long) utime;
141 146  
142 147 return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
kernel/rtmutex_common.h
... ... @@ -112,5 +112,13 @@
112 112 return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING;
113 113 }
114 114  
  115 +/*
  116 + * PI-futex support (proxy locking functions, etc.):
  117 + */
  118 +extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
  119 +extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
  120 + struct task_struct *proxy_owner);
  121 +extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
  122 + struct task_struct *proxy_owner);
115 123 #endif