Commit 52400ba946759af28442dee6265c5c0180ac7122

Authored by Darren Hart
Committed by Thomas Gleixner
1 parent f801073f87

futex: add requeue_pi functionality

PI Futexes and their underlying rt_mutex cannot be left ownerless if
there are pending waiters as this will break the PI boosting logic, so
the standard requeue commands aren't sufficient.  The new commands
properly manage pi futex ownership by ensuring a futex with waiters
has an owner at all times.  This will allow glibc to properly handle
pi mutexes with pthread_condvars.

The approach taken here is to create two new futex op codes:

FUTEX_WAIT_REQUEUE_PI:
Tasks will use this op code to wait on a futex (such as a non-pi waitqueue)
and wake after they have been requeued to a pi futex.  Prior to returning to
userspace, they will acquire this pi futex (and the underlying rt_mutex).

futex_wait_requeue_pi() is the result of a high speed collision between
futex_wait() and futex_lock_pi() (with the first part of futex_lock_pi() being
done by futex_proxy_trylock_atomic() on behalf of the top_waiter).

FUTEX_REQUEUE_PI (and FUTEX_CMP_REQUEUE_PI):
This call must be used to wake tasks waiting with FUTEX_WAIT_REQUEUE_PI,
regardless of how many tasks the caller intends to wake or requeue.
pthread_cond_broadcast() should call this with nr_wake=1 and
nr_requeue=INT_MAX.  pthread_cond_signal() should call this with nr_wake=1 and
nr_requeue=0.  The reason being we need both callers to get the benefit of the
futex_proxy_trylock_atomic() routine.  futex_requeue() also enqueues the
top_waiter on the rt_mutex via rt_mutex_start_proxy_lock().

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

Showing 3 changed files with 510 additions and 20 deletions Side-by-side Diff

include/linux/futex.h
... ... @@ -23,6 +23,9 @@
23 23 #define FUTEX_TRYLOCK_PI 8
24 24 #define FUTEX_WAIT_BITSET 9
25 25 #define FUTEX_WAKE_BITSET 10
  26 +#define FUTEX_WAIT_REQUEUE_PI 11
  27 +#define FUTEX_REQUEUE_PI 12
  28 +#define FUTEX_CMP_REQUEUE_PI 13
26 29  
27 30 #define FUTEX_PRIVATE_FLAG 128
28 31 #define FUTEX_CLOCK_REALTIME 256
... ... @@ -38,6 +41,11 @@
38 41 #define FUTEX_TRYLOCK_PI_PRIVATE (FUTEX_TRYLOCK_PI | FUTEX_PRIVATE_FLAG)
39 42 #define FUTEX_WAIT_BITSET_PRIVATE (FUTEX_WAIT_BITS | FUTEX_PRIVATE_FLAG)
40 43 #define FUTEX_WAKE_BITSET_PRIVATE (FUTEX_WAKE_BITS | FUTEX_PRIVATE_FLAG)
  44 +#define FUTEX_WAIT_REQUEUE_PI_PRIVATE (FUTEX_WAIT_REQUEUE_PI | \
  45 + FUTEX_PRIVATE_FLAG)
  46 +#define FUTEX_REQUEUE_PI_PRIVATE (FUTEX_REQUEUE_PI | FUTEX_PRIVATE_FLAG)
  47 +#define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | \
  48 + FUTEX_PRIVATE_FLAG)
41 49  
42 50 /*
43 51 * Support for robust futexes: the kernel cleans up held futexes at
include/linux/thread_info.h
... ... @@ -21,13 +21,14 @@
21 21 struct {
22 22 unsigned long arg0, arg1, arg2, arg3;
23 23 };
24   - /* For futex_wait */
  24 + /* For futex_wait and futex_wait_requeue_pi */
25 25 struct {
26 26 u32 *uaddr;
27 27 u32 val;
28 28 u32 flags;
29 29 u32 bitset;
30 30 u64 time;
  31 + u32 *uaddr2;
31 32 } futex;
32 33 /* For nanosleep */
33 34 struct {
... ... @@ -19,6 +19,10 @@
19 19 * PRIVATE futexes by Eric Dumazet
20 20 * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
21 21 *
  22 + * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
  23 + * Copyright (C) IBM Corporation, 2009
  24 + * Thanks to Thomas Gleixner for conceptual design and careful reviews.
  25 + *
22 26 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
23 27 * enough at me, Linus for the original (flawed) idea, Matthew
24 28 * Kirkwood for proof-of-concept implementation.
... ... @@ -109,6 +113,9 @@
109 113 struct futex_pi_state *pi_state;
110 114 struct task_struct *task;
111 115  
  116 + /* rt_waiter storage for requeue_pi: */
  117 + struct rt_mutex_waiter *rt_waiter;
  118 +
112 119 /* Bitset for the optional bitmasked wakeup */
113 120 u32 bitset;
114 121 };
... ... @@ -827,7 +834,7 @@
827 834  
828 835 plist_for_each_entry_safe(this, next, head, list) {
829 836 if (match_futex (&this->key, &key)) {
830   - if (this->pi_state) {
  837 + if (this->pi_state || this->rt_waiter) {
831 838 ret = -EINVAL;
832 839 break;
833 840 }
834 841  
835 842  
836 843  
837 844  
838 845  
839 846  
... ... @@ -968,20 +975,138 @@
968 975 q->key = *key2;
969 976 }
970 977  
971   -/*
972   - * Requeue all waiters hashed on one physical page to another
973   - * physical page.
  978 +/**
  979 + * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
  980 + * q: the futex_q
  981 + * key: the key of the requeue target futex
  982 + *
  983 + * During futex_requeue, with requeue_pi=1, it is possible to acquire the
  984 + * target futex if it is uncontended or via a lock steal. Set the futex_q key
  985 + * to the requeue target futex so the waiter can detect the wakeup on the right
  986 + * futex, but remove it from the hb and NULL the rt_waiter so it can detect
  987 + * atomic lock acquisition. Must be called with the q->lock_ptr held.
974 988 */
  989 +static inline
  990 +void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key)
  991 +{
  992 + drop_futex_key_refs(&q->key);
  993 + get_futex_key_refs(key);
  994 + q->key = *key;
  995 +
  996 + WARN_ON(plist_node_empty(&q->list));
  997 + plist_del(&q->list, &q->list.plist);
  998 +
  999 + WARN_ON(!q->rt_waiter);
  1000 + q->rt_waiter = NULL;
  1001 +
  1002 + wake_up(&q->waiter);
  1003 +}
  1004 +
  1005 +/**
  1006 + * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
  1007 + * @pifutex: the user address of the to futex
  1008 + * @hb1: the from futex hash bucket, must be locked by the caller
  1009 + * @hb2: the to futex hash bucket, must be locked by the caller
  1010 + * @key1: the from futex key
  1011 + * @key2: the to futex key
  1012 + *
  1013 + * Try and get the lock on behalf of the top waiter if we can do it atomically.
  1014 + * Wake the top waiter if we succeed. hb1 and hb2 must be held by the caller.
  1015 + *
  1016 + * Returns:
  1017 + * 0 - failed to acquire the lock atomicly
  1018 + * 1 - acquired the lock
  1019 + * <0 - error
  1020 + */
  1021 +static int futex_proxy_trylock_atomic(u32 __user *pifutex,
  1022 + struct futex_hash_bucket *hb1,
  1023 + struct futex_hash_bucket *hb2,
  1024 + union futex_key *key1, union futex_key *key2,
  1025 + struct futex_pi_state **ps)
  1026 +{
  1027 + struct futex_q *top_waiter;
  1028 + u32 curval;
  1029 + int ret;
  1030 +
  1031 + if (get_futex_value_locked(&curval, pifutex))
  1032 + return -EFAULT;
  1033 +
  1034 + top_waiter = futex_top_waiter(hb1, key1);
  1035 +
  1036 + /* There are no waiters, nothing for us to do. */
  1037 + if (!top_waiter)
  1038 + return 0;
  1039 +
  1040 + /*
  1041 + * Either take the lock for top_waiter or set the FUTEX_WAITERS bit.
  1042 + * The pi_state is returned in ps in contended cases.
  1043 + */
  1044 + ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task);
  1045 + if (ret == 1)
  1046 + requeue_pi_wake_futex(top_waiter, key2);
  1047 +
  1048 + return ret;
  1049 +}
  1050 +
  1051 +/**
  1052 + * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
  1053 + * uaddr1: source futex user address
  1054 + * uaddr2: target futex user address
  1055 + * nr_wake: number of waiters to wake (must be 1 for requeue_pi)
  1056 + * nr_requeue: number of waiters to requeue (0-INT_MAX)
  1057 + * requeue_pi: if we are attempting to requeue from a non-pi futex to a
  1058 + * pi futex (pi to pi requeue is not supported)
  1059 + *
  1060 + * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
  1061 + * uaddr2 atomically on behalf of the top waiter.
  1062 + *
  1063 + * Returns:
  1064 + * >=0 - on success, the number of tasks requeued or woken
  1065 + * <0 - on error
  1066 + */
975 1067 static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
976   - int nr_wake, int nr_requeue, u32 *cmpval)
  1068 + int nr_wake, int nr_requeue, u32 *cmpval,
  1069 + int requeue_pi)
977 1070 {
978 1071 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
  1072 + int drop_count = 0, task_count = 0, ret;
  1073 + struct futex_pi_state *pi_state = NULL;
979 1074 struct futex_hash_bucket *hb1, *hb2;
980 1075 struct plist_head *head1;
981 1076 struct futex_q *this, *next;
982   - int ret, drop_count = 0;
  1077 + u32 curval2;
983 1078  
  1079 + if (requeue_pi) {
  1080 + /*
  1081 + * requeue_pi requires a pi_state, try to allocate it now
  1082 + * without any locks in case it fails.
  1083 + */
  1084 + if (refill_pi_state_cache())
  1085 + return -ENOMEM;
  1086 + /*
  1087 + * requeue_pi must wake as many tasks as it can, up to nr_wake
  1088 + * + nr_requeue, since it acquires the rt_mutex prior to
  1089 + * returning to userspace, so as to not leave the rt_mutex with
  1090 + * waiters and no owner. However, second and third wake-ups
  1091 + * cannot be predicted as they involve race conditions with the
  1092 + * first wake and a fault while looking up the pi_state. Both
  1093 + * pthread_cond_signal() and pthread_cond_broadcast() should
  1094 + * use nr_wake=1.
  1095 + */
  1096 + if (nr_wake != 1)
  1097 + return -EINVAL;
  1098 + }
  1099 +
984 1100 retry:
  1101 + if (pi_state != NULL) {
  1102 + /*
  1103 + * We will have to lookup the pi_state again, so free this one
  1104 + * to keep the accounting correct.
  1105 + */
  1106 + free_pi_state(pi_state);
  1107 + pi_state = NULL;
  1108 + }
  1109 +
985 1110 ret = get_futex_key(uaddr1, fshared, &key1);
986 1111 if (unlikely(ret != 0))
987 1112 goto out;
988 1113  
989 1114  
990 1115  
991 1116  
992 1117  
... ... @@ -1020,19 +1145,94 @@
1020 1145 }
1021 1146 }
1022 1147  
  1148 + if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
  1149 + /* Attempt to acquire uaddr2 and wake the top_waiter. */
  1150 + ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
  1151 + &key2, &pi_state);
  1152 +
  1153 + /*
  1154 + * At this point the top_waiter has either taken uaddr2 or is
  1155 + * waiting on it. If the former, then the pi_state will not
  1156 + * exist yet, look it up one more time to ensure we have a
  1157 + * reference to it.
  1158 + */
  1159 + if (ret == 1) {
  1160 + WARN_ON(pi_state);
  1161 + task_count++;
  1162 + ret = get_futex_value_locked(&curval2, uaddr2);
  1163 + if (!ret)
  1164 + ret = lookup_pi_state(curval2, hb2, &key2,
  1165 + &pi_state);
  1166 + }
  1167 +
  1168 + switch (ret) {
  1169 + case 0:
  1170 + break;
  1171 + case -EFAULT:
  1172 + double_unlock_hb(hb1, hb2);
  1173 + put_futex_key(fshared, &key2);
  1174 + put_futex_key(fshared, &key1);
  1175 + ret = get_user(curval2, uaddr2);
  1176 + if (!ret)
  1177 + goto retry;
  1178 + goto out;
  1179 + case -EAGAIN:
  1180 + /* The owner was exiting, try again. */
  1181 + double_unlock_hb(hb1, hb2);
  1182 + put_futex_key(fshared, &key2);
  1183 + put_futex_key(fshared, &key1);
  1184 + cond_resched();
  1185 + goto retry;
  1186 + default:
  1187 + goto out_unlock;
  1188 + }
  1189 + }
  1190 +
1023 1191 head1 = &hb1->chain;
1024 1192 plist_for_each_entry_safe(this, next, head1, list) {
1025   - if (!match_futex (&this->key, &key1))
  1193 + if (task_count - nr_wake >= nr_requeue)
  1194 + break;
  1195 +
  1196 + if (!match_futex(&this->key, &key1))
1026 1197 continue;
1027   - if (++ret <= nr_wake) {
  1198 +
  1199 + WARN_ON(!requeue_pi && this->rt_waiter);
  1200 + WARN_ON(requeue_pi && !this->rt_waiter);
  1201 +
  1202 + /*
  1203 + * Wake nr_wake waiters. For requeue_pi, if we acquired the
  1204 + * lock, we already woke the top_waiter. If not, it will be
  1205 + * woken by futex_unlock_pi().
  1206 + */
  1207 + if (++task_count <= nr_wake && !requeue_pi) {
1028 1208 wake_futex(this);
1029   - } else {
1030   - requeue_futex(this, hb1, hb2, &key2);
1031   - drop_count++;
  1209 + continue;
  1210 + }
1032 1211  
1033   - if (ret - nr_wake >= nr_requeue)
1034   - break;
  1212 + /*
  1213 + * Requeue nr_requeue waiters and possibly one more in the case
  1214 + * of requeue_pi if we couldn't acquire the lock atomically.
  1215 + */
  1216 + if (requeue_pi) {
  1217 + /* Prepare the waiter to take the rt_mutex. */
  1218 + atomic_inc(&pi_state->refcount);
  1219 + this->pi_state = pi_state;
  1220 + ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
  1221 + this->rt_waiter,
  1222 + this->task, 1);
  1223 + if (ret == 1) {
  1224 + /* We got the lock. */
  1225 + requeue_pi_wake_futex(this, &key2);
  1226 + continue;
  1227 + } else if (ret) {
  1228 + /* -EDEADLK */
  1229 + this->pi_state = NULL;
  1230 + free_pi_state(pi_state);
  1231 + goto out_unlock;
  1232 + }
1035 1233 }
  1234 + requeue_futex(this, hb1, hb2, &key2);
  1235 + drop_count++;
1036 1236 }
1037 1237  
1038 1238 out_unlock:
... ... @@ -1047,7 +1247,9 @@
1047 1247 out_put_key1:
1048 1248 put_futex_key(fshared, &key1);
1049 1249 out:
1050   - return ret;
  1250 + if (pi_state != NULL)
  1251 + free_pi_state(pi_state);
  1252 + return ret ? ret : task_count;
1051 1253 }
1052 1254  
1053 1255 /* The key must be already stored in q->key. */
... ... @@ -1270,6 +1472,7 @@
1270 1472 #define FLAGS_HAS_TIMEOUT 0x04
1271 1473  
1272 1474 static long futex_wait_restart(struct restart_block *restart);
  1475 +static long futex_lock_pi_restart(struct restart_block *restart);
1273 1476  
1274 1477 /**
1275 1478 * fixup_owner() - Post lock pi_state and corner case management
... ... @@ -1489,6 +1692,7 @@
1489 1692  
1490 1693 q.pi_state = NULL;
1491 1694 q.bitset = bitset;
  1695 + q.rt_waiter = NULL;
1492 1696  
1493 1697 if (abs_time) {
1494 1698 to = &timeout;
... ... @@ -1596,6 +1800,7 @@
1596 1800 }
1597 1801  
1598 1802 q.pi_state = NULL;
  1803 + q.rt_waiter = NULL;
1599 1804 retry:
1600 1805 q.key = FUTEX_KEY_INIT;
1601 1806 ret = get_futex_key(uaddr, fshared, &q.key);
1602 1807  
... ... @@ -1701,7 +1906,21 @@
1701 1906 goto retry;
1702 1907 }
1703 1908  
  1909 +static long futex_lock_pi_restart(struct restart_block *restart)
  1910 +{
  1911 + u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
  1912 + ktime_t t, *tp = NULL;
  1913 + int fshared = restart->futex.flags & FLAGS_SHARED;
1704 1914  
  1915 + if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
  1916 + t.tv64 = restart->futex.time;
  1917 + tp = &t;
  1918 + }
  1919 + restart->fn = do_no_restart_syscall;
  1920 +
  1921 + return (long)futex_lock_pi(uaddr, fshared, restart->futex.val, tp, 0);
  1922 +}
  1923 +
1705 1924 /*
1706 1925 * Userspace attempted a TID -> 0 atomic transition, and failed.
1707 1926 * This is the in-kernel slowpath: we look up the PI state (if any),
... ... @@ -1803,6 +2022,253 @@
1803 2022 return ret;
1804 2023 }
1805 2024  
  2025 +/**
  2026 + * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex
  2027 + * @hb: the hash_bucket futex_q was original enqueued on
  2028 + * @q: the futex_q woken while waiting to be requeued
  2029 + * @key2: the futex_key of the requeue target futex
  2030 + * @timeout: the timeout associated with the wait (NULL if none)
  2031 + *
  2032 + * Detect if the task was woken on the initial futex as opposed to the requeue
  2033 + * target futex. If so, determine if it was a timeout or a signal that caused
  2034 + * the wakeup and return the appropriate error code to the caller. Must be
  2035 + * called with the hb lock held.
  2036 + *
  2037 + * Returns
  2038 + * 0 - no early wakeup detected
  2039 + * <0 - -ETIMEDOUT or -ERESTARTSYS (FIXME: or ERESTARTNOINTR?)
  2040 + */
  2041 +static inline
  2042 +int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
  2043 + struct futex_q *q, union futex_key *key2,
  2044 + struct hrtimer_sleeper *timeout)
  2045 +{
  2046 + int ret = 0;
  2047 +
  2048 + /*
  2049 + * With the hb lock held, we avoid races while we process the wakeup.
  2050 + * We only need to hold hb (and not hb2) to ensure atomicity as the
  2051 + * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
  2052 + * It can't be requeued from uaddr2 to something else since we don't
  2053 + * support a PI aware source futex for requeue.
  2054 + */
  2055 + if (!match_futex(&q->key, key2)) {
  2056 + WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr));
  2057 + /*
  2058 + * We were woken prior to requeue by a timeout or a signal.
  2059 + * Unqueue the futex_q and determine which it was.
  2060 + */
  2061 + plist_del(&q->list, &q->list.plist);
  2062 + drop_futex_key_refs(&q->key);
  2063 +
  2064 + if (timeout && !timeout->task)
  2065 + ret = -ETIMEDOUT;
  2066 + else {
  2067 + /*
  2068 + * We expect signal_pending(current), but another
  2069 + * thread may have handled it for us already.
  2070 + */
  2071 + /* FIXME: ERESTARTSYS or ERESTARTNOINTR? Do we care if
  2072 + * the user specified SA_RESTART or not? */
  2073 + ret = -ERESTARTSYS;
  2074 + }
  2075 + }
  2076 + return ret;
  2077 +}
  2078 +
  2079 +/**
  2080 + * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
  2081 + * @uaddr: the futex we initialyl wait on (non-pi)
  2082 + * @fshared: whether the futexes are shared (1) or not (0). They must be
  2083 + * the same type, no requeueing from private to shared, etc.
  2084 + * @val: the expected value of uaddr
  2085 + * @abs_time: absolute timeout
  2086 + * @bitset: 32 bit wakeup bitset set by userspace, defaults to all.
  2087 + * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)
  2088 + * @uaddr2: the pi futex we will take prior to returning to user-space
  2089 + *
  2090 + * The caller will wait on uaddr and will be requeued by futex_requeue() to
  2091 + * uaddr2 which must be PI aware. Normal wakeup will wake on uaddr2 and
  2092 + * complete the acquisition of the rt_mutex prior to returning to userspace.
  2093 + * This ensures the rt_mutex maintains an owner when it has waiters; without
  2094 + * one, the pi logic wouldn't know which task to boost/deboost, if there was a
  2095 + * need to.
  2096 + *
  2097 + * We call schedule in futex_wait_queue_me() when we enqueue and return there
  2098 + * via the following:
  2099 + * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
  2100 + * 2) wakeup on uaddr2 after a requeue and subsequent unlock
  2101 + * 3) signal (before or after requeue)
  2102 + * 4) timeout (before or after requeue)
  2103 + *
  2104 + * If 3, we setup a restart_block with futex_wait_requeue_pi() as the function.
  2105 + *
  2106 + * If 2, we may then block on trying to take the rt_mutex and return via:
  2107 + * 5) successful lock
  2108 + * 6) signal
  2109 + * 7) timeout
  2110 + * 8) other lock acquisition failure
  2111 + *
  2112 + * If 6, we setup a restart_block with futex_lock_pi() as the function.
  2113 + *
  2114 + * If 4 or 7, we cleanup and return with -ETIMEDOUT.
  2115 + *
  2116 + * Returns:
  2117 + * 0 - On success
  2118 + * <0 - On error
  2119 + */
  2120 +static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
  2121 + u32 val, ktime_t *abs_time, u32 bitset,
  2122 + int clockrt, u32 __user *uaddr2)
  2123 +{
  2124 + struct hrtimer_sleeper timeout, *to = NULL;
  2125 + struct rt_mutex_waiter rt_waiter;
  2126 + struct rt_mutex *pi_mutex = NULL;
  2127 + DECLARE_WAITQUEUE(wait, current);
  2128 + struct restart_block *restart;
  2129 + struct futex_hash_bucket *hb;
  2130 + union futex_key key2;
  2131 + struct futex_q q;
  2132 + int res, ret;
  2133 + u32 uval;
  2134 +
  2135 + if (!bitset)
  2136 + return -EINVAL;
  2137 +
  2138 + if (abs_time) {
  2139 + to = &timeout;
  2140 + hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
  2141 + CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
  2142 + hrtimer_init_sleeper(to, current);
  2143 + hrtimer_set_expires_range_ns(&to->timer, *abs_time,
  2144 + current->timer_slack_ns);
  2145 + }
  2146 +
  2147 + /*
  2148 + * The waiter is allocated on our stack, manipulated by the requeue
  2149 + * code while we sleep on uaddr.
  2150 + */
  2151 + debug_rt_mutex_init_waiter(&rt_waiter);
  2152 + rt_waiter.task = NULL;
  2153 +
  2154 + q.pi_state = NULL;
  2155 + q.bitset = bitset;
  2156 + q.rt_waiter = &rt_waiter;
  2157 +
  2158 + key2 = FUTEX_KEY_INIT;
  2159 + ret = get_futex_key(uaddr2, fshared, &key2);
  2160 + if (unlikely(ret != 0))
  2161 + goto out;
  2162 +
  2163 + /* Prepare to wait on uaddr. */
  2164 + ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
  2165 + if (ret) {
  2166 + put_futex_key(fshared, &key2);
  2167 + goto out;
  2168 + }
  2169 +
  2170 + /* Queue the futex_q, drop the hb lock, wait for wakeup. */
  2171 + futex_wait_queue_me(hb, &q, to, &wait);
  2172 +
  2173 + spin_lock(&hb->lock);
  2174 + ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
  2175 + spin_unlock(&hb->lock);
  2176 + if (ret)
  2177 + goto out_put_keys;
  2178 +
  2179 + /*
  2180 + * In order for us to be here, we know our q.key == key2, and since
  2181 + * we took the hb->lock above, we also know that futex_requeue() has
  2182 + * completed and we no longer have to concern ourselves with a wakeup
  2183 + * race with the atomic proxy lock acquition by the requeue code.
  2184 + */
  2185 +
  2186 + /* Check if the requeue code acquired the second futex for us. */
  2187 + if (!q.rt_waiter) {
  2188 + /*
  2189 + * Got the lock. We might not be the anticipated owner if we
  2190 + * did a lock-steal - fix up the PI-state in that case.
  2191 + */
  2192 + if (q.pi_state && (q.pi_state->owner != current)) {
  2193 + spin_lock(q.lock_ptr);
  2194 + ret = fixup_pi_state_owner(uaddr2, &q, current,
  2195 + fshared);
  2196 + spin_unlock(q.lock_ptr);
  2197 + }
  2198 + } else {
  2199 + /*
  2200 + * We have been woken up by futex_unlock_pi(), a timeout, or a
  2201 + * signal. futex_unlock_pi() will not destroy the lock_ptr nor
  2202 + * the pi_state.
  2203 + */
  2204 + WARN_ON(!&q.pi_state);
  2205 + pi_mutex = &q.pi_state->pi_mutex;
  2206 + ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
  2207 + debug_rt_mutex_free_waiter(&rt_waiter);
  2208 +
  2209 + spin_lock(q.lock_ptr);
  2210 + /*
  2211 + * Fixup the pi_state owner and possibly acquire the lock if we
  2212 + * haven't already.
  2213 + */
  2214 + res = fixup_owner(uaddr2, fshared, &q, !ret);
  2215 + /*
  2216 + * If fixup_owner() returned an error, proprogate that. If it
  2217 + * acquired the lock, clear our -ETIMEDOUT or -EINTR.
  2218 + */
  2219 + if (res)
  2220 + ret = (res < 0) ? res : 0;
  2221 +
  2222 + /* Unqueue and drop the lock. */
  2223 + unqueue_me_pi(&q);
  2224 + }
  2225 +
  2226 + /*
  2227 + * If fixup_pi_state_owner() faulted and was unable to handle the
  2228 + * fault, unlock the rt_mutex and return the fault to userspace.
  2229 + */
  2230 + if (ret == -EFAULT) {
  2231 + if (rt_mutex_owner(pi_mutex) == current)
  2232 + rt_mutex_unlock(pi_mutex);
  2233 + } else if (ret == -EINTR) {
  2234 + ret = -EFAULT;
  2235 + if (get_user(uval, uaddr2))
  2236 + goto out_put_keys;
  2237 +
  2238 + /*
  2239 + * We've already been requeued, so restart by calling
  2240 + * futex_lock_pi() directly, rather then returning to this
  2241 + * function.
  2242 + */
  2243 + ret = -ERESTART_RESTARTBLOCK;
  2244 + restart = &current_thread_info()->restart_block;
  2245 + restart->fn = futex_lock_pi_restart;
  2246 + restart->futex.uaddr = (u32 *)uaddr2;
  2247 + restart->futex.val = uval;
  2248 + restart->futex.flags = 0;
  2249 + if (abs_time) {
  2250 + restart->futex.flags |= FLAGS_HAS_TIMEOUT;
  2251 + restart->futex.time = abs_time->tv64;
  2252 + }
  2253 +
  2254 + if (fshared)
  2255 + restart->futex.flags |= FLAGS_SHARED;
  2256 + if (clockrt)
  2257 + restart->futex.flags |= FLAGS_CLOCKRT;
  2258 + }
  2259 +
  2260 +out_put_keys:
  2261 + put_futex_key(fshared, &q.key);
  2262 + put_futex_key(fshared, &key2);
  2263 +
  2264 +out:
  2265 + if (to) {
  2266 + hrtimer_cancel(&to->timer);
  2267 + destroy_hrtimer_on_stack(&to->timer);
  2268 + }
  2269 + return ret;
  2270 +}
  2271 +
1806 2272 /*
1807 2273 * Support for robust futexes: the kernel cleans up held futexes at
1808 2274 * thread exit time.
... ... @@ -2025,7 +2491,7 @@
2025 2491 fshared = 1;
2026 2492  
2027 2493 clockrt = op & FUTEX_CLOCK_REALTIME;
2028   - if (clockrt && cmd != FUTEX_WAIT_BITSET)
  2494 + if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
2029 2495 return -ENOSYS;
2030 2496  
2031 2497 switch (cmd) {
2032 2498  
... ... @@ -2040,10 +2506,11 @@
2040 2506 ret = futex_wake(uaddr, fshared, val, val3);
2041 2507 break;
2042 2508 case FUTEX_REQUEUE:
2043   - ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL);
  2509 + ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0);
2044 2510 break;
2045 2511 case FUTEX_CMP_REQUEUE:
2046   - ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3);
  2512 + ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
  2513 + 0);
2047 2514 break;
2048 2515 case FUTEX_WAKE_OP:
2049 2516 ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
... ... @@ -2060,6 +2527,18 @@
2060 2527 if (futex_cmpxchg_enabled)
2061 2528 ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
2062 2529 break;
  2530 + case FUTEX_WAIT_REQUEUE_PI:
  2531 + val3 = FUTEX_BITSET_MATCH_ANY;
  2532 + ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3,
  2533 + clockrt, uaddr2);
  2534 + break;
  2535 + case FUTEX_REQUEUE_PI:
  2536 + ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 1);
  2537 + break;
  2538 + case FUTEX_CMP_REQUEUE_PI:
  2539 + ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
  2540 + 1);
  2541 + break;
2063 2542 default:
2064 2543 ret = -ENOSYS;
2065 2544 }
... ... @@ -2077,7 +2556,8 @@
2077 2556 int cmd = op & FUTEX_CMD_MASK;
2078 2557  
2079 2558 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
2080   - cmd == FUTEX_WAIT_BITSET)) {
  2559 + cmd == FUTEX_WAIT_BITSET ||
  2560 + cmd == FUTEX_WAIT_REQUEUE_PI)) {
2081 2561 if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
2082 2562 return -EFAULT;
2083 2563 if (!timespec_valid(&ts))
2084 2564  
... ... @@ -2089,10 +2569,11 @@
2089 2569 tp = &t;
2090 2570 }
2091 2571 /*
2092   - * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE.
  2572 + * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*.
2093 2573 * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
2094 2574 */
2095 2575 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
  2576 + cmd == FUTEX_REQUEUE_PI || cmd == FUTEX_CMP_REQUEUE_PI ||
2096 2577 cmd == FUTEX_WAKE_OP)
2097 2578 val2 = (u32) (unsigned long) utime;
2098 2579