Commit 70bdc6e0644f3535e93bac5c364ca199397e507e

Authored by Michel Lespinasse
Committed by Linus Torvalds
1 parent 345af7bf33

rwsem: lighter active count checks when waking up readers

In __rwsem_do_wake(), we can skip the active count check unless we come
there from up_xxxx().  Also when checking the active count, it is not
actually necessary to increment it; this allows us to get rid of the read
side undo code and simplify the calculation of the final rwsem count
adjustment once we've counted the reader threads to wake.

The basic observation is the following.  When there are waiter threads on
a rwsem and the spinlock is held, other threads can only increment the
active count by trying to grab the rwsem in down_xxxx().  However
down_xxxx() will notice there are waiter threads and take the down_failed
path, blocking to acquire the spinlock on the way there.  Therefore, a
thread observing an active count of zero with waiters queued and the
spinlock held, is protected against other threads acquiring the rwsem
until it wakes the last waiter or releases the spinlock.

Signed-off-by: Michel Lespinasse <walken@google.com>
Acked-by: David Howells <dhowells@redhat.com>
Cc: Mike Waychison <mikew@google.com>
Cc: Suleiman Souhlal <suleiman@google.com>
Cc: Ying Han <yinghan@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 1 changed file with 32 additions and 25 deletions Side-by-side Diff

... ... @@ -36,6 +36,14 @@
36 36 #define RWSEM_WAITING_FOR_WRITE 0x00000002
37 37 };
38 38  
  39 +/* Wake types for __rwsem_do_wake(). Note that RWSEM_WAKE_NO_ACTIVE and
  40 + * RWSEM_WAKE_READ_OWNED imply that the spinlock must have been kept held
  41 + * since the rwsem value was observed.
  42 + */
  43 +#define RWSEM_WAKE_ANY 0 /* Wake whatever's at head of wait list */
  44 +#define RWSEM_WAKE_NO_ACTIVE 1 /* rwsem was observed with no active thread */
  45 +#define RWSEM_WAKE_READ_OWNED 2 /* rwsem was observed to be read owned */
  46 +
39 47 /*
40 48 * handle the lock release when processes blocked on it that can now run
41 49 * - if we come here from up_xxxx(), then:
... ... @@ -46,8 +54,8 @@
46 54 * - woken process blocks are discarded from the list after having task zeroed
47 55 * - writers are only woken if downgrading is false
48 56 */
49   -static inline struct rw_semaphore *
50   -__rwsem_do_wake(struct rw_semaphore *sem, int downgrading)
  57 +static struct rw_semaphore *
  58 +__rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
51 59 {
52 60 struct rwsem_waiter *waiter;
53 61 struct task_struct *tsk;
... ... @@ -58,7 +66,7 @@
58 66 if (!(waiter->flags & RWSEM_WAITING_FOR_WRITE))
59 67 goto readers_only;
60 68  
61   - if (downgrading)
  69 + if (wake_type == RWSEM_WAKE_READ_OWNED)
62 70 goto out;
63 71  
64 72 /* There's a writer at the front of the queue - try to grant it the
65 73  
66 74  
... ... @@ -85,19 +93,25 @@
85 93 goto out;
86 94  
87 95 readers_only:
88   - if (downgrading)
89   - goto wake_readers;
90   -
91   - /* if we came through an up_xxxx() call, we only only wake someone up
92   - * if we can transition the active part of the count from 0 -> 1 */
93   - try_again_read:
94   - oldcount = rwsem_atomic_update(RWSEM_ACTIVE_BIAS, sem)
95   - - RWSEM_ACTIVE_BIAS;
96   - if (oldcount & RWSEM_ACTIVE_MASK)
  96 + /* If we come here from up_xxxx(), another thread might have reached
  97 + * rwsem_down_failed_common() before we acquired the spinlock and
  98 + * woken up a waiter, making it now active. We prefer to check for
  99 + * this first in order to not spend too much time with the spinlock
  100 + * held if we're not going to be able to wake up readers in the end.
  101 + *
  102 + * Note that we do not need to update the rwsem count: any writer
  103 + * trying to acquire rwsem will run rwsem_down_write_failed() due
  104 + * to the waiting threads and block trying to acquire the spinlock.
  105 + *
  106 + * We use a dummy atomic update in order to acquire the cache line
  107 + * exclusively since we expect to succeed and run the final rwsem
  108 + * count adjustment pretty soon.
  109 + */
  110 + if (wake_type == RWSEM_WAKE_ANY &&
  111 + (rwsem_atomic_update(0, sem) & RWSEM_ACTIVE_MASK))
97 112 /* Someone grabbed the sem already */
98   - goto undo_read;
  113 + goto out;
99 114  
100   - wake_readers:
101 115 /* Grant an infinite number of read locks to the readers at the front
102 116 * of the queue. Note we increment the 'active part' of the count by
103 117 * the number of readers before waking any processes up.
... ... @@ -116,9 +130,6 @@
116 130  
117 131 loop = woken;
118 132 woken *= RWSEM_ACTIVE_BIAS - RWSEM_WAITING_BIAS;
119   - if (!downgrading)
120   - /* we'd already done one increment earlier */
121   - woken -= RWSEM_ACTIVE_BIAS;
122 133  
123 134 rwsem_atomic_add(woken, sem);
124 135  
... ... @@ -145,10 +156,6 @@
145 156 if (rwsem_atomic_update(-RWSEM_ACTIVE_BIAS, sem) & RWSEM_ACTIVE_MASK)
146 157 goto out;
147 158 goto try_again_write;
148   - undo_read:
149   - if (rwsem_atomic_update(-RWSEM_ACTIVE_BIAS, sem) & RWSEM_ACTIVE_MASK)
150   - goto out;
151   - goto try_again_read;
152 159 }
153 160  
154 161 /*
155 162  
... ... @@ -170,12 +177,12 @@
170 177  
171 178 list_add_tail(&waiter->list, &sem->wait_list);
172 179  
173   - /* we're now waiting on the lock, but no longer actively read-locking */
  180 + /* we're now waiting on the lock, but no longer actively locking */
174 181 count = rwsem_atomic_update(adjustment, sem);
175 182  
176 183 /* if there are no active locks, wake the front queued process(es) up */
177 184 if (!(count & RWSEM_ACTIVE_MASK))
178   - sem = __rwsem_do_wake(sem, 0);
  185 + sem = __rwsem_do_wake(sem, RWSEM_WAKE_NO_ACTIVE);
179 186  
180 187 spin_unlock_irq(&sem->wait_lock);
181 188  
... ... @@ -232,7 +239,7 @@
232 239  
233 240 /* do nothing if list empty */
234 241 if (!list_empty(&sem->wait_list))
235   - sem = __rwsem_do_wake(sem, 0);
  242 + sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY);
236 243  
237 244 spin_unlock_irqrestore(&sem->wait_lock, flags);
238 245  
... ... @@ -252,7 +259,7 @@
252 259  
253 260 /* do nothing if list empty */
254 261 if (!list_empty(&sem->wait_list))
255   - sem = __rwsem_do_wake(sem, 1);
  262 + sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED);
256 263  
257 264 spin_unlock_irqrestore(&sem->wait_lock, flags);
258 265