Commit 3b5d8510b94a95e493e8c4951ffc3d1cf6a6792d

Authored by Linus Torvalds

Merge branch 'core-locking-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull core locking changes from Ingo Molnar:
 "The biggest change is the rwsem lock-steal improvements, both to the
  assembly optimized and the spinlock based variants.

  The other notable change is the clean up of the seqlock implementation
  to be based on the seqcount infrastructure.

  The rest is assorted smaller debuggability, cleanup and continued -rt
  locking changes."

* 'core-locking-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  rwsem-spinlock: Implement writer lock-stealing for better scalability
  futex: Revert "futex: Mark get_robust_list as deprecated"
  generic: Use raw local irq variant for generic cmpxchg
  lockdep: Selftest: convert spinlock to raw spinlock
  seqlock: Use seqcount infrastructure
  seqlock: Remove unused functions
  ntp: Make ntp_lock raw
  intel_idle: Convert i7300_idle_lock to raw_spinlock
  locking: Various static lock initializer fixes
  lockdep: Print more info when MAX_LOCK_DEPTH is exceeded
  rwsem: Implement writer lock-stealing for better scalability
  lockdep: Silence warning if CONFIG_LOCKDEP isn't set
  watchdog: Use local_clock for get_timestamp()
  lockdep: Rename print_unlock_inbalance_bug() to print_unlock_imbalance_bug()
  locking/stat: Fix a typo

Showing 17 changed files Side-by-side Diff

Documentation/lockstat.txt
... ... @@ -65,7 +65,7 @@
65 65  
66 66 - CONFIGURATION
67 67  
68   -Lock statistics are enabled via CONFIG_LOCK_STATS.
  68 +Lock statistics are enabled via CONFIG_LOCK_STAT.
69 69  
70 70 - USAGE
71 71  
drivers/char/random.c
... ... @@ -445,7 +445,7 @@
445 445 .poolinfo = &poolinfo_table[0],
446 446 .name = "input",
447 447 .limit = 1,
448   - .lock = __SPIN_LOCK_UNLOCKED(&input_pool.lock),
  448 + .lock = __SPIN_LOCK_UNLOCKED(input_pool.lock),
449 449 .pool = input_pool_data
450 450 };
451 451  
... ... @@ -454,7 +454,7 @@
454 454 .name = "blocking",
455 455 .limit = 1,
456 456 .pull = &input_pool,
457   - .lock = __SPIN_LOCK_UNLOCKED(&blocking_pool.lock),
  457 + .lock = __SPIN_LOCK_UNLOCKED(blocking_pool.lock),
458 458 .pool = blocking_pool_data
459 459 };
460 460  
... ... @@ -462,7 +462,7 @@
462 462 .poolinfo = &poolinfo_table[1],
463 463 .name = "nonblocking",
464 464 .pull = &input_pool,
465   - .lock = __SPIN_LOCK_UNLOCKED(&nonblocking_pool.lock),
  465 + .lock = __SPIN_LOCK_UNLOCKED(nonblocking_pool.lock),
466 466 .pool = nonblocking_pool_data
467 467 };
468 468  
drivers/idle/i7300_idle.c
... ... @@ -75,7 +75,7 @@
75 75  
76 76 static struct pci_dev *fbd_dev;
77 77  
78   -static spinlock_t i7300_idle_lock;
  78 +static raw_spinlock_t i7300_idle_lock;
79 79 static int i7300_idle_active;
80 80  
81 81 static u8 i7300_idle_thrtctl_saved;
... ... @@ -457,7 +457,7 @@
457 457 idle_begin_time = ktime_get();
458 458 }
459 459  
460   - spin_lock_irqsave(&i7300_idle_lock, flags);
  460 + raw_spin_lock_irqsave(&i7300_idle_lock, flags);
461 461 if (val == IDLE_START) {
462 462  
463 463 cpumask_set_cpu(smp_processor_id(), idle_cpumask);
... ... @@ -506,7 +506,7 @@
506 506 }
507 507 }
508 508 end:
509   - spin_unlock_irqrestore(&i7300_idle_lock, flags);
  509 + raw_spin_unlock_irqrestore(&i7300_idle_lock, flags);
510 510 return 0;
511 511 }
512 512  
... ... @@ -548,7 +548,7 @@
548 548  
549 549 static int __init i7300_idle_init(void)
550 550 {
551   - spin_lock_init(&i7300_idle_lock);
  551 + raw_spin_lock_init(&i7300_idle_lock);
552 552 total_us = 0;
553 553  
554 554 if (i7300_idle_platform_probe(&fbd_dev, &ioat_dev, forceload))
drivers/usb/chipidea/debug.c
... ... @@ -222,7 +222,7 @@
222 222 } dbg_data = {
223 223 .idx = 0,
224 224 .tty = 0,
225   - .lck = __RW_LOCK_UNLOCKED(lck)
  225 + .lck = __RW_LOCK_UNLOCKED(dbg_data.lck)
226 226 };
227 227  
228 228 /**
... ... @@ -516,7 +516,7 @@
516 516 .close_on_exec = init_files.close_on_exec_init,
517 517 .open_fds = init_files.open_fds_init,
518 518 },
519   - .file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock),
  519 + .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock),
520 520 };
521 521  
522 522 /*
include/asm-generic/cmpxchg-local.h
... ... @@ -21,7 +21,7 @@
21 21 if (size == 8 && sizeof(unsigned long) != 8)
22 22 wrong_size_cmpxchg(ptr);
23 23  
24   - local_irq_save(flags);
  24 + raw_local_irq_save(flags);
25 25 switch (size) {
26 26 case 1: prev = *(u8 *)ptr;
27 27 if (prev == old)
... ... @@ -42,7 +42,7 @@
42 42 default:
43 43 wrong_size_cmpxchg(ptr);
44 44 }
45   - local_irq_restore(flags);
  45 + raw_local_irq_restore(flags);
46 46 return prev;
47 47 }
48 48  
49 49  
... ... @@ -55,11 +55,11 @@
55 55 u64 prev;
56 56 unsigned long flags;
57 57  
58   - local_irq_save(flags);
  58 + raw_local_irq_save(flags);
59 59 prev = *(u64 *)ptr;
60 60 if (prev == old)
61 61 *(u64 *)ptr = new;
62   - local_irq_restore(flags);
  62 + raw_local_irq_restore(flags);
63 63 return prev;
64 64 }
65 65  
... ... @@ -136,7 +136,7 @@
136 136 struct ida_bitmap *free_bitmap;
137 137 };
138 138  
139   -#define IDA_INIT(name) { .idr = IDR_INIT(name), .free_bitmap = NULL, }
  139 +#define IDA_INIT(name) { .idr = IDR_INIT((name).idr), .free_bitmap = NULL, }
140 140 #define DEFINE_IDA(name) struct ida name = IDA_INIT(name)
141 141  
142 142 int ida_pre_get(struct ida *ida, gfp_t gfp_mask);
include/linux/lockdep.h
... ... @@ -412,7 +412,7 @@
412 412  
413 413 #define lockdep_depth(tsk) (0)
414 414  
415   -#define lockdep_assert_held(l) do { } while (0)
  415 +#define lockdep_assert_held(l) do { (void)(l); } while (0)
416 416  
417 417 #define lockdep_recursing(tsk) (0)
418 418  
include/linux/seqlock.h
... ... @@ -30,92 +30,12 @@
30 30 #include <linux/preempt.h>
31 31 #include <asm/processor.h>
32 32  
33   -typedef struct {
34   - unsigned sequence;
35   - spinlock_t lock;
36   -} seqlock_t;
37   -
38 33 /*
39   - * These macros triggered gcc-3.x compile-time problems. We think these are
40   - * OK now. Be cautious.
41   - */
42   -#define __SEQLOCK_UNLOCKED(lockname) \
43   - { 0, __SPIN_LOCK_UNLOCKED(lockname) }
44   -
45   -#define seqlock_init(x) \
46   - do { \
47   - (x)->sequence = 0; \
48   - spin_lock_init(&(x)->lock); \
49   - } while (0)
50   -
51   -#define DEFINE_SEQLOCK(x) \
52   - seqlock_t x = __SEQLOCK_UNLOCKED(x)
53   -
54   -/* Lock out other writers and update the count.
55   - * Acts like a normal spin_lock/unlock.
56   - * Don't need preempt_disable() because that is in the spin_lock already.
57   - */
58   -static inline void write_seqlock(seqlock_t *sl)
59   -{
60   - spin_lock(&sl->lock);
61   - ++sl->sequence;
62   - smp_wmb();
63   -}
64   -
65   -static inline void write_sequnlock(seqlock_t *sl)
66   -{
67   - smp_wmb();
68   - sl->sequence++;
69   - spin_unlock(&sl->lock);
70   -}
71   -
72   -static inline int write_tryseqlock(seqlock_t *sl)
73   -{
74   - int ret = spin_trylock(&sl->lock);
75   -
76   - if (ret) {
77   - ++sl->sequence;
78   - smp_wmb();
79   - }
80   - return ret;
81   -}
82   -
83   -/* Start of read calculation -- fetch last complete writer token */
84   -static __always_inline unsigned read_seqbegin(const seqlock_t *sl)
85   -{
86   - unsigned ret;
87   -
88   -repeat:
89   - ret = ACCESS_ONCE(sl->sequence);
90   - if (unlikely(ret & 1)) {
91   - cpu_relax();
92   - goto repeat;
93   - }
94   - smp_rmb();
95   -
96   - return ret;
97   -}
98   -
99   -/*
100   - * Test if reader processed invalid data.
101   - *
102   - * If sequence value changed then writer changed data while in section.
103   - */
104   -static __always_inline int read_seqretry(const seqlock_t *sl, unsigned start)
105   -{
106   - smp_rmb();
107   -
108   - return unlikely(sl->sequence != start);
109   -}
110   -
111   -
112   -/*
113 34 * Version using sequence counter only.
114 35 * This can be used when code has its own mutex protecting the
115 36 * updating starting before the write_seqcountbeqin() and ending
116 37 * after the write_seqcount_end().
117 38 */
118   -
119 39 typedef struct seqcount {
120 40 unsigned sequence;
121 41 } seqcount_t;
... ... @@ -218,7 +138,6 @@
218 138 static inline int read_seqcount_retry(const seqcount_t *s, unsigned start)
219 139 {
220 140 smp_rmb();
221   -
222 141 return __read_seqcount_retry(s, start);
223 142 }
224 143  
225 144  
226 145  
227 146  
228 147  
229 148  
... ... @@ -252,32 +171,102 @@
252 171 s->sequence+=2;
253 172 }
254 173  
  174 +typedef struct {
  175 + struct seqcount seqcount;
  176 + spinlock_t lock;
  177 +} seqlock_t;
  178 +
255 179 /*
256   - * Possible sw/hw IRQ protected versions of the interfaces.
  180 + * These macros triggered gcc-3.x compile-time problems. We think these are
  181 + * OK now. Be cautious.
257 182 */
258   -#define write_seqlock_irqsave(lock, flags) \
259   - do { local_irq_save(flags); write_seqlock(lock); } while (0)
260   -#define write_seqlock_irq(lock) \
261   - do { local_irq_disable(); write_seqlock(lock); } while (0)
262   -#define write_seqlock_bh(lock) \
263   - do { local_bh_disable(); write_seqlock(lock); } while (0)
  183 +#define __SEQLOCK_UNLOCKED(lockname) \
  184 + { \
  185 + .seqcount = SEQCNT_ZERO, \
  186 + .lock = __SPIN_LOCK_UNLOCKED(lockname) \
  187 + }
264 188  
265   -#define write_sequnlock_irqrestore(lock, flags) \
266   - do { write_sequnlock(lock); local_irq_restore(flags); } while(0)
267   -#define write_sequnlock_irq(lock) \
268   - do { write_sequnlock(lock); local_irq_enable(); } while(0)
269   -#define write_sequnlock_bh(lock) \
270   - do { write_sequnlock(lock); local_bh_enable(); } while(0)
  189 +#define seqlock_init(x) \
  190 + do { \
  191 + seqcount_init(&(x)->seqcount); \
  192 + spin_lock_init(&(x)->lock); \
  193 + } while (0)
271 194  
272   -#define read_seqbegin_irqsave(lock, flags) \
273   - ({ local_irq_save(flags); read_seqbegin(lock); })
  195 +#define DEFINE_SEQLOCK(x) \
  196 + seqlock_t x = __SEQLOCK_UNLOCKED(x)
274 197  
275   -#define read_seqretry_irqrestore(lock, iv, flags) \
276   - ({ \
277   - int ret = read_seqretry(lock, iv); \
278   - local_irq_restore(flags); \
279   - ret; \
280   - })
  198 +/*
  199 + * Read side functions for starting and finalizing a read side section.
  200 + */
  201 +static inline unsigned read_seqbegin(const seqlock_t *sl)
  202 +{
  203 + return read_seqcount_begin(&sl->seqcount);
  204 +}
  205 +
  206 +static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
  207 +{
  208 + return read_seqcount_retry(&sl->seqcount, start);
  209 +}
  210 +
  211 +/*
  212 + * Lock out other writers and update the count.
  213 + * Acts like a normal spin_lock/unlock.
  214 + * Don't need preempt_disable() because that is in the spin_lock already.
  215 + */
  216 +static inline void write_seqlock(seqlock_t *sl)
  217 +{
  218 + spin_lock(&sl->lock);
  219 + write_seqcount_begin(&sl->seqcount);
  220 +}
  221 +
  222 +static inline void write_sequnlock(seqlock_t *sl)
  223 +{
  224 + write_seqcount_end(&sl->seqcount);
  225 + spin_unlock(&sl->lock);
  226 +}
  227 +
  228 +static inline void write_seqlock_bh(seqlock_t *sl)
  229 +{
  230 + spin_lock_bh(&sl->lock);
  231 + write_seqcount_begin(&sl->seqcount);
  232 +}
  233 +
  234 +static inline void write_sequnlock_bh(seqlock_t *sl)
  235 +{
  236 + write_seqcount_end(&sl->seqcount);
  237 + spin_unlock_bh(&sl->lock);
  238 +}
  239 +
  240 +static inline void write_seqlock_irq(seqlock_t *sl)
  241 +{
  242 + spin_lock_irq(&sl->lock);
  243 + write_seqcount_begin(&sl->seqcount);
  244 +}
  245 +
  246 +static inline void write_sequnlock_irq(seqlock_t *sl)
  247 +{
  248 + write_seqcount_end(&sl->seqcount);
  249 + spin_unlock_irq(&sl->lock);
  250 +}
  251 +
  252 +static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
  253 +{
  254 + unsigned long flags;
  255 +
  256 + spin_lock_irqsave(&sl->lock, flags);
  257 + write_seqcount_begin(&sl->seqcount);
  258 + return flags;
  259 +}
  260 +
  261 +#define write_seqlock_irqsave(lock, flags) \
  262 + do { flags = __write_seqlock_irqsave(lock); } while (0)
  263 +
  264 +static inline void
  265 +write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
  266 +{
  267 + write_seqcount_end(&sl->seqcount);
  268 + spin_unlock_irqrestore(&sl->lock, flags);
  269 +}
281 270  
282 271 #endif /* __LINUX_SEQLOCK_H */
... ... @@ -2472,8 +2472,6 @@
2472 2472 if (!futex_cmpxchg_enabled)
2473 2473 return -ENOSYS;
2474 2474  
2475   - WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n");
2476   -
2477 2475 rcu_read_lock();
2478 2476  
2479 2477 ret = -ESRCH;
kernel/futex_compat.c
... ... @@ -142,8 +142,6 @@
142 142 if (!futex_cmpxchg_enabled)
143 143 return -ENOSYS;
144 144  
145   - WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n");
146   -
147 145 rcu_read_lock();
148 146  
149 147 ret = -ESRCH;
... ... @@ -3190,9 +3190,14 @@
3190 3190 #endif
3191 3191 if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) {
3192 3192 debug_locks_off();
3193   - printk("BUG: MAX_LOCK_DEPTH too low!\n");
  3193 + printk("BUG: MAX_LOCK_DEPTH too low, depth: %i max: %lu!\n",
  3194 + curr->lockdep_depth, MAX_LOCK_DEPTH);
3194 3195 printk("turning off the locking correctness validator.\n");
  3196 +
  3197 + lockdep_print_held_locks(current);
  3198 + debug_show_all_locks();
3195 3199 dump_stack();
  3200 +
3196 3201 return 0;
3197 3202 }
3198 3203  
... ... @@ -3203,7 +3208,7 @@
3203 3208 }
3204 3209  
3205 3210 static int
3206   -print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
  3211 +print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
3207 3212 unsigned long ip)
3208 3213 {
3209 3214 if (!debug_locks_off())
... ... @@ -3246,7 +3251,7 @@
3246 3251 return 0;
3247 3252  
3248 3253 if (curr->lockdep_depth <= 0)
3249   - return print_unlock_inbalance_bug(curr, lock, ip);
  3254 + return print_unlock_imbalance_bug(curr, lock, ip);
3250 3255  
3251 3256 return 1;
3252 3257 }
... ... @@ -3317,7 +3322,7 @@
3317 3322 goto found_it;
3318 3323 prev_hlock = hlock;
3319 3324 }
3320   - return print_unlock_inbalance_bug(curr, lock, ip);
  3325 + return print_unlock_imbalance_bug(curr, lock, ip);
3321 3326  
3322 3327 found_it:
3323 3328 lockdep_init_map(lock, name, key, 0);
... ... @@ -3384,7 +3389,7 @@
3384 3389 goto found_it;
3385 3390 prev_hlock = hlock;
3386 3391 }
3387   - return print_unlock_inbalance_bug(curr, lock, ip);
  3392 + return print_unlock_imbalance_bug(curr, lock, ip);
3388 3393  
3389 3394 found_it:
3390 3395 if (hlock->instance == lock)
... ... @@ -23,7 +23,7 @@
23 23 * NTP timekeeping variables:
24 24 */
25 25  
26   -DEFINE_SPINLOCK(ntp_lock);
  26 +DEFINE_RAW_SPINLOCK(ntp_lock);
27 27  
28 28  
29 29 /* USER_HZ period (usecs): */
... ... @@ -348,7 +348,7 @@
348 348 {
349 349 unsigned long flags;
350 350  
351   - spin_lock_irqsave(&ntp_lock, flags);
  351 + raw_spin_lock_irqsave(&ntp_lock, flags);
352 352  
353 353 time_adjust = 0; /* stop active adjtime() */
354 354 time_status |= STA_UNSYNC;
... ... @@ -362,7 +362,7 @@
362 362  
363 363 /* Clear PPS state variables */
364 364 pps_clear();
365   - spin_unlock_irqrestore(&ntp_lock, flags);
  365 + raw_spin_unlock_irqrestore(&ntp_lock, flags);
366 366  
367 367 }
368 368  
369 369  
... ... @@ -372,9 +372,9 @@
372 372 unsigned long flags;
373 373 s64 ret;
374 374  
375   - spin_lock_irqsave(&ntp_lock, flags);
  375 + raw_spin_lock_irqsave(&ntp_lock, flags);
376 376 ret = tick_length;
377   - spin_unlock_irqrestore(&ntp_lock, flags);
  377 + raw_spin_unlock_irqrestore(&ntp_lock, flags);
378 378 return ret;
379 379 }
380 380  
... ... @@ -395,7 +395,7 @@
395 395 int leap = 0;
396 396 unsigned long flags;
397 397  
398   - spin_lock_irqsave(&ntp_lock, flags);
  398 + raw_spin_lock_irqsave(&ntp_lock, flags);
399 399  
400 400 /*
401 401 * Leap second processing. If in leap-insert state at the end of the
... ... @@ -479,7 +479,7 @@
479 479 time_adjust = 0;
480 480  
481 481 out:
482   - spin_unlock_irqrestore(&ntp_lock, flags);
  482 + raw_spin_unlock_irqrestore(&ntp_lock, flags);
483 483  
484 484 return leap;
485 485 }
... ... @@ -672,7 +672,7 @@
672 672  
673 673 getnstimeofday(&ts);
674 674  
675   - spin_lock_irq(&ntp_lock);
  675 + raw_spin_lock_irq(&ntp_lock);
676 676  
677 677 if (txc->modes & ADJ_ADJTIME) {
678 678 long save_adjust = time_adjust;
... ... @@ -714,7 +714,7 @@
714 714 /* fill PPS status fields */
715 715 pps_fill_timex(txc);
716 716  
717   - spin_unlock_irq(&ntp_lock);
  717 + raw_spin_unlock_irq(&ntp_lock);
718 718  
719 719 txc->time.tv_sec = ts.tv_sec;
720 720 txc->time.tv_usec = ts.tv_nsec;
... ... @@ -912,7 +912,7 @@
912 912  
913 913 pts_norm = pps_normalize_ts(*phase_ts);
914 914  
915   - spin_lock_irqsave(&ntp_lock, flags);
  915 + raw_spin_lock_irqsave(&ntp_lock, flags);
916 916  
917 917 /* clear the error bits, they will be set again if needed */
918 918 time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
... ... @@ -925,7 +925,7 @@
925 925 * just start the frequency interval */
926 926 if (unlikely(pps_fbase.tv_sec == 0)) {
927 927 pps_fbase = *raw_ts;
928   - spin_unlock_irqrestore(&ntp_lock, flags);
  928 + raw_spin_unlock_irqrestore(&ntp_lock, flags);
929 929 return;
930 930 }
931 931  
... ... @@ -940,7 +940,7 @@
940 940 time_status |= STA_PPSJITTER;
941 941 /* restart the frequency calibration interval */
942 942 pps_fbase = *raw_ts;
943   - spin_unlock_irqrestore(&ntp_lock, flags);
  943 + raw_spin_unlock_irqrestore(&ntp_lock, flags);
944 944 pr_err("hardpps: PPSJITTER: bad pulse\n");
945 945 return;
946 946 }
... ... @@ -957,7 +957,7 @@
957 957  
958 958 hardpps_update_phase(pts_norm.nsec);
959 959  
960   - spin_unlock_irqrestore(&ntp_lock, flags);
  960 + raw_spin_unlock_irqrestore(&ntp_lock, flags);
961 961 }
962 962 EXPORT_SYMBOL(hardpps);
963 963  
... ... @@ -113,9 +113,9 @@
113 113 * resolution, and we don't need to waste time with a big divide when
114 114 * 2^30ns == 1.074s.
115 115 */
116   -static unsigned long get_timestamp(int this_cpu)
  116 +static unsigned long get_timestamp(void)
117 117 {
118   - return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */
  118 + return local_clock() >> 30LL; /* 2^30 ~= 10^9 */
119 119 }
120 120  
121 121 static void set_sample_period(void)
... ... @@ -133,9 +133,7 @@
133 133 /* Commands for resetting the watchdog */
134 134 static void __touch_watchdog(void)
135 135 {
136   - int this_cpu = smp_processor_id();
137   -
138   - __this_cpu_write(watchdog_touch_ts, get_timestamp(this_cpu));
  136 + __this_cpu_write(watchdog_touch_ts, get_timestamp());
139 137 }
140 138  
141 139 void touch_softlockup_watchdog(void)
... ... @@ -196,7 +194,7 @@
196 194  
197 195 static int is_softlockup(unsigned long touch_ts)
198 196 {
199   - unsigned long now = get_timestamp(smp_processor_id());
  197 + unsigned long now = get_timestamp();
200 198  
201 199 /* Warn about unreasonable delays: */
202 200 if (time_after(now, touch_ts + get_softlockup_thresh()))
lib/locking-selftest.c
... ... @@ -47,10 +47,10 @@
47 47 * Normal standalone locks, for the circular and irq-context
48 48 * dependency tests:
49 49 */
50   -static DEFINE_SPINLOCK(lock_A);
51   -static DEFINE_SPINLOCK(lock_B);
52   -static DEFINE_SPINLOCK(lock_C);
53   -static DEFINE_SPINLOCK(lock_D);
  50 +static DEFINE_RAW_SPINLOCK(lock_A);
  51 +static DEFINE_RAW_SPINLOCK(lock_B);
  52 +static DEFINE_RAW_SPINLOCK(lock_C);
  53 +static DEFINE_RAW_SPINLOCK(lock_D);
54 54  
55 55 static DEFINE_RWLOCK(rwlock_A);
56 56 static DEFINE_RWLOCK(rwlock_B);
... ... @@ -73,12 +73,12 @@
73 73 * but X* and Y* are different classes. We do this so that
74 74 * we do not trigger a real lockup:
75 75 */
76   -static DEFINE_SPINLOCK(lock_X1);
77   -static DEFINE_SPINLOCK(lock_X2);
78   -static DEFINE_SPINLOCK(lock_Y1);
79   -static DEFINE_SPINLOCK(lock_Y2);
80   -static DEFINE_SPINLOCK(lock_Z1);
81   -static DEFINE_SPINLOCK(lock_Z2);
  76 +static DEFINE_RAW_SPINLOCK(lock_X1);
  77 +static DEFINE_RAW_SPINLOCK(lock_X2);
  78 +static DEFINE_RAW_SPINLOCK(lock_Y1);
  79 +static DEFINE_RAW_SPINLOCK(lock_Y2);
  80 +static DEFINE_RAW_SPINLOCK(lock_Z1);
  81 +static DEFINE_RAW_SPINLOCK(lock_Z2);
82 82  
83 83 static DEFINE_RWLOCK(rwlock_X1);
84 84 static DEFINE_RWLOCK(rwlock_X2);
85 85  
... ... @@ -107,10 +107,10 @@
107 107 */
108 108 #define INIT_CLASS_FUNC(class) \
109 109 static noinline void \
110   -init_class_##class(spinlock_t *lock, rwlock_t *rwlock, struct mutex *mutex, \
111   - struct rw_semaphore *rwsem) \
  110 +init_class_##class(raw_spinlock_t *lock, rwlock_t *rwlock, \
  111 + struct mutex *mutex, struct rw_semaphore *rwsem)\
112 112 { \
113   - spin_lock_init(lock); \
  113 + raw_spin_lock_init(lock); \
114 114 rwlock_init(rwlock); \
115 115 mutex_init(mutex); \
116 116 init_rwsem(rwsem); \
117 117  
... ... @@ -168,10 +168,10 @@
168 168 * Shortcuts for lock/unlock API variants, to keep
169 169 * the testcases compact:
170 170 */
171   -#define L(x) spin_lock(&lock_##x)
172   -#define U(x) spin_unlock(&lock_##x)
  171 +#define L(x) raw_spin_lock(&lock_##x)
  172 +#define U(x) raw_spin_unlock(&lock_##x)
173 173 #define LU(x) L(x); U(x)
174   -#define SI(x) spin_lock_init(&lock_##x)
  174 +#define SI(x) raw_spin_lock_init(&lock_##x)
175 175  
176 176 #define WL(x) write_lock(&rwlock_##x)
177 177 #define WU(x) write_unlock(&rwlock_##x)
... ... @@ -911,7 +911,7 @@
911 911  
912 912 #define I2(x) \
913 913 do { \
914   - spin_lock_init(&lock_##x); \
  914 + raw_spin_lock_init(&lock_##x); \
915 915 rwlock_init(&rwlock_##x); \
916 916 mutex_init(&mutex_##x); \
917 917 init_rwsem(&rwsem_##x); \
lib/rwsem-spinlock.c
... ... @@ -73,20 +73,13 @@
73 73 goto dont_wake_writers;
74 74 }
75 75  
76   - /* if we are allowed to wake writers try to grant a single write lock
77   - * if there's a writer at the front of the queue
78   - * - we leave the 'waiting count' incremented to signify potential
79   - * contention
  76 + /*
  77 + * as we support write lock stealing, we can't set sem->activity
  78 + * to -1 here to indicate we get the lock. Instead, we wake it up
  79 + * to let it go get it again.
80 80 */
81 81 if (waiter->flags & RWSEM_WAITING_FOR_WRITE) {
82   - sem->activity = -1;
83   - list_del(&waiter->list);
84   - tsk = waiter->task;
85   - /* Don't touch waiter after ->task has been NULLed */
86   - smp_mb();
87   - waiter->task = NULL;
88   - wake_up_process(tsk);
89   - put_task_struct(tsk);
  82 + wake_up_process(waiter->task);
90 83 goto out;
91 84 }
92 85  
93 86  
94 87  
95 88  
... ... @@ -121,18 +114,10 @@
121 114 __rwsem_wake_one_writer(struct rw_semaphore *sem)
122 115 {
123 116 struct rwsem_waiter *waiter;
124   - struct task_struct *tsk;
125 117  
126   - sem->activity = -1;
127   -
128 118 waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
129   - list_del(&waiter->list);
  119 + wake_up_process(waiter->task);
130 120  
131   - tsk = waiter->task;
132   - smp_mb();
133   - waiter->task = NULL;
134   - wake_up_process(tsk);
135   - put_task_struct(tsk);
136 121 return sem;
137 122 }
138 123  
... ... @@ -204,7 +189,6 @@
204 189  
205 190 /*
206 191 * get a write lock on the semaphore
207   - * - we increment the waiting count anyway to indicate an exclusive lock
208 192 */
209 193 void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
210 194 {
211 195  
212 196  
213 197  
214 198  
215 199  
216 200  
217 201  
218 202  
... ... @@ -214,37 +198,32 @@
214 198  
215 199 raw_spin_lock_irqsave(&sem->wait_lock, flags);
216 200  
217   - if (sem->activity == 0 && list_empty(&sem->wait_list)) {
218   - /* granted */
219   - sem->activity = -1;
220   - raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
221   - goto out;
222   - }
223   -
224   - tsk = current;
225   - set_task_state(tsk, TASK_UNINTERRUPTIBLE);
226   -
227 201 /* set up my own style of waitqueue */
  202 + tsk = current;
228 203 waiter.task = tsk;
229 204 waiter.flags = RWSEM_WAITING_FOR_WRITE;
230   - get_task_struct(tsk);
231   -
232 205 list_add_tail(&waiter.list, &sem->wait_list);
233 206  
234   - /* we don't need to touch the semaphore struct anymore */
235   - raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
236   -
237   - /* wait to be given the lock */
  207 + /* wait for someone to release the lock */
238 208 for (;;) {
239   - if (!waiter.task)
  209 + /*
  210 + * That is the key to support write lock stealing: allows the
  211 + * task already on CPU to get the lock soon rather than put
  212 + * itself into sleep and waiting for system woke it or someone
  213 + * else in the head of the wait list up.
  214 + */
  215 + if (sem->activity == 0)
240 216 break;
241   - schedule();
242 217 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
  218 + raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
  219 + schedule();
  220 + raw_spin_lock_irqsave(&sem->wait_lock, flags);
243 221 }
  222 + /* got the lock */
  223 + sem->activity = -1;
  224 + list_del(&waiter.list);
244 225  
245   - tsk->state = TASK_RUNNING;
246   - out:
247   - ;
  226 + raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
248 227 }
249 228  
250 229 void __sched __down_write(struct rw_semaphore *sem)
... ... @@ -262,8 +241,8 @@
262 241  
263 242 raw_spin_lock_irqsave(&sem->wait_lock, flags);
264 243  
265   - if (sem->activity == 0 && list_empty(&sem->wait_list)) {
266   - /* granted */
  244 + if (sem->activity == 0) {
  245 + /* got the lock */
267 246 sem->activity = -1;
268 247 ret = 1;
269 248 }
... ... @@ -2,6 +2,8 @@
2 2 *
3 3 * Written by David Howells (dhowells@redhat.com).
4 4 * Derived from arch/i386/kernel/semaphore.c
  5 + *
  6 + * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
5 7 */
6 8 #include <linux/rwsem.h>
7 9 #include <linux/sched.h>
... ... @@ -60,7 +62,7 @@
60 62 struct rwsem_waiter *waiter;
61 63 struct task_struct *tsk;
62 64 struct list_head *next;
63   - signed long oldcount, woken, loop, adjustment;
  65 + signed long woken, loop, adjustment;
64 66  
65 67 waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
66 68 if (!(waiter->flags & RWSEM_WAITING_FOR_WRITE))
... ... @@ -72,30 +74,8 @@
72 74 */
73 75 goto out;
74 76  
75   - /* There's a writer at the front of the queue - try to grant it the
76   - * write lock. However, we only wake this writer if we can transition
77   - * the active part of the count from 0 -> 1
78   - */
79   - adjustment = RWSEM_ACTIVE_WRITE_BIAS;
80   - if (waiter->list.next == &sem->wait_list)
81   - adjustment -= RWSEM_WAITING_BIAS;
82   -
83   - try_again_write:
84   - oldcount = rwsem_atomic_update(adjustment, sem) - adjustment;
85   - if (oldcount & RWSEM_ACTIVE_MASK)
86   - /* Someone grabbed the sem already */
87   - goto undo_write;
88   -
89   - /* We must be careful not to touch 'waiter' after we set ->task = NULL.
90   - * It is an allocated on the waiter's stack and may become invalid at
91   - * any time after that point (due to a wakeup from another source).
92   - */
93   - list_del(&waiter->list);
94   - tsk = waiter->task;
95   - smp_mb();
96   - waiter->task = NULL;
97   - wake_up_process(tsk);
98   - put_task_struct(tsk);
  77 + /* Wake up the writing waiter and let the task grab the sem: */
  78 + wake_up_process(waiter->task);
99 79 goto out;
100 80  
101 81 readers_only:
102 82  
103 83  
... ... @@ -157,12 +137,40 @@
157 137  
158 138 out:
159 139 return sem;
  140 +}
160 141  
161   - /* undo the change to the active count, but check for a transition
162   - * 1->0 */
163   - undo_write:
  142 +/* Try to get write sem, caller holds sem->wait_lock: */
  143 +static int try_get_writer_sem(struct rw_semaphore *sem,
  144 + struct rwsem_waiter *waiter)
  145 +{
  146 + struct rwsem_waiter *fwaiter;
  147 + long oldcount, adjustment;
  148 +
  149 + /* only steal when first waiter is writing */
  150 + fwaiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
  151 + if (!(fwaiter->flags & RWSEM_WAITING_FOR_WRITE))
  152 + return 0;
  153 +
  154 + adjustment = RWSEM_ACTIVE_WRITE_BIAS;
  155 + /* Only one waiter in the queue: */
  156 + if (fwaiter == waiter && waiter->list.next == &sem->wait_list)
  157 + adjustment -= RWSEM_WAITING_BIAS;
  158 +
  159 +try_again_write:
  160 + oldcount = rwsem_atomic_update(adjustment, sem) - adjustment;
  161 + if (!(oldcount & RWSEM_ACTIVE_MASK)) {
  162 + /* No active lock: */
  163 + struct task_struct *tsk = waiter->task;
  164 +
  165 + list_del(&waiter->list);
  166 + smp_mb();
  167 + put_task_struct(tsk);
  168 + tsk->state = TASK_RUNNING;
  169 + return 1;
  170 + }
  171 + /* some one grabbed the sem already */
164 172 if (rwsem_atomic_update(-adjustment, sem) & RWSEM_ACTIVE_MASK)
165   - goto out;
  173 + return 0;
166 174 goto try_again_write;
167 175 }
168 176  
... ... @@ -210,6 +218,15 @@
210 218 for (;;) {
211 219 if (!waiter.task)
212 220 break;
  221 +
  222 + raw_spin_lock_irq(&sem->wait_lock);
  223 + /* Try to get the writer sem, may steal from the head writer: */
  224 + if (flags == RWSEM_WAITING_FOR_WRITE)
  225 + if (try_get_writer_sem(sem, &waiter)) {
  226 + raw_spin_unlock_irq(&sem->wait_lock);
  227 + return sem;
  228 + }
  229 + raw_spin_unlock_irq(&sem->wait_lock);
213 230 schedule();
214 231 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
215 232 }