Commit dcc2e51a1fc91c2a631ed3ea5afed22c6aa13728

Authored by Raghavendra K T
Committed by Greg Kroah-Hartman
1 parent b6888abd99

x86/spinlocks/paravirt: Fix memory corruption on unlock

commit d6abfdb2022368d8c6c4be3f11a06656601a6cc2 upstream.

Paravirt spinlock clears slowpath flag after doing unlock.
As explained by Linus currently it does:

                prev = *lock;
                add_smp(&lock->tickets.head, TICKET_LOCK_INC);

                /* add_smp() is a full mb() */

                if (unlikely(lock->tickets.tail & TICKET_SLOWPATH_FLAG))
                        __ticket_unlock_slowpath(lock, prev);

which is *exactly* the kind of things you cannot do with spinlocks,
because after you've done the "add_smp()" and released the spinlock
for the fast-path, you can't access the spinlock any more.  Exactly
because a fast-path lock might come in, and release the whole data
structure.

Linus suggested that we should not do any writes to lock after unlock(),
and we can move slowpath clearing to fastpath lock.

So this patch implements the fix with:

 1. Moving slowpath flag to head (Oleg):
    Unlocked locks don't care about the slowpath flag; therefore we can keep
    it set after the last unlock, and clear it again on the first (try)lock.
    -- this removes the write after unlock. note that keeping slowpath flag would
    result in unnecessary kicks.
    By moving the slowpath flag from the tail to the head ticket we also avoid
    the need to access both the head and tail tickets on unlock.

 2. use xadd to avoid read/write after unlock that checks the need for
    unlock_kick (Linus):
    We further avoid the need for a read-after-release by using xadd;
    the prev head value will include the slowpath flag and indicate if we
    need to do PV kicking of suspended spinners -- on modern chips xadd
    isn't (much) more expensive than an add + load.

Result:
 setup: 16core (32 cpu +ht sandy bridge 8GB 16vcpu guest)
 benchmark overcommit %improve
 kernbench  1x           -0.13
 kernbench  2x            0.02
 dbench     1x           -1.77
 dbench     2x           -0.63

[Jeremy: Hinted missing TICKET_LOCK_INC for kick]
[Oleg: Moved slowpath flag to head, ticket_equals idea]
[PeterZ: Added detailed changelog]

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Reported-by: Sasha Levin <sasha.levin@oracle.com>
Tested-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Jones <drjones@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dave Jones <davej@redhat.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Fernando Luis Vázquez Cao <fernando_b1@lab.ntt.co.jp>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Cc: Waiman Long <Waiman.Long@hp.com>
Cc: a.ryabinin@samsung.com
Cc: dave@stgolabs.net
Cc: hpa@zytor.com
Cc: jasowang@redhat.com
Cc: jeremy@goop.org
Cc: paul.gortmaker@windriver.com
Cc: riel@redhat.com
Cc: tglx@linutronix.de
Cc: waiman.long@hp.com
Cc: xen-devel@lists.xenproject.org
Link: http://lkml.kernel.org/r/20150215173043.GA7471@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Showing 3 changed files with 64 additions and 56 deletions Side-by-side Diff

arch/x86/include/asm/spinlock.h
... ... @@ -46,7 +46,7 @@
46 46  
47 47 static inline void __ticket_enter_slowpath(arch_spinlock_t *lock)
48 48 {
49   - set_bit(0, (volatile unsigned long *)&lock->tickets.tail);
  49 + set_bit(0, (volatile unsigned long *)&lock->tickets.head);
50 50 }
51 51  
52 52 #else /* !CONFIG_PARAVIRT_SPINLOCKS */
53 53  
54 54  
... ... @@ -60,10 +60,30 @@
60 60 }
61 61  
62 62 #endif /* CONFIG_PARAVIRT_SPINLOCKS */
  63 +static inline int __tickets_equal(__ticket_t one, __ticket_t two)
  64 +{
  65 + return !((one ^ two) & ~TICKET_SLOWPATH_FLAG);
  66 +}
63 67  
  68 +static inline void __ticket_check_and_clear_slowpath(arch_spinlock_t *lock,
  69 + __ticket_t head)
  70 +{
  71 + if (head & TICKET_SLOWPATH_FLAG) {
  72 + arch_spinlock_t old, new;
  73 +
  74 + old.tickets.head = head;
  75 + new.tickets.head = head & ~TICKET_SLOWPATH_FLAG;
  76 + old.tickets.tail = new.tickets.head + TICKET_LOCK_INC;
  77 + new.tickets.tail = old.tickets.tail;
  78 +
  79 + /* try to clear slowpath flag when there are no contenders */
  80 + cmpxchg(&lock->head_tail, old.head_tail, new.head_tail);
  81 + }
  82 +}
  83 +
64 84 static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
65 85 {
66   - return lock.tickets.head == lock.tickets.tail;
  86 + return __tickets_equal(lock.tickets.head, lock.tickets.tail);
67 87 }
68 88  
69 89 /*
70 90  
71 91  
... ... @@ -87,18 +107,21 @@
87 107 if (likely(inc.head == inc.tail))
88 108 goto out;
89 109  
90   - inc.tail &= ~TICKET_SLOWPATH_FLAG;
91 110 for (;;) {
92 111 unsigned count = SPIN_THRESHOLD;
93 112  
94 113 do {
95   - if (READ_ONCE(lock->tickets.head) == inc.tail)
96   - goto out;
  114 + inc.head = READ_ONCE(lock->tickets.head);
  115 + if (__tickets_equal(inc.head, inc.tail))
  116 + goto clear_slowpath;
97 117 cpu_relax();
98 118 } while (--count);
99 119 __ticket_lock_spinning(lock, inc.tail);
100 120 }
101   -out: barrier(); /* make sure nothing creeps before the lock is taken */
  121 +clear_slowpath:
  122 + __ticket_check_and_clear_slowpath(lock, inc.head);
  123 +out:
  124 + barrier(); /* make sure nothing creeps before the lock is taken */
102 125 }
103 126  
104 127 static __always_inline int arch_spin_trylock(arch_spinlock_t *lock)
105 128  
106 129  
107 130  
108 131  
109 132  
110 133  
... ... @@ -106,56 +129,30 @@
106 129 arch_spinlock_t old, new;
107 130  
108 131 old.tickets = READ_ONCE(lock->tickets);
109   - if (old.tickets.head != (old.tickets.tail & ~TICKET_SLOWPATH_FLAG))
  132 + if (!__tickets_equal(old.tickets.head, old.tickets.tail))
110 133 return 0;
111 134  
112 135 new.head_tail = old.head_tail + (TICKET_LOCK_INC << TICKET_SHIFT);
  136 + new.head_tail &= ~TICKET_SLOWPATH_FLAG;
113 137  
114 138 /* cmpxchg is a full barrier, so nothing can move before it */
115 139 return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail;
116 140 }
117 141  
118   -static inline void __ticket_unlock_slowpath(arch_spinlock_t *lock,
119   - arch_spinlock_t old)
120   -{
121   - arch_spinlock_t new;
122   -
123   - BUILD_BUG_ON(((__ticket_t)NR_CPUS) != NR_CPUS);
124   -
125   - /* Perform the unlock on the "before" copy */
126   - old.tickets.head += TICKET_LOCK_INC;
127   -
128   - /* Clear the slowpath flag */
129   - new.head_tail = old.head_tail & ~(TICKET_SLOWPATH_FLAG << TICKET_SHIFT);
130   -
131   - /*
132   - * If the lock is uncontended, clear the flag - use cmpxchg in
133   - * case it changes behind our back though.
134   - */
135   - if (new.tickets.head != new.tickets.tail ||
136   - cmpxchg(&lock->head_tail, old.head_tail,
137   - new.head_tail) != old.head_tail) {
138   - /*
139   - * Lock still has someone queued for it, so wake up an
140   - * appropriate waiter.
141   - */
142   - __ticket_unlock_kick(lock, old.tickets.head);
143   - }
144   -}
145   -
146 142 static __always_inline void arch_spin_unlock(arch_spinlock_t *lock)
147 143 {
148 144 if (TICKET_SLOWPATH_FLAG &&
149   - static_key_false(&paravirt_ticketlocks_enabled)) {
150   - arch_spinlock_t prev;
  145 + static_key_false(&paravirt_ticketlocks_enabled)) {
  146 + __ticket_t head;
151 147  
152   - prev = *lock;
153   - add_smp(&lock->tickets.head, TICKET_LOCK_INC);
  148 + BUILD_BUG_ON(((__ticket_t)NR_CPUS) != NR_CPUS);
154 149  
155   - /* add_smp() is a full mb() */
  150 + head = xadd(&lock->tickets.head, TICKET_LOCK_INC);
156 151  
157   - if (unlikely(lock->tickets.tail & TICKET_SLOWPATH_FLAG))
158   - __ticket_unlock_slowpath(lock, prev);
  152 + if (unlikely(head & TICKET_SLOWPATH_FLAG)) {
  153 + head &= ~TICKET_SLOWPATH_FLAG;
  154 + __ticket_unlock_kick(lock, (head + TICKET_LOCK_INC));
  155 + }
159 156 } else
160 157 __add(&lock->tickets.head, TICKET_LOCK_INC, UNLOCK_LOCK_PREFIX);
161 158 }
162 159  
... ... @@ -164,14 +161,15 @@
164 161 {
165 162 struct __raw_tickets tmp = READ_ONCE(lock->tickets);
166 163  
167   - return tmp.tail != tmp.head;
  164 + return !__tickets_equal(tmp.tail, tmp.head);
168 165 }
169 166  
170 167 static inline int arch_spin_is_contended(arch_spinlock_t *lock)
171 168 {
172 169 struct __raw_tickets tmp = READ_ONCE(lock->tickets);
173 170  
174   - return (__ticket_t)(tmp.tail - tmp.head) > TICKET_LOCK_INC;
  171 + tmp.head &= ~TICKET_SLOWPATH_FLAG;
  172 + return (tmp.tail - tmp.head) > TICKET_LOCK_INC;
175 173 }
176 174 #define arch_spin_is_contended arch_spin_is_contended
177 175  
178 176  
179 177  
... ... @@ -183,16 +181,16 @@
183 181  
184 182 static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
185 183 {
186   - __ticket_t head = ACCESS_ONCE(lock->tickets.head);
  184 + __ticket_t head = READ_ONCE(lock->tickets.head);
187 185  
188 186 for (;;) {
189   - struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
  187 + struct __raw_tickets tmp = READ_ONCE(lock->tickets);
190 188 /*
191 189 * We need to check "unlocked" in a loop, tmp.head == head
192 190 * can be false positive because of overflow.
193 191 */
194   - if (tmp.head == (tmp.tail & ~TICKET_SLOWPATH_FLAG) ||
195   - tmp.head != head)
  192 + if (__tickets_equal(tmp.head, tmp.tail) ||
  193 + !__tickets_equal(tmp.head, head))
196 194 break;
197 195  
198 196 cpu_relax();
arch/x86/kernel/kvm.c
... ... @@ -609,7 +609,7 @@
609 609 u8 ret;
610 610 u8 old;
611 611  
612   - old = ACCESS_ONCE(zero_stats);
  612 + old = READ_ONCE(zero_stats);
613 613 if (unlikely(old)) {
614 614 ret = cmpxchg(&zero_stats, old, 0);
615 615 /* This ensures only one fellow resets the stat */
... ... @@ -727,6 +727,7 @@
727 727 int cpu;
728 728 u64 start;
729 729 unsigned long flags;
  730 + __ticket_t head;
730 731  
731 732 if (in_nmi())
732 733 return;
733 734  
... ... @@ -768,11 +769,15 @@
768 769 */
769 770 __ticket_enter_slowpath(lock);
770 771  
  772 + /* make sure enter_slowpath, which is atomic does not cross the read */
  773 + smp_mb__after_atomic();
  774 +
771 775 /*
772 776 * check again make sure it didn't become free while
773 777 * we weren't looking.
774 778 */
775   - if (ACCESS_ONCE(lock->tickets.head) == want) {
  779 + head = READ_ONCE(lock->tickets.head);
  780 + if (__tickets_equal(head, want)) {
776 781 add_stats(TAKEN_SLOW_PICKUP, 1);
777 782 goto out;
778 783 }
... ... @@ -803,8 +808,8 @@
803 808 add_stats(RELEASED_SLOW, 1);
804 809 for_each_cpu(cpu, &waiting_cpus) {
805 810 const struct kvm_lock_waiting *w = &per_cpu(klock_waiting, cpu);
806   - if (ACCESS_ONCE(w->lock) == lock &&
807   - ACCESS_ONCE(w->want) == ticket) {
  811 + if (READ_ONCE(w->lock) == lock &&
  812 + READ_ONCE(w->want) == ticket) {
808 813 add_stats(RELEASED_SLOW_KICKED, 1);
809 814 kvm_kick_cpu(cpu);
810 815 break;
arch/x86/xen/spinlock.c
... ... @@ -41,7 +41,7 @@
41 41 static inline void check_zero(void)
42 42 {
43 43 u8 ret;
44   - u8 old = ACCESS_ONCE(zero_stats);
  44 + u8 old = READ_ONCE(zero_stats);
45 45 if (unlikely(old)) {
46 46 ret = cmpxchg(&zero_stats, old, 0);
47 47 /* This ensures only one fellow resets the stat */
... ... @@ -112,6 +112,7 @@
112 112 struct xen_lock_waiting *w = this_cpu_ptr(&lock_waiting);
113 113 int cpu = smp_processor_id();
114 114 u64 start;
  115 + __ticket_t head;
115 116 unsigned long flags;
116 117  
117 118 /* If kicker interrupts not initialized yet, just spin */
118 119  
... ... @@ -159,11 +160,15 @@
159 160 */
160 161 __ticket_enter_slowpath(lock);
161 162  
  163 + /* make sure enter_slowpath, which is atomic does not cross the read */
  164 + smp_mb__after_atomic();
  165 +
162 166 /*
163 167 * check again make sure it didn't become free while
164 168 * we weren't looking
165 169 */
166   - if (ACCESS_ONCE(lock->tickets.head) == want) {
  170 + head = READ_ONCE(lock->tickets.head);
  171 + if (__tickets_equal(head, want)) {
167 172 add_stats(TAKEN_SLOW_PICKUP, 1);
168 173 goto out;
169 174 }
... ... @@ -204,8 +209,8 @@
204 209 const struct xen_lock_waiting *w = &per_cpu(lock_waiting, cpu);
205 210  
206 211 /* Make sure we read lock before want */
207   - if (ACCESS_ONCE(w->lock) == lock &&
208   - ACCESS_ONCE(w->want) == next) {
  212 + if (READ_ONCE(w->lock) == lock &&
  213 + READ_ONCE(w->want) == next) {
209 214 add_stats(RELEASED_SLOW_KICKED, 1);
210 215 xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
211 216 break;