Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/borntraeger/linux

Pull ACCESS_ONCE cleanup preparation from Christian Borntraeger: "kernel: Provide READ_ONCE and ASSIGN_ONCE As discussed on LKML http://marc.info/?i=54611D86.4040306%40de.ibm.com ACCESS_ONCE might fail with specific compilers for non-scalar accesses. Here is a set of patches to tackle that problem. The first patch introduce READ_ONCE and ASSIGN_ONCE. If the data structure is larger than the machine word size memcpy is used and a warning is emitted. The next patches fix up several in-tree users of ACCESS_ONCE on non-scalar types. This does not yet contain a patch that forces ACCESS_ONCE to work only on scalar types. This is targetted for the next merge window as Linux next already contains new offenders regarding ACCESS_ONCE vs. non-scalar types" * tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/borntraeger/linux: s390/kvm: REPLACE barrier fixup with READ_ONCE arm/spinlock: Replace ACCESS_ONCE with READ_ONCE arm64/spinlock: Replace ACCESS_ONCE READ_ONCE mips/gup: Replace ACCESS_ONCE with READ_ONCE x86/gup: Replace ACCESS_ONCE with READ_ONCE x86/spinlock: Replace ACCESS_ONCE with READ_ONCE mm: replace ACCESS_ONCE with READ_ONCE or barriers kernel: Provide READ_ONCE and ASSIGN_ONCE

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/borntraeger/linux
Pull ACCESS_ONCE cleanup preparation from Christian Borntraeger: "kernel: Provide READ_ONCE and ASSIGN_ONCE As discussed on LKML http://marc.info/?i=54611D86.4040306%40de.ibm.com ACCESS_ONCE might fail with specific compilers for non-scalar accesses. Here is a set of patches to tackle that problem. The first patch introduce READ_ONCE and ASSIGN_ONCE. If the data structure is larger than the machine word size memcpy is used and a warning is emitted. The next patches fix up several in-tree users of ACCESS_ONCE on non-scalar types. This does not yet contain a patch that forces ACCESS_ONCE to work only on scalar types. This is targetted for the next merge window as Linux next already contains new offenders regarding ACCESS_ONCE vs. non-scalar types" * tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/borntraeger/linux: s390/kvm: REPLACE barrier fixup with READ_ONCE arm/spinlock: Replace ACCESS_ONCE with READ_ONCE arm64/spinlock: Replace ACCESS_ONCE READ_ONCE mips/gup: Replace ACCESS_ONCE with READ_ONCE x86/gup: Replace ACCESS_ONCE with READ_ONCE x86/spinlock: Replace ACCESS_ONCE with READ_ONCE mm: replace ACCESS_ONCE with READ_ONCE or barriers kernel: Provide READ_ONCE and ASSIGN_ONCE
Linus Torvalds
2 parents bfc7249cc2 5de72a2247
Showing 10 changed files Side-by-side Diff
arch/arm/include/asm/spinlock.h
arch/arm64/include/asm/spinlock.h
arch/mips/mm/gup.c
arch/s390/kvm/gaccess.c
arch/x86/include/asm/spinlock.h
arch/x86/mm/gup.c
include/linux/compiler.h
mm/gup.c
mm/memory.c
mm/rmap.c
@@ -120,12 +120,12 @@
  
 static inline int arch_spin_is_locked(arch_spinlock_t *lock)
 {
-	return !arch_spin_value_unlocked(ACCESS_ONCE(*lock));
+	return !arch_spin_value_unlocked(READ_ONCE(*lock));
 }
  
 static inline int arch_spin_is_contended(arch_spinlock_t *lock)
 {
-	struct __raw_tickets tickets = ACCESS_ONCE(lock->tickets);
+	struct __raw_tickets tickets = READ_ONCE(lock->tickets);
 	return (tickets.next - tickets.owner) > 1;
 }
 #define arch_spin_is_contended	arch_spin_is_contended
@@ -99,12 +99,12 @@
  
 static inline int arch_spin_is_locked(arch_spinlock_t *lock)
 {
-	return !arch_spin_value_unlocked(ACCESS_ONCE(*lock));
+	return !arch_spin_value_unlocked(READ_ONCE(*lock));
 }
  
 static inline int arch_spin_is_contended(arch_spinlock_t *lock)
 {
-	arch_spinlock_t lockval = ACCESS_ONCE(*lock);
+	arch_spinlock_t lockval = READ_ONCE(*lock);
 	return (lockval.next - lockval.owner) > 1;
 }
 #define arch_spin_is_contended	arch_spin_is_contended
@@ -30,7 +30,7 @@
  
 	return pte;
 #else
-	return ACCESS_ONCE(*ptep);
+	return READ_ONCE(*ptep);
 #endif
 }
  
@@ -227,12 +227,10 @@
 		goto out;
 	ic = &vcpu->kvm->arch.sca->ipte_control;
 	do {
-		old = *ic;
-		barrier();
+		old = READ_ONCE(*ic);
 		while (old.k) {
 			cond_resched();
-			old = *ic;
-			barrier();
+			old = READ_ONCE(*ic);
 		}
 		new = old;
 		new.k = 1;
@@ -251,8 +249,7 @@
 		goto out;
 	ic = &vcpu->kvm->arch.sca->ipte_control;
 	do {
-		old = *ic;
-		barrier();
+		old = READ_ONCE(*ic);
 		new = old;
 		new.k = 0;
 	} while (cmpxchg(&ic->val, old.val, new.val) != old.val);
  
@@ -267,12 +264,10 @@
  
 	ic = &vcpu->kvm->arch.sca->ipte_control;
 	do {
-		old = *ic;
-		barrier();
+		old = READ_ONCE(*ic);
 		while (old.kg) {
 			cond_resched();
-			old = *ic;
-			barrier();
+			old = READ_ONCE(*ic);
 		}
 		new = old;
 		new.k = 1;
@@ -286,8 +281,7 @@
  
 	ic = &vcpu->kvm->arch.sca->ipte_control;
 	do {
-		old = *ic;
-		barrier();
+		old = READ_ONCE(*ic);
 		new = old;
 		new.kh--;
 		if (!new.kh)
@@ -92,7 +92,7 @@
 		unsigned count = SPIN_THRESHOLD;
  
 		do {
-			if (ACCESS_ONCE(lock->tickets.head) == inc.tail)
+			if (READ_ONCE(lock->tickets.head) == inc.tail)
 				goto out;
 			cpu_relax();
 		} while (--count);
@@ -105,7 +105,7 @@
 {
 	arch_spinlock_t old, new;
  
-	old.tickets = ACCESS_ONCE(lock->tickets);
+	old.tickets = READ_ONCE(lock->tickets);
 	if (old.tickets.head != (old.tickets.tail & ~TICKET_SLOWPATH_FLAG))
 		return 0;
  
  
@@ -162,14 +162,14 @@
  
 static inline int arch_spin_is_locked(arch_spinlock_t *lock)
 {
-	struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
+	struct __raw_tickets tmp = READ_ONCE(lock->tickets);
  
 	return tmp.tail != tmp.head;
 }
  
 static inline int arch_spin_is_contended(arch_spinlock_t *lock)
 {
-	struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
+	struct __raw_tickets tmp = READ_ONCE(lock->tickets);
  
 	return (__ticket_t)(tmp.tail - tmp.head) > TICKET_LOCK_INC;
 }
@@ -15,7 +15,7 @@
 static inline pte_t gup_get_pte(pte_t *ptep)
 {
 #ifndef CONFIG_X86_PAE
-	return ACCESS_ONCE(*ptep);
+	return READ_ONCE(*ptep);
 #else
 	/*
 	 * With get_user_pages_fast, we walk down the pagetables without taking
@@ -186,6 +186,80 @@
 # define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __LINE__)
 #endif
  
+#include <uapi/linux/types.h>
+
+static __always_inline void data_access_exceeds_word_size(void)
+#ifdef __compiletime_warning
+__compiletime_warning("data access exceeds word size and won't be atomic")
+#endif
+;
+
+static __always_inline void data_access_exceeds_word_size(void)
+{
+}
+
+static __always_inline void __read_once_size(volatile void *p, void *res, int size)
+{
+	switch (size) {
+	case 1: *(__u8 *)res = *(volatile __u8 *)p; break;
+	case 2: *(__u16 *)res = *(volatile __u16 *)p; break;
+	case 4: *(__u32 *)res = *(volatile __u32 *)p; break;
+#ifdef CONFIG_64BIT
+	case 8: *(__u64 *)res = *(volatile __u64 *)p; break;
+#endif
+	default:
+		barrier();
+		__builtin_memcpy((void *)res, (const void *)p, size);
+		data_access_exceeds_word_size();
+		barrier();
+	}
+}
+
+static __always_inline void __assign_once_size(volatile void *p, void *res, int size)
+{
+	switch (size) {
+	case 1: *(volatile __u8 *)p = *(__u8 *)res; break;
+	case 2: *(volatile __u16 *)p = *(__u16 *)res; break;
+	case 4: *(volatile __u32 *)p = *(__u32 *)res; break;
+#ifdef CONFIG_64BIT
+	case 8: *(volatile __u64 *)p = *(__u64 *)res; break;
+#endif
+	default:
+		barrier();
+		__builtin_memcpy((void *)p, (const void *)res, size);
+		data_access_exceeds_word_size();
+		barrier();
+	}
+}
+
+/*
+ * Prevent the compiler from merging or refetching reads or writes. The
+ * compiler is also forbidden from reordering successive instances of
+ * READ_ONCE, ASSIGN_ONCE and ACCESS_ONCE (see below), but only when the
+ * compiler is aware of some particular ordering.  One way to make the
+ * compiler aware of ordering is to put the two invocations of READ_ONCE,
+ * ASSIGN_ONCE or ACCESS_ONCE() in different C statements.
+ *
+ * In contrast to ACCESS_ONCE these two macros will also work on aggregate
+ * data types like structs or unions. If the size of the accessed data
+ * type exceeds the word size of the machine (e.g., 32 bits or 64 bits)
+ * READ_ONCE() and ASSIGN_ONCE()  will fall back to memcpy and print a
+ * compile-time warning.
+ *
+ * Their two major use cases are: (1) Mediating communication between
+ * process-level code and irq/NMI handlers, all running on the same CPU,
+ * and (2) Ensuring that the compiler does not  fold, spindle, or otherwise
+ * mutilate accesses that either do not require ordering or that interact
+ * with an explicit memory barrier or atomic instruction that provides the
+ * required ordering.
+ */
+
+#define READ_ONCE(x) \
+	({ typeof(x) __val; __read_once_size(&x, &__val, sizeof(__val)); __val; })
+
+#define ASSIGN_ONCE(val, x) \
+	({ typeof(x) __val; __val = val; __assign_once_size(&x, &__val, sizeof(__val)); __val; })
+
 #endif /* __KERNEL__ */
  
 #endif /* __ASSEMBLY__ */
@@ -968,7 +968,7 @@
  
 	pudp = pud_offset(&pgd, addr);
 	do {
-		pud_t pud = ACCESS_ONCE(*pudp);
+		pud_t pud = READ_ONCE(*pudp);
  
 		next = pud_addr_end(addr, end);
 		if (pud_none(pud))
@@ -3195,7 +3195,16 @@
 	pte_t entry;
 	spinlock_t *ptl;
  
-	entry = ACCESS_ONCE(*pte);
+	/*
+	 * some architectures can have larger ptes than wordsize,
+	 * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and CONFIG_32BIT=y,
+	 * so READ_ONCE or ACCESS_ONCE cannot guarantee atomic accesses.
+	 * The code below just needs a consistent view for the ifs and
+	 * we later double check anyway with the ptl lock held. So here
+	 * a barrier will do.
+	 */
+	entry = *pte;
+	barrier();
 	if (!pte_present(entry)) {
 		if (pte_none(entry)) {
 			if (vma->vm_ops) {
@@ -583,7 +583,8 @@
 	 * without holding anon_vma lock for write.  So when looking for a
 	 * genuine pmde (in which to find pte), test present and !THP together.
 	 */
-	pmde = ACCESS_ONCE(*pmd);
+	pmde = *pmd;
+	barrier();
 	if (!pmd_present(pmde) || pmd_trans_huge(pmde))
 		pmd = NULL;
 out:
...	...	@@ -120,12 +120,12 @@
120	120
121	121	static inline int arch_spin_is_locked(arch_spinlock_t *lock)
122	122	{
123		- return !arch_spin_value_unlocked(ACCESS_ONCE(*lock));
	123	+ return !arch_spin_value_unlocked(READ_ONCE(*lock));
124	124	}
125	125
126	126	static inline int arch_spin_is_contended(arch_spinlock_t *lock)
127	127	{
128		- struct __raw_tickets tickets = ACCESS_ONCE(lock->tickets);
	128	+ struct __raw_tickets tickets = READ_ONCE(lock->tickets);
129	129	return (tickets.next - tickets.owner) > 1;
130	130	}
131	131	#define arch_spin_is_contended arch_spin_is_contended
...	...	@@ -99,12 +99,12 @@
99	99
100	100	static inline int arch_spin_is_locked(arch_spinlock_t *lock)
101	101	{
102		- return !arch_spin_value_unlocked(ACCESS_ONCE(*lock));
	102	+ return !arch_spin_value_unlocked(READ_ONCE(*lock));
103	103	}
104	104
105	105	static inline int arch_spin_is_contended(arch_spinlock_t *lock)
106	106	{
107		- arch_spinlock_t lockval = ACCESS_ONCE(*lock);
	107	+ arch_spinlock_t lockval = READ_ONCE(*lock);
108	108	return (lockval.next - lockval.owner) > 1;
109	109	}
110	110	#define arch_spin_is_contended arch_spin_is_contended
...	...	@@ -30,7 +30,7 @@
30	30
31	31	return pte;
32	32	#else
33		- return ACCESS_ONCE(*ptep);
	33	+ return READ_ONCE(*ptep);
34	34	#endif
35	35	}
36	36
...	...	@@ -227,12 +227,10 @@
227	227	goto out;
228	228	ic = &vcpu->kvm->arch.sca->ipte_control;
229	229	do {
230		- old = *ic;
231		- barrier();
	230	+ old = READ_ONCE(*ic);
232	231	while (old.k) {
233	232	cond_resched();
234		- old = *ic;
235		- barrier();
	233	+ old = READ_ONCE(*ic);
236	234	}
237	235	new = old;
238	236	new.k = 1;
...	...	@@ -251,8 +249,7 @@
251	249	goto out;
252	250	ic = &vcpu->kvm->arch.sca->ipte_control;
253	251	do {
254		- old = *ic;
255		- barrier();
	252	+ old = READ_ONCE(*ic);
256	253	new = old;
257	254	new.k = 0;
258	255	} while (cmpxchg(&ic->val, old.val, new.val) != old.val);
259	256
...	...	@@ -267,12 +264,10 @@
267	264
268	265	ic = &vcpu->kvm->arch.sca->ipte_control;
269	266	do {
270		- old = *ic;
271		- barrier();
	267	+ old = READ_ONCE(*ic);
272	268	while (old.kg) {
273	269	cond_resched();
274		- old = *ic;
275		- barrier();
	270	+ old = READ_ONCE(*ic);
276	271	}
277	272	new = old;
278	273	new.k = 1;
...	...	@@ -286,8 +281,7 @@
286	281
287	282	ic = &vcpu->kvm->arch.sca->ipte_control;
288	283	do {
289		- old = *ic;
290		- barrier();
	284	+ old = READ_ONCE(*ic);
291	285	new = old;
292	286	new.kh--;
293	287	if (!new.kh)
...	...	@@ -92,7 +92,7 @@
92	92	unsigned count = SPIN_THRESHOLD;
93	93
94	94	do {
95		- if (ACCESS_ONCE(lock->tickets.head) == inc.tail)
	95	+ if (READ_ONCE(lock->tickets.head) == inc.tail)
96	96	goto out;
97	97	cpu_relax();
98	98	} while (--count);
...	...	@@ -105,7 +105,7 @@
105	105	{
106	106	arch_spinlock_t old, new;
107	107
108		- old.tickets = ACCESS_ONCE(lock->tickets);
	108	+ old.tickets = READ_ONCE(lock->tickets);
109	109	if (old.tickets.head != (old.tickets.tail & ~TICKET_SLOWPATH_FLAG))
110	110	return 0;
111	111
112	112
...	...	@@ -162,14 +162,14 @@
162	162
163	163	static inline int arch_spin_is_locked(arch_spinlock_t *lock)
164	164	{
165		- struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
	165	+ struct __raw_tickets tmp = READ_ONCE(lock->tickets);
166	166
167	167	return tmp.tail != tmp.head;
168	168	}
169	169
170	170	static inline int arch_spin_is_contended(arch_spinlock_t *lock)
171	171	{
172		- struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
	172	+ struct __raw_tickets tmp = READ_ONCE(lock->tickets);
173	173
174	174	return (__ticket_t)(tmp.tail - tmp.head) > TICKET_LOCK_INC;
175	175	}
...	...	@@ -15,7 +15,7 @@
15	15	static inline pte_t gup_get_pte(pte_t *ptep)
16	16	{
17	17	#ifndef CONFIG_X86_PAE
18		- return ACCESS_ONCE(*ptep);
	18	+ return READ_ONCE(*ptep);
19	19	#else
20	20	/*
21	21	* With get_user_pages_fast, we walk down the pagetables without taking
...	...	@@ -186,6 +186,80 @@
186	186	# define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __LINE__)
187	187	#endif
188	188
	189	+#include <uapi/linux/types.h>
	190	+
	191	+static __always_inline void data_access_exceeds_word_size(void)
	192	+#ifdef __compiletime_warning
	193	+__compiletime_warning("data access exceeds word size and won't be atomic")
	194	+#endif
	195	+;
	196	+
	197	+static __always_inline void data_access_exceeds_word_size(void)
	198	+{
	199	+}
	200	+
	201	+static __always_inline void __read_once_size(volatile void p, void res, int size)
	202	+{
	203	+ switch (size) {
	204	+ case 1: (__u8 )res = (volatile __u8 )p; break;
	205	+ case 2: (__u16 )res = (volatile __u16 )p; break;
	206	+ case 4: (__u32 )res = (volatile __u32 )p; break;
	207	+#ifdef CONFIG_64BIT
	208	+ case 8: (__u64 )res = (volatile __u64 )p; break;
	209	+#endif
	210	+ default:
	211	+ barrier();
	212	+ __builtin_memcpy((void )res, (const void )p, size);
	213	+ data_access_exceeds_word_size();
	214	+ barrier();
	215	+ }
	216	+}
	217	+
	218	+static __always_inline void __assign_once_size(volatile void p, void res, int size)
	219	+{
	220	+ switch (size) {
	221	+ case 1: (volatile __u8 )p = (__u8 )res; break;
	222	+ case 2: (volatile __u16 )p = (__u16 )res; break;
	223	+ case 4: (volatile __u32 )p = (__u32 )res; break;
	224	+#ifdef CONFIG_64BIT
	225	+ case 8: (volatile __u64 )p = (__u64 )res; break;
	226	+#endif
	227	+ default:
	228	+ barrier();
	229	+ __builtin_memcpy((void )p, (const void )res, size);
	230	+ data_access_exceeds_word_size();
	231	+ barrier();
	232	+ }
	233	+}
	234	+
	235	+/*
	236	+ * Prevent the compiler from merging or refetching reads or writes. The
	237	+ * compiler is also forbidden from reordering successive instances of
	238	+ * READ_ONCE, ASSIGN_ONCE and ACCESS_ONCE (see below), but only when the
	239	+ * compiler is aware of some particular ordering. One way to make the
	240	+ * compiler aware of ordering is to put the two invocations of READ_ONCE,
	241	+ * ASSIGN_ONCE or ACCESS_ONCE() in different C statements.
	242	+ *
	243	+ * In contrast to ACCESS_ONCE these two macros will also work on aggregate
	244	+ * data types like structs or unions. If the size of the accessed data
	245	+ * type exceeds the word size of the machine (e.g., 32 bits or 64 bits)
	246	+ * READ_ONCE() and ASSIGN_ONCE() will fall back to memcpy and print a
	247	+ * compile-time warning.
	248	+ *
	249	+ * Their two major use cases are: (1) Mediating communication between
	250	+ * process-level code and irq/NMI handlers, all running on the same CPU,
	251	+ * and (2) Ensuring that the compiler does not fold, spindle, or otherwise
	252	+ * mutilate accesses that either do not require ordering or that interact
	253	+ * with an explicit memory barrier or atomic instruction that provides the
	254	+ * required ordering.
	255	+ */
	256	+
	257	+#define READ_ONCE(x) \
	258	+ ({ typeof(x) __val; __read_once_size(&x, &__val, sizeof(__val)); __val; })
	259	+
	260	+#define ASSIGN_ONCE(val, x) \
	261	+ ({ typeof(x) __val; __val = val; __assign_once_size(&x, &__val, sizeof(__val)); __val; })
	262	+
189	263	#endif /* __KERNEL__ */
190	264
191	265	#endif /* __ASSEMBLY__ */
...	...	@@ -968,7 +968,7 @@
968	968
969	969	pudp = pud_offset(&pgd, addr);
970	970	do {
971		- pud_t pud = ACCESS_ONCE(*pudp);
	971	+ pud_t pud = READ_ONCE(*pudp);
972	972
973	973	next = pud_addr_end(addr, end);
974	974	if (pud_none(pud))
...	...	@@ -3195,7 +3195,16 @@
3195	3195	pte_t entry;
3196	3196	spinlock_t *ptl;
3197	3197
3198		- entry = ACCESS_ONCE(*pte);
	3198	+ /*
	3199	+ * some architectures can have larger ptes than wordsize,
	3200	+ * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and CONFIG_32BIT=y,
	3201	+ * so READ_ONCE or ACCESS_ONCE cannot guarantee atomic accesses.
	3202	+ * The code below just needs a consistent view for the ifs and
	3203	+ * we later double check anyway with the ptl lock held. So here
	3204	+ * a barrier will do.
	3205	+ */
	3206	+ entry = *pte;
	3207	+ barrier();
3199	3208	if (!pte_present(entry)) {
3200	3209	if (pte_none(entry)) {
3201	3210	if (vma->vm_ops) {
...	...	@@ -583,7 +583,8 @@
583	583	* without holding anon_vma lock for write. So when looking for a
584	584	* genuine pmde (in which to find pte), test present and !THP together.
585	585	*/
586		- pmde = ACCESS_ONCE(*pmd);
	586	+ pmde = *pmd;
	587	+ barrier();
587	588	if (!pmd_present(pmde) \|\| pmd_trans_huge(pmde))
588	589	pmd = NULL;
589	590	out: