mutex: speed up generic mutex implementations

- atomic operations which both modify the variable and return something imply full smp memory barriers before and after the memory operations involved (failing atomic_cmpxchg, atomic_add_unless, etc don't imply a barrier because they don't modify the target). See Documentation/atomic_ops.txt. So remove extra barriers and branches. - All architectures support atomic_cmpxchg. This has no relation to __HAVE_ARCH_CMPXCHG. We can just take the atomic_cmpxchg path unconditionally This reduces a simple single threaded fastpath lock+unlock test from 590 cycles to 203 cycles on a ppc970 system. Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

mutex: speed up generic mutex implementations
- atomic operations which both modify the variable and return something imply full smp memory barriers before and after the memory operations involved (failing atomic_cmpxchg, atomic_add_unless, etc don't imply a barrier because they don't modify the target). See Documentation/atomic_ops.txt. So remove extra barriers and branches. - All architectures support atomic_cmpxchg. This has no relation to __HAVE_ARCH_CMPXCHG. We can just take the atomic_cmpxchg path unconditionally This reduces a simple single threaded fastpath lock+unlock test from 590 cycles to 203 cycles on a ppc970 system. Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Nick Piggin · Linus Torvalds
1 parent 5a439c5657
Showing 2 changed files with 3 additions and 32 deletions Side-by-side Diff
include/asm-generic/mutex-dec.h
include/asm-generic/mutex-xchg.h
@@ -22,8 +22,6 @@
 {
 	if (unlikely(atomic_dec_return(count) < 0))
 		fail_fn(count);
-	else
-		smp_mb();
 }
  
 /**
@@ -41,10 +39,7 @@
 {
 	if (unlikely(atomic_dec_return(count) < 0))
 		return fail_fn(count);
-	else {
-		smp_mb();
-		return 0;
-	}
+	return 0;
 }
  
 /**
@@ -63,7 +58,6 @@
 static inline void
 __mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *))
 {
-	smp_mb();
 	if (unlikely(atomic_inc_return(count) <= 0))
 		fail_fn(count);
 }
  
  
@@ -88,25 +82,9 @@
 static inline int
 __mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *))
 {
-	/*
-	 * We have two variants here. The cmpxchg based one is the best one
-	 * because it never induce a false contention state.  It is included
-	 * here because architectures using the inc/dec algorithms over the
-	 * xchg ones are much more likely to support cmpxchg natively.
-	 *
-	 * If not we fall back to the spinlock based variant - that is
-	 * just as efficient (and simpler) as a 'destructive' probing of
-	 * the mutex state would be.
-	 */
-#ifdef __HAVE_ARCH_CMPXCHG
-	if (likely(atomic_cmpxchg(count, 1, 0) == 1)) {
-		smp_mb();
+	if (likely(atomic_cmpxchg(count, 1, 0) == 1))
 		return 1;
-	}
 	return 0;
-#else
-	return fail_fn(count);
-#endif
 }
  
 #endif
@@ -27,8 +27,6 @@
 {
 	if (unlikely(atomic_xchg(count, 0) != 1))
 		fail_fn(count);
-	else
-		smp_mb();
 }
  
 /**
@@ -46,10 +44,7 @@
 {
 	if (unlikely(atomic_xchg(count, 0) != 1))
 		return fail_fn(count);
-	else {
-		smp_mb();
-		return 0;
-	}
+	return 0;
 }
  
 /**
@@ -67,7 +62,6 @@
 static inline void
 __mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *))
 {
-	smp_mb();
 	if (unlikely(atomic_xchg(count, 1) != 0))
 		fail_fn(count);
 }
@@ -110,7 +104,6 @@
 		if (prev < 0)
 			prev = 0;
 	}
-	smp_mb();
  
 	return prev;
 }
...	...	@@ -22,8 +22,6 @@
22	22	{
23	23	if (unlikely(atomic_dec_return(count) < 0))
24	24	fail_fn(count);
25		- else
26		- smp_mb();
27	25	}
28	26
29	27	/**
...	...	@@ -41,10 +39,7 @@
41	39	{
42	40	if (unlikely(atomic_dec_return(count) < 0))
43	41	return fail_fn(count);
44		- else {
45		- smp_mb();
46		- return 0;
47		- }
	42	+ return 0;
48	43	}
49	44
50	45	/**
...	...	@@ -63,7 +58,6 @@
63	58	static inline void
64	59	__mutex_fastpath_unlock(atomic_t count, void (fail_fn)(atomic_t *))
65	60	{
66		- smp_mb();
67	61	if (unlikely(atomic_inc_return(count) <= 0))
68	62	fail_fn(count);
69	63	}
70	64
71	65
...	...	@@ -88,25 +82,9 @@
88	82	static inline int
89	83	__mutex_fastpath_trylock(atomic_t count, int (fail_fn)(atomic_t *))
90	84	{
91		- /*
92		- * We have two variants here. The cmpxchg based one is the best one
93		- * because it never induce a false contention state. It is included
94		- * here because architectures using the inc/dec algorithms over the
95		- * xchg ones are much more likely to support cmpxchg natively.
96		- *
97		- * If not we fall back to the spinlock based variant - that is
98		- * just as efficient (and simpler) as a 'destructive' probing of
99		- * the mutex state would be.
100		- */
101		-#ifdef __HAVE_ARCH_CMPXCHG
102		- if (likely(atomic_cmpxchg(count, 1, 0) == 1)) {
103		- smp_mb();
	85	+ if (likely(atomic_cmpxchg(count, 1, 0) == 1))
104	86	return 1;
105		- }
106	87	return 0;
107		-#else
108		- return fail_fn(count);
109		-#endif
110	88	}
111	89
112	90	#endif
...	...	@@ -27,8 +27,6 @@
27	27	{
28	28	if (unlikely(atomic_xchg(count, 0) != 1))
29	29	fail_fn(count);
30		- else
31		- smp_mb();
32	30	}
33	31
34	32	/**
...	...	@@ -46,10 +44,7 @@
46	44	{
47	45	if (unlikely(atomic_xchg(count, 0) != 1))
48	46	return fail_fn(count);
49		- else {
50		- smp_mb();
51		- return 0;
52		- }
	47	+ return 0;
53	48	}
54	49
55	50	/**
...	...	@@ -67,7 +62,6 @@
67	62	static inline void
68	63	__mutex_fastpath_unlock(atomic_t count, void (fail_fn)(atomic_t *))
69	64	{
70		- smp_mb();
71	65	if (unlikely(atomic_xchg(count, 1) != 0))
72	66	fail_fn(count);
73	67	}
...	...	@@ -110,7 +104,6 @@
110	104	if (prev < 0)
111	105	prev = 0;
112	106	}
113		- smp_mb();
114	107
115	108	return prev;
116	109	}