[ARM] cache align destination pointer when copying memory for some processors

The implementation for memory copy functions on ARM had a (disabled) provision for aligning the source pointer before loading registers with data. Turns out that aligning the _destination_ pointer is much more useful, as the read side is already sufficiently helped with the use of preload. So this changes the definition of the CALGN() macro to target the destination pointer instead, and turns it on for Feroceon processors where the gain is very noticeable. Signed-off-by: Nicolas Pitre <nico@marvell.com> Signed-off-by: Lennert Buytenhek <buytenh@marvell.com>

[ARM] cache align destination pointer when copying memory for some processors
The implementation for memory copy functions on ARM had a (disabled) provision for aligning the source pointer before loading registers with data. Turns out that aligning the _destination_ pointer is much more useful, as the read side is already sufficiently helped with the use of preload. So this changes the definition of the CALGN() macro to target the destination pointer instead, and turns it on for Feroceon processors where the gain is very noticeable. Signed-off-by: Nicolas Pitre <nico@marvell.com> Signed-off-by: Lennert Buytenhek <buytenh@marvell.com>
Nicolas Pitre · Lennert Buytenhek
1 parent 4c4925c1f4
Showing 3 changed files with 19 additions and 20 deletions Side-by-side Diff
arch/arm/lib/copy_template.S
arch/arm/lib/memmove.S
include/asm-arm/assembler.h
@@ -13,14 +13,6 @@
  */
  
 /*
- * This can be used to enable code to cacheline align the source pointer.
- * Experiments on tested architectures (StrongARM and XScale) didn't show
- * this a worthwhile thing to do.  That might be different in the future.
- */
-//#define CALGN(code...)	code
-#define CALGN(code...)
-
-/*
  * Theory of operation
  * -------------------
  *
@@ -82,7 +74,7 @@
 		stmfd	sp!, {r5 - r8}
 		blt	5f
  
-	CALGN(	ands	ip, r1, #31		)
+	CALGN(	ands	ip, r0, #31		)
 	CALGN(	rsb	r3, ip, #32		)
 	CALGN(	sbcnes	r4, r3, r2		)  @ C is always set here
 	CALGN(	bcs	2f			)
@@ -168,7 +160,7 @@
 		subs	r2, r2, #28
 		blt	14f
  
-	CALGN(	ands	ip, r1, #31		)
+	CALGN(	ands	ip, r0, #31		)
 	CALGN(	rsb	ip, ip, #32		)
 	CALGN(	sbcnes	r4, ip, r2		)  @ C is always set here
 	CALGN(	subcc	r2, r2, ip		)
@@ -13,14 +13,6 @@
 #include <linux/linkage.h>
 #include <asm/assembler.h>
  
-/*
- * This can be used to enable code to cacheline align the source pointer.
- * Experiments on tested architectures (StrongARM and XScale) didn't show
- * this a worthwhile thing to do.  That might be different in the future.
- */
-//#define CALGN(code...)        code
-#define CALGN(code...)
-
 		.text
  
 /*
@@ -55,7 +47,7 @@
 		stmfd	sp!, {r5 - r8}
 		blt	5f
  
-	CALGN(	ands	ip, r1, #31		)
+	CALGN(	ands	ip, r0, #31		)
 	CALGN(	sbcnes	r4, ip, r2		)  @ C is always set here
 	CALGN(	bcs	2f			)
 	CALGN(	adr	r4, 6f			)
@@ -139,7 +131,7 @@
 		subs	r2, r2, #28
 		blt	14f
  
-	CALGN(	ands	ip, r1, #31		)
+	CALGN(	ands	ip, r0, #31		)
 	CALGN(	sbcnes	r4, ip, r2		)  @ C is always set here
 	CALGN(	subcc	r2, r2, ip		)
 	CALGN(	bcc	15f			)
@@ -56,6 +56,21 @@
 #endif
  
 /*
+ * This can be used to enable code to cacheline align the destination
+ * pointer when bulk writing to memory.  Experiments on StrongARM and
+ * XScale didn't show this a worthwhile thing to do when the cache is not
+ * set to write-allocate (this would need further testing on XScale when WA
+ * is used).
+ *
+ * On Feroceon there is much to gain however, regardless of cache mode.
+ */
+#ifdef CONFIG_CPU_FEROCEON
+#define CALGN(code...) code
+#else
+#define CALGN(code...)
+#endif
+
+/*
  * Enable and disable interrupts
  */
 #if __LINUX_ARM_ARCH__ >= 6
...	...	@@ -13,14 +13,6 @@
13	13	*/
14	14
15	15	/*
16		- * This can be used to enable code to cacheline align the source pointer.
17		- * Experiments on tested architectures (StrongARM and XScale) didn't show
18		- * this a worthwhile thing to do. That might be different in the future.
19		- */
20		-//#define CALGN(code...) code
21		-#define CALGN(code...)
22		-
23		-/*
24	16	* Theory of operation
25	17	* -------------------
26	18	*
...	...	@@ -82,7 +74,7 @@
82	74	stmfd sp!, {r5 - r8}
83	75	blt 5f
84	76
85		- CALGN( ands ip, r1, #31 )
	77	+ CALGN( ands ip, r0, #31 )
86	78	CALGN( rsb r3, ip, #32 )
87	79	CALGN( sbcnes r4, r3, r2 ) @ C is always set here
88	80	CALGN( bcs 2f )
...	...	@@ -168,7 +160,7 @@
168	160	subs r2, r2, #28
169	161	blt 14f
170	162
171		- CALGN( ands ip, r1, #31 )
	163	+ CALGN( ands ip, r0, #31 )
172	164	CALGN( rsb ip, ip, #32 )
173	165	CALGN( sbcnes r4, ip, r2 ) @ C is always set here
174	166	CALGN( subcc r2, r2, ip )
...	...	@@ -13,14 +13,6 @@
13	13	#include <linux/linkage.h>
14	14	#include <asm/assembler.h>
15	15
16		-/*
17		- * This can be used to enable code to cacheline align the source pointer.
18		- * Experiments on tested architectures (StrongARM and XScale) didn't show
19		- * this a worthwhile thing to do. That might be different in the future.
20		- */
21		-//#define CALGN(code...) code
22		-#define CALGN(code...)
23		-
24	16	.text
25	17
26	18	/*
...	...	@@ -55,7 +47,7 @@
55	47	stmfd sp!, {r5 - r8}
56	48	blt 5f
57	49
58		- CALGN( ands ip, r1, #31 )
	50	+ CALGN( ands ip, r0, #31 )
59	51	CALGN( sbcnes r4, ip, r2 ) @ C is always set here
60	52	CALGN( bcs 2f )
61	53	CALGN( adr r4, 6f )
...	...	@@ -139,7 +131,7 @@
139	131	subs r2, r2, #28
140	132	blt 14f
141	133
142		- CALGN( ands ip, r1, #31 )
	134	+ CALGN( ands ip, r0, #31 )
143	135	CALGN( sbcnes r4, ip, r2 ) @ C is always set here
144	136	CALGN( subcc r2, r2, ip )
145	137	CALGN( bcc 15f )
...	...	@@ -56,6 +56,21 @@
56	56	#endif
57	57
58	58	/*
	59	+ * This can be used to enable code to cacheline align the destination
	60	+ * pointer when bulk writing to memory. Experiments on StrongARM and
	61	+ * XScale didn't show this a worthwhile thing to do when the cache is not
	62	+ * set to write-allocate (this would need further testing on XScale when WA
	63	+ * is used).
	64	+ *
	65	+ * On Feroceon there is much to gain however, regardless of cache mode.
	66	+ */
	67	+#ifdef CONFIG_CPU_FEROCEON
	68	+#define CALGN(code...) code
	69	+#else
	70	+#define CALGN(code...)
	71	+#endif
	72	+
	73	+/*
59	74	* Enable and disable interrupts
60	75	*/
61	76	#if __LINUX_ARM_ARCH__ >= 6