tile: make __write_once a synonym for __read_mostly

This was really only useful for TILE64 when we mapped the kernel data with small pages. Now we use a huge page and we really don't want to map different parts of the kernel data in different ways. We retain the __write_once name in case we want to bring it back to life at some point in the future. Note that this change uncovered a latent bug where the "smp_topology" variable happened to always be aligned mod 8 so we could store two "int" values at once, but when we eliminated __write_once it ended up only aligned mod 4. Fix with an explicit annotation. Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>

tile: make __write_once a synonym for __read_mostly
This was really only useful for TILE64 when we mapped the kernel data with small pages. Now we use a huge page and we really don't want to map different parts of the kernel data in different ways. We retain the __write_once name in case we want to bring it back to life at some point in the future. Note that this change uncovered a latent bug where the "smp_topology" variable happened to always be aligned mod 8 so we could store two "int" values at once, but when we eliminated __write_once it ended up only aligned mod 4. Fix with an explicit annotation. Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
Chris Metcalf
1 parent d7c9661115
Showing 4 changed files with 17 additions and 27 deletions Side-by-side Diff
arch/tile/include/asm/cache.h
arch/tile/kernel/smp.c
arch/tile/kernel/vmlinux.lds.S
arch/tile/mm/init.c
@@ -49,10 +49,17 @@
 #define __read_mostly __attribute__((__section__(".data..read_mostly")))
  
 /*
- * Attribute for data that is kept read/write coherent until the end of
- * initialization, then bumped to read/only incoherent for performance.
+ * Originally we used small TLB pages for kernel data and grouped some
+ * things together as "write once", enforcing the property at the end
+ * of initialization by making those pages read-only and non-coherent.
+ * This allowed better cache utilization since cache inclusion did not
+ * need to be maintained.  However, to do this requires an extra TLB
+ * entry, which on balance is more of a performance hit than the
+ * non-coherence is a performance gain, so we now just make "read
+ * mostly" and "write once" be synonyms.  We keep the attribute
+ * separate in case we change our minds at a future date.
  */
-#define __write_once __attribute__((__section__(".w1data")))
+#define __write_once __read_mostly
  
 #endif /* _ASM_TILE_CACHE_H */
@@ -22,7 +22,11 @@
 #include <asm/cacheflush.h>
 #include <asm/homecache.h>
  
-HV_Topology smp_topology __write_once;
+/*
+ * We write to width and height with a single store in head_NN.S,
+ * so make the variable aligned to "long".
+ */
+HV_Topology smp_topology __write_once __aligned(sizeof(long));
 EXPORT_SYMBOL(smp_topology);
  
 #if CHIP_HAS_IPI()
@@ -74,20 +74,8 @@
   __init_end = .;
  
   _sdata = .;                   /* Start of data section */
-
   RO_DATA_SECTION(PAGE_SIZE)
-
-  /* initially writeable, then read-only */
-  . = ALIGN(PAGE_SIZE);
-  __w1data_begin = .;
-  .w1data : AT(ADDR(.w1data) - LOAD_OFFSET) {
-    VMLINUX_SYMBOL(__w1data_begin) = .;
-    *(.w1data)
-    VMLINUX_SYMBOL(__w1data_end) = .;
-  }
-
   RW_DATA_SECTION(L2_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE)
-
   _edata = .;
  
   EXCEPTION_TABLE(L2_CACHE_BYTES)
@@ -271,21 +271,13 @@
 		return construct_pgprot(PAGE_KERNEL, PAGE_HOME_HASH);
  
 	/*
-	 * Make the w1data homed like heap to start with, to avoid
-	 * making it part of the page-striped data area when we're just
-	 * going to convert it to read-only soon anyway.
-	 */
-	if (address >= (ulong)__w1data_begin && address < (ulong)__w1data_end)
-		return construct_pgprot(PAGE_KERNEL, initial_heap_home());
-
-	/*
 	 * Otherwise we just hand out consecutive cpus.  To avoid
 	 * requiring this function to hold state, we just walk forward from
 	 * _sdata by PAGE_SIZE, skipping the readonly and init data, to reach
 	 * the requested address, while walking cpu home around kdata_mask.
 	 * This is typically no more than a dozen or so iterations.
 	 */
-	page = (((ulong)__w1data_end) + PAGE_SIZE - 1) & PAGE_MASK;
+	page = (((ulong)__end_rodata) + PAGE_SIZE - 1) & PAGE_MASK;
 	BUG_ON(address < page || address >= (ulong)_end);
 	cpu = cpumask_first(&kdata_mask);
 	for (; page < address; page += PAGE_SIZE) {
@@ -980,8 +972,7 @@
 	const unsigned long text_delta = MEM_SV_START - PAGE_OFFSET;
  
 	/*
-	 * Evict the dirty initdata on the boot cpu, evict the w1data
-	 * wherever it's homed, and evict all the init code everywhere.
+	 * Evict the cache on all cores to avoid incoherence.
 	 * We are guaranteed that no one will touch the init pages any more.
 	 */
 	homecache_evict(&cpu_cacheable_map);
...	...	@@ -49,10 +49,17 @@
49	49	#define __read_mostly __attribute__((__section__(".data..read_mostly")))
50	50
51	51	/*
52		- * Attribute for data that is kept read/write coherent until the end of
53		- * initialization, then bumped to read/only incoherent for performance.
	52	+ * Originally we used small TLB pages for kernel data and grouped some
	53	+ * things together as "write once", enforcing the property at the end
	54	+ * of initialization by making those pages read-only and non-coherent.
	55	+ * This allowed better cache utilization since cache inclusion did not
	56	+ * need to be maintained. However, to do this requires an extra TLB
	57	+ * entry, which on balance is more of a performance hit than the
	58	+ * non-coherence is a performance gain, so we now just make "read
	59	+ * mostly" and "write once" be synonyms. We keep the attribute
	60	+ * separate in case we change our minds at a future date.
54	61	*/
55		-#define __write_once __attribute__((__section__(".w1data")))
	62	+#define __write_once __read_mostly
56	63
57	64	#endif /* _ASM_TILE_CACHE_H */
...	...	@@ -22,7 +22,11 @@
22	22	#include <asm/cacheflush.h>
23	23	#include <asm/homecache.h>
24	24
25		-HV_Topology smp_topology __write_once;
	25	+/*
	26	+ * We write to width and height with a single store in head_NN.S,
	27	+ * so make the variable aligned to "long".
	28	+ */
	29	+HV_Topology smp_topology __write_once __aligned(sizeof(long));
26	30	EXPORT_SYMBOL(smp_topology);
27	31
28	32	#if CHIP_HAS_IPI()
...	...	@@ -74,20 +74,8 @@
74	74	__init_end = .;
75	75
76	76	_sdata = .; /* Start of data section */
77		-
78	77	RO_DATA_SECTION(PAGE_SIZE)
79		-
80		- /* initially writeable, then read-only */
81		- . = ALIGN(PAGE_SIZE);
82		- __w1data_begin = .;
83		- .w1data : AT(ADDR(.w1data) - LOAD_OFFSET) {
84		- VMLINUX_SYMBOL(__w1data_begin) = .;
85		- *(.w1data)
86		- VMLINUX_SYMBOL(__w1data_end) = .;
87		- }
88		-
89	78	RW_DATA_SECTION(L2_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE)
90		-
91	79	_edata = .;
92	80
93	81	EXCEPTION_TABLE(L2_CACHE_BYTES)
...	...	@@ -271,21 +271,13 @@
271	271	return construct_pgprot(PAGE_KERNEL, PAGE_HOME_HASH);
272	272
273	273	/*
274		- * Make the w1data homed like heap to start with, to avoid
275		- * making it part of the page-striped data area when we're just
276		- * going to convert it to read-only soon anyway.
277		- */
278		- if (address >= (ulong)__w1data_begin && address < (ulong)__w1data_end)
279		- return construct_pgprot(PAGE_KERNEL, initial_heap_home());
280		-
281		- /*
282	274	* Otherwise we just hand out consecutive cpus. To avoid
283	275	* requiring this function to hold state, we just walk forward from
284	276	* _sdata by PAGE_SIZE, skipping the readonly and init data, to reach
285	277	* the requested address, while walking cpu home around kdata_mask.
286	278	* This is typically no more than a dozen or so iterations.
287	279	*/
288		- page = (((ulong)__w1data_end) + PAGE_SIZE - 1) & PAGE_MASK;
	280	+ page = (((ulong)__end_rodata) + PAGE_SIZE - 1) & PAGE_MASK;
289	281	BUG_ON(address < page \|\| address >= (ulong)_end);
290	282	cpu = cpumask_first(&kdata_mask);
291	283	for (; page < address; page += PAGE_SIZE) {
...	...	@@ -980,8 +972,7 @@
980	972	const unsigned long text_delta = MEM_SV_START - PAGE_OFFSET;
981	973
982	974	/*
983		- * Evict the dirty initdata on the boot cpu, evict the w1data
984		- * wherever it's homed, and evict all the init code everywhere.
	975	+ * Evict the cache on all cores to avoid incoherence.
985	976	* We are guaranteed that no one will touch the init pages any more.
986	977	*/
987	978	homecache_evict(&cpu_cacheable_map);