Commit 5fb7dc37dc16fbc8b80d81318a582201ef7e280d

Authored by Fenghua Yu
Committed by Linus Torvalds
1 parent 3d7e33825d

define new percpu interface for shared data

per cpu data section contains two types of data.  One set which is
exclusively accessed by the local cpu and the other set which is per cpu,
but also shared by remote cpus.  In the current kernel, these two sets are
not clearely separated out.  This can potentially cause the same data
cacheline shared between the two sets of data, which will result in
unnecessary bouncing of the cacheline between cpus.

One way to fix the problem is to cacheline align the remotely accessed per
cpu data, both at the beginning and at the end.  Because of the padding at
both ends, this will likely cause some memory wastage and also the
interface to achieve this is not clean.

This patch:

Moves the remotely accessed per cpu data (which is currently marked
as ____cacheline_aligned_in_smp) into a different section, where all the data
elements are cacheline aligned. And as such, this differentiates the local
only data and remotely accessed data cleanly.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: <linux-arch@vger.kernel.org>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 26 changed files with 84 additions and 53 deletions Side-by-side Diff

arch/alpha/kernel/vmlinux.lds.S
... ... @@ -69,10 +69,7 @@
69 69 . = ALIGN(8);
70 70 SECURITY_INIT
71 71  
72   - . = ALIGN(8192);
73   - __per_cpu_start = .;
74   - .data.percpu : { *(.data.percpu) }
75   - __per_cpu_end = .;
  72 + PERCPU(8192)
76 73  
77 74 . = ALIGN(2*8192);
78 75 __init_end = .;
arch/arm/kernel/vmlinux.lds.S
... ... @@ -66,6 +66,7 @@
66 66 . = ALIGN(4096);
67 67 __per_cpu_start = .;
68 68 *(.data.percpu)
  69 + *(.data.percpu.shared_aligned)
69 70 __per_cpu_end = .;
70 71 #ifndef CONFIG_XIP_KERNEL
71 72 __init_begin = _stext;
arch/cris/arch-v32/vmlinux.lds.S
... ... @@ -91,10 +91,7 @@
91 91 }
92 92 SECURITY_INIT
93 93  
94   - . = ALIGN (8192);
95   - __per_cpu_start = .;
96   - .data.percpu : { *(.data.percpu) }
97   - __per_cpu_end = .;
  94 + PERCPU(8192)
98 95  
99 96 #ifdef CONFIG_BLK_DEV_INITRD
100 97 .init.ramfs : {
arch/frv/kernel/vmlinux.lds.S
... ... @@ -57,10 +57,7 @@
57 57 __alt_instructions_end = .;
58 58 .altinstr_replacement : { *(.altinstr_replacement) }
59 59  
60   - . = ALIGN(4096);
61   - __per_cpu_start = .;
62   - .data.percpu : { *(.data.percpu) }
63   - __per_cpu_end = .;
  60 + PERCPU(4096)
64 61  
65 62 #ifdef CONFIG_BLK_DEV_INITRD
66 63 . = ALIGN(4096);
arch/i386/kernel/vmlinux.lds.S
... ... @@ -181,6 +181,7 @@
181 181 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
182 182 __per_cpu_start = .;
183 183 *(.data.percpu)
  184 + *(.data.percpu.shared_aligned)
184 185 __per_cpu_end = .;
185 186 }
186 187 . = ALIGN(4096);
arch/ia64/kernel/vmlinux.lds.S
... ... @@ -206,6 +206,7 @@
206 206 {
207 207 __per_cpu_start = .;
208 208 *(.data.percpu)
  209 + *(.data.percpu.shared_aligned)
209 210 __per_cpu_end = .;
210 211 }
211 212 . = __phys_per_cpu_start + PERCPU_PAGE_SIZE; /* ensure percpu data fits
arch/m32r/kernel/vmlinux.lds.S
... ... @@ -110,10 +110,7 @@
110 110 __initramfs_end = .;
111 111 #endif
112 112  
113   - . = ALIGN(4096);
114   - __per_cpu_start = .;
115   - .data.percpu : { *(.data.percpu) }
116   - __per_cpu_end = .;
  113 + PERCPU(4096)
117 114 . = ALIGN(4096);
118 115 __init_end = .;
119 116 /* freed after init ends here */
arch/mips/kernel/vmlinux.lds.S
... ... @@ -119,10 +119,7 @@
119 119 .init.ramfs : { *(.init.ramfs) }
120 120 __initramfs_end = .;
121 121 #endif
122   - . = ALIGN(_PAGE_SIZE);
123   - __per_cpu_start = .;
124   - .data.percpu : { *(.data.percpu) }
125   - __per_cpu_end = .;
  122 + PERCPU(_PAGE_SIZE)
126 123 . = ALIGN(_PAGE_SIZE);
127 124 __init_end = .;
128 125 /* freed after init ends here */
arch/parisc/kernel/vmlinux.lds.S
... ... @@ -181,10 +181,9 @@
181 181 .init.ramfs : { *(.init.ramfs) }
182 182 __initramfs_end = .;
183 183 #endif
184   - . = ALIGN(ASM_PAGE_SIZE);
185   - __per_cpu_start = .;
186   - .data.percpu : { *(.data.percpu) }
187   - __per_cpu_end = .;
  184 +
  185 + PERCPU(ASM_PAGE_SIZE)
  186 +
188 187 . = ALIGN(ASM_PAGE_SIZE);
189 188 __init_end = .;
190 189 /* freed after init ends here */
arch/powerpc/kernel/vmlinux.lds.S
... ... @@ -144,6 +144,7 @@
144 144 .data.percpu : {
145 145 __per_cpu_start = .;
146 146 *(.data.percpu)
  147 + *(.data.percpu.shared_aligned)
147 148 __per_cpu_end = .;
148 149 }
149 150  
arch/ppc/kernel/vmlinux.lds.S
... ... @@ -130,10 +130,7 @@
130 130 __ftr_fixup : { *(__ftr_fixup) }
131 131 __stop___ftr_fixup = .;
132 132  
133   - . = ALIGN(4096);
134   - __per_cpu_start = .;
135   - .data.percpu : { *(.data.percpu) }
136   - __per_cpu_end = .;
  133 + PERCPU(4096)
137 134  
138 135 #ifdef CONFIG_BLK_DEV_INITRD
139 136 . = ALIGN(4096);
arch/s390/kernel/vmlinux.lds.S
... ... @@ -107,10 +107,7 @@
107 107 . = ALIGN(2);
108 108 __initramfs_end = .;
109 109 #endif
110   - . = ALIGN(4096);
111   - __per_cpu_start = .;
112   - .data.percpu : { *(.data.percpu) }
113   - __per_cpu_end = .;
  110 + PERCPU(4096)
114 111 . = ALIGN(4096);
115 112 __init_end = .;
116 113 /* freed after init ends here */
arch/sh/kernel/vmlinux.lds.S
... ... @@ -60,10 +60,7 @@
60 60 . = ALIGN(PAGE_SIZE);
61 61 __nosave_end = .;
62 62  
63   - . = ALIGN(PAGE_SIZE);
64   - __per_cpu_start = .;
65   - .data.percpu : { *(.data.percpu) }
66   - __per_cpu_end = .;
  63 + PERCPU(PAGE_SIZE)
67 64 .data.cacheline_aligned : { *(.data.cacheline_aligned) }
68 65  
69 66 _edata = .; /* End of data section */
arch/sh64/kernel/vmlinux.lds.S
... ... @@ -87,7 +87,10 @@
87 87  
88 88 . = ALIGN(PAGE_SIZE);
89 89 __per_cpu_start = .;
90   - .data.percpu : C_PHYS(.data.percpu) { *(.data.percpu) }
  90 + .data.percpu : C_PHYS(.data.percpu) {
  91 + *(.data.percpu)
  92 + *(.data.percpu.shared_aligned)
  93 + }
91 94 __per_cpu_end = . ;
92 95 .data.cacheline_aligned : C_PHYS(.data.cacheline_aligned) { *(.data.cacheline_aligned) }
93 96  
arch/sparc/kernel/vmlinux.lds.S
... ... @@ -65,10 +65,7 @@
65 65 __initramfs_end = .;
66 66 #endif
67 67  
68   - . = ALIGN(4096);
69   - __per_cpu_start = .;
70   - .data.percpu : { *(.data.percpu) }
71   - __per_cpu_end = .;
  68 + PERCPU(4096)
72 69 . = ALIGN(4096);
73 70 __init_end = .;
74 71 . = ALIGN(32);
arch/sparc64/kernel/vmlinux.lds.S
... ... @@ -90,10 +90,8 @@
90 90 __initramfs_end = .;
91 91 #endif
92 92  
93   - . = ALIGN(PAGE_SIZE);
94   - __per_cpu_start = .;
95   - .data.percpu : { *(.data.percpu) }
96   - __per_cpu_end = .;
  93 + PERCPU(PAGE_SIZE)
  94 +
97 95 . = ALIGN(PAGE_SIZE);
98 96 __init_end = .;
99 97 __bss_start = .;
arch/x86_64/kernel/vmlinux.lds.S
... ... @@ -194,10 +194,8 @@
194 194 __initramfs_end = .;
195 195 #endif
196 196  
197   - . = ALIGN(4096);
198   - __per_cpu_start = .;
199   - .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) }
200   - __per_cpu_end = .;
  197 + PERCPU(4096)
  198 +
201 199 . = ALIGN(4096);
202 200 __init_end = .;
203 201  
arch/xtensa/kernel/vmlinux.lds.S
... ... @@ -190,10 +190,7 @@
190 190 __initramfs_end = .;
191 191 #endif
192 192  
193   - . = ALIGN(4096);
194   - __per_cpu_start = .;
195   - .data.percpu : { *(.data.percpu) }
196   - __per_cpu_end = .;
  193 + PERCPU(4096)
197 194  
198 195  
199 196 /* We need this dummy segment here */
include/asm-generic/percpu.h
... ... @@ -14,6 +14,11 @@
14 14 #define DEFINE_PER_CPU(type, name) \
15 15 __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
16 16  
  17 +#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \
  18 + __attribute__((__section__(".data.percpu.shared_aligned"))) \
  19 + __typeof__(type) per_cpu__##name \
  20 + ____cacheline_aligned_in_smp
  21 +
17 22 /* var is in discarded region: offset to particular copy we want */
18 23 #define per_cpu(var, cpu) (*({ \
19 24 extern int simple_identifier_##var(void); \
... ... @@ -33,6 +38,9 @@
33 38  
34 39 #define DEFINE_PER_CPU(type, name) \
35 40 __typeof__(type) per_cpu__##name
  41 +
  42 +#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \
  43 + DEFINE_PER_CPU(type, name)
36 44  
37 45 #define per_cpu(var, cpu) (*((void)(cpu), &per_cpu__##var))
38 46 #define __get_cpu_var(var) per_cpu__##var
include/asm-generic/vmlinux.lds.h
... ... @@ -244,4 +244,13 @@
244 244 *(.initcall6s.init) \
245 245 *(.initcall7.init) \
246 246 *(.initcall7s.init)
  247 +
  248 +#define PERCPU(align) \
  249 + . = ALIGN(align); \
  250 + __per_cpu_start = .; \
  251 + .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { \
  252 + *(.data.percpu) \
  253 + *(.data.percpu.shared_aligned) \
  254 + } \
  255 + __per_cpu_end = .;
include/asm-i386/percpu.h
... ... @@ -54,6 +54,11 @@
54 54 #define DEFINE_PER_CPU(type, name) \
55 55 __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
56 56  
  57 +#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \
  58 + __attribute__((__section__(".data.percpu.shared_aligned"))) \
  59 + __typeof__(type) per_cpu__##name \
  60 + ____cacheline_aligned_in_smp
  61 +
57 62 /* We can use this directly for local CPU (faster). */
58 63 DECLARE_PER_CPU(unsigned long, this_cpu_off);
59 64  
include/asm-ia64/percpu.h
... ... @@ -29,6 +29,16 @@
29 29 __attribute__((__section__(".data.percpu"))) \
30 30 __SMALL_ADDR_AREA __typeof__(type) per_cpu__##name
31 31  
  32 +#ifdef CONFIG_SMP
  33 +#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \
  34 + __attribute__((__section__(".data.percpu.shared_aligned"))) \
  35 + __SMALL_ADDR_AREA __typeof__(type) per_cpu__##name \
  36 + ____cacheline_aligned_in_smp
  37 +#else
  38 +#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \
  39 + DEFINE_PER_CPU(type, name)
  40 +#endif
  41 +
32 42 /*
33 43 * Pretty much a literal copy of asm-generic/percpu.h, except that percpu_modcopy() is an
34 44 * external routine, to avoid include-hell.
include/asm-powerpc/percpu.h
... ... @@ -20,6 +20,11 @@
20 20 #define DEFINE_PER_CPU(type, name) \
21 21 __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
22 22  
  23 +#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \
  24 + __attribute__((__section__(".data.percpu.shared_aligned"))) \
  25 + __typeof__(type) per_cpu__##name \
  26 + ____cacheline_aligned_in_smp
  27 +
23 28 /* var is in discarded region: offset to particular copy we want */
24 29 #define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset(cpu)))
25 30 #define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __my_cpu_offset()))
... ... @@ -40,6 +45,8 @@
40 45  
41 46 #define DEFINE_PER_CPU(type, name) \
42 47 __typeof__(type) per_cpu__##name
  48 +#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \
  49 + DEFINE_PER_CPU(type, name)
43 50  
44 51 #define per_cpu(var, cpu) (*((void)(cpu), &per_cpu__##var))
45 52 #define __get_cpu_var(var) per_cpu__##var
include/asm-s390/percpu.h
... ... @@ -41,6 +41,11 @@
41 41 __attribute__((__section__(".data.percpu"))) \
42 42 __typeof__(type) per_cpu__##name
43 43  
  44 +#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \
  45 + __attribute__((__section__(".data.percpu.shared_aligned"))) \
  46 + __typeof__(type) per_cpu__##name \
  47 + ____cacheline_aligned_in_smp
  48 +
44 49 #define __get_cpu_var(var) __reloc_hide(var,S390_lowcore.percpu_offset)
45 50 #define __raw_get_cpu_var(var) __reloc_hide(var,S390_lowcore.percpu_offset)
46 51 #define per_cpu(var,cpu) __reloc_hide(var,__per_cpu_offset[cpu])
... ... @@ -59,6 +64,8 @@
59 64  
60 65 #define DEFINE_PER_CPU(type, name) \
61 66 __typeof__(type) per_cpu__##name
  67 +#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \
  68 + DEFINE_PER_CPU(type, name)
62 69  
63 70 #define __get_cpu_var(var) __reloc_hide(var,0)
64 71 #define __raw_get_cpu_var(var) __reloc_hide(var,0)
include/asm-sparc64/percpu.h
... ... @@ -18,6 +18,11 @@
18 18 #define DEFINE_PER_CPU(type, name) \
19 19 __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
20 20  
  21 +#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \
  22 + __attribute__((__section__(".data.percpu.shared_aligned"))) \
  23 + __typeof__(type) per_cpu__##name \
  24 + ____cacheline_aligned_in_smp
  25 +
21 26 register unsigned long __local_per_cpu_offset asm("g5");
22 27  
23 28 /* var is in discarded region: offset to particular copy we want */
... ... @@ -38,6 +43,8 @@
38 43 #define real_setup_per_cpu_areas() do { } while (0)
39 44 #define DEFINE_PER_CPU(type, name) \
40 45 __typeof__(type) per_cpu__##name
  46 +#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \
  47 + DEFINE_PER_CPU(type, name)
41 48  
42 49 #define per_cpu(var, cpu) (*((void)cpu, &per_cpu__##var))
43 50 #define __get_cpu_var(var) per_cpu__##var
include/asm-x86_64/percpu.h
... ... @@ -20,6 +20,11 @@
20 20 #define DEFINE_PER_CPU(type, name) \
21 21 __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
22 22  
  23 +#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \
  24 + __attribute__((__section__(".data.percpu.shared_aligned"))) \
  25 + __typeof__(type) per_cpu__##name \
  26 + ____cacheline_internodealigned_in_smp
  27 +
23 28 /* var is in discarded region: offset to particular copy we want */
24 29 #define per_cpu(var, cpu) (*({ \
25 30 extern int simple_identifier_##var(void); \
... ... @@ -46,6 +51,8 @@
46 51  
47 52 #define DEFINE_PER_CPU(type, name) \
48 53 __typeof__(type) per_cpu__##name
  54 +#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \
  55 + DEFINE_PER_CPU(type, name)
49 56  
50 57 #define per_cpu(var, cpu) (*((void)(cpu), &per_cpu__##var))
51 58 #define __get_cpu_var(var) per_cpu__##var