Commit fa8a7094ba1679b4b9b443e0ac9f5e046c79ee8d

Authored by Tejun Heo
1 parent e59a1bb2fd

x86: implement percpu_alloc kernel parameter

According to Andi, it isn't clear whether lpage allocator is worth the
trouble as there are many processors where PMD TLB is far scarcer than
PTE TLB.  The advantage or disadvantage probably depends on the actual
size of percpu area and specific processor.  As performance
degradation due to TLB pressure tends to be highly workload specific
and subtle, it is difficult to decide which way to go without more
data.

This patch implements percpu_alloc kernel parameter to allow selecting
which first chunk allocator to use to ease debugging and testing.

While at it, make sure all the failure paths report why something
failed to help determining why certain allocator isn't working.  Also,
kill the "Great future plan" comment which had already been realized
quite some time ago.

[ Impact: allow explicit percpu first chunk allocator selection ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Jan Beulich <JBeulich@novell.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ingo Molnar <mingo@elte.hu>

Showing 3 changed files with 65 additions and 23 deletions Side-by-side Diff

Documentation/kernel-parameters.txt
... ... @@ -1882,6 +1882,12 @@
1882 1882 Format: { 0 | 1 }
1883 1883 See arch/parisc/kernel/pdc_chassis.c
1884 1884  
  1885 + percpu_alloc= [X86] Select which percpu first chunk allocator to use.
  1886 + Allowed values are one of "lpage", "embed" and "4k".
  1887 + See comments in arch/x86/kernel/setup_percpu.c for
  1888 + details on each allocator. This parameter is primarily
  1889 + for debugging and performance comparison.
  1890 +
1885 1891 pf. [PARIDE]
1886 1892 See Documentation/blockdev/paride.txt.
1887 1893  
arch/x86/kernel/setup_percpu.c
... ... @@ -156,20 +156,23 @@
156 156 return virt_to_page(pcpul_map[cpu].ptr + off);
157 157 }
158 158  
159   -static ssize_t __init setup_pcpu_lpage(size_t static_size)
  159 +static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
160 160 {
161 161 size_t map_size, dyn_size;
162 162 unsigned int cpu;
163 163 int i, j;
164 164 ssize_t ret;
165 165  
166   - /*
167   - * If large page isn't supported, there's no benefit in doing
168   - * this. Also, on non-NUMA, embedding is better.
169   - */
170   - if (!cpu_has_pse || !pcpu_need_numa())
  166 + /* on non-NUMA, embedding is better */
  167 + if (!chosen && !pcpu_need_numa())
171 168 return -EINVAL;
172 169  
  170 + /* need PSE */
  171 + if (!cpu_has_pse) {
  172 + pr_warning("PERCPU: lpage allocator requires PSE\n");
  173 + return -EINVAL;
  174 + }
  175 +
173 176 /*
174 177 * Currently supports only single page. Supporting multiple
175 178 * pages won't be too difficult if it ever becomes necessary.
176 179  
... ... @@ -191,8 +194,11 @@
191 194 pcpul_map[cpu].cpu = cpu;
192 195 pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE,
193 196 PMD_SIZE);
194   - if (!pcpul_map[cpu].ptr)
  197 + if (!pcpul_map[cpu].ptr) {
  198 + pr_warning("PERCPU: failed to allocate large page "
  199 + "for cpu%u\n", cpu);
195 200 goto enomem;
  201 + }
196 202  
197 203 /*
198 204 * Only use pcpul_size bytes and give back the rest.
... ... @@ -297,7 +303,7 @@
297 303 return NULL;
298 304 }
299 305 #else
300   -static ssize_t __init setup_pcpu_lpage(size_t static_size)
  306 +static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
301 307 {
302 308 return -EINVAL;
303 309 }
... ... @@ -311,7 +317,7 @@
311 317 * mapping so that it can use PMD mapping without additional TLB
312 318 * pressure.
313 319 */
314   -static ssize_t __init setup_pcpu_embed(size_t static_size)
  320 +static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen)
315 321 {
316 322 size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
317 323  
... ... @@ -320,7 +326,7 @@
320 326 * this. Also, embedding allocation doesn't play well with
321 327 * NUMA.
322 328 */
323   - if (!cpu_has_pse || pcpu_need_numa())
  329 + if (!chosen && (!cpu_has_pse || pcpu_need_numa()))
324 330 return -EINVAL;
325 331  
326 332 return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
327 333  
... ... @@ -370,8 +376,11 @@
370 376 void *ptr;
371 377  
372 378 ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE);
373   - if (!ptr)
  379 + if (!ptr) {
  380 + pr_warning("PERCPU: failed to allocate "
  381 + "4k page for cpu%u\n", cpu);
374 382 goto enomem;
  383 + }
375 384  
376 385 memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
377 386 pcpu4k_pages[j++] = virt_to_page(ptr);
... ... @@ -395,6 +404,16 @@
395 404 return ret;
396 405 }
397 406  
  407 +/* for explicit first chunk allocator selection */
  408 +static char pcpu_chosen_alloc[16] __initdata;
  409 +
  410 +static int __init percpu_alloc_setup(char *str)
  411 +{
  412 + strncpy(pcpu_chosen_alloc, str, sizeof(pcpu_chosen_alloc) - 1);
  413 + return 0;
  414 +}
  415 +early_param("percpu_alloc", percpu_alloc_setup);
  416 +
398 417 static inline void setup_percpu_segment(int cpu)
399 418 {
400 419 #ifdef CONFIG_X86_32
... ... @@ -408,11 +427,6 @@
408 427 #endif
409 428 }
410 429  
411   -/*
412   - * Great future plan:
413   - * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
414   - * Always point %gs to its beginning
415   - */
416 430 void __init setup_per_cpu_areas(void)
417 431 {
418 432 size_t static_size = __per_cpu_end - __per_cpu_start;
... ... @@ -429,9 +443,26 @@
429 443 * of large page mappings. Please read comments on top of
430 444 * each allocator for details.
431 445 */
432   - ret = setup_pcpu_lpage(static_size);
433   - if (ret < 0)
434   - ret = setup_pcpu_embed(static_size);
  446 + ret = -EINVAL;
  447 + if (strlen(pcpu_chosen_alloc)) {
  448 + if (strcmp(pcpu_chosen_alloc, "4k")) {
  449 + if (!strcmp(pcpu_chosen_alloc, "lpage"))
  450 + ret = setup_pcpu_lpage(static_size, true);
  451 + else if (!strcmp(pcpu_chosen_alloc, "embed"))
  452 + ret = setup_pcpu_embed(static_size, true);
  453 + else
  454 + pr_warning("PERCPU: unknown allocator %s "
  455 + "specified\n", pcpu_chosen_alloc);
  456 + if (ret < 0)
  457 + pr_warning("PERCPU: %s allocator failed (%zd), "
  458 + "falling back to 4k\n",
  459 + pcpu_chosen_alloc, ret);
  460 + }
  461 + } else {
  462 + ret = setup_pcpu_lpage(static_size, false);
  463 + if (ret < 0)
  464 + ret = setup_pcpu_embed(static_size, false);
  465 + }
435 466 if (ret < 0)
436 467 ret = setup_pcpu_4k(static_size);
437 468 if (ret < 0)
... ... @@ -1233,6 +1233,7 @@
1233 1233 ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
1234 1234 ssize_t dyn_size, ssize_t unit_size)
1235 1235 {
  1236 + size_t chunk_size;
1236 1237 unsigned int cpu;
1237 1238  
1238 1239 /* determine parameters and allocate */
1239 1240  
... ... @@ -1247,11 +1248,15 @@
1247 1248 } else
1248 1249 pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
1249 1250  
1250   - pcpue_ptr = __alloc_bootmem_nopanic(
1251   - num_possible_cpus() * pcpue_unit_size,
1252   - PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
1253   - if (!pcpue_ptr)
  1251 + chunk_size = pcpue_unit_size * num_possible_cpus();
  1252 +
  1253 + pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
  1254 + __pa(MAX_DMA_ADDRESS));
  1255 + if (!pcpue_ptr) {
  1256 + pr_warning("PERCPU: failed to allocate %zu bytes for "
  1257 + "embedding\n", chunk_size);
1254 1258 return -ENOMEM;
  1259 + }
1255 1260  
1256 1261 /* return the leftover and copy */
1257 1262 for_each_possible_cpu(cpu) {