Commit fa8a7094ba1679b4b9b443e0ac9f5e046c79ee8d
1 parent
e59a1bb2fd
Exists in
master
and in
4 other branches
x86: implement percpu_alloc kernel parameter
According to Andi, it isn't clear whether lpage allocator is worth the trouble as there are many processors where PMD TLB is far scarcer than PTE TLB. The advantage or disadvantage probably depends on the actual size of percpu area and specific processor. As performance degradation due to TLB pressure tends to be highly workload specific and subtle, it is difficult to decide which way to go without more data. This patch implements percpu_alloc kernel parameter to allow selecting which first chunk allocator to use to ease debugging and testing. While at it, make sure all the failure paths report why something failed to help determining why certain allocator isn't working. Also, kill the "Great future plan" comment which had already been realized quite some time ago. [ Impact: allow explicit percpu first chunk allocator selection ] Signed-off-by: Tejun Heo <tj@kernel.org> Reported-by: Jan Beulich <JBeulich@novell.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Ingo Molnar <mingo@elte.hu>
Showing 3 changed files with 65 additions and 23 deletions Side-by-side Diff
Documentation/kernel-parameters.txt
... | ... | @@ -1882,6 +1882,12 @@ |
1882 | 1882 | Format: { 0 | 1 } |
1883 | 1883 | See arch/parisc/kernel/pdc_chassis.c |
1884 | 1884 | |
1885 | + percpu_alloc= [X86] Select which percpu first chunk allocator to use. | |
1886 | + Allowed values are one of "lpage", "embed" and "4k". | |
1887 | + See comments in arch/x86/kernel/setup_percpu.c for | |
1888 | + details on each allocator. This parameter is primarily | |
1889 | + for debugging and performance comparison. | |
1890 | + | |
1885 | 1891 | pf. [PARIDE] |
1886 | 1892 | See Documentation/blockdev/paride.txt. |
1887 | 1893 |
arch/x86/kernel/setup_percpu.c
... | ... | @@ -156,20 +156,23 @@ |
156 | 156 | return virt_to_page(pcpul_map[cpu].ptr + off); |
157 | 157 | } |
158 | 158 | |
159 | -static ssize_t __init setup_pcpu_lpage(size_t static_size) | |
159 | +static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) | |
160 | 160 | { |
161 | 161 | size_t map_size, dyn_size; |
162 | 162 | unsigned int cpu; |
163 | 163 | int i, j; |
164 | 164 | ssize_t ret; |
165 | 165 | |
166 | - /* | |
167 | - * If large page isn't supported, there's no benefit in doing | |
168 | - * this. Also, on non-NUMA, embedding is better. | |
169 | - */ | |
170 | - if (!cpu_has_pse || !pcpu_need_numa()) | |
166 | + /* on non-NUMA, embedding is better */ | |
167 | + if (!chosen && !pcpu_need_numa()) | |
171 | 168 | return -EINVAL; |
172 | 169 | |
170 | + /* need PSE */ | |
171 | + if (!cpu_has_pse) { | |
172 | + pr_warning("PERCPU: lpage allocator requires PSE\n"); | |
173 | + return -EINVAL; | |
174 | + } | |
175 | + | |
173 | 176 | /* |
174 | 177 | * Currently supports only single page. Supporting multiple |
175 | 178 | * pages won't be too difficult if it ever becomes necessary. |
176 | 179 | |
... | ... | @@ -191,8 +194,11 @@ |
191 | 194 | pcpul_map[cpu].cpu = cpu; |
192 | 195 | pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE, |
193 | 196 | PMD_SIZE); |
194 | - if (!pcpul_map[cpu].ptr) | |
197 | + if (!pcpul_map[cpu].ptr) { | |
198 | + pr_warning("PERCPU: failed to allocate large page " | |
199 | + "for cpu%u\n", cpu); | |
195 | 200 | goto enomem; |
201 | + } | |
196 | 202 | |
197 | 203 | /* |
198 | 204 | * Only use pcpul_size bytes and give back the rest. |
... | ... | @@ -297,7 +303,7 @@ |
297 | 303 | return NULL; |
298 | 304 | } |
299 | 305 | #else |
300 | -static ssize_t __init setup_pcpu_lpage(size_t static_size) | |
306 | +static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) | |
301 | 307 | { |
302 | 308 | return -EINVAL; |
303 | 309 | } |
... | ... | @@ -311,7 +317,7 @@ |
311 | 317 | * mapping so that it can use PMD mapping without additional TLB |
312 | 318 | * pressure. |
313 | 319 | */ |
314 | -static ssize_t __init setup_pcpu_embed(size_t static_size) | |
320 | +static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen) | |
315 | 321 | { |
316 | 322 | size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; |
317 | 323 | |
... | ... | @@ -320,7 +326,7 @@ |
320 | 326 | * this. Also, embedding allocation doesn't play well with |
321 | 327 | * NUMA. |
322 | 328 | */ |
323 | - if (!cpu_has_pse || pcpu_need_numa()) | |
329 | + if (!chosen && (!cpu_has_pse || pcpu_need_numa())) | |
324 | 330 | return -EINVAL; |
325 | 331 | |
326 | 332 | return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, |
327 | 333 | |
... | ... | @@ -370,8 +376,11 @@ |
370 | 376 | void *ptr; |
371 | 377 | |
372 | 378 | ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE); |
373 | - if (!ptr) | |
379 | + if (!ptr) { | |
380 | + pr_warning("PERCPU: failed to allocate " | |
381 | + "4k page for cpu%u\n", cpu); | |
374 | 382 | goto enomem; |
383 | + } | |
375 | 384 | |
376 | 385 | memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); |
377 | 386 | pcpu4k_pages[j++] = virt_to_page(ptr); |
... | ... | @@ -395,6 +404,16 @@ |
395 | 404 | return ret; |
396 | 405 | } |
397 | 406 | |
407 | +/* for explicit first chunk allocator selection */ | |
408 | +static char pcpu_chosen_alloc[16] __initdata; | |
409 | + | |
410 | +static int __init percpu_alloc_setup(char *str) | |
411 | +{ | |
412 | + strncpy(pcpu_chosen_alloc, str, sizeof(pcpu_chosen_alloc) - 1); | |
413 | + return 0; | |
414 | +} | |
415 | +early_param("percpu_alloc", percpu_alloc_setup); | |
416 | + | |
398 | 417 | static inline void setup_percpu_segment(int cpu) |
399 | 418 | { |
400 | 419 | #ifdef CONFIG_X86_32 |
... | ... | @@ -408,11 +427,6 @@ |
408 | 427 | #endif |
409 | 428 | } |
410 | 429 | |
411 | -/* | |
412 | - * Great future plan: | |
413 | - * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. | |
414 | - * Always point %gs to its beginning | |
415 | - */ | |
416 | 430 | void __init setup_per_cpu_areas(void) |
417 | 431 | { |
418 | 432 | size_t static_size = __per_cpu_end - __per_cpu_start; |
... | ... | @@ -429,9 +443,26 @@ |
429 | 443 | * of large page mappings. Please read comments on top of |
430 | 444 | * each allocator for details. |
431 | 445 | */ |
432 | - ret = setup_pcpu_lpage(static_size); | |
433 | - if (ret < 0) | |
434 | - ret = setup_pcpu_embed(static_size); | |
446 | + ret = -EINVAL; | |
447 | + if (strlen(pcpu_chosen_alloc)) { | |
448 | + if (strcmp(pcpu_chosen_alloc, "4k")) { | |
449 | + if (!strcmp(pcpu_chosen_alloc, "lpage")) | |
450 | + ret = setup_pcpu_lpage(static_size, true); | |
451 | + else if (!strcmp(pcpu_chosen_alloc, "embed")) | |
452 | + ret = setup_pcpu_embed(static_size, true); | |
453 | + else | |
454 | + pr_warning("PERCPU: unknown allocator %s " | |
455 | + "specified\n", pcpu_chosen_alloc); | |
456 | + if (ret < 0) | |
457 | + pr_warning("PERCPU: %s allocator failed (%zd), " | |
458 | + "falling back to 4k\n", | |
459 | + pcpu_chosen_alloc, ret); | |
460 | + } | |
461 | + } else { | |
462 | + ret = setup_pcpu_lpage(static_size, false); | |
463 | + if (ret < 0) | |
464 | + ret = setup_pcpu_embed(static_size, false); | |
465 | + } | |
435 | 466 | if (ret < 0) |
436 | 467 | ret = setup_pcpu_4k(static_size); |
437 | 468 | if (ret < 0) |
mm/percpu.c
... | ... | @@ -1233,6 +1233,7 @@ |
1233 | 1233 | ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, |
1234 | 1234 | ssize_t dyn_size, ssize_t unit_size) |
1235 | 1235 | { |
1236 | + size_t chunk_size; | |
1236 | 1237 | unsigned int cpu; |
1237 | 1238 | |
1238 | 1239 | /* determine parameters and allocate */ |
1239 | 1240 | |
... | ... | @@ -1247,11 +1248,15 @@ |
1247 | 1248 | } else |
1248 | 1249 | pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); |
1249 | 1250 | |
1250 | - pcpue_ptr = __alloc_bootmem_nopanic( | |
1251 | - num_possible_cpus() * pcpue_unit_size, | |
1252 | - PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); | |
1253 | - if (!pcpue_ptr) | |
1251 | + chunk_size = pcpue_unit_size * num_possible_cpus(); | |
1252 | + | |
1253 | + pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE, | |
1254 | + __pa(MAX_DMA_ADDRESS)); | |
1255 | + if (!pcpue_ptr) { | |
1256 | + pr_warning("PERCPU: failed to allocate %zu bytes for " | |
1257 | + "embedding\n", chunk_size); | |
1254 | 1258 | return -ENOMEM; |
1259 | + } | |
1255 | 1260 | |
1256 | 1261 | /* return the leftover and copy */ |
1257 | 1262 | for_each_possible_cpu(cpu) { |