Commit 1f522509c77a5dea8dc384b735314f03908a6415
Committed by
Linus Torvalds
1 parent
319774e25f
Exists in
master
and in
20 other branches
mem-hotplug: avoid multiple zones sharing same boot strapping boot_pageset
For each new populated zone of hotadded node, need to update its pagesets with dynamically allocated per_cpu_pageset struct for all possible CPUs: 1) Detach zone->pageset from the shared boot_pageset at end of __build_all_zonelists(). 2) Use mutex to protect zone->pageset when it's still shared in onlined_pages() Otherwises, multiple zones of different nodes would share same boot strapping boot_pageset for same CPU, which will finally cause below kernel panic: ------------[ cut here ]------------ kernel BUG at mm/page_alloc.c:1239! invalid opcode: 0000 [#1] SMP ... Call Trace: [<ffffffff811300c1>] __alloc_pages_nodemask+0x131/0x7b0 [<ffffffff81162e67>] alloc_pages_current+0x87/0xd0 [<ffffffff81128407>] __page_cache_alloc+0x67/0x70 [<ffffffff811325f0>] __do_page_cache_readahead+0x120/0x260 [<ffffffff81132751>] ra_submit+0x21/0x30 [<ffffffff811329c6>] ondemand_readahead+0x166/0x2c0 [<ffffffff81132ba0>] page_cache_async_readahead+0x80/0xa0 [<ffffffff8112a0e4>] generic_file_aio_read+0x364/0x670 [<ffffffff81266cfa>] nfs_file_read+0xca/0x130 [<ffffffff8117b20a>] do_sync_read+0xfa/0x140 [<ffffffff8117bf75>] vfs_read+0xb5/0x1a0 [<ffffffff8117c151>] sys_read+0x51/0x80 [<ffffffff8103c032>] system_call_fastpath+0x16/0x1b RIP [<ffffffff8112ff13>] get_page_from_freelist+0x883/0x900 RSP <ffff88000d1e78a8> ---[ end trace 4bda28328b9990db ] [akpm@linux-foundation.org: merge fix] Signed-off-by: Haicheng Li <haicheng.li@linux.intel.com> Signed-off-by: Wu Fengguang <fengguang.wu@intel.com> Reviewed-by: Andi Kleen <andi.kleen@intel.com> Reviewed-by: Christoph Lameter <cl@linux-foundation.org> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Tejun Heo <tj@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 5 changed files with 29 additions and 12 deletions Side-by-side Diff
include/linux/mmzone.h
... | ... | @@ -652,7 +652,7 @@ |
652 | 652 | |
653 | 653 | void get_zone_counts(unsigned long *active, unsigned long *inactive, |
654 | 654 | unsigned long *free); |
655 | -void build_all_zonelists(void); | |
655 | +void build_all_zonelists(void *data); | |
656 | 656 | void wakeup_kswapd(struct zone *zone, int order); |
657 | 657 | int zone_watermark_ok(struct zone *z, int order, unsigned long mark, |
658 | 658 | int classzone_idx, int alloc_flags); |
init/main.c
... | ... | @@ -567,7 +567,7 @@ |
567 | 567 | setup_per_cpu_areas(); |
568 | 568 | smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ |
569 | 569 | |
570 | - build_all_zonelists(); | |
570 | + build_all_zonelists(NULL); | |
571 | 571 | page_alloc_init(); |
572 | 572 | |
573 | 573 | printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line); |
kernel/cpu.c
mm/memory_hotplug.c
... | ... | @@ -389,6 +389,11 @@ |
389 | 389 | int nid; |
390 | 390 | int ret; |
391 | 391 | struct memory_notify arg; |
392 | + /* | |
393 | + * mutex to protect zone->pageset when it's still shared | |
394 | + * in onlined_pages() | |
395 | + */ | |
396 | + static DEFINE_MUTEX(zone_pageset_mutex); | |
392 | 397 | |
393 | 398 | arg.start_pfn = pfn; |
394 | 399 | arg.nr_pages = nr_pages; |
395 | 400 | |
... | ... | @@ -415,12 +420,14 @@ |
415 | 420 | * This means the page allocator ignores this zone. |
416 | 421 | * So, zonelist must be updated after online. |
417 | 422 | */ |
423 | + mutex_lock(&zone_pageset_mutex); | |
418 | 424 | if (!populated_zone(zone)) |
419 | 425 | need_zonelists_rebuild = 1; |
420 | 426 | |
421 | 427 | ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, |
422 | 428 | online_pages_range); |
423 | 429 | if (ret) { |
430 | + mutex_unlock(&zone_pageset_mutex); | |
424 | 431 | printk(KERN_DEBUG "online_pages %lx at %lx failed\n", |
425 | 432 | nr_pages, pfn); |
426 | 433 | memory_notify(MEM_CANCEL_ONLINE, &arg); |
427 | 434 | |
... | ... | @@ -429,8 +436,12 @@ |
429 | 436 | |
430 | 437 | zone->present_pages += onlined_pages; |
431 | 438 | zone->zone_pgdat->node_present_pages += onlined_pages; |
439 | + if (need_zonelists_rebuild) | |
440 | + build_all_zonelists(zone); | |
441 | + else | |
442 | + zone_pcp_update(zone); | |
432 | 443 | |
433 | - zone_pcp_update(zone); | |
444 | + mutex_unlock(&zone_pageset_mutex); | |
434 | 445 | setup_per_zone_wmarks(); |
435 | 446 | calculate_zone_inactive_ratio(zone); |
436 | 447 | if (onlined_pages) { |
... | ... | @@ -438,10 +449,7 @@ |
438 | 449 | node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); |
439 | 450 | } |
440 | 451 | |
441 | - if (need_zonelists_rebuild) | |
442 | - build_all_zonelists(); | |
443 | - else | |
444 | - vm_total_pages = nr_free_pagecache_pages(); | |
452 | + vm_total_pages = nr_free_pagecache_pages(); | |
445 | 453 | |
446 | 454 | writeback_set_ratelimit(); |
447 | 455 |
mm/page_alloc.c
... | ... | @@ -2572,7 +2572,7 @@ |
2572 | 2572 | NUMA_ZONELIST_ORDER_LEN); |
2573 | 2573 | user_zonelist_order = oldval; |
2574 | 2574 | } else if (oldval != user_zonelist_order) |
2575 | - build_all_zonelists(); | |
2575 | + build_all_zonelists(NULL); | |
2576 | 2576 | } |
2577 | 2577 | out: |
2578 | 2578 | mutex_unlock(&zl_order_mutex); |
2579 | 2579 | |
... | ... | @@ -2922,9 +2922,10 @@ |
2922 | 2922 | */ |
2923 | 2923 | static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); |
2924 | 2924 | static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); |
2925 | +static void setup_zone_pageset(struct zone *zone); | |
2925 | 2926 | |
2926 | 2927 | /* return values int ....just for stop_machine() */ |
2927 | -static int __build_all_zonelists(void *dummy) | |
2928 | +static __init_refok int __build_all_zonelists(void *data) | |
2928 | 2929 | { |
2929 | 2930 | int nid; |
2930 | 2931 | int cpu; |
... | ... | @@ -2939,6 +2940,14 @@ |
2939 | 2940 | build_zonelist_cache(pgdat); |
2940 | 2941 | } |
2941 | 2942 | |
2943 | +#ifdef CONFIG_MEMORY_HOTPLUG | |
2944 | + /* Setup real pagesets for the new zone */ | |
2945 | + if (data) { | |
2946 | + struct zone *zone = data; | |
2947 | + setup_zone_pageset(zone); | |
2948 | + } | |
2949 | +#endif | |
2950 | + | |
2942 | 2951 | /* |
2943 | 2952 | * Initialize the boot_pagesets that are going to be used |
2944 | 2953 | * for bootstrapping processors. The real pagesets for |
... | ... | @@ -2958,7 +2967,7 @@ |
2958 | 2967 | return 0; |
2959 | 2968 | } |
2960 | 2969 | |
2961 | -void build_all_zonelists(void) | |
2970 | +void build_all_zonelists(void *data) | |
2962 | 2971 | { |
2963 | 2972 | set_zonelist_order(); |
2964 | 2973 | |
... | ... | @@ -2969,7 +2978,7 @@ |
2969 | 2978 | } else { |
2970 | 2979 | /* we have to stop all cpus to guarantee there is no user |
2971 | 2980 | of zonelist */ |
2972 | - stop_machine(__build_all_zonelists, NULL, NULL); | |
2981 | + stop_machine(__build_all_zonelists, data, NULL); | |
2973 | 2982 | /* cpuset refresh routine should be here */ |
2974 | 2983 | } |
2975 | 2984 | vm_total_pages = nr_free_pagecache_pages(); |