Commit 1f522509c77a5dea8dc384b735314f03908a6415

Authored by Haicheng Li
Committed by Linus Torvalds
1 parent 319774e25f

mem-hotplug: avoid multiple zones sharing same boot strapping boot_pageset

For each new populated zone of hotadded node, need to update its pagesets
with dynamically allocated per_cpu_pageset struct for all possible CPUs:

    1) Detach zone->pageset from the shared boot_pageset
       at end of __build_all_zonelists().

    2) Use mutex to protect zone->pageset when it's still
       shared in onlined_pages()

Otherwises, multiple zones of different nodes would share same boot strapping
boot_pageset for same CPU, which will finally cause below kernel panic:

  ------------[ cut here ]------------
  kernel BUG at mm/page_alloc.c:1239!
  invalid opcode: 0000 [#1] SMP
  ...
  Call Trace:
   [<ffffffff811300c1>] __alloc_pages_nodemask+0x131/0x7b0
   [<ffffffff81162e67>] alloc_pages_current+0x87/0xd0
   [<ffffffff81128407>] __page_cache_alloc+0x67/0x70
   [<ffffffff811325f0>] __do_page_cache_readahead+0x120/0x260
   [<ffffffff81132751>] ra_submit+0x21/0x30
   [<ffffffff811329c6>] ondemand_readahead+0x166/0x2c0
   [<ffffffff81132ba0>] page_cache_async_readahead+0x80/0xa0
   [<ffffffff8112a0e4>] generic_file_aio_read+0x364/0x670
   [<ffffffff81266cfa>] nfs_file_read+0xca/0x130
   [<ffffffff8117b20a>] do_sync_read+0xfa/0x140
   [<ffffffff8117bf75>] vfs_read+0xb5/0x1a0
   [<ffffffff8117c151>] sys_read+0x51/0x80
   [<ffffffff8103c032>] system_call_fastpath+0x16/0x1b
  RIP  [<ffffffff8112ff13>] get_page_from_freelist+0x883/0x900
   RSP <ffff88000d1e78a8>
  ---[ end trace 4bda28328b9990db ]

[akpm@linux-foundation.org: merge fix]
Signed-off-by: Haicheng Li <haicheng.li@linux.intel.com>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Reviewed-by: Andi Kleen <andi.kleen@intel.com>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 5 changed files with 29 additions and 12 deletions Side-by-side Diff

include/linux/mmzone.h
... ... @@ -652,7 +652,7 @@
652 652  
653 653 void get_zone_counts(unsigned long *active, unsigned long *inactive,
654 654 unsigned long *free);
655   -void build_all_zonelists(void);
  655 +void build_all_zonelists(void *data);
656 656 void wakeup_kswapd(struct zone *zone, int order);
657 657 int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
658 658 int classzone_idx, int alloc_flags);
... ... @@ -567,7 +567,7 @@
567 567 setup_per_cpu_areas();
568 568 smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
569 569  
570   - build_all_zonelists();
  570 + build_all_zonelists(NULL);
571 571 page_alloc_init();
572 572  
573 573 printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
... ... @@ -358,7 +358,7 @@
358 358 }
359 359  
360 360 if (pgdat->node_zonelists->_zonerefs->zone == NULL)
361   - build_all_zonelists();
  361 + build_all_zonelists(NULL);
362 362 #endif
363 363  
364 364 cpu_maps_update_begin();
... ... @@ -389,6 +389,11 @@
389 389 int nid;
390 390 int ret;
391 391 struct memory_notify arg;
  392 + /*
  393 + * mutex to protect zone->pageset when it's still shared
  394 + * in onlined_pages()
  395 + */
  396 + static DEFINE_MUTEX(zone_pageset_mutex);
392 397  
393 398 arg.start_pfn = pfn;
394 399 arg.nr_pages = nr_pages;
395 400  
... ... @@ -415,12 +420,14 @@
415 420 * This means the page allocator ignores this zone.
416 421 * So, zonelist must be updated after online.
417 422 */
  423 + mutex_lock(&zone_pageset_mutex);
418 424 if (!populated_zone(zone))
419 425 need_zonelists_rebuild = 1;
420 426  
421 427 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
422 428 online_pages_range);
423 429 if (ret) {
  430 + mutex_unlock(&zone_pageset_mutex);
424 431 printk(KERN_DEBUG "online_pages %lx at %lx failed\n",
425 432 nr_pages, pfn);
426 433 memory_notify(MEM_CANCEL_ONLINE, &arg);
427 434  
... ... @@ -429,8 +436,12 @@
429 436  
430 437 zone->present_pages += onlined_pages;
431 438 zone->zone_pgdat->node_present_pages += onlined_pages;
  439 + if (need_zonelists_rebuild)
  440 + build_all_zonelists(zone);
  441 + else
  442 + zone_pcp_update(zone);
432 443  
433   - zone_pcp_update(zone);
  444 + mutex_unlock(&zone_pageset_mutex);
434 445 setup_per_zone_wmarks();
435 446 calculate_zone_inactive_ratio(zone);
436 447 if (onlined_pages) {
... ... @@ -438,10 +449,7 @@
438 449 node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
439 450 }
440 451  
441   - if (need_zonelists_rebuild)
442   - build_all_zonelists();
443   - else
444   - vm_total_pages = nr_free_pagecache_pages();
  452 + vm_total_pages = nr_free_pagecache_pages();
445 453  
446 454 writeback_set_ratelimit();
447 455  
... ... @@ -2572,7 +2572,7 @@
2572 2572 NUMA_ZONELIST_ORDER_LEN);
2573 2573 user_zonelist_order = oldval;
2574 2574 } else if (oldval != user_zonelist_order)
2575   - build_all_zonelists();
  2575 + build_all_zonelists(NULL);
2576 2576 }
2577 2577 out:
2578 2578 mutex_unlock(&zl_order_mutex);
2579 2579  
... ... @@ -2922,9 +2922,10 @@
2922 2922 */
2923 2923 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
2924 2924 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
  2925 +static void setup_zone_pageset(struct zone *zone);
2925 2926  
2926 2927 /* return values int ....just for stop_machine() */
2927   -static int __build_all_zonelists(void *dummy)
  2928 +static __init_refok int __build_all_zonelists(void *data)
2928 2929 {
2929 2930 int nid;
2930 2931 int cpu;
... ... @@ -2939,6 +2940,14 @@
2939 2940 build_zonelist_cache(pgdat);
2940 2941 }
2941 2942  
  2943 +#ifdef CONFIG_MEMORY_HOTPLUG
  2944 + /* Setup real pagesets for the new zone */
  2945 + if (data) {
  2946 + struct zone *zone = data;
  2947 + setup_zone_pageset(zone);
  2948 + }
  2949 +#endif
  2950 +
2942 2951 /*
2943 2952 * Initialize the boot_pagesets that are going to be used
2944 2953 * for bootstrapping processors. The real pagesets for
... ... @@ -2958,7 +2967,7 @@
2958 2967 return 0;
2959 2968 }
2960 2969  
2961   -void build_all_zonelists(void)
  2970 +void build_all_zonelists(void *data)
2962 2971 {
2963 2972 set_zonelist_order();
2964 2973  
... ... @@ -2969,7 +2978,7 @@
2969 2978 } else {
2970 2979 /* we have to stop all cpus to guarantee there is no user
2971 2980 of zonelist */
2972   - stop_machine(__build_all_zonelists, NULL, NULL);
  2981 + stop_machine(__build_all_zonelists, data, NULL);
2973 2982 /* cpuset refresh routine should be here */
2974 2983 }
2975 2984 vm_total_pages = nr_free_pagecache_pages();