Commit 4eaf3f64397c3db3c5785eee508270d62a9fabd9
Committed by
Linus Torvalds
1 parent
1f522509c7
Exists in
master
and in
4 other branches
mem-hotplug: fix potential race while building zonelist for new populated zone
Add global mutex zonelists_mutex to fix the possible race: CPU0 CPU1 CPU2 (1) zone->present_pages += online_pages; (2) build_all_zonelists(); (3) alloc_page(); (4) free_page(); (5) build_all_zonelists(); (6) __build_all_zonelists(); (7) zone->pageset = alloc_percpu(); In step (3,4), zone->pageset still points to boot_pageset, so bad things may happen if 2+ nodes are in this state. Even if only 1 node is accessing the boot_pageset, (3) may still consume too much memory to fail the memory allocations in step (7). Besides, atomic operation ensures alloc_percpu() in step (7) will never fail since there is a new fresh memory block added in step(6). [haicheng.li@linux.intel.com: hold zonelists_mutex when build_all_zonelists] Signed-off-by: Haicheng Li <haicheng.li@linux.intel.com> Signed-off-by: Wu Fengguang <fengguang.wu@intel.com> Reviewed-by: Andi Kleen <andi.kleen@intel.com> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Tejun Heo <tj@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 4 changed files with 22 additions and 10 deletions Side-by-side Diff
include/linux/mmzone.h
kernel/cpu.c
... | ... | @@ -357,8 +357,11 @@ |
357 | 357 | return -ENOMEM; |
358 | 358 | } |
359 | 359 | |
360 | - if (pgdat->node_zonelists->_zonerefs->zone == NULL) | |
360 | + if (pgdat->node_zonelists->_zonerefs->zone == NULL) { | |
361 | + mutex_lock(&zonelists_mutex); | |
361 | 362 | build_all_zonelists(NULL); |
363 | + mutex_unlock(&zonelists_mutex); | |
364 | + } | |
362 | 365 | #endif |
363 | 366 | |
364 | 367 | cpu_maps_update_begin(); |
mm/memory_hotplug.c
... | ... | @@ -389,11 +389,6 @@ |
389 | 389 | int nid; |
390 | 390 | int ret; |
391 | 391 | struct memory_notify arg; |
392 | - /* | |
393 | - * mutex to protect zone->pageset when it's still shared | |
394 | - * in onlined_pages() | |
395 | - */ | |
396 | - static DEFINE_MUTEX(zone_pageset_mutex); | |
397 | 392 | |
398 | 393 | arg.start_pfn = pfn; |
399 | 394 | arg.nr_pages = nr_pages; |
400 | 395 | |
... | ... | @@ -420,14 +415,14 @@ |
420 | 415 | * This means the page allocator ignores this zone. |
421 | 416 | * So, zonelist must be updated after online. |
422 | 417 | */ |
423 | - mutex_lock(&zone_pageset_mutex); | |
418 | + mutex_lock(&zonelists_mutex); | |
424 | 419 | if (!populated_zone(zone)) |
425 | 420 | need_zonelists_rebuild = 1; |
426 | 421 | |
427 | 422 | ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, |
428 | 423 | online_pages_range); |
429 | 424 | if (ret) { |
430 | - mutex_unlock(&zone_pageset_mutex); | |
425 | + mutex_unlock(&zonelists_mutex); | |
431 | 426 | printk(KERN_DEBUG "online_pages %lx at %lx failed\n", |
432 | 427 | nr_pages, pfn); |
433 | 428 | memory_notify(MEM_CANCEL_ONLINE, &arg); |
... | ... | @@ -441,7 +436,7 @@ |
441 | 436 | else |
442 | 437 | zone_pcp_update(zone); |
443 | 438 | |
444 | - mutex_unlock(&zone_pageset_mutex); | |
439 | + mutex_unlock(&zonelists_mutex); | |
445 | 440 | setup_per_zone_wmarks(); |
446 | 441 | calculate_zone_inactive_ratio(zone); |
447 | 442 | if (onlined_pages) { |
mm/page_alloc.c
... | ... | @@ -2571,8 +2571,11 @@ |
2571 | 2571 | strncpy((char*)table->data, saved_string, |
2572 | 2572 | NUMA_ZONELIST_ORDER_LEN); |
2573 | 2573 | user_zonelist_order = oldval; |
2574 | - } else if (oldval != user_zonelist_order) | |
2574 | + } else if (oldval != user_zonelist_order) { | |
2575 | + mutex_lock(&zonelists_mutex); | |
2575 | 2576 | build_all_zonelists(NULL); |
2577 | + mutex_unlock(&zonelists_mutex); | |
2578 | + } | |
2576 | 2579 | } |
2577 | 2580 | out: |
2578 | 2581 | mutex_unlock(&zl_order_mutex); |
... | ... | @@ -2924,6 +2927,12 @@ |
2924 | 2927 | static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); |
2925 | 2928 | static void setup_zone_pageset(struct zone *zone); |
2926 | 2929 | |
2930 | +/* | |
2931 | + * Global mutex to protect against size modification of zonelists | |
2932 | + * as well as to serialize pageset setup for the new populated zone. | |
2933 | + */ | |
2934 | +DEFINE_MUTEX(zonelists_mutex); | |
2935 | + | |
2927 | 2936 | /* return values int ....just for stop_machine() */ |
2928 | 2937 | static __init_refok int __build_all_zonelists(void *data) |
2929 | 2938 | { |
... | ... | @@ -2967,6 +2976,10 @@ |
2967 | 2976 | return 0; |
2968 | 2977 | } |
2969 | 2978 | |
2979 | +/* | |
2980 | + * Called with zonelists_mutex held always | |
2981 | + * unless system_state == SYSTEM_BOOTING. | |
2982 | + */ | |
2970 | 2983 | void build_all_zonelists(void *data) |
2971 | 2984 | { |
2972 | 2985 | set_zonelist_order(); |