Commit 7f1290f2f2a4d2c3f1b7ce8e87256e052ca23125

Authored by Jianguo Wu
Committed by Linus Torvalds
1 parent 05106e6a54

mm: fix-up zone present pages

I think zone->present_pages indicates pages that buddy system can management,
it should be:

	zone->present_pages = spanned pages - absent pages - bootmem pages,

but is now:
	zone->present_pages = spanned pages - absent pages - memmap pages.

spanned pages: total size, including holes.
absent pages: holes.
bootmem pages: pages used in system boot, managed by bootmem allocator.
memmap pages: pages used by page structs.

This may cause zone->present_pages less than it should be.  For example,
numa node 1 has ZONE_NORMAL and ZONE_MOVABLE, it's memmap and other
bootmem will be allocated from ZONE_MOVABLE, so ZONE_NORMAL's
present_pages should be spanned pages - absent pages, but now it also
minus memmap pages(free_area_init_core), which are actually allocated from
ZONE_MOVABLE.  When offlining all memory of a zone, this will cause
zone->present_pages less than 0, because present_pages is unsigned long
type, it is actually a very large integer, it indirectly caused
zone->watermark[WMARK_MIN] becomes a large
integer(setup_per_zone_wmarks()), than cause totalreserve_pages become a
large integer(calculate_totalreserve_pages()), and finally cause memory
allocating failure when fork process(__vm_enough_memory()).

[root@localhost ~]# dmesg
-bash: fork: Cannot allocate memory

I think the bug described in

  http://marc.info/?l=linux-mm&m=134502182714186&w=2

is also caused by wrong zone present pages.

This patch intends to fix-up zone->present_pages when memory are freed to
buddy system on x86_64 and IA64 platforms.

Signed-off-by: Jianguo Wu <wujianguo@huawei.com>
Signed-off-by: Jiang Liu <jiang.liu@huawei.com>
Reported-by: Petr Tesarik <ptesarik@suse.cz>
Tested-by: Petr Tesarik <ptesarik@suse.cz>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 6 changed files with 58 additions and 1 deletions Side-by-side Diff

... ... @@ -637,6 +637,7 @@
637 637  
638 638 high_memory = __va(max_low_pfn * PAGE_SIZE);
639 639  
  640 + reset_zone_present_pages();
640 641 for_each_online_pgdat(pgdat)
641 642 if (pgdat->bdata->node_bootmem_map)
642 643 totalram_pages += free_all_bootmem_node(pgdat);
... ... @@ -1684,6 +1684,10 @@
1684 1684 static inline bool page_is_guard(struct page *page) { return false; }
1685 1685 #endif /* CONFIG_DEBUG_PAGEALLOC */
1686 1686  
  1687 +extern void reset_zone_present_pages(void);
  1688 +extern void fixup_zone_present_pages(int nid, unsigned long start_pfn,
  1689 + unsigned long end_pfn);
  1690 +
1687 1691 #endif /* __KERNEL__ */
1688 1692 #endif /* _LINUX_MM_H */
... ... @@ -198,6 +198,8 @@
198 198 int order = ilog2(BITS_PER_LONG);
199 199  
200 200 __free_pages_bootmem(pfn_to_page(start), order);
  201 + fixup_zone_present_pages(page_to_nid(pfn_to_page(start)),
  202 + start, start + BITS_PER_LONG);
201 203 count += BITS_PER_LONG;
202 204 start += BITS_PER_LONG;
203 205 } else {
... ... @@ -208,6 +210,9 @@
208 210 if (vec & 1) {
209 211 page = pfn_to_page(start + off);
210 212 __free_pages_bootmem(page, 0);
  213 + fixup_zone_present_pages(
  214 + page_to_nid(page),
  215 + start + off, start + off + 1);
211 216 count++;
212 217 }
213 218 vec >>= 1;
214 219  
... ... @@ -221,8 +226,11 @@
221 226 pages = bdata->node_low_pfn - bdata->node_min_pfn;
222 227 pages = bootmem_bootmap_pages(pages);
223 228 count += pages;
224   - while (pages--)
  229 + while (pages--) {
  230 + fixup_zone_present_pages(page_to_nid(page),
  231 + page_to_pfn(page), page_to_pfn(page) + 1);
225 232 __free_pages_bootmem(page++, 0);
  233 + }
226 234  
227 235 bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);
228 236  
... ... @@ -106,6 +106,7 @@
106 106 void __ref put_page_bootmem(struct page *page)
107 107 {
108 108 unsigned long type;
  109 + struct zone *zone;
109 110  
110 111 type = (unsigned long) page->lru.next;
111 112 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
... ... @@ -116,6 +117,12 @@
116 117 set_page_private(page, 0);
117 118 INIT_LIST_HEAD(&page->lru);
118 119 __free_pages_bootmem(page, 0);
  120 +
  121 + zone = page_zone(page);
  122 + zone_span_writelock(zone);
  123 + zone->present_pages++;
  124 + zone_span_writeunlock(zone);
  125 + totalram_pages++;
119 126 }
120 127  
121 128 }
... ... @@ -116,6 +116,8 @@
116 116 return 0;
117 117  
118 118 __free_pages_memory(start_pfn, end_pfn);
  119 + fixup_zone_present_pages(pfn_to_nid(start >> PAGE_SHIFT),
  120 + start_pfn, end_pfn);
119 121  
120 122 return end_pfn - start_pfn;
121 123 }
... ... @@ -126,6 +128,7 @@
126 128 phys_addr_t start, end, size;
127 129 u64 i;
128 130  
  131 + reset_zone_present_pages();
129 132 for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL)
130 133 count += __free_memory_core(start, end);
131 134  
... ... @@ -6087,4 +6087,38 @@
6087 6087 dump_page_flags(page->flags);
6088 6088 mem_cgroup_print_bad_page(page);
6089 6089 }
  6090 +
  6091 +/* reset zone->present_pages */
  6092 +void reset_zone_present_pages(void)
  6093 +{
  6094 + struct zone *z;
  6095 + int i, nid;
  6096 +
  6097 + for_each_node_state(nid, N_HIGH_MEMORY) {
  6098 + for (i = 0; i < MAX_NR_ZONES; i++) {
  6099 + z = NODE_DATA(nid)->node_zones + i;
  6100 + z->present_pages = 0;
  6101 + }
  6102 + }
  6103 +}
  6104 +
  6105 +/* calculate zone's present pages in buddy system */
  6106 +void fixup_zone_present_pages(int nid, unsigned long start_pfn,
  6107 + unsigned long end_pfn)
  6108 +{
  6109 + struct zone *z;
  6110 + unsigned long zone_start_pfn, zone_end_pfn;
  6111 + int i;
  6112 +
  6113 + for (i = 0; i < MAX_NR_ZONES; i++) {
  6114 + z = NODE_DATA(nid)->node_zones + i;
  6115 + zone_start_pfn = z->zone_start_pfn;
  6116 + zone_end_pfn = zone_start_pfn + z->spanned_pages;
  6117 +
  6118 + /* if the two regions intersect */
  6119 + if (!(zone_start_pfn >= end_pfn || zone_end_pfn <= start_pfn))
  6120 + z->present_pages += min(end_pfn, zone_end_pfn) -
  6121 + max(start_pfn, zone_start_pfn);
  6122 + }
  6123 +}